diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,23282 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 33200, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00030120481927710846, + "grad_norm": 34.184687640160305, + "learning_rate": 6.02409638554217e-08, + "loss": 1.9054, + "step": 10 + }, + { + "epoch": 0.0006024096385542169, + "grad_norm": 17.14476558690921, + "learning_rate": 1.204819277108434e-07, + "loss": 2.1905, + "step": 20 + }, + { + "epoch": 0.0009036144578313253, + "grad_norm": 33.892200398551054, + "learning_rate": 1.8072289156626505e-07, + "loss": 1.7507, + "step": 30 + }, + { + "epoch": 0.0012048192771084338, + "grad_norm": 22.97431512371491, + "learning_rate": 2.409638554216868e-07, + "loss": 1.8622, + "step": 40 + }, + { + "epoch": 0.0015060240963855422, + "grad_norm": 20.59352371925957, + "learning_rate": 3.0120481927710845e-07, + "loss": 1.9969, + "step": 50 + }, + { + "epoch": 0.0018072289156626507, + "grad_norm": 11.716491338439093, + "learning_rate": 3.614457831325301e-07, + "loss": 1.5547, + "step": 60 + }, + { + "epoch": 0.002108433734939759, + "grad_norm": 8.45595968752092, + "learning_rate": 4.216867469879518e-07, + "loss": 1.36, + "step": 70 + }, + { + "epoch": 0.0024096385542168677, + "grad_norm": 4.2568241543074, + "learning_rate": 4.819277108433736e-07, + "loss": 1.1727, + "step": 80 + }, + { + "epoch": 0.002710843373493976, + "grad_norm": 6.321702055594082, + "learning_rate": 5.421686746987952e-07, + "loss": 1.6102, + "step": 90 + }, + { + "epoch": 0.0030120481927710845, + "grad_norm": 5.267243903465454, + "learning_rate": 6.024096385542169e-07, + "loss": 1.2673, + "step": 100 + }, + { + "epoch": 0.0033132530120481927, + "grad_norm": 5.9212115360469015, + "learning_rate": 6.626506024096387e-07, + "loss": 1.2851, + "step": 110 + }, + { + "epoch": 0.0036144578313253013, + "grad_norm": 2.7909893738719256, + "learning_rate": 7.228915662650602e-07, + "loss": 1.2658, + "step": 120 + }, + { + "epoch": 0.00391566265060241, + "grad_norm": 4.700991426756883, + "learning_rate": 7.83132530120482e-07, + "loss": 1.2353, + "step": 130 + }, + { + "epoch": 0.004216867469879518, + "grad_norm": 2.982566988838653, + "learning_rate": 8.433734939759036e-07, + "loss": 1.1382, + "step": 140 + }, + { + "epoch": 0.004518072289156626, + "grad_norm": 4.751028531989113, + "learning_rate": 9.036144578313254e-07, + "loss": 1.2219, + "step": 150 + }, + { + "epoch": 0.004819277108433735, + "grad_norm": 4.058570665682753, + "learning_rate": 9.638554216867472e-07, + "loss": 1.4589, + "step": 160 + }, + { + "epoch": 0.0051204819277108436, + "grad_norm": 4.391684695638581, + "learning_rate": 1.0240963855421688e-06, + "loss": 1.2259, + "step": 170 + }, + { + "epoch": 0.005421686746987952, + "grad_norm": 2.749288984807557, + "learning_rate": 1.0843373493975905e-06, + "loss": 1.1787, + "step": 180 + }, + { + "epoch": 0.00572289156626506, + "grad_norm": 4.7615868068825575, + "learning_rate": 1.1445783132530121e-06, + "loss": 1.0488, + "step": 190 + }, + { + "epoch": 0.006024096385542169, + "grad_norm": 4.7129578011308535, + "learning_rate": 1.2048192771084338e-06, + "loss": 1.1802, + "step": 200 + }, + { + "epoch": 0.006325301204819277, + "grad_norm": 4.575454200356594, + "learning_rate": 1.2650602409638555e-06, + "loss": 1.2244, + "step": 210 + }, + { + "epoch": 0.006626506024096385, + "grad_norm": 4.406374457009423, + "learning_rate": 1.3253012048192773e-06, + "loss": 1.211, + "step": 220 + }, + { + "epoch": 0.0069277108433734936, + "grad_norm": 4.197678974172092, + "learning_rate": 1.385542168674699e-06, + "loss": 1.1268, + "step": 230 + }, + { + "epoch": 0.007228915662650603, + "grad_norm": 4.575232860771412, + "learning_rate": 1.4457831325301204e-06, + "loss": 1.1966, + "step": 240 + }, + { + "epoch": 0.007530120481927711, + "grad_norm": 4.099565929616023, + "learning_rate": 1.5060240963855425e-06, + "loss": 1.4051, + "step": 250 + }, + { + "epoch": 0.00783132530120482, + "grad_norm": 4.270337306856698, + "learning_rate": 1.566265060240964e-06, + "loss": 1.3423, + "step": 260 + }, + { + "epoch": 0.008132530120481927, + "grad_norm": 3.982952502682325, + "learning_rate": 1.6265060240963856e-06, + "loss": 1.3128, + "step": 270 + }, + { + "epoch": 0.008433734939759036, + "grad_norm": 4.004693789896972, + "learning_rate": 1.6867469879518073e-06, + "loss": 1.3504, + "step": 280 + }, + { + "epoch": 0.008734939759036145, + "grad_norm": 4.592468092346859, + "learning_rate": 1.7469879518072292e-06, + "loss": 1.2639, + "step": 290 + }, + { + "epoch": 0.009036144578313253, + "grad_norm": 4.060501115633273, + "learning_rate": 1.8072289156626508e-06, + "loss": 1.2481, + "step": 300 + }, + { + "epoch": 0.009337349397590362, + "grad_norm": 4.815336571061053, + "learning_rate": 1.8674698795180723e-06, + "loss": 1.2346, + "step": 310 + }, + { + "epoch": 0.00963855421686747, + "grad_norm": 4.053695477874738, + "learning_rate": 1.9277108433734943e-06, + "loss": 0.9776, + "step": 320 + }, + { + "epoch": 0.009939759036144578, + "grad_norm": 3.919326425143882, + "learning_rate": 1.987951807228916e-06, + "loss": 1.3553, + "step": 330 + }, + { + "epoch": 0.010240963855421687, + "grad_norm": 3.044358040142568, + "learning_rate": 2.0481927710843377e-06, + "loss": 1.2019, + "step": 340 + }, + { + "epoch": 0.010542168674698794, + "grad_norm": 2.8849085807871373, + "learning_rate": 2.1084337349397595e-06, + "loss": 1.1237, + "step": 350 + }, + { + "epoch": 0.010843373493975903, + "grad_norm": 4.338427499373051, + "learning_rate": 2.168674698795181e-06, + "loss": 1.2304, + "step": 360 + }, + { + "epoch": 0.011144578313253013, + "grad_norm": 4.230190025931408, + "learning_rate": 2.2289156626506024e-06, + "loss": 1.1195, + "step": 370 + }, + { + "epoch": 0.01144578313253012, + "grad_norm": 4.124234826305292, + "learning_rate": 2.2891566265060243e-06, + "loss": 1.279, + "step": 380 + }, + { + "epoch": 0.011746987951807229, + "grad_norm": 4.32999826606396, + "learning_rate": 2.349397590361446e-06, + "loss": 1.1222, + "step": 390 + }, + { + "epoch": 0.012048192771084338, + "grad_norm": 4.037705852563037, + "learning_rate": 2.4096385542168676e-06, + "loss": 1.171, + "step": 400 + }, + { + "epoch": 0.012349397590361445, + "grad_norm": 3.6215123183400806, + "learning_rate": 2.469879518072289e-06, + "loss": 1.3193, + "step": 410 + }, + { + "epoch": 0.012650602409638554, + "grad_norm": 4.0649621509685785, + "learning_rate": 2.530120481927711e-06, + "loss": 1.2043, + "step": 420 + }, + { + "epoch": 0.012951807228915663, + "grad_norm": 3.833813479206042, + "learning_rate": 2.590361445783133e-06, + "loss": 1.129, + "step": 430 + }, + { + "epoch": 0.01325301204819277, + "grad_norm": 3.951963927513688, + "learning_rate": 2.6506024096385547e-06, + "loss": 1.1101, + "step": 440 + }, + { + "epoch": 0.01355421686746988, + "grad_norm": 4.271519936836857, + "learning_rate": 2.710843373493976e-06, + "loss": 1.271, + "step": 450 + }, + { + "epoch": 0.013855421686746987, + "grad_norm": 2.6764256226165317, + "learning_rate": 2.771084337349398e-06, + "loss": 1.1224, + "step": 460 + }, + { + "epoch": 0.014156626506024096, + "grad_norm": 4.634735718452594, + "learning_rate": 2.83132530120482e-06, + "loss": 1.1163, + "step": 470 + }, + { + "epoch": 0.014457831325301205, + "grad_norm": 2.7293250507262155, + "learning_rate": 2.891566265060241e-06, + "loss": 0.9871, + "step": 480 + }, + { + "epoch": 0.014759036144578313, + "grad_norm": 3.866232843232568, + "learning_rate": 2.9518072289156627e-06, + "loss": 1.3567, + "step": 490 + }, + { + "epoch": 0.015060240963855422, + "grad_norm": 4.239742910872224, + "learning_rate": 3.012048192771085e-06, + "loss": 0.9524, + "step": 500 + }, + { + "epoch": 0.01536144578313253, + "grad_norm": 2.644873772813341, + "learning_rate": 3.072289156626506e-06, + "loss": 1.0942, + "step": 510 + }, + { + "epoch": 0.01566265060240964, + "grad_norm": 4.032437393625629, + "learning_rate": 3.132530120481928e-06, + "loss": 1.1671, + "step": 520 + }, + { + "epoch": 0.015963855421686747, + "grad_norm": 2.88176103182694, + "learning_rate": 3.1927710843373494e-06, + "loss": 1.0398, + "step": 530 + }, + { + "epoch": 0.016265060240963854, + "grad_norm": 4.0032840796024765, + "learning_rate": 3.2530120481927713e-06, + "loss": 1.0952, + "step": 540 + }, + { + "epoch": 0.016566265060240965, + "grad_norm": 3.902519176881758, + "learning_rate": 3.313253012048193e-06, + "loss": 1.1107, + "step": 550 + }, + { + "epoch": 0.016867469879518072, + "grad_norm": 4.340144191056268, + "learning_rate": 3.3734939759036146e-06, + "loss": 1.1925, + "step": 560 + }, + { + "epoch": 0.01716867469879518, + "grad_norm": 3.7467344008008316, + "learning_rate": 3.4337349397590364e-06, + "loss": 1.2664, + "step": 570 + }, + { + "epoch": 0.01746987951807229, + "grad_norm": 3.8308835387585694, + "learning_rate": 3.4939759036144583e-06, + "loss": 1.1655, + "step": 580 + }, + { + "epoch": 0.017771084337349398, + "grad_norm": 2.7014599545087865, + "learning_rate": 3.5542168674698798e-06, + "loss": 0.983, + "step": 590 + }, + { + "epoch": 0.018072289156626505, + "grad_norm": 2.6573476355219485, + "learning_rate": 3.6144578313253016e-06, + "loss": 1.0783, + "step": 600 + }, + { + "epoch": 0.018373493975903616, + "grad_norm": 4.238081108120648, + "learning_rate": 3.6746987951807235e-06, + "loss": 1.163, + "step": 610 + }, + { + "epoch": 0.018674698795180723, + "grad_norm": 4.306356593664175, + "learning_rate": 3.7349397590361445e-06, + "loss": 1.0491, + "step": 620 + }, + { + "epoch": 0.01897590361445783, + "grad_norm": 3.9183023904130625, + "learning_rate": 3.7951807228915664e-06, + "loss": 0.885, + "step": 630 + }, + { + "epoch": 0.01927710843373494, + "grad_norm": 2.754654672572441, + "learning_rate": 3.855421686746989e-06, + "loss": 0.9848, + "step": 640 + }, + { + "epoch": 0.01957831325301205, + "grad_norm": 2.9070743973349047, + "learning_rate": 3.91566265060241e-06, + "loss": 1.0814, + "step": 650 + }, + { + "epoch": 0.019879518072289156, + "grad_norm": 3.832054275721117, + "learning_rate": 3.975903614457832e-06, + "loss": 1.0426, + "step": 660 + }, + { + "epoch": 0.020180722891566263, + "grad_norm": 2.9112570734345375, + "learning_rate": 4.036144578313254e-06, + "loss": 1.0199, + "step": 670 + }, + { + "epoch": 0.020481927710843374, + "grad_norm": 3.884916864538904, + "learning_rate": 4.096385542168675e-06, + "loss": 1.104, + "step": 680 + }, + { + "epoch": 0.02078313253012048, + "grad_norm": 4.1678404584613205, + "learning_rate": 4.156626506024097e-06, + "loss": 1.0895, + "step": 690 + }, + { + "epoch": 0.02108433734939759, + "grad_norm": 3.864285436996809, + "learning_rate": 4.216867469879519e-06, + "loss": 1.2234, + "step": 700 + }, + { + "epoch": 0.0213855421686747, + "grad_norm": 5.16849448714143, + "learning_rate": 4.27710843373494e-06, + "loss": 1.1659, + "step": 710 + }, + { + "epoch": 0.021686746987951807, + "grad_norm": 4.095326962423422, + "learning_rate": 4.337349397590362e-06, + "loss": 1.1602, + "step": 720 + }, + { + "epoch": 0.021987951807228914, + "grad_norm": 2.952963894247243, + "learning_rate": 4.397590361445783e-06, + "loss": 0.9866, + "step": 730 + }, + { + "epoch": 0.022289156626506025, + "grad_norm": 4.342880994909028, + "learning_rate": 4.457831325301205e-06, + "loss": 1.0471, + "step": 740 + }, + { + "epoch": 0.022590361445783132, + "grad_norm": 4.188011930874192, + "learning_rate": 4.518072289156627e-06, + "loss": 1.0198, + "step": 750 + }, + { + "epoch": 0.02289156626506024, + "grad_norm": 3.9036198525868993, + "learning_rate": 4.578313253012049e-06, + "loss": 1.2074, + "step": 760 + }, + { + "epoch": 0.02319277108433735, + "grad_norm": 4.308339600081488, + "learning_rate": 4.63855421686747e-06, + "loss": 1.2214, + "step": 770 + }, + { + "epoch": 0.023493975903614458, + "grad_norm": 4.200789362568133, + "learning_rate": 4.698795180722892e-06, + "loss": 1.2115, + "step": 780 + }, + { + "epoch": 0.023795180722891565, + "grad_norm": 3.8186451735550135, + "learning_rate": 4.759036144578314e-06, + "loss": 1.0943, + "step": 790 + }, + { + "epoch": 0.024096385542168676, + "grad_norm": 3.7520522634243676, + "learning_rate": 4.819277108433735e-06, + "loss": 1.0688, + "step": 800 + }, + { + "epoch": 0.024397590361445783, + "grad_norm": 2.8150643850127186, + "learning_rate": 4.8795180722891575e-06, + "loss": 1.099, + "step": 810 + }, + { + "epoch": 0.02469879518072289, + "grad_norm": 3.0519094509153892, + "learning_rate": 4.939759036144578e-06, + "loss": 1.0965, + "step": 820 + }, + { + "epoch": 0.025, + "grad_norm": 4.194014232032205, + "learning_rate": 5e-06, + "loss": 0.9908, + "step": 830 + }, + { + "epoch": 0.02530120481927711, + "grad_norm": 4.302589861076283, + "learning_rate": 5.060240963855422e-06, + "loss": 1.1164, + "step": 840 + }, + { + "epoch": 0.025602409638554216, + "grad_norm": 4.907180215758534, + "learning_rate": 5.120481927710844e-06, + "loss": 1.0914, + "step": 850 + }, + { + "epoch": 0.025903614457831327, + "grad_norm": 4.373876586118721, + "learning_rate": 5.180722891566266e-06, + "loss": 1.0364, + "step": 860 + }, + { + "epoch": 0.026204819277108434, + "grad_norm": 4.722558680398194, + "learning_rate": 5.240963855421687e-06, + "loss": 1.1332, + "step": 870 + }, + { + "epoch": 0.02650602409638554, + "grad_norm": 2.931284980772947, + "learning_rate": 5.301204819277109e-06, + "loss": 1.0539, + "step": 880 + }, + { + "epoch": 0.026807228915662652, + "grad_norm": 4.041583561605744, + "learning_rate": 5.361445783132531e-06, + "loss": 1.1339, + "step": 890 + }, + { + "epoch": 0.02710843373493976, + "grad_norm": 4.082971119962101, + "learning_rate": 5.421686746987952e-06, + "loss": 1.2437, + "step": 900 + }, + { + "epoch": 0.027409638554216867, + "grad_norm": 4.617617434680252, + "learning_rate": 5.4819277108433745e-06, + "loss": 1.092, + "step": 910 + }, + { + "epoch": 0.027710843373493974, + "grad_norm": 4.0349654871489, + "learning_rate": 5.542168674698796e-06, + "loss": 1.1967, + "step": 920 + }, + { + "epoch": 0.028012048192771085, + "grad_norm": 2.6027878449239426, + "learning_rate": 5.602409638554217e-06, + "loss": 1.0091, + "step": 930 + }, + { + "epoch": 0.028313253012048192, + "grad_norm": 4.485028893262391, + "learning_rate": 5.66265060240964e-06, + "loss": 1.0483, + "step": 940 + }, + { + "epoch": 0.0286144578313253, + "grad_norm": 4.522486623622015, + "learning_rate": 5.722891566265061e-06, + "loss": 1.2027, + "step": 950 + }, + { + "epoch": 0.02891566265060241, + "grad_norm": 2.8472374483590546, + "learning_rate": 5.783132530120482e-06, + "loss": 0.9967, + "step": 960 + }, + { + "epoch": 0.029216867469879518, + "grad_norm": 4.7385115596889635, + "learning_rate": 5.843373493975905e-06, + "loss": 1.1533, + "step": 970 + }, + { + "epoch": 0.029518072289156625, + "grad_norm": 3.076634710050929, + "learning_rate": 5.9036144578313255e-06, + "loss": 1.1005, + "step": 980 + }, + { + "epoch": 0.029819277108433736, + "grad_norm": 2.689123057316776, + "learning_rate": 5.963855421686747e-06, + "loss": 1.1132, + "step": 990 + }, + { + "epoch": 0.030120481927710843, + "grad_norm": 3.707408578720797, + "learning_rate": 6.02409638554217e-06, + "loss": 1.1946, + "step": 1000 + }, + { + "epoch": 0.03042168674698795, + "grad_norm": 5.063000512597904, + "learning_rate": 6.084337349397591e-06, + "loss": 1.0708, + "step": 1010 + }, + { + "epoch": 0.03072289156626506, + "grad_norm": 4.06276247660459, + "learning_rate": 6.144578313253012e-06, + "loss": 0.9929, + "step": 1020 + }, + { + "epoch": 0.03102409638554217, + "grad_norm": 4.322190402987572, + "learning_rate": 6.2048192771084344e-06, + "loss": 1.0097, + "step": 1030 + }, + { + "epoch": 0.03132530120481928, + "grad_norm": 4.1985515286316195, + "learning_rate": 6.265060240963856e-06, + "loss": 1.1831, + "step": 1040 + }, + { + "epoch": 0.03162650602409638, + "grad_norm": 4.738056079854521, + "learning_rate": 6.325301204819277e-06, + "loss": 1.0026, + "step": 1050 + }, + { + "epoch": 0.031927710843373494, + "grad_norm": 7.908138501617392, + "learning_rate": 6.385542168674699e-06, + "loss": 1.0075, + "step": 1060 + }, + { + "epoch": 0.032228915662650605, + "grad_norm": 4.095549042393854, + "learning_rate": 6.445783132530121e-06, + "loss": 1.1996, + "step": 1070 + }, + { + "epoch": 0.03253012048192771, + "grad_norm": 4.607760763528683, + "learning_rate": 6.5060240963855425e-06, + "loss": 1.0659, + "step": 1080 + }, + { + "epoch": 0.03283132530120482, + "grad_norm": 3.599600156442471, + "learning_rate": 6.566265060240964e-06, + "loss": 0.9888, + "step": 1090 + }, + { + "epoch": 0.03313253012048193, + "grad_norm": 4.57889685904033, + "learning_rate": 6.626506024096386e-06, + "loss": 1.0804, + "step": 1100 + }, + { + "epoch": 0.033433734939759034, + "grad_norm": 2.8468610531916267, + "learning_rate": 6.686746987951808e-06, + "loss": 0.9436, + "step": 1110 + }, + { + "epoch": 0.033734939759036145, + "grad_norm": 4.028682492647136, + "learning_rate": 6.746987951807229e-06, + "loss": 1.042, + "step": 1120 + }, + { + "epoch": 0.034036144578313256, + "grad_norm": 4.487223110665026, + "learning_rate": 6.8072289156626514e-06, + "loss": 1.1325, + "step": 1130 + }, + { + "epoch": 0.03433734939759036, + "grad_norm": 4.528149873897056, + "learning_rate": 6.867469879518073e-06, + "loss": 1.2997, + "step": 1140 + }, + { + "epoch": 0.03463855421686747, + "grad_norm": 3.5686330335011553, + "learning_rate": 6.927710843373494e-06, + "loss": 0.9743, + "step": 1150 + }, + { + "epoch": 0.03493975903614458, + "grad_norm": 3.2377711419187203, + "learning_rate": 6.987951807228917e-06, + "loss": 1.1301, + "step": 1160 + }, + { + "epoch": 0.035240963855421685, + "grad_norm": 4.098978665123173, + "learning_rate": 7.048192771084338e-06, + "loss": 1.1878, + "step": 1170 + }, + { + "epoch": 0.035542168674698796, + "grad_norm": 3.7882630819857774, + "learning_rate": 7.1084337349397595e-06, + "loss": 1.1188, + "step": 1180 + }, + { + "epoch": 0.03584337349397591, + "grad_norm": 4.4587101564262515, + "learning_rate": 7.168674698795182e-06, + "loss": 1.1766, + "step": 1190 + }, + { + "epoch": 0.03614457831325301, + "grad_norm": 3.8989607392844507, + "learning_rate": 7.228915662650603e-06, + "loss": 1.0636, + "step": 1200 + }, + { + "epoch": 0.03644578313253012, + "grad_norm": 3.839191587235316, + "learning_rate": 7.289156626506025e-06, + "loss": 1.196, + "step": 1210 + }, + { + "epoch": 0.03674698795180723, + "grad_norm": 3.78409926911461, + "learning_rate": 7.349397590361447e-06, + "loss": 1.0951, + "step": 1220 + }, + { + "epoch": 0.037048192771084336, + "grad_norm": 2.7074286505344913, + "learning_rate": 7.4096385542168684e-06, + "loss": 0.9479, + "step": 1230 + }, + { + "epoch": 0.03734939759036145, + "grad_norm": 4.381090966336082, + "learning_rate": 7.469879518072289e-06, + "loss": 1.1028, + "step": 1240 + }, + { + "epoch": 0.03765060240963856, + "grad_norm": 4.764045150202293, + "learning_rate": 7.530120481927712e-06, + "loss": 1.0773, + "step": 1250 + }, + { + "epoch": 0.03795180722891566, + "grad_norm": 3.032725440916371, + "learning_rate": 7.590361445783133e-06, + "loss": 1.0261, + "step": 1260 + }, + { + "epoch": 0.03825301204819277, + "grad_norm": 4.035121309975671, + "learning_rate": 7.650602409638555e-06, + "loss": 1.062, + "step": 1270 + }, + { + "epoch": 0.03855421686746988, + "grad_norm": 3.5854891100719564, + "learning_rate": 7.710843373493977e-06, + "loss": 1.0601, + "step": 1280 + }, + { + "epoch": 0.03885542168674699, + "grad_norm": 3.8237871025587435, + "learning_rate": 7.771084337349398e-06, + "loss": 1.1306, + "step": 1290 + }, + { + "epoch": 0.0391566265060241, + "grad_norm": 4.115558747304762, + "learning_rate": 7.83132530120482e-06, + "loss": 1.0698, + "step": 1300 + }, + { + "epoch": 0.0394578313253012, + "grad_norm": 3.7410956006810547, + "learning_rate": 7.891566265060243e-06, + "loss": 1.146, + "step": 1310 + }, + { + "epoch": 0.03975903614457831, + "grad_norm": 3.6105166014436936, + "learning_rate": 7.951807228915663e-06, + "loss": 1.1621, + "step": 1320 + }, + { + "epoch": 0.04006024096385542, + "grad_norm": 3.973739829544077, + "learning_rate": 8.012048192771085e-06, + "loss": 1.2101, + "step": 1330 + }, + { + "epoch": 0.04036144578313253, + "grad_norm": 2.5733017305600114, + "learning_rate": 8.072289156626508e-06, + "loss": 1.0769, + "step": 1340 + }, + { + "epoch": 0.04066265060240964, + "grad_norm": 3.133485976297979, + "learning_rate": 8.132530120481928e-06, + "loss": 1.0562, + "step": 1350 + }, + { + "epoch": 0.04096385542168675, + "grad_norm": 4.01743747234379, + "learning_rate": 8.19277108433735e-06, + "loss": 1.0677, + "step": 1360 + }, + { + "epoch": 0.04126506024096385, + "grad_norm": 5.793222896431323, + "learning_rate": 8.253012048192773e-06, + "loss": 1.1337, + "step": 1370 + }, + { + "epoch": 0.04156626506024096, + "grad_norm": 4.267870315133769, + "learning_rate": 8.313253012048194e-06, + "loss": 1.1537, + "step": 1380 + }, + { + "epoch": 0.041867469879518074, + "grad_norm": 3.56150217463621, + "learning_rate": 8.373493975903614e-06, + "loss": 1.015, + "step": 1390 + }, + { + "epoch": 0.04216867469879518, + "grad_norm": 3.44868228410339, + "learning_rate": 8.433734939759038e-06, + "loss": 1.2518, + "step": 1400 + }, + { + "epoch": 0.04246987951807229, + "grad_norm": 4.069954456635347, + "learning_rate": 8.493975903614459e-06, + "loss": 1.0554, + "step": 1410 + }, + { + "epoch": 0.0427710843373494, + "grad_norm": 2.889978527209783, + "learning_rate": 8.55421686746988e-06, + "loss": 0.8878, + "step": 1420 + }, + { + "epoch": 0.0430722891566265, + "grad_norm": 3.8526229033524673, + "learning_rate": 8.614457831325302e-06, + "loss": 1.1049, + "step": 1430 + }, + { + "epoch": 0.043373493975903614, + "grad_norm": 3.992363809147142, + "learning_rate": 8.674698795180724e-06, + "loss": 1.1341, + "step": 1440 + }, + { + "epoch": 0.043674698795180725, + "grad_norm": 3.6448879341956895, + "learning_rate": 8.734939759036145e-06, + "loss": 1.1601, + "step": 1450 + }, + { + "epoch": 0.04397590361445783, + "grad_norm": 4.514705884392051, + "learning_rate": 8.795180722891567e-06, + "loss": 1.0113, + "step": 1460 + }, + { + "epoch": 0.04427710843373494, + "grad_norm": 4.0363185397570245, + "learning_rate": 8.855421686746989e-06, + "loss": 1.0451, + "step": 1470 + }, + { + "epoch": 0.04457831325301205, + "grad_norm": 3.925379146933244, + "learning_rate": 8.91566265060241e-06, + "loss": 1.1947, + "step": 1480 + }, + { + "epoch": 0.044879518072289154, + "grad_norm": 2.6181945942087608, + "learning_rate": 8.975903614457832e-06, + "loss": 1.0535, + "step": 1490 + }, + { + "epoch": 0.045180722891566265, + "grad_norm": 3.110805657773211, + "learning_rate": 9.036144578313254e-06, + "loss": 1.0878, + "step": 1500 + }, + { + "epoch": 0.045481927710843376, + "grad_norm": 3.4913255776936003, + "learning_rate": 9.096385542168675e-06, + "loss": 1.1069, + "step": 1510 + }, + { + "epoch": 0.04578313253012048, + "grad_norm": 2.7603777767228563, + "learning_rate": 9.156626506024097e-06, + "loss": 1.0823, + "step": 1520 + }, + { + "epoch": 0.04608433734939759, + "grad_norm": 4.280614356362033, + "learning_rate": 9.21686746987952e-06, + "loss": 1.1414, + "step": 1530 + }, + { + "epoch": 0.0463855421686747, + "grad_norm": 2.991791650219649, + "learning_rate": 9.27710843373494e-06, + "loss": 0.9234, + "step": 1540 + }, + { + "epoch": 0.046686746987951805, + "grad_norm": 4.513985825827475, + "learning_rate": 9.337349397590362e-06, + "loss": 1.0021, + "step": 1550 + }, + { + "epoch": 0.046987951807228916, + "grad_norm": 3.8650087083781033, + "learning_rate": 9.397590361445785e-06, + "loss": 1.0892, + "step": 1560 + }, + { + "epoch": 0.047289156626506026, + "grad_norm": 4.69885903360439, + "learning_rate": 9.457831325301205e-06, + "loss": 1.0488, + "step": 1570 + }, + { + "epoch": 0.04759036144578313, + "grad_norm": 4.178287377782754, + "learning_rate": 9.518072289156628e-06, + "loss": 1.1808, + "step": 1580 + }, + { + "epoch": 0.04789156626506024, + "grad_norm": 4.041913650511557, + "learning_rate": 9.57831325301205e-06, + "loss": 1.1977, + "step": 1590 + }, + { + "epoch": 0.04819277108433735, + "grad_norm": 2.4573674173962474, + "learning_rate": 9.63855421686747e-06, + "loss": 1.0869, + "step": 1600 + }, + { + "epoch": 0.048493975903614456, + "grad_norm": 3.7242981224219056, + "learning_rate": 9.698795180722893e-06, + "loss": 0.98, + "step": 1610 + }, + { + "epoch": 0.04879518072289157, + "grad_norm": 3.3696425145200015, + "learning_rate": 9.759036144578315e-06, + "loss": 1.1795, + "step": 1620 + }, + { + "epoch": 0.04909638554216868, + "grad_norm": 3.765251027998519, + "learning_rate": 9.819277108433736e-06, + "loss": 1.1529, + "step": 1630 + }, + { + "epoch": 0.04939759036144578, + "grad_norm": 4.380167625859237, + "learning_rate": 9.879518072289156e-06, + "loss": 1.0109, + "step": 1640 + }, + { + "epoch": 0.04969879518072289, + "grad_norm": 3.5901053438352757, + "learning_rate": 9.93975903614458e-06, + "loss": 1.0782, + "step": 1650 + }, + { + "epoch": 0.05, + "grad_norm": 4.697835290455968, + "learning_rate": 1e-05, + "loss": 1.1538, + "step": 1660 + }, + { + "epoch": 0.05030120481927711, + "grad_norm": 5.149787332226158, + "learning_rate": 9.999997519630741e-06, + "loss": 1.1527, + "step": 1670 + }, + { + "epoch": 0.05060240963855422, + "grad_norm": 4.2386612321855575, + "learning_rate": 9.999990078525426e-06, + "loss": 1.0683, + "step": 1680 + }, + { + "epoch": 0.05090361445783133, + "grad_norm": 3.6473005117732313, + "learning_rate": 9.999977676691436e-06, + "loss": 1.2378, + "step": 1690 + }, + { + "epoch": 0.05120481927710843, + "grad_norm": 3.7901644794972937, + "learning_rate": 9.999960314141076e-06, + "loss": 1.1079, + "step": 1700 + }, + { + "epoch": 0.05150602409638554, + "grad_norm": 3.68394417330946, + "learning_rate": 9.999937990891573e-06, + "loss": 1.0808, + "step": 1710 + }, + { + "epoch": 0.051807228915662654, + "grad_norm": 4.8171466442202036, + "learning_rate": 9.999910706965073e-06, + "loss": 1.0813, + "step": 1720 + }, + { + "epoch": 0.05210843373493976, + "grad_norm": 4.210089621064192, + "learning_rate": 9.999878462388647e-06, + "loss": 1.1659, + "step": 1730 + }, + { + "epoch": 0.05240963855421687, + "grad_norm": 2.401098250310252, + "learning_rate": 9.99984125719429e-06, + "loss": 1.0514, + "step": 1740 + }, + { + "epoch": 0.05271084337349398, + "grad_norm": 3.9199440778618113, + "learning_rate": 9.999799091418908e-06, + "loss": 1.1456, + "step": 1750 + }, + { + "epoch": 0.05301204819277108, + "grad_norm": 2.433777634678354, + "learning_rate": 9.99975196510434e-06, + "loss": 0.8795, + "step": 1760 + }, + { + "epoch": 0.053313253012048194, + "grad_norm": 3.815132735539651, + "learning_rate": 9.999699878297342e-06, + "loss": 1.2158, + "step": 1770 + }, + { + "epoch": 0.053614457831325305, + "grad_norm": 4.387848610690783, + "learning_rate": 9.999642831049591e-06, + "loss": 1.1993, + "step": 1780 + }, + { + "epoch": 0.05391566265060241, + "grad_norm": 4.701283174117646, + "learning_rate": 9.999580823417688e-06, + "loss": 1.0755, + "step": 1790 + }, + { + "epoch": 0.05421686746987952, + "grad_norm": 3.752459770285841, + "learning_rate": 9.99951385546315e-06, + "loss": 1.0101, + "step": 1800 + }, + { + "epoch": 0.05451807228915663, + "grad_norm": 5.337745045286134, + "learning_rate": 9.99944192725242e-06, + "loss": 1.1359, + "step": 1810 + }, + { + "epoch": 0.054819277108433734, + "grad_norm": 3.1352783689331547, + "learning_rate": 9.999365038856866e-06, + "loss": 1.0291, + "step": 1820 + }, + { + "epoch": 0.055120481927710845, + "grad_norm": 2.768514712096848, + "learning_rate": 9.999283190352768e-06, + "loss": 1.0069, + "step": 1830 + }, + { + "epoch": 0.05542168674698795, + "grad_norm": 3.858630636143342, + "learning_rate": 9.999196381821334e-06, + "loss": 1.1378, + "step": 1840 + }, + { + "epoch": 0.05572289156626506, + "grad_norm": 3.378119057719515, + "learning_rate": 9.99910461334869e-06, + "loss": 0.9906, + "step": 1850 + }, + { + "epoch": 0.05602409638554217, + "grad_norm": 3.001219254715605, + "learning_rate": 9.999007885025883e-06, + "loss": 1.0809, + "step": 1860 + }, + { + "epoch": 0.056325301204819274, + "grad_norm": 3.5393439832915425, + "learning_rate": 9.998906196948884e-06, + "loss": 0.9938, + "step": 1870 + }, + { + "epoch": 0.056626506024096385, + "grad_norm": 3.7850574927214007, + "learning_rate": 9.998799549218581e-06, + "loss": 1.2124, + "step": 1880 + }, + { + "epoch": 0.056927710843373495, + "grad_norm": 2.4388623812023242, + "learning_rate": 9.998687941940784e-06, + "loss": 0.9052, + "step": 1890 + }, + { + "epoch": 0.0572289156626506, + "grad_norm": 2.5699202902016633, + "learning_rate": 9.998571375226224e-06, + "loss": 0.932, + "step": 1900 + }, + { + "epoch": 0.05753012048192771, + "grad_norm": 2.5661310794398946, + "learning_rate": 9.998449849190555e-06, + "loss": 1.1105, + "step": 1910 + }, + { + "epoch": 0.05783132530120482, + "grad_norm": 3.8414058457846245, + "learning_rate": 9.998323363954345e-06, + "loss": 1.1434, + "step": 1920 + }, + { + "epoch": 0.058132530120481925, + "grad_norm": 3.7746141534608326, + "learning_rate": 9.99819191964309e-06, + "loss": 1.1568, + "step": 1930 + }, + { + "epoch": 0.058433734939759036, + "grad_norm": 4.604015815450119, + "learning_rate": 9.998055516387199e-06, + "loss": 1.0162, + "step": 1940 + }, + { + "epoch": 0.058734939759036146, + "grad_norm": 3.960228642260117, + "learning_rate": 9.997914154322005e-06, + "loss": 0.9437, + "step": 1950 + }, + { + "epoch": 0.05903614457831325, + "grad_norm": 6.1648613306726965, + "learning_rate": 9.997767833587759e-06, + "loss": 1.182, + "step": 1960 + }, + { + "epoch": 0.05933734939759036, + "grad_norm": 2.549375334776112, + "learning_rate": 9.997616554329634e-06, + "loss": 0.9646, + "step": 1970 + }, + { + "epoch": 0.05963855421686747, + "grad_norm": 4.236515755123189, + "learning_rate": 9.997460316697722e-06, + "loss": 1.0929, + "step": 1980 + }, + { + "epoch": 0.059939759036144576, + "grad_norm": 5.0300970638199844, + "learning_rate": 9.997299120847035e-06, + "loss": 1.2331, + "step": 1990 + }, + { + "epoch": 0.060240963855421686, + "grad_norm": 3.1979172866204717, + "learning_rate": 9.9971329669375e-06, + "loss": 1.0689, + "step": 2000 + }, + { + "epoch": 0.0605421686746988, + "grad_norm": 4.921490012301479, + "learning_rate": 9.996961855133966e-06, + "loss": 1.1924, + "step": 2010 + }, + { + "epoch": 0.0608433734939759, + "grad_norm": 5.084404905580949, + "learning_rate": 9.996785785606204e-06, + "loss": 1.1251, + "step": 2020 + }, + { + "epoch": 0.06114457831325301, + "grad_norm": 2.7468146978263994, + "learning_rate": 9.9966047585289e-06, + "loss": 1.0865, + "step": 2030 + }, + { + "epoch": 0.06144578313253012, + "grad_norm": 4.910435596688258, + "learning_rate": 9.996418774081658e-06, + "loss": 1.0844, + "step": 2040 + }, + { + "epoch": 0.061746987951807226, + "grad_norm": 3.4800396481633586, + "learning_rate": 9.996227832449003e-06, + "loss": 1.1514, + "step": 2050 + }, + { + "epoch": 0.06204819277108434, + "grad_norm": 3.65181405646942, + "learning_rate": 9.996031933820379e-06, + "loss": 1.2223, + "step": 2060 + }, + { + "epoch": 0.06234939759036145, + "grad_norm": 3.074297815221104, + "learning_rate": 9.995831078390145e-06, + "loss": 1.1549, + "step": 2070 + }, + { + "epoch": 0.06265060240963856, + "grad_norm": 3.8273029757375583, + "learning_rate": 9.995625266357579e-06, + "loss": 1.1299, + "step": 2080 + }, + { + "epoch": 0.06295180722891566, + "grad_norm": 3.7573449297064365, + "learning_rate": 9.995414497926876e-06, + "loss": 1.0365, + "step": 2090 + }, + { + "epoch": 0.06325301204819277, + "grad_norm": 10.118518034769624, + "learning_rate": 9.99519877330715e-06, + "loss": 1.0804, + "step": 2100 + }, + { + "epoch": 0.06355421686746988, + "grad_norm": 3.190883815111035, + "learning_rate": 9.994978092712435e-06, + "loss": 1.1677, + "step": 2110 + }, + { + "epoch": 0.06385542168674699, + "grad_norm": 3.9555260125968363, + "learning_rate": 9.994752456361674e-06, + "loss": 1.0777, + "step": 2120 + }, + { + "epoch": 0.06415662650602409, + "grad_norm": 2.905744251020082, + "learning_rate": 9.994521864478734e-06, + "loss": 1.051, + "step": 2130 + }, + { + "epoch": 0.06445783132530121, + "grad_norm": 2.320618136489521, + "learning_rate": 9.994286317292396e-06, + "loss": 1.0803, + "step": 2140 + }, + { + "epoch": 0.06475903614457831, + "grad_norm": 2.4877820734765046, + "learning_rate": 9.994045815036357e-06, + "loss": 0.9119, + "step": 2150 + }, + { + "epoch": 0.06506024096385542, + "grad_norm": 3.151171444449592, + "learning_rate": 9.993800357949231e-06, + "loss": 1.1343, + "step": 2160 + }, + { + "epoch": 0.06536144578313254, + "grad_norm": 2.506532317945994, + "learning_rate": 9.993549946274549e-06, + "loss": 1.1076, + "step": 2170 + }, + { + "epoch": 0.06566265060240964, + "grad_norm": 2.3254966136883524, + "learning_rate": 9.993294580260754e-06, + "loss": 1.0453, + "step": 2180 + }, + { + "epoch": 0.06596385542168674, + "grad_norm": 3.1805029580613837, + "learning_rate": 9.993034260161208e-06, + "loss": 1.117, + "step": 2190 + }, + { + "epoch": 0.06626506024096386, + "grad_norm": 2.9968452761944553, + "learning_rate": 9.992768986234187e-06, + "loss": 1.1851, + "step": 2200 + }, + { + "epoch": 0.06656626506024096, + "grad_norm": 2.434212792136939, + "learning_rate": 9.992498758742882e-06, + "loss": 1.0131, + "step": 2210 + }, + { + "epoch": 0.06686746987951807, + "grad_norm": 3.349590146400896, + "learning_rate": 9.992223577955398e-06, + "loss": 1.1821, + "step": 2220 + }, + { + "epoch": 0.06716867469879519, + "grad_norm": 3.764825932197075, + "learning_rate": 9.991943444144758e-06, + "loss": 1.0021, + "step": 2230 + }, + { + "epoch": 0.06746987951807229, + "grad_norm": 2.592028510712079, + "learning_rate": 9.991658357588892e-06, + "loss": 1.0018, + "step": 2240 + }, + { + "epoch": 0.0677710843373494, + "grad_norm": 3.1938995334707863, + "learning_rate": 9.991368318570648e-06, + "loss": 0.9243, + "step": 2250 + }, + { + "epoch": 0.06807228915662651, + "grad_norm": 3.595601219077997, + "learning_rate": 9.99107332737779e-06, + "loss": 1.0054, + "step": 2260 + }, + { + "epoch": 0.06837349397590362, + "grad_norm": 2.3885187420780536, + "learning_rate": 9.990773384302992e-06, + "loss": 1.0097, + "step": 2270 + }, + { + "epoch": 0.06867469879518072, + "grad_norm": 2.4930420792745798, + "learning_rate": 9.990468489643842e-06, + "loss": 1.0304, + "step": 2280 + }, + { + "epoch": 0.06897590361445784, + "grad_norm": 2.5449190333509093, + "learning_rate": 9.99015864370284e-06, + "loss": 1.007, + "step": 2290 + }, + { + "epoch": 0.06927710843373494, + "grad_norm": 3.5462243746153477, + "learning_rate": 9.9898438467874e-06, + "loss": 1.0589, + "step": 2300 + }, + { + "epoch": 0.06957831325301204, + "grad_norm": 2.7165842252202994, + "learning_rate": 9.989524099209846e-06, + "loss": 1.1116, + "step": 2310 + }, + { + "epoch": 0.06987951807228916, + "grad_norm": 3.1862255813692397, + "learning_rate": 9.989199401287415e-06, + "loss": 1.1819, + "step": 2320 + }, + { + "epoch": 0.07018072289156627, + "grad_norm": 2.6637679123123466, + "learning_rate": 9.988869753342256e-06, + "loss": 0.9118, + "step": 2330 + }, + { + "epoch": 0.07048192771084337, + "grad_norm": 3.2336481003567537, + "learning_rate": 9.988535155701427e-06, + "loss": 1.1251, + "step": 2340 + }, + { + "epoch": 0.07078313253012049, + "grad_norm": 3.5066824578309967, + "learning_rate": 9.9881956086969e-06, + "loss": 1.038, + "step": 2350 + }, + { + "epoch": 0.07108433734939759, + "grad_norm": 3.7795574130684857, + "learning_rate": 9.987851112665554e-06, + "loss": 1.0597, + "step": 2360 + }, + { + "epoch": 0.0713855421686747, + "grad_norm": 4.5414605529212295, + "learning_rate": 9.98750166794918e-06, + "loss": 1.0951, + "step": 2370 + }, + { + "epoch": 0.07168674698795181, + "grad_norm": 2.585582041219885, + "learning_rate": 9.987147274894482e-06, + "loss": 0.9117, + "step": 2380 + }, + { + "epoch": 0.07198795180722892, + "grad_norm": 4.080742697926455, + "learning_rate": 9.986787933853066e-06, + "loss": 0.9999, + "step": 2390 + }, + { + "epoch": 0.07228915662650602, + "grad_norm": 2.7035924145421633, + "learning_rate": 9.986423645181453e-06, + "loss": 1.0319, + "step": 2400 + }, + { + "epoch": 0.07259036144578314, + "grad_norm": 2.491404655175915, + "learning_rate": 9.986054409241073e-06, + "loss": 1.0551, + "step": 2410 + }, + { + "epoch": 0.07289156626506024, + "grad_norm": 3.6349460804068268, + "learning_rate": 9.985680226398261e-06, + "loss": 1.0337, + "step": 2420 + }, + { + "epoch": 0.07319277108433735, + "grad_norm": 3.10110982779257, + "learning_rate": 9.98530109702426e-06, + "loss": 1.1683, + "step": 2430 + }, + { + "epoch": 0.07349397590361446, + "grad_norm": 3.8403291806956057, + "learning_rate": 9.984917021495226e-06, + "loss": 1.1914, + "step": 2440 + }, + { + "epoch": 0.07379518072289157, + "grad_norm": 2.555025383200647, + "learning_rate": 9.984528000192216e-06, + "loss": 1.1094, + "step": 2450 + }, + { + "epoch": 0.07409638554216867, + "grad_norm": 3.605108875645977, + "learning_rate": 9.984134033501198e-06, + "loss": 1.0844, + "step": 2460 + }, + { + "epoch": 0.07439759036144579, + "grad_norm": 3.4843104645151812, + "learning_rate": 9.983735121813043e-06, + "loss": 1.0617, + "step": 2470 + }, + { + "epoch": 0.0746987951807229, + "grad_norm": 3.2471284257271167, + "learning_rate": 9.983331265523532e-06, + "loss": 1.1092, + "step": 2480 + }, + { + "epoch": 0.075, + "grad_norm": 1.9449811473314356, + "learning_rate": 9.98292246503335e-06, + "loss": 0.998, + "step": 2490 + }, + { + "epoch": 0.07530120481927711, + "grad_norm": 3.941148943349073, + "learning_rate": 9.982508720748087e-06, + "loss": 1.2207, + "step": 2500 + }, + { + "epoch": 0.07560240963855422, + "grad_norm": 3.4412112592806383, + "learning_rate": 9.98209003307824e-06, + "loss": 1.0887, + "step": 2510 + }, + { + "epoch": 0.07590361445783132, + "grad_norm": 4.818457829096565, + "learning_rate": 9.981666402439206e-06, + "loss": 1.0562, + "step": 2520 + }, + { + "epoch": 0.07620481927710844, + "grad_norm": 3.80293562021422, + "learning_rate": 9.981237829251293e-06, + "loss": 1.0833, + "step": 2530 + }, + { + "epoch": 0.07650602409638554, + "grad_norm": 2.4087631061355363, + "learning_rate": 9.980804313939704e-06, + "loss": 1.0543, + "step": 2540 + }, + { + "epoch": 0.07680722891566265, + "grad_norm": 3.5487519559148266, + "learning_rate": 9.980365856934555e-06, + "loss": 0.9845, + "step": 2550 + }, + { + "epoch": 0.07710843373493977, + "grad_norm": 2.4850575123163727, + "learning_rate": 9.97992245867086e-06, + "loss": 0.9657, + "step": 2560 + }, + { + "epoch": 0.07740963855421687, + "grad_norm": 3.555607525747383, + "learning_rate": 9.97947411958853e-06, + "loss": 1.1084, + "step": 2570 + }, + { + "epoch": 0.07771084337349397, + "grad_norm": 2.40737881814081, + "learning_rate": 9.979020840132389e-06, + "loss": 1.1145, + "step": 2580 + }, + { + "epoch": 0.07801204819277109, + "grad_norm": 2.9290379516100717, + "learning_rate": 9.978562620752155e-06, + "loss": 1.1361, + "step": 2590 + }, + { + "epoch": 0.0783132530120482, + "grad_norm": 4.920614149434376, + "learning_rate": 9.978099461902449e-06, + "loss": 1.0711, + "step": 2600 + }, + { + "epoch": 0.0786144578313253, + "grad_norm": 2.396143608817259, + "learning_rate": 9.977631364042796e-06, + "loss": 0.9605, + "step": 2610 + }, + { + "epoch": 0.0789156626506024, + "grad_norm": 21.085877420807975, + "learning_rate": 9.977158327637614e-06, + "loss": 0.9583, + "step": 2620 + }, + { + "epoch": 0.07921686746987952, + "grad_norm": 2.294929049480722, + "learning_rate": 9.97668035315623e-06, + "loss": 1.0063, + "step": 2630 + }, + { + "epoch": 0.07951807228915662, + "grad_norm": 6.789153018446023, + "learning_rate": 9.976197441072859e-06, + "loss": 1.1275, + "step": 2640 + }, + { + "epoch": 0.07981927710843373, + "grad_norm": 7.742954844534115, + "learning_rate": 9.975709591866625e-06, + "loss": 1.1735, + "step": 2650 + }, + { + "epoch": 0.08012048192771085, + "grad_norm": 16.04533601383964, + "learning_rate": 9.975216806021546e-06, + "loss": 1.1783, + "step": 2660 + }, + { + "epoch": 0.08042168674698795, + "grad_norm": 2.1758648326982875, + "learning_rate": 9.974719084026539e-06, + "loss": 1.0405, + "step": 2670 + }, + { + "epoch": 0.08072289156626505, + "grad_norm": 6.064945451747976, + "learning_rate": 9.974216426375415e-06, + "loss": 1.0942, + "step": 2680 + }, + { + "epoch": 0.08102409638554217, + "grad_norm": 8.445513916077669, + "learning_rate": 9.973708833566888e-06, + "loss": 1.1507, + "step": 2690 + }, + { + "epoch": 0.08132530120481928, + "grad_norm": 7.596599663105275, + "learning_rate": 9.973196306104565e-06, + "loss": 1.0625, + "step": 2700 + }, + { + "epoch": 0.08162650602409638, + "grad_norm": 3.8107562681697886, + "learning_rate": 9.972678844496946e-06, + "loss": 1.054, + "step": 2710 + }, + { + "epoch": 0.0819277108433735, + "grad_norm": 2.60289693386569, + "learning_rate": 9.972156449257431e-06, + "loss": 1.1578, + "step": 2720 + }, + { + "epoch": 0.0822289156626506, + "grad_norm": 34.67041330752499, + "learning_rate": 9.971629120904311e-06, + "loss": 0.9634, + "step": 2730 + }, + { + "epoch": 0.0825301204819277, + "grad_norm": 4.124388244266611, + "learning_rate": 9.971096859960778e-06, + "loss": 1.083, + "step": 2740 + }, + { + "epoch": 0.08283132530120482, + "grad_norm": 2.5613783895358324, + "learning_rate": 9.970559666954912e-06, + "loss": 1.0909, + "step": 2750 + }, + { + "epoch": 0.08313253012048193, + "grad_norm": 2.4217897863699687, + "learning_rate": 9.970017542419685e-06, + "loss": 1.0532, + "step": 2760 + }, + { + "epoch": 0.08343373493975903, + "grad_norm": 3.489663688491791, + "learning_rate": 9.969470486892969e-06, + "loss": 1.059, + "step": 2770 + }, + { + "epoch": 0.08373493975903615, + "grad_norm": 3.441237385708804, + "learning_rate": 9.968918500917519e-06, + "loss": 1.1639, + "step": 2780 + }, + { + "epoch": 0.08403614457831325, + "grad_norm": 2.423276371430195, + "learning_rate": 9.96836158504099e-06, + "loss": 0.9466, + "step": 2790 + }, + { + "epoch": 0.08433734939759036, + "grad_norm": 3.289694596045945, + "learning_rate": 9.967799739815925e-06, + "loss": 0.9739, + "step": 2800 + }, + { + "epoch": 0.08463855421686747, + "grad_norm": 3.9050353977577235, + "learning_rate": 9.967232965799756e-06, + "loss": 1.0265, + "step": 2810 + }, + { + "epoch": 0.08493975903614458, + "grad_norm": 2.532904826528985, + "learning_rate": 9.966661263554807e-06, + "loss": 1.1686, + "step": 2820 + }, + { + "epoch": 0.08524096385542168, + "grad_norm": 2.3416221435459534, + "learning_rate": 9.96608463364829e-06, + "loss": 1.0797, + "step": 2830 + }, + { + "epoch": 0.0855421686746988, + "grad_norm": 3.0805759527658867, + "learning_rate": 9.965503076652309e-06, + "loss": 1.1602, + "step": 2840 + }, + { + "epoch": 0.0858433734939759, + "grad_norm": 2.887601186210108, + "learning_rate": 9.964916593143851e-06, + "loss": 1.1548, + "step": 2850 + }, + { + "epoch": 0.086144578313253, + "grad_norm": 3.3377312336528187, + "learning_rate": 9.964325183704801e-06, + "loss": 1.1498, + "step": 2860 + }, + { + "epoch": 0.08644578313253012, + "grad_norm": 3.7551887104103807, + "learning_rate": 9.963728848921918e-06, + "loss": 1.133, + "step": 2870 + }, + { + "epoch": 0.08674698795180723, + "grad_norm": 3.927673372140471, + "learning_rate": 9.963127589386856e-06, + "loss": 1.1079, + "step": 2880 + }, + { + "epoch": 0.08704819277108433, + "grad_norm": 3.646635001753062, + "learning_rate": 9.962521405696155e-06, + "loss": 1.1541, + "step": 2890 + }, + { + "epoch": 0.08734939759036145, + "grad_norm": 2.2738856304274138, + "learning_rate": 9.961910298451237e-06, + "loss": 1.0623, + "step": 2900 + }, + { + "epoch": 0.08765060240963855, + "grad_norm": 2.5961700941930217, + "learning_rate": 9.96129426825841e-06, + "loss": 1.054, + "step": 2910 + }, + { + "epoch": 0.08795180722891566, + "grad_norm": 3.151722234942918, + "learning_rate": 9.960673315728869e-06, + "loss": 1.0582, + "step": 2920 + }, + { + "epoch": 0.08825301204819277, + "grad_norm": 2.4318221200620447, + "learning_rate": 9.960047441478688e-06, + "loss": 1.0786, + "step": 2930 + }, + { + "epoch": 0.08855421686746988, + "grad_norm": 3.2436328140102466, + "learning_rate": 9.959416646128832e-06, + "loss": 0.9924, + "step": 2940 + }, + { + "epoch": 0.08885542168674698, + "grad_norm": 3.4902327614333366, + "learning_rate": 9.958780930305136e-06, + "loss": 1.0438, + "step": 2950 + }, + { + "epoch": 0.0891566265060241, + "grad_norm": 2.529586079380723, + "learning_rate": 9.95814029463833e-06, + "loss": 1.0668, + "step": 2960 + }, + { + "epoch": 0.0894578313253012, + "grad_norm": 3.5179299925095635, + "learning_rate": 9.957494739764015e-06, + "loss": 1.1369, + "step": 2970 + }, + { + "epoch": 0.08975903614457831, + "grad_norm": 3.209633091677148, + "learning_rate": 9.95684426632268e-06, + "loss": 1.0598, + "step": 2980 + }, + { + "epoch": 0.09006024096385543, + "grad_norm": 2.870985968805948, + "learning_rate": 9.956188874959686e-06, + "loss": 1.1513, + "step": 2990 + }, + { + "epoch": 0.09036144578313253, + "grad_norm": 3.458606378245846, + "learning_rate": 9.955528566325285e-06, + "loss": 1.0767, + "step": 3000 + }, + { + "epoch": 0.09066265060240963, + "grad_norm": 3.44900113672603, + "learning_rate": 9.954863341074597e-06, + "loss": 1.0477, + "step": 3010 + }, + { + "epoch": 0.09096385542168675, + "grad_norm": 2.4691131879834836, + "learning_rate": 9.954193199867622e-06, + "loss": 0.9465, + "step": 3020 + }, + { + "epoch": 0.09126506024096386, + "grad_norm": 3.452353799076586, + "learning_rate": 9.95351814336924e-06, + "loss": 1.0809, + "step": 3030 + }, + { + "epoch": 0.09156626506024096, + "grad_norm": 3.5876569387258, + "learning_rate": 9.95283817224921e-06, + "loss": 1.2361, + "step": 3040 + }, + { + "epoch": 0.09186746987951808, + "grad_norm": 2.3053326000738608, + "learning_rate": 9.952153287182161e-06, + "loss": 1.0254, + "step": 3050 + }, + { + "epoch": 0.09216867469879518, + "grad_norm": 3.312836075166249, + "learning_rate": 9.951463488847598e-06, + "loss": 1.0892, + "step": 3060 + }, + { + "epoch": 0.09246987951807228, + "grad_norm": 3.3874341809742066, + "learning_rate": 9.950768777929906e-06, + "loss": 0.9769, + "step": 3070 + }, + { + "epoch": 0.0927710843373494, + "grad_norm": 4.679167740714582, + "learning_rate": 9.950069155118341e-06, + "loss": 1.1189, + "step": 3080 + }, + { + "epoch": 0.0930722891566265, + "grad_norm": 3.6434603101544174, + "learning_rate": 9.949364621107032e-06, + "loss": 1.1087, + "step": 3090 + }, + { + "epoch": 0.09337349397590361, + "grad_norm": 4.565814160232158, + "learning_rate": 9.948655176594979e-06, + "loss": 1.1342, + "step": 3100 + }, + { + "epoch": 0.09367469879518073, + "grad_norm": 2.300775890404092, + "learning_rate": 9.947940822286058e-06, + "loss": 1.0865, + "step": 3110 + }, + { + "epoch": 0.09397590361445783, + "grad_norm": 3.7003026630602194, + "learning_rate": 9.947221558889011e-06, + "loss": 1.0942, + "step": 3120 + }, + { + "epoch": 0.09427710843373494, + "grad_norm": 3.2932522022342354, + "learning_rate": 9.946497387117456e-06, + "loss": 1.0463, + "step": 3130 + }, + { + "epoch": 0.09457831325301205, + "grad_norm": 2.2104973463777147, + "learning_rate": 9.945768307689878e-06, + "loss": 0.9945, + "step": 3140 + }, + { + "epoch": 0.09487951807228916, + "grad_norm": 3.454729841387448, + "learning_rate": 9.945034321329631e-06, + "loss": 1.0486, + "step": 3150 + }, + { + "epoch": 0.09518072289156626, + "grad_norm": 2.526763130559842, + "learning_rate": 9.944295428764939e-06, + "loss": 1.0415, + "step": 3160 + }, + { + "epoch": 0.09548192771084338, + "grad_norm": 3.3192206957471155, + "learning_rate": 9.94355163072889e-06, + "loss": 1.0743, + "step": 3170 + }, + { + "epoch": 0.09578313253012048, + "grad_norm": 3.6483424767916173, + "learning_rate": 9.942802927959444e-06, + "loss": 0.9687, + "step": 3180 + }, + { + "epoch": 0.09608433734939759, + "grad_norm": 3.550875633521195, + "learning_rate": 9.942049321199423e-06, + "loss": 1.1543, + "step": 3190 + }, + { + "epoch": 0.0963855421686747, + "grad_norm": 3.29660823020748, + "learning_rate": 9.941290811196518e-06, + "loss": 1.167, + "step": 3200 + }, + { + "epoch": 0.09668674698795181, + "grad_norm": 3.3986787350060603, + "learning_rate": 9.940527398703282e-06, + "loss": 1.0601, + "step": 3210 + }, + { + "epoch": 0.09698795180722891, + "grad_norm": 2.9548349101010243, + "learning_rate": 9.939759084477132e-06, + "loss": 1.1153, + "step": 3220 + }, + { + "epoch": 0.09728915662650603, + "grad_norm": 2.7975966103960115, + "learning_rate": 9.938985869280352e-06, + "loss": 1.0727, + "step": 3230 + }, + { + "epoch": 0.09759036144578313, + "grad_norm": 3.224219184432969, + "learning_rate": 9.938207753880082e-06, + "loss": 1.0709, + "step": 3240 + }, + { + "epoch": 0.09789156626506024, + "grad_norm": 4.89642920300496, + "learning_rate": 9.937424739048332e-06, + "loss": 1.0952, + "step": 3250 + }, + { + "epoch": 0.09819277108433735, + "grad_norm": 2.3077224024701293, + "learning_rate": 9.936636825561962e-06, + "loss": 1.077, + "step": 3260 + }, + { + "epoch": 0.09849397590361446, + "grad_norm": 3.525517406827571, + "learning_rate": 9.935844014202704e-06, + "loss": 1.09, + "step": 3270 + }, + { + "epoch": 0.09879518072289156, + "grad_norm": 3.3837534625306174, + "learning_rate": 9.935046305757144e-06, + "loss": 1.1552, + "step": 3280 + }, + { + "epoch": 0.09909638554216868, + "grad_norm": 3.3450320696159404, + "learning_rate": 9.934243701016722e-06, + "loss": 1.1078, + "step": 3290 + }, + { + "epoch": 0.09939759036144578, + "grad_norm": 3.2315444842214975, + "learning_rate": 9.933436200777744e-06, + "loss": 1.0719, + "step": 3300 + }, + { + "epoch": 0.09969879518072289, + "grad_norm": 3.681112830375248, + "learning_rate": 9.932623805841369e-06, + "loss": 1.0665, + "step": 3310 + }, + { + "epoch": 0.1, + "grad_norm": 2.4855398408295777, + "learning_rate": 9.931806517013612e-06, + "loss": 0.9008, + "step": 3320 + }, + { + "epoch": 0.10030120481927711, + "grad_norm": 4.2067150888802365, + "learning_rate": 9.930984335105346e-06, + "loss": 1.0229, + "step": 3330 + }, + { + "epoch": 0.10060240963855421, + "grad_norm": 3.6811653466226653, + "learning_rate": 9.930157260932295e-06, + "loss": 0.9746, + "step": 3340 + }, + { + "epoch": 0.10090361445783133, + "grad_norm": 3.867774500184461, + "learning_rate": 9.929325295315039e-06, + "loss": 1.1569, + "step": 3350 + }, + { + "epoch": 0.10120481927710843, + "grad_norm": 3.2169003704498436, + "learning_rate": 9.928488439079012e-06, + "loss": 1.1525, + "step": 3360 + }, + { + "epoch": 0.10150602409638554, + "grad_norm": 2.429935006013882, + "learning_rate": 9.927646693054498e-06, + "loss": 0.9991, + "step": 3370 + }, + { + "epoch": 0.10180722891566266, + "grad_norm": 2.4064532640598353, + "learning_rate": 9.92680005807663e-06, + "loss": 1.0455, + "step": 3380 + }, + { + "epoch": 0.10210843373493976, + "grad_norm": 5.643497312447238, + "learning_rate": 9.925948534985404e-06, + "loss": 1.0037, + "step": 3390 + }, + { + "epoch": 0.10240963855421686, + "grad_norm": 2.4185176278194738, + "learning_rate": 9.925092124625649e-06, + "loss": 0.958, + "step": 3400 + }, + { + "epoch": 0.10271084337349398, + "grad_norm": 2.427364695140659, + "learning_rate": 9.92423082784705e-06, + "loss": 1.0585, + "step": 3410 + }, + { + "epoch": 0.10301204819277109, + "grad_norm": 5.887559987596885, + "learning_rate": 9.923364645504147e-06, + "loss": 1.0812, + "step": 3420 + }, + { + "epoch": 0.10331325301204819, + "grad_norm": 4.6406899191484685, + "learning_rate": 9.922493578456316e-06, + "loss": 1.0793, + "step": 3430 + }, + { + "epoch": 0.10361445783132531, + "grad_norm": 4.74836161300959, + "learning_rate": 9.921617627567786e-06, + "loss": 1.0573, + "step": 3440 + }, + { + "epoch": 0.10391566265060241, + "grad_norm": 3.8895481633008626, + "learning_rate": 9.920736793707627e-06, + "loss": 1.0817, + "step": 3450 + }, + { + "epoch": 0.10421686746987951, + "grad_norm": 2.2925826881912723, + "learning_rate": 9.91985107774976e-06, + "loss": 0.9974, + "step": 3460 + }, + { + "epoch": 0.10451807228915663, + "grad_norm": 2.500834179568333, + "learning_rate": 9.918960480572943e-06, + "loss": 1.161, + "step": 3470 + }, + { + "epoch": 0.10481927710843374, + "grad_norm": 3.140445484276692, + "learning_rate": 9.918065003060782e-06, + "loss": 0.933, + "step": 3480 + }, + { + "epoch": 0.10512048192771084, + "grad_norm": 2.0696270131923695, + "learning_rate": 9.917164646101723e-06, + "loss": 0.9928, + "step": 3490 + }, + { + "epoch": 0.10542168674698796, + "grad_norm": 3.5201557272388504, + "learning_rate": 9.916259410589051e-06, + "loss": 0.9544, + "step": 3500 + }, + { + "epoch": 0.10572289156626506, + "grad_norm": 3.4034684537817044, + "learning_rate": 9.915349297420895e-06, + "loss": 1.0712, + "step": 3510 + }, + { + "epoch": 0.10602409638554217, + "grad_norm": 3.8292368668618546, + "learning_rate": 9.914434307500223e-06, + "loss": 1.0634, + "step": 3520 + }, + { + "epoch": 0.10632530120481928, + "grad_norm": 3.217098721578885, + "learning_rate": 9.913514441734837e-06, + "loss": 1.1015, + "step": 3530 + }, + { + "epoch": 0.10662650602409639, + "grad_norm": 2.215923928809928, + "learning_rate": 9.91258970103738e-06, + "loss": 0.9373, + "step": 3540 + }, + { + "epoch": 0.10692771084337349, + "grad_norm": 3.249713627496291, + "learning_rate": 9.911660086325335e-06, + "loss": 1.0283, + "step": 3550 + }, + { + "epoch": 0.10722891566265061, + "grad_norm": 3.2075055711658504, + "learning_rate": 9.910725598521014e-06, + "loss": 1.1366, + "step": 3560 + }, + { + "epoch": 0.10753012048192771, + "grad_norm": 3.1526632646499437, + "learning_rate": 9.909786238551567e-06, + "loss": 1.0981, + "step": 3570 + }, + { + "epoch": 0.10783132530120482, + "grad_norm": 4.012541444087307, + "learning_rate": 9.90884200734898e-06, + "loss": 1.077, + "step": 3580 + }, + { + "epoch": 0.10813253012048193, + "grad_norm": 3.412507998567453, + "learning_rate": 9.907892905850067e-06, + "loss": 1.0265, + "step": 3590 + }, + { + "epoch": 0.10843373493975904, + "grad_norm": 3.3760447554613493, + "learning_rate": 9.906938934996477e-06, + "loss": 0.9877, + "step": 3600 + }, + { + "epoch": 0.10873493975903614, + "grad_norm": 3.152908418084381, + "learning_rate": 9.905980095734693e-06, + "loss": 1.0545, + "step": 3610 + }, + { + "epoch": 0.10903614457831326, + "grad_norm": 4.9422576933511335, + "learning_rate": 9.905016389016022e-06, + "loss": 1.0797, + "step": 3620 + }, + { + "epoch": 0.10933734939759036, + "grad_norm": 2.5713962627640394, + "learning_rate": 9.904047815796606e-06, + "loss": 1.0536, + "step": 3630 + }, + { + "epoch": 0.10963855421686747, + "grad_norm": 3.8361775301205445, + "learning_rate": 9.90307437703741e-06, + "loss": 1.0602, + "step": 3640 + }, + { + "epoch": 0.10993975903614457, + "grad_norm": 4.388280339546095, + "learning_rate": 9.902096073704232e-06, + "loss": 1.1544, + "step": 3650 + }, + { + "epoch": 0.11024096385542169, + "grad_norm": 3.53370160147256, + "learning_rate": 9.901112906767691e-06, + "loss": 1.1583, + "step": 3660 + }, + { + "epoch": 0.11054216867469879, + "grad_norm": 5.587251441249992, + "learning_rate": 9.900124877203234e-06, + "loss": 1.0497, + "step": 3670 + }, + { + "epoch": 0.1108433734939759, + "grad_norm": 2.499547777526994, + "learning_rate": 9.899131985991135e-06, + "loss": 0.9369, + "step": 3680 + }, + { + "epoch": 0.11114457831325301, + "grad_norm": 7.181643890173013, + "learning_rate": 9.898134234116484e-06, + "loss": 0.9935, + "step": 3690 + }, + { + "epoch": 0.11144578313253012, + "grad_norm": 4.832575337671559, + "learning_rate": 9.897131622569203e-06, + "loss": 1.1319, + "step": 3700 + }, + { + "epoch": 0.11174698795180722, + "grad_norm": 3.3856814879475814, + "learning_rate": 9.896124152344026e-06, + "loss": 1.0827, + "step": 3710 + }, + { + "epoch": 0.11204819277108434, + "grad_norm": 3.821240493677679, + "learning_rate": 9.895111824440518e-06, + "loss": 0.96, + "step": 3720 + }, + { + "epoch": 0.11234939759036144, + "grad_norm": 2.2862337649984044, + "learning_rate": 9.894094639863052e-06, + "loss": 1.075, + "step": 3730 + }, + { + "epoch": 0.11265060240963855, + "grad_norm": 3.7350976360281924, + "learning_rate": 9.89307259962083e-06, + "loss": 1.1295, + "step": 3740 + }, + { + "epoch": 0.11295180722891567, + "grad_norm": 5.33116955217106, + "learning_rate": 9.892045704727864e-06, + "loss": 1.0976, + "step": 3750 + }, + { + "epoch": 0.11325301204819277, + "grad_norm": 4.9096763724568175, + "learning_rate": 9.891013956202986e-06, + "loss": 1.0943, + "step": 3760 + }, + { + "epoch": 0.11355421686746987, + "grad_norm": 5.946510447864238, + "learning_rate": 9.889977355069846e-06, + "loss": 0.9954, + "step": 3770 + }, + { + "epoch": 0.11385542168674699, + "grad_norm": 2.5896161633818893, + "learning_rate": 9.8889359023569e-06, + "loss": 1.0303, + "step": 3780 + }, + { + "epoch": 0.1141566265060241, + "grad_norm": 2.682308487012938, + "learning_rate": 9.887889599097427e-06, + "loss": 0.9402, + "step": 3790 + }, + { + "epoch": 0.1144578313253012, + "grad_norm": 8.090386094644277, + "learning_rate": 9.886838446329513e-06, + "loss": 0.993, + "step": 3800 + }, + { + "epoch": 0.11475903614457832, + "grad_norm": 4.9136633248414, + "learning_rate": 9.885782445096054e-06, + "loss": 1.0911, + "step": 3810 + }, + { + "epoch": 0.11506024096385542, + "grad_norm": 11.66087514078214, + "learning_rate": 9.884721596444765e-06, + "loss": 1.0986, + "step": 3820 + }, + { + "epoch": 0.11536144578313252, + "grad_norm": 2.803940405000577, + "learning_rate": 9.88365590142816e-06, + "loss": 0.974, + "step": 3830 + }, + { + "epoch": 0.11566265060240964, + "grad_norm": 2.0741399724434197, + "learning_rate": 9.882585361103567e-06, + "loss": 0.9336, + "step": 3840 + }, + { + "epoch": 0.11596385542168675, + "grad_norm": 4.285118614701219, + "learning_rate": 9.881509976533122e-06, + "loss": 1.0348, + "step": 3850 + }, + { + "epoch": 0.11626506024096385, + "grad_norm": 6.401548377417817, + "learning_rate": 9.880429748783765e-06, + "loss": 1.084, + "step": 3860 + }, + { + "epoch": 0.11656626506024097, + "grad_norm": 2.238506957174242, + "learning_rate": 9.879344678927237e-06, + "loss": 0.9916, + "step": 3870 + }, + { + "epoch": 0.11686746987951807, + "grad_norm": 2.1766726826908824, + "learning_rate": 9.878254768040094e-06, + "loss": 0.9253, + "step": 3880 + }, + { + "epoch": 0.11716867469879517, + "grad_norm": 9.605422303557702, + "learning_rate": 9.877160017203685e-06, + "loss": 1.0702, + "step": 3890 + }, + { + "epoch": 0.11746987951807229, + "grad_norm": 6.72304912503531, + "learning_rate": 9.876060427504163e-06, + "loss": 1.0878, + "step": 3900 + }, + { + "epoch": 0.1177710843373494, + "grad_norm": 6.676548799781242, + "learning_rate": 9.874956000032486e-06, + "loss": 1.0945, + "step": 3910 + }, + { + "epoch": 0.1180722891566265, + "grad_norm": 2.2583690591099885, + "learning_rate": 9.87384673588441e-06, + "loss": 0.9939, + "step": 3920 + }, + { + "epoch": 0.11837349397590362, + "grad_norm": 7.146130842348655, + "learning_rate": 9.872732636160485e-06, + "loss": 1.0195, + "step": 3930 + }, + { + "epoch": 0.11867469879518072, + "grad_norm": 7.463148927538527, + "learning_rate": 9.871613701966067e-06, + "loss": 1.0251, + "step": 3940 + }, + { + "epoch": 0.11897590361445783, + "grad_norm": 2.4988058155353463, + "learning_rate": 9.870489934411302e-06, + "loss": 1.0683, + "step": 3950 + }, + { + "epoch": 0.11927710843373494, + "grad_norm": 4.866698415262084, + "learning_rate": 9.869361334611134e-06, + "loss": 1.1394, + "step": 3960 + }, + { + "epoch": 0.11957831325301205, + "grad_norm": 6.302578574320336, + "learning_rate": 9.8682279036853e-06, + "loss": 1.0607, + "step": 3970 + }, + { + "epoch": 0.11987951807228915, + "grad_norm": 3.895202506220607, + "learning_rate": 9.867089642758329e-06, + "loss": 1.1389, + "step": 3980 + }, + { + "epoch": 0.12018072289156627, + "grad_norm": 3.3532605127940087, + "learning_rate": 9.865946552959546e-06, + "loss": 1.0315, + "step": 3990 + }, + { + "epoch": 0.12048192771084337, + "grad_norm": 2.287650829129172, + "learning_rate": 9.864798635423067e-06, + "loss": 1.0001, + "step": 4000 + }, + { + "epoch": 0.12078313253012048, + "grad_norm": 3.1202934651229777, + "learning_rate": 9.863645891287793e-06, + "loss": 1.033, + "step": 4010 + }, + { + "epoch": 0.1210843373493976, + "grad_norm": 3.2086966960342784, + "learning_rate": 9.862488321697414e-06, + "loss": 1.1562, + "step": 4020 + }, + { + "epoch": 0.1213855421686747, + "grad_norm": 2.417660159644811, + "learning_rate": 9.861325927800415e-06, + "loss": 0.9891, + "step": 4030 + }, + { + "epoch": 0.1216867469879518, + "grad_norm": 3.2444385489033545, + "learning_rate": 9.860158710750062e-06, + "loss": 1.1242, + "step": 4040 + }, + { + "epoch": 0.12198795180722892, + "grad_norm": 2.234151030168644, + "learning_rate": 9.858986671704405e-06, + "loss": 0.9638, + "step": 4050 + }, + { + "epoch": 0.12228915662650602, + "grad_norm": 3.235452061511477, + "learning_rate": 9.857809811826277e-06, + "loss": 0.9994, + "step": 4060 + }, + { + "epoch": 0.12259036144578313, + "grad_norm": 2.309966366479653, + "learning_rate": 9.856628132283303e-06, + "loss": 0.9584, + "step": 4070 + }, + { + "epoch": 0.12289156626506025, + "grad_norm": 2.814561986665927, + "learning_rate": 9.85544163424788e-06, + "loss": 1.0083, + "step": 4080 + }, + { + "epoch": 0.12319277108433735, + "grad_norm": 4.05930623486242, + "learning_rate": 9.854250318897188e-06, + "loss": 0.9834, + "step": 4090 + }, + { + "epoch": 0.12349397590361445, + "grad_norm": 3.755891333932205, + "learning_rate": 9.853054187413192e-06, + "loss": 1.1235, + "step": 4100 + }, + { + "epoch": 0.12379518072289157, + "grad_norm": 2.310425777436963, + "learning_rate": 9.851853240982628e-06, + "loss": 1.0613, + "step": 4110 + }, + { + "epoch": 0.12409638554216867, + "grad_norm": 3.083914087842926, + "learning_rate": 9.850647480797011e-06, + "loss": 1.1484, + "step": 4120 + }, + { + "epoch": 0.12439759036144578, + "grad_norm": 3.9021725996989582, + "learning_rate": 9.849436908052636e-06, + "loss": 1.0013, + "step": 4130 + }, + { + "epoch": 0.1246987951807229, + "grad_norm": 4.261045854576861, + "learning_rate": 9.84822152395057e-06, + "loss": 0.988, + "step": 4140 + }, + { + "epoch": 0.125, + "grad_norm": 3.0533766104396562, + "learning_rate": 9.847001329696653e-06, + "loss": 1.1048, + "step": 4150 + }, + { + "epoch": 0.12530120481927712, + "grad_norm": 2.8009853883701235, + "learning_rate": 9.845776326501497e-06, + "loss": 0.9714, + "step": 4160 + }, + { + "epoch": 0.1256024096385542, + "grad_norm": 3.7077863547315464, + "learning_rate": 9.844546515580486e-06, + "loss": 1.0074, + "step": 4170 + }, + { + "epoch": 0.12590361445783133, + "grad_norm": 3.5851684616175374, + "learning_rate": 9.843311898153775e-06, + "loss": 0.9621, + "step": 4180 + }, + { + "epoch": 0.12620481927710844, + "grad_norm": 2.0936533248577427, + "learning_rate": 9.842072475446287e-06, + "loss": 0.9984, + "step": 4190 + }, + { + "epoch": 0.12650602409638553, + "grad_norm": 3.236837577173862, + "learning_rate": 9.840828248687714e-06, + "loss": 0.9921, + "step": 4200 + }, + { + "epoch": 0.12680722891566265, + "grad_norm": 4.302891239940509, + "learning_rate": 9.839579219112508e-06, + "loss": 1.0077, + "step": 4210 + }, + { + "epoch": 0.12710843373493977, + "grad_norm": 4.72174446683809, + "learning_rate": 9.838325387959894e-06, + "loss": 1.1164, + "step": 4220 + }, + { + "epoch": 0.12740963855421686, + "grad_norm": 3.53934622620043, + "learning_rate": 9.837066756473858e-06, + "loss": 1.1238, + "step": 4230 + }, + { + "epoch": 0.12771084337349398, + "grad_norm": 2.3670759836719353, + "learning_rate": 9.835803325903146e-06, + "loss": 1.0793, + "step": 4240 + }, + { + "epoch": 0.1280120481927711, + "grad_norm": 3.757187886266233, + "learning_rate": 9.834535097501271e-06, + "loss": 1.0342, + "step": 4250 + }, + { + "epoch": 0.12831325301204818, + "grad_norm": 2.2064521983256453, + "learning_rate": 9.833262072526502e-06, + "loss": 1.016, + "step": 4260 + }, + { + "epoch": 0.1286144578313253, + "grad_norm": 3.504463888375646, + "learning_rate": 9.831984252241864e-06, + "loss": 1.1306, + "step": 4270 + }, + { + "epoch": 0.12891566265060242, + "grad_norm": 3.4352599178275462, + "learning_rate": 9.83070163791515e-06, + "loss": 1.0684, + "step": 4280 + }, + { + "epoch": 0.1292168674698795, + "grad_norm": 2.4608514162669373, + "learning_rate": 9.829414230818897e-06, + "loss": 1.0528, + "step": 4290 + }, + { + "epoch": 0.12951807228915663, + "grad_norm": 3.697431973052916, + "learning_rate": 9.828122032230405e-06, + "loss": 1.0048, + "step": 4300 + }, + { + "epoch": 0.12981927710843374, + "grad_norm": 3.5673373327538562, + "learning_rate": 9.826825043431727e-06, + "loss": 1.0399, + "step": 4310 + }, + { + "epoch": 0.13012048192771083, + "grad_norm": 3.521636320056176, + "learning_rate": 9.825523265709667e-06, + "loss": 1.1146, + "step": 4320 + }, + { + "epoch": 0.13042168674698795, + "grad_norm": 2.212955928751166, + "learning_rate": 9.824216700355781e-06, + "loss": 1.0379, + "step": 4330 + }, + { + "epoch": 0.13072289156626507, + "grad_norm": 4.556411036688682, + "learning_rate": 9.822905348666373e-06, + "loss": 0.979, + "step": 4340 + }, + { + "epoch": 0.13102409638554216, + "grad_norm": 4.1305780843730515, + "learning_rate": 9.8215892119425e-06, + "loss": 1.1257, + "step": 4350 + }, + { + "epoch": 0.13132530120481928, + "grad_norm": 3.736857665673749, + "learning_rate": 9.820268291489962e-06, + "loss": 1.037, + "step": 4360 + }, + { + "epoch": 0.1316265060240964, + "grad_norm": 3.4496883762116344, + "learning_rate": 9.818942588619307e-06, + "loss": 1.1464, + "step": 4370 + }, + { + "epoch": 0.13192771084337349, + "grad_norm": 4.722706276570147, + "learning_rate": 9.81761210464583e-06, + "loss": 1.0105, + "step": 4380 + }, + { + "epoch": 0.1322289156626506, + "grad_norm": 3.674024436386717, + "learning_rate": 9.816276840889569e-06, + "loss": 0.9943, + "step": 4390 + }, + { + "epoch": 0.13253012048192772, + "grad_norm": 4.306251454283166, + "learning_rate": 9.8149367986753e-06, + "loss": 1.1416, + "step": 4400 + }, + { + "epoch": 0.1328313253012048, + "grad_norm": 2.318543131903423, + "learning_rate": 9.81359197933254e-06, + "loss": 1.0301, + "step": 4410 + }, + { + "epoch": 0.13313253012048193, + "grad_norm": 3.21024883306362, + "learning_rate": 9.812242384195553e-06, + "loss": 1.1388, + "step": 4420 + }, + { + "epoch": 0.13343373493975905, + "grad_norm": 4.890422036619223, + "learning_rate": 9.810888014603338e-06, + "loss": 1.1272, + "step": 4430 + }, + { + "epoch": 0.13373493975903614, + "grad_norm": 3.7271056320387834, + "learning_rate": 9.809528871899625e-06, + "loss": 0.9663, + "step": 4440 + }, + { + "epoch": 0.13403614457831325, + "grad_norm": 2.2886395005193227, + "learning_rate": 9.808164957432887e-06, + "loss": 1.0486, + "step": 4450 + }, + { + "epoch": 0.13433734939759037, + "grad_norm": 2.273780413684844, + "learning_rate": 9.806796272556329e-06, + "loss": 1.0658, + "step": 4460 + }, + { + "epoch": 0.13463855421686746, + "grad_norm": 3.4815418700381815, + "learning_rate": 9.805422818627887e-06, + "loss": 1.151, + "step": 4470 + }, + { + "epoch": 0.13493975903614458, + "grad_norm": 3.143963049514939, + "learning_rate": 9.80404459701023e-06, + "loss": 1.1618, + "step": 4480 + }, + { + "epoch": 0.1352409638554217, + "grad_norm": 3.8333974143775444, + "learning_rate": 9.80266160907076e-06, + "loss": 1.1307, + "step": 4490 + }, + { + "epoch": 0.1355421686746988, + "grad_norm": 3.796359402263128, + "learning_rate": 9.801273856181603e-06, + "loss": 1.0186, + "step": 4500 + }, + { + "epoch": 0.1358433734939759, + "grad_norm": 3.526380098069618, + "learning_rate": 9.799881339719615e-06, + "loss": 1.1202, + "step": 4510 + }, + { + "epoch": 0.13614457831325302, + "grad_norm": 2.3799394298701637, + "learning_rate": 9.798484061066379e-06, + "loss": 1.1168, + "step": 4520 + }, + { + "epoch": 0.1364457831325301, + "grad_norm": 2.3420014684851416, + "learning_rate": 9.797082021608199e-06, + "loss": 1.0472, + "step": 4530 + }, + { + "epoch": 0.13674698795180723, + "grad_norm": 3.8824369336877993, + "learning_rate": 9.795675222736109e-06, + "loss": 1.1207, + "step": 4540 + }, + { + "epoch": 0.13704819277108435, + "grad_norm": 3.7880901774233027, + "learning_rate": 9.794263665845862e-06, + "loss": 1.1036, + "step": 4550 + }, + { + "epoch": 0.13734939759036144, + "grad_norm": 2.2568634543924944, + "learning_rate": 9.792847352337926e-06, + "loss": 1.0919, + "step": 4560 + }, + { + "epoch": 0.13765060240963856, + "grad_norm": 3.1778188343088205, + "learning_rate": 9.791426283617497e-06, + "loss": 1.1278, + "step": 4570 + }, + { + "epoch": 0.13795180722891567, + "grad_norm": 3.224250552306772, + "learning_rate": 9.790000461094483e-06, + "loss": 0.982, + "step": 4580 + }, + { + "epoch": 0.13825301204819276, + "grad_norm": 3.1585896158608344, + "learning_rate": 9.788569886183511e-06, + "loss": 1.1179, + "step": 4590 + }, + { + "epoch": 0.13855421686746988, + "grad_norm": 3.462414772444446, + "learning_rate": 9.787134560303925e-06, + "loss": 1.0827, + "step": 4600 + }, + { + "epoch": 0.138855421686747, + "grad_norm": 2.0866169715331084, + "learning_rate": 9.785694484879778e-06, + "loss": 0.988, + "step": 4610 + }, + { + "epoch": 0.1391566265060241, + "grad_norm": 3.389979691122731, + "learning_rate": 9.784249661339838e-06, + "loss": 1.1036, + "step": 4620 + }, + { + "epoch": 0.1394578313253012, + "grad_norm": 5.210842898777085, + "learning_rate": 9.782800091117581e-06, + "loss": 1.0532, + "step": 4630 + }, + { + "epoch": 0.13975903614457832, + "grad_norm": 2.2035718957581456, + "learning_rate": 9.7813457756512e-06, + "loss": 0.9751, + "step": 4640 + }, + { + "epoch": 0.14006024096385541, + "grad_norm": 5.163817863609091, + "learning_rate": 9.779886716383585e-06, + "loss": 1.0447, + "step": 4650 + }, + { + "epoch": 0.14036144578313253, + "grad_norm": 2.1874523291711427, + "learning_rate": 9.778422914762342e-06, + "loss": 1.0163, + "step": 4660 + }, + { + "epoch": 0.14066265060240965, + "grad_norm": 3.3593565331746476, + "learning_rate": 9.776954372239778e-06, + "loss": 1.0752, + "step": 4670 + }, + { + "epoch": 0.14096385542168674, + "grad_norm": 6.561474234949798, + "learning_rate": 9.775481090272903e-06, + "loss": 1.0352, + "step": 4680 + }, + { + "epoch": 0.14126506024096386, + "grad_norm": 2.2764790120471474, + "learning_rate": 9.774003070323433e-06, + "loss": 1.0816, + "step": 4690 + }, + { + "epoch": 0.14156626506024098, + "grad_norm": 4.284699327442535, + "learning_rate": 9.772520313857777e-06, + "loss": 1.0362, + "step": 4700 + }, + { + "epoch": 0.14186746987951807, + "grad_norm": 5.622862962661047, + "learning_rate": 9.771032822347052e-06, + "loss": 1.1013, + "step": 4710 + }, + { + "epoch": 0.14216867469879518, + "grad_norm": 3.5193476260860708, + "learning_rate": 9.76954059726707e-06, + "loss": 1.0359, + "step": 4720 + }, + { + "epoch": 0.1424698795180723, + "grad_norm": 3.1635234057670396, + "learning_rate": 9.768043640098337e-06, + "loss": 1.0838, + "step": 4730 + }, + { + "epoch": 0.1427710843373494, + "grad_norm": 5.3147053949696845, + "learning_rate": 9.766541952326055e-06, + "loss": 1.0217, + "step": 4740 + }, + { + "epoch": 0.1430722891566265, + "grad_norm": 3.9502160134496602, + "learning_rate": 9.765035535440122e-06, + "loss": 1.0724, + "step": 4750 + }, + { + "epoch": 0.14337349397590363, + "grad_norm": 3.2204727664784274, + "learning_rate": 9.763524390935124e-06, + "loss": 1.0666, + "step": 4760 + }, + { + "epoch": 0.14367469879518072, + "grad_norm": 3.1741778820868247, + "learning_rate": 9.762008520310341e-06, + "loss": 0.963, + "step": 4770 + }, + { + "epoch": 0.14397590361445783, + "grad_norm": 4.733626504373592, + "learning_rate": 9.760487925069742e-06, + "loss": 0.9357, + "step": 4780 + }, + { + "epoch": 0.14427710843373495, + "grad_norm": 4.318346784048267, + "learning_rate": 9.75896260672198e-06, + "loss": 1.0624, + "step": 4790 + }, + { + "epoch": 0.14457831325301204, + "grad_norm": 2.327064879984706, + "learning_rate": 9.757432566780394e-06, + "loss": 1.0039, + "step": 4800 + }, + { + "epoch": 0.14487951807228916, + "grad_norm": 3.655317981523251, + "learning_rate": 9.755897806763015e-06, + "loss": 1.1628, + "step": 4810 + }, + { + "epoch": 0.14518072289156628, + "grad_norm": 3.1522492880450566, + "learning_rate": 9.754358328192546e-06, + "loss": 1.0239, + "step": 4820 + }, + { + "epoch": 0.14548192771084337, + "grad_norm": 3.351343494801427, + "learning_rate": 9.752814132596382e-06, + "loss": 1.1085, + "step": 4830 + }, + { + "epoch": 0.14578313253012049, + "grad_norm": 3.278068913765995, + "learning_rate": 9.75126522150659e-06, + "loss": 1.0416, + "step": 4840 + }, + { + "epoch": 0.1460843373493976, + "grad_norm": 3.2726257917049435, + "learning_rate": 9.74971159645992e-06, + "loss": 1.0441, + "step": 4850 + }, + { + "epoch": 0.1463855421686747, + "grad_norm": 2.3833415910525386, + "learning_rate": 9.748153258997795e-06, + "loss": 1.0152, + "step": 4860 + }, + { + "epoch": 0.1466867469879518, + "grad_norm": 3.231056424379859, + "learning_rate": 9.746590210666318e-06, + "loss": 1.1212, + "step": 4870 + }, + { + "epoch": 0.14698795180722893, + "grad_norm": 4.140471833383135, + "learning_rate": 9.745022453016265e-06, + "loss": 0.9856, + "step": 4880 + }, + { + "epoch": 0.14728915662650602, + "grad_norm": 2.0937714976689143, + "learning_rate": 9.743449987603082e-06, + "loss": 1.0324, + "step": 4890 + }, + { + "epoch": 0.14759036144578314, + "grad_norm": 4.2575104490906845, + "learning_rate": 9.741872815986888e-06, + "loss": 1.0921, + "step": 4900 + }, + { + "epoch": 0.14789156626506025, + "grad_norm": 2.9103859190839247, + "learning_rate": 9.740290939732467e-06, + "loss": 0.9322, + "step": 4910 + }, + { + "epoch": 0.14819277108433734, + "grad_norm": 2.721531202849386, + "learning_rate": 9.738704360409276e-06, + "loss": 1.1442, + "step": 4920 + }, + { + "epoch": 0.14849397590361446, + "grad_norm": 2.050446101251248, + "learning_rate": 9.737113079591438e-06, + "loss": 1.0194, + "step": 4930 + }, + { + "epoch": 0.14879518072289158, + "grad_norm": 3.3305624788137864, + "learning_rate": 9.735517098857735e-06, + "loss": 0.9831, + "step": 4940 + }, + { + "epoch": 0.14909638554216867, + "grad_norm": 2.420983075223732, + "learning_rate": 9.733916419791618e-06, + "loss": 1.0134, + "step": 4950 + }, + { + "epoch": 0.1493975903614458, + "grad_norm": 2.8134241849328574, + "learning_rate": 9.732311043981195e-06, + "loss": 1.0931, + "step": 4960 + }, + { + "epoch": 0.1496987951807229, + "grad_norm": 3.1472681936635483, + "learning_rate": 9.730700973019239e-06, + "loss": 1.0581, + "step": 4970 + }, + { + "epoch": 0.15, + "grad_norm": 3.0810334766317093, + "learning_rate": 9.729086208503174e-06, + "loss": 1.0027, + "step": 4980 + }, + { + "epoch": 0.1503012048192771, + "grad_norm": 2.9206618313633395, + "learning_rate": 9.727466752035088e-06, + "loss": 0.9923, + "step": 4990 + }, + { + "epoch": 0.15060240963855423, + "grad_norm": 3.2699697753281587, + "learning_rate": 9.725842605221721e-06, + "loss": 1.1164, + "step": 5000 + }, + { + "epoch": 0.15090361445783132, + "grad_norm": 6.773061296964998, + "learning_rate": 9.724213769674466e-06, + "loss": 1.0413, + "step": 5010 + }, + { + "epoch": 0.15120481927710844, + "grad_norm": 3.348064035041207, + "learning_rate": 9.722580247009367e-06, + "loss": 1.0733, + "step": 5020 + }, + { + "epoch": 0.15150602409638556, + "grad_norm": 4.123785046042495, + "learning_rate": 9.720942038847123e-06, + "loss": 1.0695, + "step": 5030 + }, + { + "epoch": 0.15180722891566265, + "grad_norm": 6.6390263479865075, + "learning_rate": 9.719299146813074e-06, + "loss": 1.0102, + "step": 5040 + }, + { + "epoch": 0.15210843373493976, + "grad_norm": 3.183406080712184, + "learning_rate": 9.717651572537217e-06, + "loss": 1.0317, + "step": 5050 + }, + { + "epoch": 0.15240963855421688, + "grad_norm": 2.209915307656265, + "learning_rate": 9.715999317654186e-06, + "loss": 0.9702, + "step": 5060 + }, + { + "epoch": 0.15271084337349397, + "grad_norm": 3.71668545598457, + "learning_rate": 9.714342383803259e-06, + "loss": 1.1026, + "step": 5070 + }, + { + "epoch": 0.1530120481927711, + "grad_norm": 3.3454059076645373, + "learning_rate": 9.712680772628365e-06, + "loss": 1.024, + "step": 5080 + }, + { + "epoch": 0.1533132530120482, + "grad_norm": 3.7676266694645033, + "learning_rate": 9.711014485778063e-06, + "loss": 1.0529, + "step": 5090 + }, + { + "epoch": 0.1536144578313253, + "grad_norm": 2.3911391873701313, + "learning_rate": 9.709343524905558e-06, + "loss": 1.0187, + "step": 5100 + }, + { + "epoch": 0.15391566265060241, + "grad_norm": 4.183322203598176, + "learning_rate": 9.70766789166869e-06, + "loss": 0.9653, + "step": 5110 + }, + { + "epoch": 0.15421686746987953, + "grad_norm": 3.4195694386180646, + "learning_rate": 9.705987587729932e-06, + "loss": 1.0815, + "step": 5120 + }, + { + "epoch": 0.15451807228915662, + "grad_norm": 4.601648726229802, + "learning_rate": 9.7043026147564e-06, + "loss": 1.07, + "step": 5130 + }, + { + "epoch": 0.15481927710843374, + "grad_norm": 2.1251037089708418, + "learning_rate": 9.702612974419828e-06, + "loss": 1.0668, + "step": 5140 + }, + { + "epoch": 0.15512048192771086, + "grad_norm": 2.1782790790900224, + "learning_rate": 9.700918668396595e-06, + "loss": 1.0139, + "step": 5150 + }, + { + "epoch": 0.15542168674698795, + "grad_norm": 3.251835279628236, + "learning_rate": 9.699219698367698e-06, + "loss": 1.1551, + "step": 5160 + }, + { + "epoch": 0.15572289156626506, + "grad_norm": 3.53702536080039, + "learning_rate": 9.697516066018771e-06, + "loss": 0.9878, + "step": 5170 + }, + { + "epoch": 0.15602409638554218, + "grad_norm": 3.2199102356574096, + "learning_rate": 9.695807773040066e-06, + "loss": 1.0814, + "step": 5180 + }, + { + "epoch": 0.15632530120481927, + "grad_norm": 2.279979865855527, + "learning_rate": 9.694094821126463e-06, + "loss": 0.8722, + "step": 5190 + }, + { + "epoch": 0.1566265060240964, + "grad_norm": 3.414618660368347, + "learning_rate": 9.692377211977463e-06, + "loss": 1.1059, + "step": 5200 + }, + { + "epoch": 0.15692771084337348, + "grad_norm": 2.1338305587010895, + "learning_rate": 9.690654947297187e-06, + "loss": 0.9732, + "step": 5210 + }, + { + "epoch": 0.1572289156626506, + "grad_norm": 3.3926239553083364, + "learning_rate": 9.688928028794378e-06, + "loss": 1.0287, + "step": 5220 + }, + { + "epoch": 0.15753012048192772, + "grad_norm": 3.2978616424156866, + "learning_rate": 9.687196458182393e-06, + "loss": 1.0199, + "step": 5230 + }, + { + "epoch": 0.1578313253012048, + "grad_norm": 4.073536092982219, + "learning_rate": 9.685460237179205e-06, + "loss": 1.0905, + "step": 5240 + }, + { + "epoch": 0.15813253012048192, + "grad_norm": 3.332084115789668, + "learning_rate": 9.683719367507403e-06, + "loss": 1.0608, + "step": 5250 + }, + { + "epoch": 0.15843373493975904, + "grad_norm": 3.9475889479040536, + "learning_rate": 9.681973850894187e-06, + "loss": 1.0316, + "step": 5260 + }, + { + "epoch": 0.15873493975903613, + "grad_norm": 4.379157434561739, + "learning_rate": 9.680223689071364e-06, + "loss": 1.0919, + "step": 5270 + }, + { + "epoch": 0.15903614457831325, + "grad_norm": 3.0403670956453666, + "learning_rate": 9.678468883775357e-06, + "loss": 0.9766, + "step": 5280 + }, + { + "epoch": 0.15933734939759037, + "grad_norm": 3.8321049042223034, + "learning_rate": 9.676709436747192e-06, + "loss": 1.0237, + "step": 5290 + }, + { + "epoch": 0.15963855421686746, + "grad_norm": 3.410946507894495, + "learning_rate": 9.674945349732498e-06, + "loss": 1.0378, + "step": 5300 + }, + { + "epoch": 0.15993975903614457, + "grad_norm": 3.1167832390652057, + "learning_rate": 9.673176624481514e-06, + "loss": 1.073, + "step": 5310 + }, + { + "epoch": 0.1602409638554217, + "grad_norm": 2.627554451968262, + "learning_rate": 9.67140326274907e-06, + "loss": 0.8978, + "step": 5320 + }, + { + "epoch": 0.16054216867469878, + "grad_norm": 2.91273188611948, + "learning_rate": 9.669625266294607e-06, + "loss": 1.0324, + "step": 5330 + }, + { + "epoch": 0.1608433734939759, + "grad_norm": 3.6348902919380652, + "learning_rate": 9.667842636882161e-06, + "loss": 1.014, + "step": 5340 + }, + { + "epoch": 0.16114457831325302, + "grad_norm": 3.1362788924676175, + "learning_rate": 9.66605537628036e-06, + "loss": 1.1956, + "step": 5350 + }, + { + "epoch": 0.1614457831325301, + "grad_norm": 2.325065922275079, + "learning_rate": 9.664263486262435e-06, + "loss": 1.139, + "step": 5360 + }, + { + "epoch": 0.16174698795180723, + "grad_norm": 2.244733354608646, + "learning_rate": 9.662466968606204e-06, + "loss": 0.9976, + "step": 5370 + }, + { + "epoch": 0.16204819277108434, + "grad_norm": 2.2571621339890933, + "learning_rate": 9.660665825094075e-06, + "loss": 1.0131, + "step": 5380 + }, + { + "epoch": 0.16234939759036143, + "grad_norm": 3.162499695346246, + "learning_rate": 9.658860057513051e-06, + "loss": 1.0787, + "step": 5390 + }, + { + "epoch": 0.16265060240963855, + "grad_norm": 2.4532344817953757, + "learning_rate": 9.65704966765472e-06, + "loss": 1.0239, + "step": 5400 + }, + { + "epoch": 0.16295180722891567, + "grad_norm": 4.524706172091528, + "learning_rate": 9.655234657315255e-06, + "loss": 1.0448, + "step": 5410 + }, + { + "epoch": 0.16325301204819276, + "grad_norm": 3.6682300562837824, + "learning_rate": 9.653415028295416e-06, + "loss": 1.0477, + "step": 5420 + }, + { + "epoch": 0.16355421686746988, + "grad_norm": 3.296338353127837, + "learning_rate": 9.651590782400544e-06, + "loss": 1.0617, + "step": 5430 + }, + { + "epoch": 0.163855421686747, + "grad_norm": 4.166905656035678, + "learning_rate": 9.649761921440559e-06, + "loss": 1.1264, + "step": 5440 + }, + { + "epoch": 0.16415662650602408, + "grad_norm": 3.0669389796408577, + "learning_rate": 9.64792844722996e-06, + "loss": 1.1601, + "step": 5450 + }, + { + "epoch": 0.1644578313253012, + "grad_norm": 3.158083945931002, + "learning_rate": 9.646090361587828e-06, + "loss": 1.0575, + "step": 5460 + }, + { + "epoch": 0.16475903614457832, + "grad_norm": 3.9595012649954433, + "learning_rate": 9.644247666337811e-06, + "loss": 0.9377, + "step": 5470 + }, + { + "epoch": 0.1650602409638554, + "grad_norm": 7.823524146210229, + "learning_rate": 9.642400363308138e-06, + "loss": 1.1025, + "step": 5480 + }, + { + "epoch": 0.16536144578313253, + "grad_norm": 3.293888596495116, + "learning_rate": 9.640548454331605e-06, + "loss": 0.9792, + "step": 5490 + }, + { + "epoch": 0.16566265060240964, + "grad_norm": 2.2819294235665177, + "learning_rate": 9.638691941245581e-06, + "loss": 0.9978, + "step": 5500 + }, + { + "epoch": 0.16596385542168673, + "grad_norm": 3.3342689743151426, + "learning_rate": 9.636830825891999e-06, + "loss": 1.0293, + "step": 5510 + }, + { + "epoch": 0.16626506024096385, + "grad_norm": 3.3822531399355977, + "learning_rate": 9.63496511011736e-06, + "loss": 0.9436, + "step": 5520 + }, + { + "epoch": 0.16656626506024097, + "grad_norm": 2.255208646786481, + "learning_rate": 9.633094795772732e-06, + "loss": 1.0041, + "step": 5530 + }, + { + "epoch": 0.16686746987951806, + "grad_norm": 4.442125013498681, + "learning_rate": 9.631219884713744e-06, + "loss": 1.051, + "step": 5540 + }, + { + "epoch": 0.16716867469879518, + "grad_norm": 3.956265628275085, + "learning_rate": 9.62934037880058e-06, + "loss": 1.0325, + "step": 5550 + }, + { + "epoch": 0.1674698795180723, + "grad_norm": 4.556155077351476, + "learning_rate": 9.627456279897992e-06, + "loss": 1.09, + "step": 5560 + }, + { + "epoch": 0.16777108433734939, + "grad_norm": 3.899010336394591, + "learning_rate": 9.625567589875282e-06, + "loss": 1.0844, + "step": 5570 + }, + { + "epoch": 0.1680722891566265, + "grad_norm": 4.1025563838568075, + "learning_rate": 9.62367431060631e-06, + "loss": 1.0263, + "step": 5580 + }, + { + "epoch": 0.16837349397590362, + "grad_norm": 3.261371165963412, + "learning_rate": 9.621776443969488e-06, + "loss": 0.9759, + "step": 5590 + }, + { + "epoch": 0.1686746987951807, + "grad_norm": 3.8351126648010445, + "learning_rate": 9.619873991847782e-06, + "loss": 1.0331, + "step": 5600 + }, + { + "epoch": 0.16897590361445783, + "grad_norm": 10.385110842644648, + "learning_rate": 9.617966956128704e-06, + "loss": 1.026, + "step": 5610 + }, + { + "epoch": 0.16927710843373495, + "grad_norm": 3.9904039368640944, + "learning_rate": 9.616055338704313e-06, + "loss": 0.87, + "step": 5620 + }, + { + "epoch": 0.16957831325301204, + "grad_norm": 4.251155600976589, + "learning_rate": 9.614139141471221e-06, + "loss": 0.8925, + "step": 5630 + }, + { + "epoch": 0.16987951807228915, + "grad_norm": 5.092732483832327, + "learning_rate": 9.612218366330574e-06, + "loss": 1.0587, + "step": 5640 + }, + { + "epoch": 0.17018072289156627, + "grad_norm": 5.56784543715876, + "learning_rate": 9.610293015188067e-06, + "loss": 1.1863, + "step": 5650 + }, + { + "epoch": 0.17048192771084336, + "grad_norm": 2.085502556311897, + "learning_rate": 9.608363089953933e-06, + "loss": 0.9125, + "step": 5660 + }, + { + "epoch": 0.17078313253012048, + "grad_norm": 3.805266632887387, + "learning_rate": 9.606428592542942e-06, + "loss": 1.1807, + "step": 5670 + }, + { + "epoch": 0.1710843373493976, + "grad_norm": 2.176487423411828, + "learning_rate": 9.604489524874404e-06, + "loss": 1.0128, + "step": 5680 + }, + { + "epoch": 0.1713855421686747, + "grad_norm": 3.511119177564224, + "learning_rate": 9.602545888872153e-06, + "loss": 0.8843, + "step": 5690 + }, + { + "epoch": 0.1716867469879518, + "grad_norm": 4.65210599013388, + "learning_rate": 9.60059768646457e-06, + "loss": 1.0541, + "step": 5700 + }, + { + "epoch": 0.17198795180722892, + "grad_norm": 5.177770181975888, + "learning_rate": 9.598644919584557e-06, + "loss": 1.1533, + "step": 5710 + }, + { + "epoch": 0.172289156626506, + "grad_norm": 2.7868109404114447, + "learning_rate": 9.596687590169547e-06, + "loss": 0.9214, + "step": 5720 + }, + { + "epoch": 0.17259036144578313, + "grad_norm": 3.976355975955576, + "learning_rate": 9.594725700161503e-06, + "loss": 1.1111, + "step": 5730 + }, + { + "epoch": 0.17289156626506025, + "grad_norm": 3.8197839655454024, + "learning_rate": 9.592759251506905e-06, + "loss": 1.075, + "step": 5740 + }, + { + "epoch": 0.17319277108433734, + "grad_norm": 4.164754554800956, + "learning_rate": 9.590788246156762e-06, + "loss": 1.0444, + "step": 5750 + }, + { + "epoch": 0.17349397590361446, + "grad_norm": 2.055612662165044, + "learning_rate": 9.588812686066604e-06, + "loss": 1.0374, + "step": 5760 + }, + { + "epoch": 0.17379518072289157, + "grad_norm": 2.2807599206085913, + "learning_rate": 9.586832573196477e-06, + "loss": 1.0652, + "step": 5770 + }, + { + "epoch": 0.17409638554216866, + "grad_norm": 3.578007650323301, + "learning_rate": 9.584847909510947e-06, + "loss": 0.9844, + "step": 5780 + }, + { + "epoch": 0.17439759036144578, + "grad_norm": 2.2768377854701187, + "learning_rate": 9.582858696979092e-06, + "loss": 0.998, + "step": 5790 + }, + { + "epoch": 0.1746987951807229, + "grad_norm": 17.390522458209965, + "learning_rate": 9.580864937574503e-06, + "loss": 0.9881, + "step": 5800 + }, + { + "epoch": 0.175, + "grad_norm": 2.999305191298459, + "learning_rate": 9.578866633275289e-06, + "loss": 1.0092, + "step": 5810 + }, + { + "epoch": 0.1753012048192771, + "grad_norm": 3.827981839453712, + "learning_rate": 9.576863786064056e-06, + "loss": 1.0557, + "step": 5820 + }, + { + "epoch": 0.17560240963855422, + "grad_norm": 3.397036893241917, + "learning_rate": 9.574856397927932e-06, + "loss": 1.0643, + "step": 5830 + }, + { + "epoch": 0.17590361445783131, + "grad_norm": 2.2348195275879577, + "learning_rate": 9.572844470858537e-06, + "loss": 1.0859, + "step": 5840 + }, + { + "epoch": 0.17620481927710843, + "grad_norm": 2.948518431736692, + "learning_rate": 9.570828006852002e-06, + "loss": 1.1199, + "step": 5850 + }, + { + "epoch": 0.17650602409638555, + "grad_norm": 2.07783596473763, + "learning_rate": 9.568807007908956e-06, + "loss": 0.9549, + "step": 5860 + }, + { + "epoch": 0.17680722891566264, + "grad_norm": 3.563921820764686, + "learning_rate": 9.56678147603453e-06, + "loss": 0.9696, + "step": 5870 + }, + { + "epoch": 0.17710843373493976, + "grad_norm": 5.512277139507565, + "learning_rate": 9.564751413238348e-06, + "loss": 1.1727, + "step": 5880 + }, + { + "epoch": 0.17740963855421688, + "grad_norm": 3.4600431387569155, + "learning_rate": 9.562716821534536e-06, + "loss": 1.0864, + "step": 5890 + }, + { + "epoch": 0.17771084337349397, + "grad_norm": 4.093428608264867, + "learning_rate": 9.560677702941708e-06, + "loss": 1.0945, + "step": 5900 + }, + { + "epoch": 0.17801204819277108, + "grad_norm": 2.09148292103574, + "learning_rate": 9.558634059482967e-06, + "loss": 0.8993, + "step": 5910 + }, + { + "epoch": 0.1783132530120482, + "grad_norm": 3.281318600861496, + "learning_rate": 9.556585893185916e-06, + "loss": 0.9695, + "step": 5920 + }, + { + "epoch": 0.1786144578313253, + "grad_norm": 3.9318585105629578, + "learning_rate": 9.554533206082633e-06, + "loss": 1.1362, + "step": 5930 + }, + { + "epoch": 0.1789156626506024, + "grad_norm": 3.518532574219804, + "learning_rate": 9.55247600020969e-06, + "loss": 1.0634, + "step": 5940 + }, + { + "epoch": 0.17921686746987953, + "grad_norm": 4.155557513315787, + "learning_rate": 9.550414277608138e-06, + "loss": 0.9822, + "step": 5950 + }, + { + "epoch": 0.17951807228915662, + "grad_norm": 3.7175217066617, + "learning_rate": 9.54834804032351e-06, + "loss": 1.0538, + "step": 5960 + }, + { + "epoch": 0.17981927710843373, + "grad_norm": 3.498416998598756, + "learning_rate": 9.546277290405816e-06, + "loss": 1.0399, + "step": 5970 + }, + { + "epoch": 0.18012048192771085, + "grad_norm": 3.458768120804074, + "learning_rate": 9.544202029909553e-06, + "loss": 1.0953, + "step": 5980 + }, + { + "epoch": 0.18042168674698794, + "grad_norm": 3.1602574880014482, + "learning_rate": 9.542122260893678e-06, + "loss": 0.946, + "step": 5990 + }, + { + "epoch": 0.18072289156626506, + "grad_norm": 4.6804582295334205, + "learning_rate": 9.540037985421635e-06, + "loss": 0.9814, + "step": 6000 + }, + { + "epoch": 0.18102409638554218, + "grad_norm": 3.6710468665569405, + "learning_rate": 9.537949205561328e-06, + "loss": 1.1269, + "step": 6010 + }, + { + "epoch": 0.18132530120481927, + "grad_norm": 4.293441762407758, + "learning_rate": 9.53585592338514e-06, + "loss": 1.0217, + "step": 6020 + }, + { + "epoch": 0.18162650602409638, + "grad_norm": 3.761636872276159, + "learning_rate": 9.533758140969913e-06, + "loss": 1.0593, + "step": 6030 + }, + { + "epoch": 0.1819277108433735, + "grad_norm": 2.0282710989656683, + "learning_rate": 9.53165586039696e-06, + "loss": 1.0833, + "step": 6040 + }, + { + "epoch": 0.1822289156626506, + "grad_norm": 3.6331857587787644, + "learning_rate": 9.52954908375205e-06, + "loss": 1.1333, + "step": 6050 + }, + { + "epoch": 0.1825301204819277, + "grad_norm": 2.1858602036458006, + "learning_rate": 9.527437813125418e-06, + "loss": 1.0902, + "step": 6060 + }, + { + "epoch": 0.18283132530120483, + "grad_norm": 4.342594825620618, + "learning_rate": 9.525322050611757e-06, + "loss": 1.0631, + "step": 6070 + }, + { + "epoch": 0.18313253012048192, + "grad_norm": 4.44971842951979, + "learning_rate": 9.523201798310216e-06, + "loss": 1.0879, + "step": 6080 + }, + { + "epoch": 0.18343373493975904, + "grad_norm": 3.7729594924080487, + "learning_rate": 9.521077058324396e-06, + "loss": 1.0204, + "step": 6090 + }, + { + "epoch": 0.18373493975903615, + "grad_norm": 3.510001474937845, + "learning_rate": 9.518947832762358e-06, + "loss": 0.9884, + "step": 6100 + }, + { + "epoch": 0.18403614457831324, + "grad_norm": 5.146451089418775, + "learning_rate": 9.516814123736604e-06, + "loss": 1.063, + "step": 6110 + }, + { + "epoch": 0.18433734939759036, + "grad_norm": 4.001807019595258, + "learning_rate": 9.51467593336409e-06, + "loss": 1.004, + "step": 6120 + }, + { + "epoch": 0.18463855421686748, + "grad_norm": 2.0613442946765694, + "learning_rate": 9.512533263766217e-06, + "loss": 0.9519, + "step": 6130 + }, + { + "epoch": 0.18493975903614457, + "grad_norm": 3.9775071093580885, + "learning_rate": 9.510386117068828e-06, + "loss": 0.9807, + "step": 6140 + }, + { + "epoch": 0.1852409638554217, + "grad_norm": 2.1522460573260003, + "learning_rate": 9.508234495402211e-06, + "loss": 0.9739, + "step": 6150 + }, + { + "epoch": 0.1855421686746988, + "grad_norm": 5.011296767337308, + "learning_rate": 9.506078400901092e-06, + "loss": 1.0179, + "step": 6160 + }, + { + "epoch": 0.1858433734939759, + "grad_norm": 16.124866692782845, + "learning_rate": 9.503917835704638e-06, + "loss": 1.0036, + "step": 6170 + }, + { + "epoch": 0.186144578313253, + "grad_norm": 2.0356647653715245, + "learning_rate": 9.501752801956442e-06, + "loss": 0.9634, + "step": 6180 + }, + { + "epoch": 0.18644578313253013, + "grad_norm": 7.487542173226552, + "learning_rate": 9.499583301804544e-06, + "loss": 0.9596, + "step": 6190 + }, + { + "epoch": 0.18674698795180722, + "grad_norm": 2.021399937359315, + "learning_rate": 9.497409337401405e-06, + "loss": 1.027, + "step": 6200 + }, + { + "epoch": 0.18704819277108434, + "grad_norm": 7.259619648806638, + "learning_rate": 9.495230910903921e-06, + "loss": 1.0857, + "step": 6210 + }, + { + "epoch": 0.18734939759036146, + "grad_norm": 6.860284296746412, + "learning_rate": 9.493048024473413e-06, + "loss": 1.0305, + "step": 6220 + }, + { + "epoch": 0.18765060240963854, + "grad_norm": 4.694009851591792, + "learning_rate": 9.490860680275623e-06, + "loss": 1.0267, + "step": 6230 + }, + { + "epoch": 0.18795180722891566, + "grad_norm": 6.55172407409244, + "learning_rate": 9.488668880480722e-06, + "loss": 0.9902, + "step": 6240 + }, + { + "epoch": 0.18825301204819278, + "grad_norm": 20.98587198042032, + "learning_rate": 9.486472627263302e-06, + "loss": 0.9025, + "step": 6250 + }, + { + "epoch": 0.18855421686746987, + "grad_norm": 2.122218322085193, + "learning_rate": 9.484271922802364e-06, + "loss": 1.0299, + "step": 6260 + }, + { + "epoch": 0.188855421686747, + "grad_norm": 4.352710660528723, + "learning_rate": 9.482066769281339e-06, + "loss": 1.0839, + "step": 6270 + }, + { + "epoch": 0.1891566265060241, + "grad_norm": 3.557693000462193, + "learning_rate": 9.47985716888806e-06, + "loss": 1.019, + "step": 6280 + }, + { + "epoch": 0.1894578313253012, + "grad_norm": 2.052304906280264, + "learning_rate": 9.47764312381478e-06, + "loss": 1.0287, + "step": 6290 + }, + { + "epoch": 0.1897590361445783, + "grad_norm": 2.1546406212728684, + "learning_rate": 9.475424636258154e-06, + "loss": 0.9923, + "step": 6300 + }, + { + "epoch": 0.19006024096385543, + "grad_norm": 3.432155846270581, + "learning_rate": 9.473201708419255e-06, + "loss": 0.9993, + "step": 6310 + }, + { + "epoch": 0.19036144578313252, + "grad_norm": 3.3836314481831162, + "learning_rate": 9.470974342503554e-06, + "loss": 1.0539, + "step": 6320 + }, + { + "epoch": 0.19066265060240964, + "grad_norm": 6.410959391767066, + "learning_rate": 9.468742540720925e-06, + "loss": 1.0704, + "step": 6330 + }, + { + "epoch": 0.19096385542168676, + "grad_norm": 3.3994587418544686, + "learning_rate": 9.466506305285648e-06, + "loss": 0.9905, + "step": 6340 + }, + { + "epoch": 0.19126506024096385, + "grad_norm": 3.122653146876219, + "learning_rate": 9.464265638416397e-06, + "loss": 0.9528, + "step": 6350 + }, + { + "epoch": 0.19156626506024096, + "grad_norm": 3.516254140668866, + "learning_rate": 9.462020542336243e-06, + "loss": 0.9787, + "step": 6360 + }, + { + "epoch": 0.19186746987951808, + "grad_norm": 2.210414613219828, + "learning_rate": 9.459771019272657e-06, + "loss": 1.0348, + "step": 6370 + }, + { + "epoch": 0.19216867469879517, + "grad_norm": 3.9327617237349215, + "learning_rate": 9.457517071457494e-06, + "loss": 1.1597, + "step": 6380 + }, + { + "epoch": 0.1924698795180723, + "grad_norm": 2.4111257611704393, + "learning_rate": 9.455258701127007e-06, + "loss": 1.0841, + "step": 6390 + }, + { + "epoch": 0.1927710843373494, + "grad_norm": 1.9627995460222205, + "learning_rate": 9.452995910521828e-06, + "loss": 1.0638, + "step": 6400 + }, + { + "epoch": 0.1930722891566265, + "grad_norm": 11.59027690688135, + "learning_rate": 9.450728701886985e-06, + "loss": 1.0772, + "step": 6410 + }, + { + "epoch": 0.19337349397590362, + "grad_norm": 5.395679302173612, + "learning_rate": 9.448457077471878e-06, + "loss": 1.0666, + "step": 6420 + }, + { + "epoch": 0.19367469879518073, + "grad_norm": 17.619792245169553, + "learning_rate": 9.4461810395303e-06, + "loss": 0.9663, + "step": 6430 + }, + { + "epoch": 0.19397590361445782, + "grad_norm": 5.3293150242242096, + "learning_rate": 9.443900590320413e-06, + "loss": 1.0672, + "step": 6440 + }, + { + "epoch": 0.19427710843373494, + "grad_norm": 2.084201526136716, + "learning_rate": 9.441615732104761e-06, + "loss": 0.9945, + "step": 6450 + }, + { + "epoch": 0.19457831325301206, + "grad_norm": 3.8016068016031275, + "learning_rate": 9.43932646715026e-06, + "loss": 1.0981, + "step": 6460 + }, + { + "epoch": 0.19487951807228915, + "grad_norm": 4.835719823581217, + "learning_rate": 9.4370327977282e-06, + "loss": 1.1119, + "step": 6470 + }, + { + "epoch": 0.19518072289156627, + "grad_norm": 2.344146263507332, + "learning_rate": 9.434734726114238e-06, + "loss": 0.8969, + "step": 6480 + }, + { + "epoch": 0.19548192771084338, + "grad_norm": 3.6141181910894185, + "learning_rate": 9.432432254588402e-06, + "loss": 0.9771, + "step": 6490 + }, + { + "epoch": 0.19578313253012047, + "grad_norm": 3.744360407510648, + "learning_rate": 9.430125385435083e-06, + "loss": 1.0365, + "step": 6500 + }, + { + "epoch": 0.1960843373493976, + "grad_norm": 4.024300378942041, + "learning_rate": 9.427814120943037e-06, + "loss": 1.0536, + "step": 6510 + }, + { + "epoch": 0.1963855421686747, + "grad_norm": 1.9838251481555649, + "learning_rate": 9.42549846340538e-06, + "loss": 1.0738, + "step": 6520 + }, + { + "epoch": 0.1966867469879518, + "grad_norm": 3.8774275284407276, + "learning_rate": 9.423178415119585e-06, + "loss": 1.0219, + "step": 6530 + }, + { + "epoch": 0.19698795180722892, + "grad_norm": 3.4086775694668896, + "learning_rate": 9.420853978387482e-06, + "loss": 1.0954, + "step": 6540 + }, + { + "epoch": 0.19728915662650603, + "grad_norm": 3.190666965732196, + "learning_rate": 9.418525155515257e-06, + "loss": 0.9372, + "step": 6550 + }, + { + "epoch": 0.19759036144578312, + "grad_norm": 3.9995234218922073, + "learning_rate": 9.416191948813446e-06, + "loss": 1.0769, + "step": 6560 + }, + { + "epoch": 0.19789156626506024, + "grad_norm": 4.123718560173802, + "learning_rate": 9.413854360596934e-06, + "loss": 1.0564, + "step": 6570 + }, + { + "epoch": 0.19819277108433736, + "grad_norm": 2.0993768228124416, + "learning_rate": 9.411512393184955e-06, + "loss": 1.0118, + "step": 6580 + }, + { + "epoch": 0.19849397590361445, + "grad_norm": 3.010062786392957, + "learning_rate": 9.409166048901084e-06, + "loss": 0.963, + "step": 6590 + }, + { + "epoch": 0.19879518072289157, + "grad_norm": 3.1623236246645154, + "learning_rate": 9.406815330073244e-06, + "loss": 1.1061, + "step": 6600 + }, + { + "epoch": 0.19909638554216869, + "grad_norm": 3.0914725786047526, + "learning_rate": 9.404460239033696e-06, + "loss": 1.1769, + "step": 6610 + }, + { + "epoch": 0.19939759036144578, + "grad_norm": 2.96996989029304, + "learning_rate": 9.402100778119034e-06, + "loss": 1.0114, + "step": 6620 + }, + { + "epoch": 0.1996987951807229, + "grad_norm": 3.859782714986875, + "learning_rate": 9.399736949670195e-06, + "loss": 1.1542, + "step": 6630 + }, + { + "epoch": 0.2, + "grad_norm": 2.306360658537134, + "learning_rate": 9.397368756032445e-06, + "loss": 1.0491, + "step": 6640 + }, + { + "epoch": 0.2003012048192771, + "grad_norm": 2.8393886148069942, + "learning_rate": 9.394996199555384e-06, + "loss": 1.0969, + "step": 6650 + }, + { + "epoch": 0.20060240963855422, + "grad_norm": 3.3816692979942027, + "learning_rate": 9.392619282592935e-06, + "loss": 1.0189, + "step": 6660 + }, + { + "epoch": 0.20090361445783134, + "grad_norm": 2.2610058631841583, + "learning_rate": 9.390238007503353e-06, + "loss": 1.0917, + "step": 6670 + }, + { + "epoch": 0.20120481927710843, + "grad_norm": 3.2319729498456633, + "learning_rate": 9.387852376649211e-06, + "loss": 0.9724, + "step": 6680 + }, + { + "epoch": 0.20150602409638554, + "grad_norm": 3.1895675787296627, + "learning_rate": 9.385462392397411e-06, + "loss": 0.8884, + "step": 6690 + }, + { + "epoch": 0.20180722891566266, + "grad_norm": 2.017784678080749, + "learning_rate": 9.38306805711917e-06, + "loss": 0.9923, + "step": 6700 + }, + { + "epoch": 0.20210843373493975, + "grad_norm": 3.7886368832812853, + "learning_rate": 9.380669373190021e-06, + "loss": 0.9918, + "step": 6710 + }, + { + "epoch": 0.20240963855421687, + "grad_norm": 3.044996454038079, + "learning_rate": 9.378266342989815e-06, + "loss": 0.944, + "step": 6720 + }, + { + "epoch": 0.202710843373494, + "grad_norm": 3.2340893547627925, + "learning_rate": 9.375858968902709e-06, + "loss": 1.023, + "step": 6730 + }, + { + "epoch": 0.20301204819277108, + "grad_norm": 3.386251844194327, + "learning_rate": 9.373447253317178e-06, + "loss": 1.0739, + "step": 6740 + }, + { + "epoch": 0.2033132530120482, + "grad_norm": 1.9994776439159567, + "learning_rate": 9.371031198625998e-06, + "loss": 0.9843, + "step": 6750 + }, + { + "epoch": 0.2036144578313253, + "grad_norm": 3.8385337567154414, + "learning_rate": 9.368610807226249e-06, + "loss": 1.123, + "step": 6760 + }, + { + "epoch": 0.2039156626506024, + "grad_norm": 1.9827397135204707, + "learning_rate": 9.366186081519322e-06, + "loss": 0.9901, + "step": 6770 + }, + { + "epoch": 0.20421686746987952, + "grad_norm": 3.132873673691564, + "learning_rate": 9.3637570239109e-06, + "loss": 0.8976, + "step": 6780 + }, + { + "epoch": 0.20451807228915664, + "grad_norm": 2.9914579709001337, + "learning_rate": 9.36132363681097e-06, + "loss": 1.1023, + "step": 6790 + }, + { + "epoch": 0.20481927710843373, + "grad_norm": 3.602994062694188, + "learning_rate": 9.358885922633807e-06, + "loss": 1.1215, + "step": 6800 + }, + { + "epoch": 0.20512048192771085, + "grad_norm": 2.9406954201360453, + "learning_rate": 9.356443883797987e-06, + "loss": 0.9552, + "step": 6810 + }, + { + "epoch": 0.20542168674698796, + "grad_norm": 3.267769576224149, + "learning_rate": 9.353997522726372e-06, + "loss": 1.0854, + "step": 6820 + }, + { + "epoch": 0.20572289156626505, + "grad_norm": 2.8848588751589426, + "learning_rate": 9.351546841846113e-06, + "loss": 0.9388, + "step": 6830 + }, + { + "epoch": 0.20602409638554217, + "grad_norm": 2.1700260524078407, + "learning_rate": 9.349091843588647e-06, + "loss": 1.0435, + "step": 6840 + }, + { + "epoch": 0.2063253012048193, + "grad_norm": 3.603632113043179, + "learning_rate": 9.346632530389697e-06, + "loss": 1.0195, + "step": 6850 + }, + { + "epoch": 0.20662650602409638, + "grad_norm": 1.91925522962616, + "learning_rate": 9.344168904689266e-06, + "loss": 0.9093, + "step": 6860 + }, + { + "epoch": 0.2069277108433735, + "grad_norm": 3.102453015435755, + "learning_rate": 9.341700968931628e-06, + "loss": 1.0657, + "step": 6870 + }, + { + "epoch": 0.20722891566265061, + "grad_norm": 3.162507393683283, + "learning_rate": 9.339228725565346e-06, + "loss": 1.059, + "step": 6880 + }, + { + "epoch": 0.2075301204819277, + "grad_norm": 3.121578902753025, + "learning_rate": 9.33675217704325e-06, + "loss": 1.0669, + "step": 6890 + }, + { + "epoch": 0.20783132530120482, + "grad_norm": 2.7793956733245264, + "learning_rate": 9.33427132582244e-06, + "loss": 0.9935, + "step": 6900 + }, + { + "epoch": 0.20813253012048194, + "grad_norm": 2.085517781341558, + "learning_rate": 9.331786174364286e-06, + "loss": 0.9213, + "step": 6910 + }, + { + "epoch": 0.20843373493975903, + "grad_norm": 2.010153016809384, + "learning_rate": 9.329296725134426e-06, + "loss": 0.9949, + "step": 6920 + }, + { + "epoch": 0.20873493975903615, + "grad_norm": 4.054808349818256, + "learning_rate": 9.326802980602764e-06, + "loss": 1.0556, + "step": 6930 + }, + { + "epoch": 0.20903614457831327, + "grad_norm": 4.505837092533477, + "learning_rate": 9.324304943243461e-06, + "loss": 0.9794, + "step": 6940 + }, + { + "epoch": 0.20933734939759036, + "grad_norm": 2.979603769778444, + "learning_rate": 9.32180261553494e-06, + "loss": 1.0025, + "step": 6950 + }, + { + "epoch": 0.20963855421686747, + "grad_norm": 2.144729692647981, + "learning_rate": 9.319295999959877e-06, + "loss": 0.9761, + "step": 6960 + }, + { + "epoch": 0.2099397590361446, + "grad_norm": 3.486174659452024, + "learning_rate": 9.316785099005208e-06, + "loss": 0.9381, + "step": 6970 + }, + { + "epoch": 0.21024096385542168, + "grad_norm": 3.184600253664155, + "learning_rate": 9.314269915162115e-06, + "loss": 1.0594, + "step": 6980 + }, + { + "epoch": 0.2105421686746988, + "grad_norm": 6.215920268789802, + "learning_rate": 9.311750450926034e-06, + "loss": 1.0289, + "step": 6990 + }, + { + "epoch": 0.21084337349397592, + "grad_norm": 3.3514667059868275, + "learning_rate": 9.309226708796643e-06, + "loss": 1.0761, + "step": 7000 + }, + { + "epoch": 0.211144578313253, + "grad_norm": 3.25441268151538, + "learning_rate": 9.30669869127787e-06, + "loss": 1.101, + "step": 7010 + }, + { + "epoch": 0.21144578313253012, + "grad_norm": 3.5027058202888646, + "learning_rate": 9.304166400877882e-06, + "loss": 0.9201, + "step": 7020 + }, + { + "epoch": 0.21174698795180724, + "grad_norm": 2.9819940458452736, + "learning_rate": 9.301629840109082e-06, + "loss": 1.0614, + "step": 7030 + }, + { + "epoch": 0.21204819277108433, + "grad_norm": 1.924930162325777, + "learning_rate": 9.299089011488115e-06, + "loss": 1.0137, + "step": 7040 + }, + { + "epoch": 0.21234939759036145, + "grad_norm": 2.3023141591532643, + "learning_rate": 9.296543917535859e-06, + "loss": 0.9356, + "step": 7050 + }, + { + "epoch": 0.21265060240963857, + "grad_norm": 2.075460650189826, + "learning_rate": 9.293994560777419e-06, + "loss": 1.013, + "step": 7060 + }, + { + "epoch": 0.21295180722891566, + "grad_norm": 2.044014707663297, + "learning_rate": 9.291440943742138e-06, + "loss": 0.8399, + "step": 7070 + }, + { + "epoch": 0.21325301204819277, + "grad_norm": 3.2085697316677417, + "learning_rate": 9.288883068963581e-06, + "loss": 0.9995, + "step": 7080 + }, + { + "epoch": 0.2135542168674699, + "grad_norm": 3.2393919790610703, + "learning_rate": 9.286320938979533e-06, + "loss": 1.0997, + "step": 7090 + }, + { + "epoch": 0.21385542168674698, + "grad_norm": 3.1739107295653097, + "learning_rate": 9.283754556332012e-06, + "loss": 1.0699, + "step": 7100 + }, + { + "epoch": 0.2141566265060241, + "grad_norm": 3.0110951714632757, + "learning_rate": 9.281183923567245e-06, + "loss": 1.0548, + "step": 7110 + }, + { + "epoch": 0.21445783132530122, + "grad_norm": 4.18574638433286, + "learning_rate": 9.278609043235678e-06, + "loss": 1.0935, + "step": 7120 + }, + { + "epoch": 0.2147590361445783, + "grad_norm": 3.326876881645158, + "learning_rate": 9.276029917891976e-06, + "loss": 1.0521, + "step": 7130 + }, + { + "epoch": 0.21506024096385543, + "grad_norm": 3.210473294446714, + "learning_rate": 9.27344655009501e-06, + "loss": 1.0428, + "step": 7140 + }, + { + "epoch": 0.21536144578313254, + "grad_norm": 2.1345313470502436, + "learning_rate": 9.27085894240786e-06, + "loss": 1.0272, + "step": 7150 + }, + { + "epoch": 0.21566265060240963, + "grad_norm": 2.2664811897689847, + "learning_rate": 9.268267097397824e-06, + "loss": 0.9575, + "step": 7160 + }, + { + "epoch": 0.21596385542168675, + "grad_norm": 3.184695167512654, + "learning_rate": 9.265671017636384e-06, + "loss": 1.0909, + "step": 7170 + }, + { + "epoch": 0.21626506024096387, + "grad_norm": 2.1459242715287945, + "learning_rate": 9.263070705699241e-06, + "loss": 1.08, + "step": 7180 + }, + { + "epoch": 0.21656626506024096, + "grad_norm": 2.2353506779574825, + "learning_rate": 9.260466164166289e-06, + "loss": 1.0423, + "step": 7190 + }, + { + "epoch": 0.21686746987951808, + "grad_norm": 3.0059533014163793, + "learning_rate": 9.257857395621614e-06, + "loss": 1.008, + "step": 7200 + }, + { + "epoch": 0.2171686746987952, + "grad_norm": 3.804731492845275, + "learning_rate": 9.255244402653504e-06, + "loss": 0.9525, + "step": 7210 + }, + { + "epoch": 0.21746987951807228, + "grad_norm": 2.1694769108362753, + "learning_rate": 9.25262718785443e-06, + "loss": 1.033, + "step": 7220 + }, + { + "epoch": 0.2177710843373494, + "grad_norm": 3.279818738883482, + "learning_rate": 9.250005753821059e-06, + "loss": 1.0243, + "step": 7230 + }, + { + "epoch": 0.21807228915662652, + "grad_norm": 3.483345330928486, + "learning_rate": 9.247380103154238e-06, + "loss": 1.1496, + "step": 7240 + }, + { + "epoch": 0.2183734939759036, + "grad_norm": 3.060128588511046, + "learning_rate": 9.244750238459002e-06, + "loss": 1.0503, + "step": 7250 + }, + { + "epoch": 0.21867469879518073, + "grad_norm": 3.4930189068193735, + "learning_rate": 9.242116162344564e-06, + "loss": 1.087, + "step": 7260 + }, + { + "epoch": 0.21897590361445782, + "grad_norm": 1.973294434777584, + "learning_rate": 9.239477877424317e-06, + "loss": 1.0159, + "step": 7270 + }, + { + "epoch": 0.21927710843373494, + "grad_norm": 3.195959866031689, + "learning_rate": 9.23683538631583e-06, + "loss": 1.0611, + "step": 7280 + }, + { + "epoch": 0.21957831325301205, + "grad_norm": 3.6241550357476258, + "learning_rate": 9.234188691640846e-06, + "loss": 0.9758, + "step": 7290 + }, + { + "epoch": 0.21987951807228914, + "grad_norm": 4.35318569238192, + "learning_rate": 9.231537796025275e-06, + "loss": 0.8874, + "step": 7300 + }, + { + "epoch": 0.22018072289156626, + "grad_norm": 3.818817870724282, + "learning_rate": 9.228882702099194e-06, + "loss": 1.1088, + "step": 7310 + }, + { + "epoch": 0.22048192771084338, + "grad_norm": 4.832753181216332, + "learning_rate": 9.226223412496852e-06, + "loss": 1.0271, + "step": 7320 + }, + { + "epoch": 0.22078313253012047, + "grad_norm": 3.563928636536491, + "learning_rate": 9.22355992985666e-06, + "loss": 0.9431, + "step": 7330 + }, + { + "epoch": 0.22108433734939759, + "grad_norm": 2.106032723927942, + "learning_rate": 9.22089225682118e-06, + "loss": 0.9198, + "step": 7340 + }, + { + "epoch": 0.2213855421686747, + "grad_norm": 3.6012656698254513, + "learning_rate": 9.218220396037142e-06, + "loss": 0.9619, + "step": 7350 + }, + { + "epoch": 0.2216867469879518, + "grad_norm": 4.2692913382684745, + "learning_rate": 9.215544350155423e-06, + "loss": 1.1123, + "step": 7360 + }, + { + "epoch": 0.2219879518072289, + "grad_norm": 3.1337403317924464, + "learning_rate": 9.212864121831059e-06, + "loss": 0.961, + "step": 7370 + }, + { + "epoch": 0.22228915662650603, + "grad_norm": 4.758260061052883, + "learning_rate": 9.21017971372323e-06, + "loss": 0.9874, + "step": 7380 + }, + { + "epoch": 0.22259036144578312, + "grad_norm": 5.2849022294711085, + "learning_rate": 9.207491128495267e-06, + "loss": 1.024, + "step": 7390 + }, + { + "epoch": 0.22289156626506024, + "grad_norm": 2.1049321654664546, + "learning_rate": 9.204798368814645e-06, + "loss": 0.9812, + "step": 7400 + }, + { + "epoch": 0.22319277108433735, + "grad_norm": 4.772520821330163, + "learning_rate": 9.202101437352974e-06, + "loss": 1.0492, + "step": 7410 + }, + { + "epoch": 0.22349397590361444, + "grad_norm": 4.363365337409341, + "learning_rate": 9.199400336786016e-06, + "loss": 1.0548, + "step": 7420 + }, + { + "epoch": 0.22379518072289156, + "grad_norm": 3.7654678746095813, + "learning_rate": 9.196695069793652e-06, + "loss": 1.0793, + "step": 7430 + }, + { + "epoch": 0.22409638554216868, + "grad_norm": 5.043492783400528, + "learning_rate": 9.193985639059917e-06, + "loss": 1.0579, + "step": 7440 + }, + { + "epoch": 0.22439759036144577, + "grad_norm": 4.163967740388339, + "learning_rate": 9.19127204727296e-06, + "loss": 0.9827, + "step": 7450 + }, + { + "epoch": 0.2246987951807229, + "grad_norm": 3.6302790775841434, + "learning_rate": 9.188554297125065e-06, + "loss": 0.9875, + "step": 7460 + }, + { + "epoch": 0.225, + "grad_norm": 2.296005109334423, + "learning_rate": 9.185832391312644e-06, + "loss": 0.9688, + "step": 7470 + }, + { + "epoch": 0.2253012048192771, + "grad_norm": 9.846179925506112, + "learning_rate": 9.183106332536228e-06, + "loss": 1.0282, + "step": 7480 + }, + { + "epoch": 0.2256024096385542, + "grad_norm": 5.056630245306455, + "learning_rate": 9.18037612350047e-06, + "loss": 0.9888, + "step": 7490 + }, + { + "epoch": 0.22590361445783133, + "grad_norm": 5.431301453348206, + "learning_rate": 9.177641766914143e-06, + "loss": 1.021, + "step": 7500 + }, + { + "epoch": 0.22620481927710842, + "grad_norm": 3.301911571634357, + "learning_rate": 9.174903265490128e-06, + "loss": 0.955, + "step": 7510 + }, + { + "epoch": 0.22650602409638554, + "grad_norm": 3.5962629099331647, + "learning_rate": 9.172160621945428e-06, + "loss": 0.9391, + "step": 7520 + }, + { + "epoch": 0.22680722891566266, + "grad_norm": 2.269448713410476, + "learning_rate": 9.169413839001147e-06, + "loss": 1.0487, + "step": 7530 + }, + { + "epoch": 0.22710843373493975, + "grad_norm": 3.8344296994307996, + "learning_rate": 9.1666629193825e-06, + "loss": 0.9021, + "step": 7540 + }, + { + "epoch": 0.22740963855421686, + "grad_norm": 4.164141091617803, + "learning_rate": 9.163907865818806e-06, + "loss": 1.0118, + "step": 7550 + }, + { + "epoch": 0.22771084337349398, + "grad_norm": 2.0103672358412963, + "learning_rate": 9.161148681043486e-06, + "loss": 0.9915, + "step": 7560 + }, + { + "epoch": 0.22801204819277107, + "grad_norm": 3.8544760596594614, + "learning_rate": 9.158385367794056e-06, + "loss": 1.0487, + "step": 7570 + }, + { + "epoch": 0.2283132530120482, + "grad_norm": 3.237976024176194, + "learning_rate": 9.155617928812134e-06, + "loss": 0.8809, + "step": 7580 + }, + { + "epoch": 0.2286144578313253, + "grad_norm": 3.757167149271603, + "learning_rate": 9.152846366843428e-06, + "loss": 1.1089, + "step": 7590 + }, + { + "epoch": 0.2289156626506024, + "grad_norm": 2.2128471871795075, + "learning_rate": 9.150070684637736e-06, + "loss": 1.0245, + "step": 7600 + }, + { + "epoch": 0.22921686746987951, + "grad_norm": 4.482785826969745, + "learning_rate": 9.147290884948943e-06, + "loss": 1.0492, + "step": 7610 + }, + { + "epoch": 0.22951807228915663, + "grad_norm": 2.0738750013951046, + "learning_rate": 9.144506970535023e-06, + "loss": 0.9468, + "step": 7620 + }, + { + "epoch": 0.22981927710843372, + "grad_norm": 3.7297022255307763, + "learning_rate": 9.14171894415803e-06, + "loss": 0.9987, + "step": 7630 + }, + { + "epoch": 0.23012048192771084, + "grad_norm": 3.190077549805851, + "learning_rate": 9.138926808584098e-06, + "loss": 1.0469, + "step": 7640 + }, + { + "epoch": 0.23042168674698796, + "grad_norm": 1.9866370370988622, + "learning_rate": 9.136130566583436e-06, + "loss": 0.8609, + "step": 7650 + }, + { + "epoch": 0.23072289156626505, + "grad_norm": 3.07394379390921, + "learning_rate": 9.133330220930333e-06, + "loss": 1.0099, + "step": 7660 + }, + { + "epoch": 0.23102409638554217, + "grad_norm": 3.0863871770055353, + "learning_rate": 9.130525774403143e-06, + "loss": 1.0596, + "step": 7670 + }, + { + "epoch": 0.23132530120481928, + "grad_norm": 3.0317240789476902, + "learning_rate": 9.12771722978429e-06, + "loss": 1.0161, + "step": 7680 + }, + { + "epoch": 0.23162650602409637, + "grad_norm": 3.18180990792581, + "learning_rate": 9.124904589860269e-06, + "loss": 1.0686, + "step": 7690 + }, + { + "epoch": 0.2319277108433735, + "grad_norm": 3.1169520703622404, + "learning_rate": 9.122087857421628e-06, + "loss": 1.0192, + "step": 7700 + }, + { + "epoch": 0.2322289156626506, + "grad_norm": 2.9204592873855035, + "learning_rate": 9.119267035262989e-06, + "loss": 0.8653, + "step": 7710 + }, + { + "epoch": 0.2325301204819277, + "grad_norm": 4.344511588071381, + "learning_rate": 9.116442126183018e-06, + "loss": 1.0525, + "step": 7720 + }, + { + "epoch": 0.23283132530120482, + "grad_norm": 2.9238280827556165, + "learning_rate": 9.113613132984446e-06, + "loss": 0.9894, + "step": 7730 + }, + { + "epoch": 0.23313253012048193, + "grad_norm": 3.391027204375839, + "learning_rate": 9.110780058474052e-06, + "loss": 1.0166, + "step": 7740 + }, + { + "epoch": 0.23343373493975902, + "grad_norm": 3.2196892544578857, + "learning_rate": 9.107942905462662e-06, + "loss": 0.9522, + "step": 7750 + }, + { + "epoch": 0.23373493975903614, + "grad_norm": 3.7064446670426556, + "learning_rate": 9.10510167676515e-06, + "loss": 1.0051, + "step": 7760 + }, + { + "epoch": 0.23403614457831326, + "grad_norm": 2.9697209146836476, + "learning_rate": 9.102256375200438e-06, + "loss": 1.0173, + "step": 7770 + }, + { + "epoch": 0.23433734939759035, + "grad_norm": 2.2055470227618548, + "learning_rate": 9.099407003591485e-06, + "loss": 0.9593, + "step": 7780 + }, + { + "epoch": 0.23463855421686747, + "grad_norm": 3.247569268304636, + "learning_rate": 9.096553564765286e-06, + "loss": 0.9129, + "step": 7790 + }, + { + "epoch": 0.23493975903614459, + "grad_norm": 3.5284419683223796, + "learning_rate": 9.093696061552876e-06, + "loss": 1.026, + "step": 7800 + }, + { + "epoch": 0.23524096385542168, + "grad_norm": 2.9504014391893354, + "learning_rate": 9.090834496789319e-06, + "loss": 0.9459, + "step": 7810 + }, + { + "epoch": 0.2355421686746988, + "grad_norm": 2.8731903409884842, + "learning_rate": 9.08796887331371e-06, + "loss": 1.0403, + "step": 7820 + }, + { + "epoch": 0.2358433734939759, + "grad_norm": 3.9923686182409535, + "learning_rate": 9.085099193969172e-06, + "loss": 0.9725, + "step": 7830 + }, + { + "epoch": 0.236144578313253, + "grad_norm": 2.887965087195908, + "learning_rate": 9.082225461602849e-06, + "loss": 1.0674, + "step": 7840 + }, + { + "epoch": 0.23644578313253012, + "grad_norm": 3.461852254021425, + "learning_rate": 9.079347679065909e-06, + "loss": 1.1639, + "step": 7850 + }, + { + "epoch": 0.23674698795180724, + "grad_norm": 3.2103801236890024, + "learning_rate": 9.076465849213538e-06, + "loss": 1.0201, + "step": 7860 + }, + { + "epoch": 0.23704819277108433, + "grad_norm": 3.0429166797677483, + "learning_rate": 9.073579974904936e-06, + "loss": 0.9673, + "step": 7870 + }, + { + "epoch": 0.23734939759036144, + "grad_norm": 2.306017900996041, + "learning_rate": 9.070690059003316e-06, + "loss": 0.9704, + "step": 7880 + }, + { + "epoch": 0.23765060240963856, + "grad_norm": 2.98592283815459, + "learning_rate": 9.067796104375903e-06, + "loss": 1.1364, + "step": 7890 + }, + { + "epoch": 0.23795180722891565, + "grad_norm": 3.4606592270355945, + "learning_rate": 9.064898113893925e-06, + "loss": 1.0417, + "step": 7900 + }, + { + "epoch": 0.23825301204819277, + "grad_norm": 3.372271891702563, + "learning_rate": 9.061996090432618e-06, + "loss": 1.0594, + "step": 7910 + }, + { + "epoch": 0.2385542168674699, + "grad_norm": 3.229179997877593, + "learning_rate": 9.059090036871221e-06, + "loss": 1.1316, + "step": 7920 + }, + { + "epoch": 0.23885542168674698, + "grad_norm": 3.2308663576330185, + "learning_rate": 9.056179956092961e-06, + "loss": 1.0451, + "step": 7930 + }, + { + "epoch": 0.2391566265060241, + "grad_norm": 3.3957658720642376, + "learning_rate": 9.053265850985075e-06, + "loss": 1.0646, + "step": 7940 + }, + { + "epoch": 0.2394578313253012, + "grad_norm": 3.176042552942464, + "learning_rate": 9.050347724438785e-06, + "loss": 0.9957, + "step": 7950 + }, + { + "epoch": 0.2397590361445783, + "grad_norm": 4.48528608034301, + "learning_rate": 9.047425579349298e-06, + "loss": 0.9992, + "step": 7960 + }, + { + "epoch": 0.24006024096385542, + "grad_norm": 2.1282679119321743, + "learning_rate": 9.044499418615817e-06, + "loss": 1.0291, + "step": 7970 + }, + { + "epoch": 0.24036144578313254, + "grad_norm": 2.736142662871743, + "learning_rate": 9.041569245141528e-06, + "loss": 0.9823, + "step": 7980 + }, + { + "epoch": 0.24066265060240963, + "grad_norm": 3.3382841573409188, + "learning_rate": 9.038635061833592e-06, + "loss": 1.0627, + "step": 7990 + }, + { + "epoch": 0.24096385542168675, + "grad_norm": 2.024815484120407, + "learning_rate": 9.035696871603155e-06, + "loss": 0.9441, + "step": 8000 + }, + { + "epoch": 0.24126506024096386, + "grad_norm": 4.289359937594581, + "learning_rate": 9.032754677365335e-06, + "loss": 0.9866, + "step": 8010 + }, + { + "epoch": 0.24156626506024095, + "grad_norm": 4.359534389293543, + "learning_rate": 9.029808482039221e-06, + "loss": 1.0458, + "step": 8020 + }, + { + "epoch": 0.24186746987951807, + "grad_norm": 3.268358974643368, + "learning_rate": 9.026858288547877e-06, + "loss": 1.0405, + "step": 8030 + }, + { + "epoch": 0.2421686746987952, + "grad_norm": 2.2008962339669287, + "learning_rate": 9.02390409981833e-06, + "loss": 1.0244, + "step": 8040 + }, + { + "epoch": 0.24246987951807228, + "grad_norm": 2.118757535566222, + "learning_rate": 9.02094591878157e-06, + "loss": 0.9002, + "step": 8050 + }, + { + "epoch": 0.2427710843373494, + "grad_norm": 3.2090755214164566, + "learning_rate": 9.01798374837255e-06, + "loss": 1.1051, + "step": 8060 + }, + { + "epoch": 0.24307228915662651, + "grad_norm": 3.042220359326803, + "learning_rate": 9.015017591530183e-06, + "loss": 0.9293, + "step": 8070 + }, + { + "epoch": 0.2433734939759036, + "grad_norm": 3.426262304434553, + "learning_rate": 9.01204745119733e-06, + "loss": 1.1623, + "step": 8080 + }, + { + "epoch": 0.24367469879518072, + "grad_norm": 2.0858619438910204, + "learning_rate": 9.009073330320814e-06, + "loss": 0.8809, + "step": 8090 + }, + { + "epoch": 0.24397590361445784, + "grad_norm": 3.2517862161832367, + "learning_rate": 9.0060952318514e-06, + "loss": 0.9687, + "step": 8100 + }, + { + "epoch": 0.24427710843373493, + "grad_norm": 5.385008730635654, + "learning_rate": 9.003113158743799e-06, + "loss": 0.9973, + "step": 8110 + }, + { + "epoch": 0.24457831325301205, + "grad_norm": 3.260874636870826, + "learning_rate": 9.000127113956673e-06, + "loss": 1.1001, + "step": 8120 + }, + { + "epoch": 0.24487951807228917, + "grad_norm": 3.748508357440099, + "learning_rate": 8.997137100452617e-06, + "loss": 1.0199, + "step": 8130 + }, + { + "epoch": 0.24518072289156626, + "grad_norm": 2.3309099545563647, + "learning_rate": 8.994143121198165e-06, + "loss": 1.0364, + "step": 8140 + }, + { + "epoch": 0.24548192771084337, + "grad_norm": 3.456354645387568, + "learning_rate": 8.991145179163786e-06, + "loss": 0.9503, + "step": 8150 + }, + { + "epoch": 0.2457831325301205, + "grad_norm": 2.814700506762087, + "learning_rate": 8.988143277323886e-06, + "loss": 1.0362, + "step": 8160 + }, + { + "epoch": 0.24608433734939758, + "grad_norm": 2.12355078102478, + "learning_rate": 8.98513741865679e-06, + "loss": 1.0603, + "step": 8170 + }, + { + "epoch": 0.2463855421686747, + "grad_norm": 2.0907205765314076, + "learning_rate": 8.982127606144757e-06, + "loss": 1.0424, + "step": 8180 + }, + { + "epoch": 0.24668674698795182, + "grad_norm": 2.0393725120506816, + "learning_rate": 8.979113842773963e-06, + "loss": 0.9431, + "step": 8190 + }, + { + "epoch": 0.2469879518072289, + "grad_norm": 3.5145116975187687, + "learning_rate": 8.976096131534509e-06, + "loss": 0.9675, + "step": 8200 + }, + { + "epoch": 0.24728915662650602, + "grad_norm": 3.4096899005064016, + "learning_rate": 8.973074475420407e-06, + "loss": 0.9713, + "step": 8210 + }, + { + "epoch": 0.24759036144578314, + "grad_norm": 3.4456131377311445, + "learning_rate": 8.970048877429589e-06, + "loss": 1.0266, + "step": 8220 + }, + { + "epoch": 0.24789156626506023, + "grad_norm": 3.28229849253706, + "learning_rate": 8.967019340563895e-06, + "loss": 1.0719, + "step": 8230 + }, + { + "epoch": 0.24819277108433735, + "grad_norm": 3.029145331734003, + "learning_rate": 8.963985867829071e-06, + "loss": 1.0857, + "step": 8240 + }, + { + "epoch": 0.24849397590361447, + "grad_norm": 3.5855129324830393, + "learning_rate": 8.960948462234774e-06, + "loss": 0.9337, + "step": 8250 + }, + { + "epoch": 0.24879518072289156, + "grad_norm": 3.0901780732506396, + "learning_rate": 8.957907126794554e-06, + "loss": 1.0781, + "step": 8260 + }, + { + "epoch": 0.24909638554216867, + "grad_norm": 3.121913022289746, + "learning_rate": 8.954861864525868e-06, + "loss": 1.0861, + "step": 8270 + }, + { + "epoch": 0.2493975903614458, + "grad_norm": 2.1150872922991755, + "learning_rate": 8.951812678450064e-06, + "loss": 0.953, + "step": 8280 + }, + { + "epoch": 0.24969879518072288, + "grad_norm": 3.101876179688924, + "learning_rate": 8.948759571592387e-06, + "loss": 0.9801, + "step": 8290 + }, + { + "epoch": 0.25, + "grad_norm": 3.9124587096697256, + "learning_rate": 8.94570254698197e-06, + "loss": 1.0535, + "step": 8300 + }, + { + "epoch": 0.2503012048192771, + "grad_norm": 3.407308686234648, + "learning_rate": 8.94264160765183e-06, + "loss": 0.996, + "step": 8310 + }, + { + "epoch": 0.25060240963855424, + "grad_norm": 2.155869125435714, + "learning_rate": 8.939576756638874e-06, + "loss": 0.9361, + "step": 8320 + }, + { + "epoch": 0.2509036144578313, + "grad_norm": 1.8777188079853016, + "learning_rate": 8.936507996983887e-06, + "loss": 0.9956, + "step": 8330 + }, + { + "epoch": 0.2512048192771084, + "grad_norm": 3.155861948679197, + "learning_rate": 8.933435331731528e-06, + "loss": 0.9806, + "step": 8340 + }, + { + "epoch": 0.25150602409638556, + "grad_norm": 2.017966739523534, + "learning_rate": 8.93035876393034e-06, + "loss": 1.0198, + "step": 8350 + }, + { + "epoch": 0.25180722891566265, + "grad_norm": 5.9755373662877265, + "learning_rate": 8.92727829663273e-06, + "loss": 1.0251, + "step": 8360 + }, + { + "epoch": 0.25210843373493974, + "grad_norm": 4.003781774292372, + "learning_rate": 8.924193932894975e-06, + "loss": 1.0323, + "step": 8370 + }, + { + "epoch": 0.2524096385542169, + "grad_norm": 4.131019163865079, + "learning_rate": 8.921105675777224e-06, + "loss": 1.1216, + "step": 8380 + }, + { + "epoch": 0.252710843373494, + "grad_norm": 3.3247654634034123, + "learning_rate": 8.91801352834348e-06, + "loss": 1.0649, + "step": 8390 + }, + { + "epoch": 0.25301204819277107, + "grad_norm": 3.729922888326644, + "learning_rate": 8.91491749366161e-06, + "loss": 0.9558, + "step": 8400 + }, + { + "epoch": 0.2533132530120482, + "grad_norm": 10.224188820758755, + "learning_rate": 8.91181757480334e-06, + "loss": 0.9427, + "step": 8410 + }, + { + "epoch": 0.2536144578313253, + "grad_norm": 2.038080876914104, + "learning_rate": 8.908713774844246e-06, + "loss": 1.0966, + "step": 8420 + }, + { + "epoch": 0.2539156626506024, + "grad_norm": 3.159311381360028, + "learning_rate": 8.905606096863758e-06, + "loss": 0.9847, + "step": 8430 + }, + { + "epoch": 0.25421686746987954, + "grad_norm": 4.105394915861901, + "learning_rate": 8.90249454394515e-06, + "loss": 0.9863, + "step": 8440 + }, + { + "epoch": 0.2545180722891566, + "grad_norm": 2.987840420686699, + "learning_rate": 8.899379119175539e-06, + "loss": 0.9878, + "step": 8450 + }, + { + "epoch": 0.2548192771084337, + "grad_norm": 3.418140227575701, + "learning_rate": 8.896259825645892e-06, + "loss": 0.9998, + "step": 8460 + }, + { + "epoch": 0.25512048192771086, + "grad_norm": 6.400861492734381, + "learning_rate": 8.893136666451008e-06, + "loss": 0.9427, + "step": 8470 + }, + { + "epoch": 0.25542168674698795, + "grad_norm": 2.103983548502195, + "learning_rate": 8.890009644689516e-06, + "loss": 1.0309, + "step": 8480 + }, + { + "epoch": 0.25572289156626504, + "grad_norm": 2.3373928055118256, + "learning_rate": 8.886878763463893e-06, + "loss": 0.9713, + "step": 8490 + }, + { + "epoch": 0.2560240963855422, + "grad_norm": 2.2378882334219425, + "learning_rate": 8.883744025880429e-06, + "loss": 1.0063, + "step": 8500 + }, + { + "epoch": 0.2563253012048193, + "grad_norm": 2.399024881939655, + "learning_rate": 8.880605435049247e-06, + "loss": 1.0015, + "step": 8510 + }, + { + "epoch": 0.25662650602409637, + "grad_norm": 3.5593274010218217, + "learning_rate": 8.877462994084296e-06, + "loss": 0.963, + "step": 8520 + }, + { + "epoch": 0.2569277108433735, + "grad_norm": 3.3842963179891226, + "learning_rate": 8.874316706103339e-06, + "loss": 0.9556, + "step": 8530 + }, + { + "epoch": 0.2572289156626506, + "grad_norm": 3.6922453483063364, + "learning_rate": 8.871166574227961e-06, + "loss": 1.0794, + "step": 8540 + }, + { + "epoch": 0.2575301204819277, + "grad_norm": 2.051465817210662, + "learning_rate": 8.868012601583553e-06, + "loss": 0.9485, + "step": 8550 + }, + { + "epoch": 0.25783132530120484, + "grad_norm": 3.4485292723014718, + "learning_rate": 8.864854791299328e-06, + "loss": 1.0427, + "step": 8560 + }, + { + "epoch": 0.25813253012048193, + "grad_norm": 1.9121000707312048, + "learning_rate": 8.861693146508295e-06, + "loss": 0.8302, + "step": 8570 + }, + { + "epoch": 0.258433734939759, + "grad_norm": 3.426783290699053, + "learning_rate": 8.858527670347277e-06, + "loss": 0.9892, + "step": 8580 + }, + { + "epoch": 0.25873493975903616, + "grad_norm": 2.110351300097445, + "learning_rate": 8.85535836595689e-06, + "loss": 0.9055, + "step": 8590 + }, + { + "epoch": 0.25903614457831325, + "grad_norm": 3.499859287241071, + "learning_rate": 8.852185236481553e-06, + "loss": 1.008, + "step": 8600 + }, + { + "epoch": 0.25933734939759034, + "grad_norm": 1.967989273129067, + "learning_rate": 8.849008285069482e-06, + "loss": 0.9881, + "step": 8610 + }, + { + "epoch": 0.2596385542168675, + "grad_norm": 3.7193727294787355, + "learning_rate": 8.84582751487268e-06, + "loss": 0.9405, + "step": 8620 + }, + { + "epoch": 0.2599397590361446, + "grad_norm": 3.205138528859601, + "learning_rate": 8.84264292904694e-06, + "loss": 0.8871, + "step": 8630 + }, + { + "epoch": 0.26024096385542167, + "grad_norm": 3.105480829447051, + "learning_rate": 8.839454530751842e-06, + "loss": 1.0926, + "step": 8640 + }, + { + "epoch": 0.2605421686746988, + "grad_norm": 3.7813810151424043, + "learning_rate": 8.83626232315075e-06, + "loss": 0.8883, + "step": 8650 + }, + { + "epoch": 0.2608433734939759, + "grad_norm": 3.8266788365773072, + "learning_rate": 8.833066309410802e-06, + "loss": 1.0007, + "step": 8660 + }, + { + "epoch": 0.261144578313253, + "grad_norm": 4.250171524841646, + "learning_rate": 8.829866492702918e-06, + "loss": 1.0061, + "step": 8670 + }, + { + "epoch": 0.26144578313253014, + "grad_norm": 6.36063003016339, + "learning_rate": 8.826662876201788e-06, + "loss": 0.9953, + "step": 8680 + }, + { + "epoch": 0.26174698795180723, + "grad_norm": 3.564978005434069, + "learning_rate": 8.823455463085873e-06, + "loss": 0.9286, + "step": 8690 + }, + { + "epoch": 0.2620481927710843, + "grad_norm": 3.1135254464036253, + "learning_rate": 8.820244256537402e-06, + "loss": 1.0545, + "step": 8700 + }, + { + "epoch": 0.26234939759036147, + "grad_norm": 3.647317125831831, + "learning_rate": 8.817029259742365e-06, + "loss": 1.0226, + "step": 8710 + }, + { + "epoch": 0.26265060240963856, + "grad_norm": 2.8942736143999666, + "learning_rate": 8.813810475890512e-06, + "loss": 0.887, + "step": 8720 + }, + { + "epoch": 0.26295180722891565, + "grad_norm": 3.3744023346923755, + "learning_rate": 8.810587908175354e-06, + "loss": 0.9674, + "step": 8730 + }, + { + "epoch": 0.2632530120481928, + "grad_norm": 3.605368502463631, + "learning_rate": 8.807361559794152e-06, + "loss": 0.9999, + "step": 8740 + }, + { + "epoch": 0.2635542168674699, + "grad_norm": 2.7144431832911438, + "learning_rate": 8.804131433947924e-06, + "loss": 0.9657, + "step": 8750 + }, + { + "epoch": 0.26385542168674697, + "grad_norm": 9.828698819825334, + "learning_rate": 8.80089753384143e-06, + "loss": 1.0306, + "step": 8760 + }, + { + "epoch": 0.2641566265060241, + "grad_norm": 3.5497797003483016, + "learning_rate": 8.797659862683177e-06, + "loss": 1.067, + "step": 8770 + }, + { + "epoch": 0.2644578313253012, + "grad_norm": 4.204949003562374, + "learning_rate": 8.794418423685409e-06, + "loss": 0.883, + "step": 8780 + }, + { + "epoch": 0.2647590361445783, + "grad_norm": 2.1131690002869035, + "learning_rate": 8.79117322006412e-06, + "loss": 0.9446, + "step": 8790 + }, + { + "epoch": 0.26506024096385544, + "grad_norm": 3.2653994029648103, + "learning_rate": 8.787924255039023e-06, + "loss": 1.0977, + "step": 8800 + }, + { + "epoch": 0.26536144578313253, + "grad_norm": 3.4847198927088034, + "learning_rate": 8.784671531833576e-06, + "loss": 1.0903, + "step": 8810 + }, + { + "epoch": 0.2656626506024096, + "grad_norm": 2.059740098020506, + "learning_rate": 8.781415053674961e-06, + "loss": 1.022, + "step": 8820 + }, + { + "epoch": 0.26596385542168677, + "grad_norm": 2.1593334065604086, + "learning_rate": 8.778154823794084e-06, + "loss": 0.9914, + "step": 8830 + }, + { + "epoch": 0.26626506024096386, + "grad_norm": 2.0773201565650523, + "learning_rate": 8.774890845425574e-06, + "loss": 1.075, + "step": 8840 + }, + { + "epoch": 0.26656626506024095, + "grad_norm": 2.936156340255884, + "learning_rate": 8.771623121807782e-06, + "loss": 0.9368, + "step": 8850 + }, + { + "epoch": 0.2668674698795181, + "grad_norm": 3.089892911981274, + "learning_rate": 8.76835165618277e-06, + "loss": 1.0487, + "step": 8860 + }, + { + "epoch": 0.2671686746987952, + "grad_norm": 2.246379326838362, + "learning_rate": 8.765076451796316e-06, + "loss": 0.9794, + "step": 8870 + }, + { + "epoch": 0.2674698795180723, + "grad_norm": 3.1692235943927582, + "learning_rate": 8.761797511897907e-06, + "loss": 1.0307, + "step": 8880 + }, + { + "epoch": 0.2677710843373494, + "grad_norm": 2.948244820205797, + "learning_rate": 8.758514839740735e-06, + "loss": 0.9876, + "step": 8890 + }, + { + "epoch": 0.2680722891566265, + "grad_norm": 3.4051597829304416, + "learning_rate": 8.755228438581697e-06, + "loss": 0.9459, + "step": 8900 + }, + { + "epoch": 0.2683734939759036, + "grad_norm": 2.2249274490023856, + "learning_rate": 8.751938311681386e-06, + "loss": 0.9304, + "step": 8910 + }, + { + "epoch": 0.26867469879518074, + "grad_norm": 5.608149002370423, + "learning_rate": 8.748644462304096e-06, + "loss": 1.0476, + "step": 8920 + }, + { + "epoch": 0.26897590361445783, + "grad_norm": 4.532022390089638, + "learning_rate": 8.74534689371781e-06, + "loss": 1.091, + "step": 8930 + }, + { + "epoch": 0.2692771084337349, + "grad_norm": 2.0657552554643455, + "learning_rate": 8.742045609194207e-06, + "loss": 1.0839, + "step": 8940 + }, + { + "epoch": 0.26957831325301207, + "grad_norm": 3.262085649651936, + "learning_rate": 8.738740612008644e-06, + "loss": 1.1189, + "step": 8950 + }, + { + "epoch": 0.26987951807228916, + "grad_norm": 3.0214716038681972, + "learning_rate": 8.735431905440168e-06, + "loss": 0.9694, + "step": 8960 + }, + { + "epoch": 0.27018072289156625, + "grad_norm": 2.8653234520295574, + "learning_rate": 8.732119492771506e-06, + "loss": 0.9334, + "step": 8970 + }, + { + "epoch": 0.2704819277108434, + "grad_norm": 3.31071343105019, + "learning_rate": 8.728803377289061e-06, + "loss": 1.0991, + "step": 8980 + }, + { + "epoch": 0.2707831325301205, + "grad_norm": 3.243179632332493, + "learning_rate": 8.725483562282906e-06, + "loss": 1.0835, + "step": 8990 + }, + { + "epoch": 0.2710843373493976, + "grad_norm": 3.761598483639871, + "learning_rate": 8.722160051046793e-06, + "loss": 0.9827, + "step": 9000 + }, + { + "epoch": 0.2713855421686747, + "grad_norm": 1.9217945655360718, + "learning_rate": 8.718832846878131e-06, + "loss": 0.9949, + "step": 9010 + }, + { + "epoch": 0.2716867469879518, + "grad_norm": 2.887794643400935, + "learning_rate": 8.715501953078e-06, + "loss": 1.01, + "step": 9020 + }, + { + "epoch": 0.2719879518072289, + "grad_norm": 2.850209938883259, + "learning_rate": 8.712167372951138e-06, + "loss": 1.0425, + "step": 9030 + }, + { + "epoch": 0.27228915662650605, + "grad_norm": 2.0174305923281577, + "learning_rate": 8.708829109805946e-06, + "loss": 0.9925, + "step": 9040 + }, + { + "epoch": 0.27259036144578314, + "grad_norm": 2.8949800495869997, + "learning_rate": 8.705487166954466e-06, + "loss": 1.1293, + "step": 9050 + }, + { + "epoch": 0.2728915662650602, + "grad_norm": 2.195829337105026, + "learning_rate": 8.702141547712401e-06, + "loss": 1.0801, + "step": 9060 + }, + { + "epoch": 0.27319277108433737, + "grad_norm": 3.0799602793477816, + "learning_rate": 8.698792255399104e-06, + "loss": 1.0336, + "step": 9070 + }, + { + "epoch": 0.27349397590361446, + "grad_norm": 4.01655154679231, + "learning_rate": 8.695439293337564e-06, + "loss": 0.9759, + "step": 9080 + }, + { + "epoch": 0.27379518072289155, + "grad_norm": 4.476127710746531, + "learning_rate": 8.692082664854417e-06, + "loss": 1.013, + "step": 9090 + }, + { + "epoch": 0.2740963855421687, + "grad_norm": 3.316655101014646, + "learning_rate": 8.688722373279933e-06, + "loss": 0.9287, + "step": 9100 + }, + { + "epoch": 0.2743975903614458, + "grad_norm": 2.2120599919290305, + "learning_rate": 8.685358421948014e-06, + "loss": 0.9453, + "step": 9110 + }, + { + "epoch": 0.2746987951807229, + "grad_norm": 2.114406706854489, + "learning_rate": 8.681990814196202e-06, + "loss": 0.9044, + "step": 9120 + }, + { + "epoch": 0.275, + "grad_norm": 2.0479542650125167, + "learning_rate": 8.67861955336566e-06, + "loss": 1.0563, + "step": 9130 + }, + { + "epoch": 0.2753012048192771, + "grad_norm": 1.913614323403774, + "learning_rate": 8.675244642801172e-06, + "loss": 0.9772, + "step": 9140 + }, + { + "epoch": 0.2756024096385542, + "grad_norm": 1.974782501030137, + "learning_rate": 8.671866085851156e-06, + "loss": 0.9173, + "step": 9150 + }, + { + "epoch": 0.27590361445783135, + "grad_norm": 3.9898567048833673, + "learning_rate": 8.668483885867633e-06, + "loss": 0.9698, + "step": 9160 + }, + { + "epoch": 0.27620481927710844, + "grad_norm": 4.0756619027536045, + "learning_rate": 8.665098046206248e-06, + "loss": 0.9545, + "step": 9170 + }, + { + "epoch": 0.2765060240963855, + "grad_norm": 4.6695325960080565, + "learning_rate": 8.661708570226253e-06, + "loss": 1.0066, + "step": 9180 + }, + { + "epoch": 0.2768072289156627, + "grad_norm": 3.3586111674870347, + "learning_rate": 8.658315461290507e-06, + "loss": 0.9848, + "step": 9190 + }, + { + "epoch": 0.27710843373493976, + "grad_norm": 2.316396336595485, + "learning_rate": 8.654918722765479e-06, + "loss": 1.0356, + "step": 9200 + }, + { + "epoch": 0.27740963855421685, + "grad_norm": 3.6734082611914354, + "learning_rate": 8.651518358021234e-06, + "loss": 0.9983, + "step": 9210 + }, + { + "epoch": 0.277710843373494, + "grad_norm": 3.555579508427273, + "learning_rate": 8.648114370431436e-06, + "loss": 1.0605, + "step": 9220 + }, + { + "epoch": 0.2780120481927711, + "grad_norm": 3.0162969039502086, + "learning_rate": 8.644706763373341e-06, + "loss": 0.9616, + "step": 9230 + }, + { + "epoch": 0.2783132530120482, + "grad_norm": 3.1072813195691857, + "learning_rate": 8.641295540227801e-06, + "loss": 1.0275, + "step": 9240 + }, + { + "epoch": 0.2786144578313253, + "grad_norm": 3.152335395360339, + "learning_rate": 8.637880704379254e-06, + "loss": 1.0587, + "step": 9250 + }, + { + "epoch": 0.2789156626506024, + "grad_norm": 2.1320231425099596, + "learning_rate": 8.634462259215719e-06, + "loss": 0.9809, + "step": 9260 + }, + { + "epoch": 0.2792168674698795, + "grad_norm": 2.123634624614154, + "learning_rate": 8.631040208128801e-06, + "loss": 0.9439, + "step": 9270 + }, + { + "epoch": 0.27951807228915665, + "grad_norm": 3.7308636123001953, + "learning_rate": 8.627614554513678e-06, + "loss": 0.9259, + "step": 9280 + }, + { + "epoch": 0.27981927710843374, + "grad_norm": 2.209359435710404, + "learning_rate": 8.624185301769106e-06, + "loss": 1.0067, + "step": 9290 + }, + { + "epoch": 0.28012048192771083, + "grad_norm": 4.388990797817878, + "learning_rate": 8.62075245329741e-06, + "loss": 1.0468, + "step": 9300 + }, + { + "epoch": 0.280421686746988, + "grad_norm": 2.0315716687217247, + "learning_rate": 8.617316012504482e-06, + "loss": 0.9971, + "step": 9310 + }, + { + "epoch": 0.28072289156626506, + "grad_norm": 1.9933640334109042, + "learning_rate": 8.613875982799778e-06, + "loss": 0.9349, + "step": 9320 + }, + { + "epoch": 0.28102409638554215, + "grad_norm": 4.010703901407647, + "learning_rate": 8.610432367596318e-06, + "loss": 0.9986, + "step": 9330 + }, + { + "epoch": 0.2813253012048193, + "grad_norm": 2.0715506637193792, + "learning_rate": 8.606985170310673e-06, + "loss": 1.0251, + "step": 9340 + }, + { + "epoch": 0.2816265060240964, + "grad_norm": 3.521705097589883, + "learning_rate": 8.603534394362978e-06, + "loss": 1.0081, + "step": 9350 + }, + { + "epoch": 0.2819277108433735, + "grad_norm": 2.0308301769499537, + "learning_rate": 8.600080043176908e-06, + "loss": 0.9123, + "step": 9360 + }, + { + "epoch": 0.2822289156626506, + "grad_norm": 3.4120511236315316, + "learning_rate": 8.59662212017969e-06, + "loss": 1.0579, + "step": 9370 + }, + { + "epoch": 0.2825301204819277, + "grad_norm": 3.558088171590651, + "learning_rate": 8.593160628802093e-06, + "loss": 1.1084, + "step": 9380 + }, + { + "epoch": 0.2828313253012048, + "grad_norm": 5.2077730453201125, + "learning_rate": 8.589695572478432e-06, + "loss": 1.0583, + "step": 9390 + }, + { + "epoch": 0.28313253012048195, + "grad_norm": 3.829233159935352, + "learning_rate": 8.586226954646552e-06, + "loss": 0.9504, + "step": 9400 + }, + { + "epoch": 0.28343373493975904, + "grad_norm": 2.1213041714506318, + "learning_rate": 8.582754778747836e-06, + "loss": 1.0081, + "step": 9410 + }, + { + "epoch": 0.28373493975903613, + "grad_norm": 3.7570802554284968, + "learning_rate": 8.57927904822719e-06, + "loss": 1.0694, + "step": 9420 + }, + { + "epoch": 0.2840361445783133, + "grad_norm": 3.680727169065133, + "learning_rate": 8.57579976653306e-06, + "loss": 1.0114, + "step": 9430 + }, + { + "epoch": 0.28433734939759037, + "grad_norm": 2.125366849338149, + "learning_rate": 8.5723169371174e-06, + "loss": 0.9267, + "step": 9440 + }, + { + "epoch": 0.28463855421686746, + "grad_norm": 3.5634554268286824, + "learning_rate": 8.568830563435695e-06, + "loss": 1.0245, + "step": 9450 + }, + { + "epoch": 0.2849397590361446, + "grad_norm": 3.383819312317253, + "learning_rate": 8.565340648946942e-06, + "loss": 1.0031, + "step": 9460 + }, + { + "epoch": 0.2852409638554217, + "grad_norm": 5.018489228856675, + "learning_rate": 8.561847197113652e-06, + "loss": 1.0384, + "step": 9470 + }, + { + "epoch": 0.2855421686746988, + "grad_norm": 2.106375987821443, + "learning_rate": 8.558350211401847e-06, + "loss": 1.0437, + "step": 9480 + }, + { + "epoch": 0.2858433734939759, + "grad_norm": 2.051399084986709, + "learning_rate": 8.55484969528105e-06, + "loss": 0.8726, + "step": 9490 + }, + { + "epoch": 0.286144578313253, + "grad_norm": 5.571725247806883, + "learning_rate": 8.55134565222429e-06, + "loss": 1.0646, + "step": 9500 + }, + { + "epoch": 0.2864457831325301, + "grad_norm": 3.740064004961406, + "learning_rate": 8.5478380857081e-06, + "loss": 1.0561, + "step": 9510 + }, + { + "epoch": 0.28674698795180725, + "grad_norm": 2.170774936138277, + "learning_rate": 8.544326999212498e-06, + "loss": 0.945, + "step": 9520 + }, + { + "epoch": 0.28704819277108434, + "grad_norm": 3.5397619884652887, + "learning_rate": 8.540812396221004e-06, + "loss": 1.081, + "step": 9530 + }, + { + "epoch": 0.28734939759036143, + "grad_norm": 4.106927824872186, + "learning_rate": 8.53729428022062e-06, + "loss": 0.9765, + "step": 9540 + }, + { + "epoch": 0.2876506024096386, + "grad_norm": 2.1156632875564267, + "learning_rate": 8.533772654701841e-06, + "loss": 1.0164, + "step": 9550 + }, + { + "epoch": 0.28795180722891567, + "grad_norm": 2.22827357268054, + "learning_rate": 8.530247523158636e-06, + "loss": 0.918, + "step": 9560 + }, + { + "epoch": 0.28825301204819276, + "grad_norm": 4.056245021986871, + "learning_rate": 8.526718889088462e-06, + "loss": 0.8682, + "step": 9570 + }, + { + "epoch": 0.2885542168674699, + "grad_norm": 3.9995029052963624, + "learning_rate": 8.523186755992238e-06, + "loss": 1.0473, + "step": 9580 + }, + { + "epoch": 0.288855421686747, + "grad_norm": 3.9407262805197716, + "learning_rate": 8.519651127374365e-06, + "loss": 1.0374, + "step": 9590 + }, + { + "epoch": 0.2891566265060241, + "grad_norm": 3.5140684558085034, + "learning_rate": 8.516112006742708e-06, + "loss": 0.9466, + "step": 9600 + }, + { + "epoch": 0.28945783132530123, + "grad_norm": 4.126326268264751, + "learning_rate": 8.5125693976086e-06, + "loss": 1.1305, + "step": 9610 + }, + { + "epoch": 0.2897590361445783, + "grad_norm": 1.9779681956532646, + "learning_rate": 8.50902330348683e-06, + "loss": 0.8741, + "step": 9620 + }, + { + "epoch": 0.2900602409638554, + "grad_norm": 4.815731857352997, + "learning_rate": 8.505473727895645e-06, + "loss": 1.0245, + "step": 9630 + }, + { + "epoch": 0.29036144578313255, + "grad_norm": 4.386853244257249, + "learning_rate": 8.501920674356755e-06, + "loss": 0.9732, + "step": 9640 + }, + { + "epoch": 0.29066265060240964, + "grad_norm": 7.675980435686392, + "learning_rate": 8.498364146395308e-06, + "loss": 1.0894, + "step": 9650 + }, + { + "epoch": 0.29096385542168673, + "grad_norm": 7.4362537814990395, + "learning_rate": 8.494804147539908e-06, + "loss": 1.0496, + "step": 9660 + }, + { + "epoch": 0.2912650602409639, + "grad_norm": 2.1190568530914233, + "learning_rate": 8.491240681322597e-06, + "loss": 0.9939, + "step": 9670 + }, + { + "epoch": 0.29156626506024097, + "grad_norm": 4.504454100292339, + "learning_rate": 8.487673751278862e-06, + "loss": 0.9213, + "step": 9680 + }, + { + "epoch": 0.29186746987951806, + "grad_norm": 1.9990914564932507, + "learning_rate": 8.484103360947624e-06, + "loss": 0.8883, + "step": 9690 + }, + { + "epoch": 0.2921686746987952, + "grad_norm": 1.96311413820024, + "learning_rate": 8.480529513871237e-06, + "loss": 1.0131, + "step": 9700 + }, + { + "epoch": 0.2924698795180723, + "grad_norm": 1.9959365911392968, + "learning_rate": 8.476952213595486e-06, + "loss": 0.9971, + "step": 9710 + }, + { + "epoch": 0.2927710843373494, + "grad_norm": 4.261444775892076, + "learning_rate": 8.473371463669582e-06, + "loss": 1.091, + "step": 9720 + }, + { + "epoch": 0.29307228915662653, + "grad_norm": 4.291539127103698, + "learning_rate": 8.469787267646156e-06, + "loss": 1.0118, + "step": 9730 + }, + { + "epoch": 0.2933734939759036, + "grad_norm": 2.058077374442829, + "learning_rate": 8.466199629081259e-06, + "loss": 1.0093, + "step": 9740 + }, + { + "epoch": 0.2936746987951807, + "grad_norm": 7.278764807615133, + "learning_rate": 8.462608551534361e-06, + "loss": 1.0131, + "step": 9750 + }, + { + "epoch": 0.29397590361445786, + "grad_norm": 2.192933240945323, + "learning_rate": 8.45901403856834e-06, + "loss": 0.9013, + "step": 9760 + }, + { + "epoch": 0.29427710843373495, + "grad_norm": 4.517739760111507, + "learning_rate": 8.455416093749484e-06, + "loss": 1.0373, + "step": 9770 + }, + { + "epoch": 0.29457831325301204, + "grad_norm": 4.809282144037537, + "learning_rate": 8.451814720647489e-06, + "loss": 1.0661, + "step": 9780 + }, + { + "epoch": 0.2948795180722892, + "grad_norm": 2.220220596520115, + "learning_rate": 8.448209922835441e-06, + "loss": 0.9799, + "step": 9790 + }, + { + "epoch": 0.29518072289156627, + "grad_norm": 3.3384467708607146, + "learning_rate": 8.444601703889839e-06, + "loss": 1.0893, + "step": 9800 + }, + { + "epoch": 0.29548192771084336, + "grad_norm": 3.844421196291859, + "learning_rate": 8.440990067390567e-06, + "loss": 1.0661, + "step": 9810 + }, + { + "epoch": 0.2957831325301205, + "grad_norm": 5.606851916002022, + "learning_rate": 8.437375016920901e-06, + "loss": 0.9668, + "step": 9820 + }, + { + "epoch": 0.2960843373493976, + "grad_norm": 1.9419333318766125, + "learning_rate": 8.433756556067506e-06, + "loss": 0.9926, + "step": 9830 + }, + { + "epoch": 0.2963855421686747, + "grad_norm": 3.5390962538094497, + "learning_rate": 8.430134688420428e-06, + "loss": 1.0604, + "step": 9840 + }, + { + "epoch": 0.29668674698795183, + "grad_norm": 2.08481584778143, + "learning_rate": 8.426509417573099e-06, + "loss": 1.0084, + "step": 9850 + }, + { + "epoch": 0.2969879518072289, + "grad_norm": 5.1937764318739505, + "learning_rate": 8.422880747122317e-06, + "loss": 0.9945, + "step": 9860 + }, + { + "epoch": 0.297289156626506, + "grad_norm": 2.122275842944583, + "learning_rate": 8.419248680668263e-06, + "loss": 0.9689, + "step": 9870 + }, + { + "epoch": 0.29759036144578316, + "grad_norm": 5.668450624299819, + "learning_rate": 8.415613221814483e-06, + "loss": 0.9907, + "step": 9880 + }, + { + "epoch": 0.29789156626506025, + "grad_norm": 4.105107555390713, + "learning_rate": 8.411974374167891e-06, + "loss": 1.0722, + "step": 9890 + }, + { + "epoch": 0.29819277108433734, + "grad_norm": 4.038784462134157, + "learning_rate": 8.408332141338755e-06, + "loss": 0.9445, + "step": 9900 + }, + { + "epoch": 0.2984939759036145, + "grad_norm": 5.71876423304024, + "learning_rate": 8.404686526940715e-06, + "loss": 0.9854, + "step": 9910 + }, + { + "epoch": 0.2987951807228916, + "grad_norm": 7.343845231655897, + "learning_rate": 8.401037534590754e-06, + "loss": 1.0499, + "step": 9920 + }, + { + "epoch": 0.29909638554216866, + "grad_norm": 7.343094953013136, + "learning_rate": 8.397385167909214e-06, + "loss": 0.9208, + "step": 9930 + }, + { + "epoch": 0.2993975903614458, + "grad_norm": 2.0254793586076727, + "learning_rate": 8.393729430519781e-06, + "loss": 1.038, + "step": 9940 + }, + { + "epoch": 0.2996987951807229, + "grad_norm": 2.0875635754460737, + "learning_rate": 8.39007032604949e-06, + "loss": 0.8775, + "step": 9950 + }, + { + "epoch": 0.3, + "grad_norm": 4.629135068376105, + "learning_rate": 8.386407858128707e-06, + "loss": 1.0253, + "step": 9960 + }, + { + "epoch": 0.30030120481927713, + "grad_norm": 9.143103512430542, + "learning_rate": 8.382742030391144e-06, + "loss": 1.1477, + "step": 9970 + }, + { + "epoch": 0.3006024096385542, + "grad_norm": 2.1611988930239674, + "learning_rate": 8.379072846473845e-06, + "loss": 0.8932, + "step": 9980 + }, + { + "epoch": 0.3009036144578313, + "grad_norm": 2.1482285938635153, + "learning_rate": 8.37540031001718e-06, + "loss": 0.989, + "step": 9990 + }, + { + "epoch": 0.30120481927710846, + "grad_norm": 3.8113924866377142, + "learning_rate": 8.371724424664851e-06, + "loss": 1.0152, + "step": 10000 + }, + { + "epoch": 0.30150602409638555, + "grad_norm": 4.327424585719604, + "learning_rate": 8.368045194063876e-06, + "loss": 1.0561, + "step": 10010 + }, + { + "epoch": 0.30180722891566264, + "grad_norm": 2.10279887823453, + "learning_rate": 8.364362621864595e-06, + "loss": 0.9944, + "step": 10020 + }, + { + "epoch": 0.3021084337349398, + "grad_norm": 2.037386564834821, + "learning_rate": 8.360676711720666e-06, + "loss": 0.9977, + "step": 10030 + }, + { + "epoch": 0.3024096385542169, + "grad_norm": 2.2070496569550406, + "learning_rate": 8.356987467289055e-06, + "loss": 1.0271, + "step": 10040 + }, + { + "epoch": 0.30271084337349397, + "grad_norm": 2.1707092869165097, + "learning_rate": 8.353294892230038e-06, + "loss": 0.8483, + "step": 10050 + }, + { + "epoch": 0.3030120481927711, + "grad_norm": 3.7454069955763925, + "learning_rate": 8.349598990207192e-06, + "loss": 0.9188, + "step": 10060 + }, + { + "epoch": 0.3033132530120482, + "grad_norm": 3.2981642962310995, + "learning_rate": 8.345899764887402e-06, + "loss": 0.9694, + "step": 10070 + }, + { + "epoch": 0.3036144578313253, + "grad_norm": 3.213099280209126, + "learning_rate": 8.342197219940844e-06, + "loss": 1.1052, + "step": 10080 + }, + { + "epoch": 0.30391566265060244, + "grad_norm": 3.2864018134132014, + "learning_rate": 8.338491359040988e-06, + "loss": 1.0089, + "step": 10090 + }, + { + "epoch": 0.3042168674698795, + "grad_norm": 2.0954484427271014, + "learning_rate": 8.334782185864598e-06, + "loss": 1.0294, + "step": 10100 + }, + { + "epoch": 0.3045180722891566, + "grad_norm": 3.303121187870742, + "learning_rate": 8.331069704091722e-06, + "loss": 1.0381, + "step": 10110 + }, + { + "epoch": 0.30481927710843376, + "grad_norm": 3.490692244009057, + "learning_rate": 8.327353917405685e-06, + "loss": 0.9893, + "step": 10120 + }, + { + "epoch": 0.30512048192771085, + "grad_norm": 4.715938395729553, + "learning_rate": 8.323634829493101e-06, + "loss": 0.9515, + "step": 10130 + }, + { + "epoch": 0.30542168674698794, + "grad_norm": 4.041702284257836, + "learning_rate": 8.319912444043856e-06, + "loss": 1.0521, + "step": 10140 + }, + { + "epoch": 0.3057228915662651, + "grad_norm": 2.1308567518259305, + "learning_rate": 8.316186764751101e-06, + "loss": 0.9391, + "step": 10150 + }, + { + "epoch": 0.3060240963855422, + "grad_norm": 6.398420670580796, + "learning_rate": 8.312457795311263e-06, + "loss": 1.0661, + "step": 10160 + }, + { + "epoch": 0.30632530120481927, + "grad_norm": 6.049901202327995, + "learning_rate": 8.308725539424032e-06, + "loss": 0.8529, + "step": 10170 + }, + { + "epoch": 0.3066265060240964, + "grad_norm": 2.1836920028458158, + "learning_rate": 8.304990000792354e-06, + "loss": 1.0632, + "step": 10180 + }, + { + "epoch": 0.3069277108433735, + "grad_norm": 4.315187827797635, + "learning_rate": 8.301251183122437e-06, + "loss": 1.1366, + "step": 10190 + }, + { + "epoch": 0.3072289156626506, + "grad_norm": 3.8805494967232863, + "learning_rate": 8.297509090123739e-06, + "loss": 1.0369, + "step": 10200 + }, + { + "epoch": 0.30753012048192774, + "grad_norm": 4.9133585436285605, + "learning_rate": 8.29376372550897e-06, + "loss": 1.1049, + "step": 10210 + }, + { + "epoch": 0.30783132530120483, + "grad_norm": 3.019274722378204, + "learning_rate": 8.290015092994083e-06, + "loss": 1.0532, + "step": 10220 + }, + { + "epoch": 0.3081325301204819, + "grad_norm": 2.176444809228001, + "learning_rate": 8.28626319629828e-06, + "loss": 1.0581, + "step": 10230 + }, + { + "epoch": 0.30843373493975906, + "grad_norm": 4.955783849926732, + "learning_rate": 8.282508039143992e-06, + "loss": 1.0501, + "step": 10240 + }, + { + "epoch": 0.30873493975903615, + "grad_norm": 4.108722335550171, + "learning_rate": 8.27874962525689e-06, + "loss": 0.9323, + "step": 10250 + }, + { + "epoch": 0.30903614457831324, + "grad_norm": 3.3746658696795206, + "learning_rate": 8.27498795836588e-06, + "loss": 0.9167, + "step": 10260 + }, + { + "epoch": 0.3093373493975904, + "grad_norm": 2.2109002059876834, + "learning_rate": 8.271223042203086e-06, + "loss": 0.8523, + "step": 10270 + }, + { + "epoch": 0.3096385542168675, + "grad_norm": 1.9938075662060806, + "learning_rate": 8.267454880503862e-06, + "loss": 0.962, + "step": 10280 + }, + { + "epoch": 0.30993975903614457, + "grad_norm": 4.895058685699006, + "learning_rate": 8.26368347700678e-06, + "loss": 1.0655, + "step": 10290 + }, + { + "epoch": 0.3102409638554217, + "grad_norm": 4.683607734527515, + "learning_rate": 8.259908835453635e-06, + "loss": 1.0902, + "step": 10300 + }, + { + "epoch": 0.3105421686746988, + "grad_norm": 4.551887962361471, + "learning_rate": 8.256130959589424e-06, + "loss": 1.0172, + "step": 10310 + }, + { + "epoch": 0.3108433734939759, + "grad_norm": 6.323504816735507, + "learning_rate": 8.252349853162358e-06, + "loss": 0.9511, + "step": 10320 + }, + { + "epoch": 0.31114457831325304, + "grad_norm": 2.2244199490191026, + "learning_rate": 8.248565519923855e-06, + "loss": 0.9199, + "step": 10330 + }, + { + "epoch": 0.31144578313253013, + "grad_norm": 3.4041532317646626, + "learning_rate": 8.244777963628532e-06, + "loss": 0.9874, + "step": 10340 + }, + { + "epoch": 0.3117469879518072, + "grad_norm": 2.169175654323326, + "learning_rate": 8.240987188034202e-06, + "loss": 1.0389, + "step": 10350 + }, + { + "epoch": 0.31204819277108437, + "grad_norm": 2.173393683751808, + "learning_rate": 8.237193196901876e-06, + "loss": 0.9878, + "step": 10360 + }, + { + "epoch": 0.31234939759036146, + "grad_norm": 1.8960616181589895, + "learning_rate": 8.233395993995757e-06, + "loss": 0.8596, + "step": 10370 + }, + { + "epoch": 0.31265060240963854, + "grad_norm": 4.881679487431638, + "learning_rate": 8.229595583083225e-06, + "loss": 0.9993, + "step": 10380 + }, + { + "epoch": 0.31295180722891563, + "grad_norm": 4.0622475220613365, + "learning_rate": 8.225791967934853e-06, + "loss": 1.0734, + "step": 10390 + }, + { + "epoch": 0.3132530120481928, + "grad_norm": 3.463187950518435, + "learning_rate": 8.221985152324385e-06, + "loss": 0.9813, + "step": 10400 + }, + { + "epoch": 0.31355421686746987, + "grad_norm": 3.414810584626675, + "learning_rate": 8.21817514002875e-06, + "loss": 1.0017, + "step": 10410 + }, + { + "epoch": 0.31385542168674696, + "grad_norm": 1.946505225650699, + "learning_rate": 8.21436193482804e-06, + "loss": 0.8486, + "step": 10420 + }, + { + "epoch": 0.3141566265060241, + "grad_norm": 4.058793686487053, + "learning_rate": 8.210545540505519e-06, + "loss": 1.0272, + "step": 10430 + }, + { + "epoch": 0.3144578313253012, + "grad_norm": 4.221866633099206, + "learning_rate": 8.20672596084761e-06, + "loss": 1.0307, + "step": 10440 + }, + { + "epoch": 0.3147590361445783, + "grad_norm": 4.250681700188214, + "learning_rate": 8.202903199643903e-06, + "loss": 0.9789, + "step": 10450 + }, + { + "epoch": 0.31506024096385543, + "grad_norm": 4.088554887712706, + "learning_rate": 8.199077260687145e-06, + "loss": 1.1202, + "step": 10460 + }, + { + "epoch": 0.3153614457831325, + "grad_norm": 4.336915914235672, + "learning_rate": 8.195248147773228e-06, + "loss": 0.9602, + "step": 10470 + }, + { + "epoch": 0.3156626506024096, + "grad_norm": 2.086010663424, + "learning_rate": 8.191415864701198e-06, + "loss": 0.9813, + "step": 10480 + }, + { + "epoch": 0.31596385542168676, + "grad_norm": 4.094892650793342, + "learning_rate": 8.187580415273247e-06, + "loss": 0.9845, + "step": 10490 + }, + { + "epoch": 0.31626506024096385, + "grad_norm": 5.0111439904767, + "learning_rate": 8.183741803294707e-06, + "loss": 1.0929, + "step": 10500 + }, + { + "epoch": 0.31656626506024094, + "grad_norm": 2.1263418445804843, + "learning_rate": 8.179900032574049e-06, + "loss": 0.9088, + "step": 10510 + }, + { + "epoch": 0.3168674698795181, + "grad_norm": 8.187585461551897, + "learning_rate": 8.176055106922873e-06, + "loss": 1.0677, + "step": 10520 + }, + { + "epoch": 0.31716867469879517, + "grad_norm": 2.2516961374446334, + "learning_rate": 8.172207030155922e-06, + "loss": 0.9015, + "step": 10530 + }, + { + "epoch": 0.31746987951807226, + "grad_norm": 4.516664383443458, + "learning_rate": 8.168355806091045e-06, + "loss": 1.0555, + "step": 10540 + }, + { + "epoch": 0.3177710843373494, + "grad_norm": 6.12958491028471, + "learning_rate": 8.164501438549234e-06, + "loss": 0.9436, + "step": 10550 + }, + { + "epoch": 0.3180722891566265, + "grad_norm": 4.2743258425990485, + "learning_rate": 8.160643931354586e-06, + "loss": 1.0102, + "step": 10560 + }, + { + "epoch": 0.3183734939759036, + "grad_norm": 2.100559407650895, + "learning_rate": 8.156783288334323e-06, + "loss": 0.9872, + "step": 10570 + }, + { + "epoch": 0.31867469879518073, + "grad_norm": 4.482796122449825, + "learning_rate": 8.152919513318766e-06, + "loss": 1.0186, + "step": 10580 + }, + { + "epoch": 0.3189759036144578, + "grad_norm": 4.864044671249022, + "learning_rate": 8.149052610141357e-06, + "loss": 0.9591, + "step": 10590 + }, + { + "epoch": 0.3192771084337349, + "grad_norm": 1.899935810099827, + "learning_rate": 8.145182582638632e-06, + "loss": 0.9434, + "step": 10600 + }, + { + "epoch": 0.31957831325301206, + "grad_norm": 5.821848522611604, + "learning_rate": 8.141309434650228e-06, + "loss": 1.0552, + "step": 10610 + }, + { + "epoch": 0.31987951807228915, + "grad_norm": 6.2963056165551805, + "learning_rate": 8.137433170018885e-06, + "loss": 1.02, + "step": 10620 + }, + { + "epoch": 0.32018072289156624, + "grad_norm": 5.508301247500946, + "learning_rate": 8.133553792590427e-06, + "loss": 1.076, + "step": 10630 + }, + { + "epoch": 0.3204819277108434, + "grad_norm": 1.885883373268772, + "learning_rate": 8.12967130621377e-06, + "loss": 0.9259, + "step": 10640 + }, + { + "epoch": 0.3207831325301205, + "grad_norm": 2.1958518354758754, + "learning_rate": 8.125785714740912e-06, + "loss": 1.0185, + "step": 10650 + }, + { + "epoch": 0.32108433734939756, + "grad_norm": 2.5813327610862618, + "learning_rate": 8.121897022026935e-06, + "loss": 1.0261, + "step": 10660 + }, + { + "epoch": 0.3213855421686747, + "grad_norm": 12.632784671316534, + "learning_rate": 8.118005231929999e-06, + "loss": 0.9618, + "step": 10670 + }, + { + "epoch": 0.3216867469879518, + "grad_norm": 33.71508467064113, + "learning_rate": 8.11411034831133e-06, + "loss": 1.0002, + "step": 10680 + }, + { + "epoch": 0.3219879518072289, + "grad_norm": 2.2510074427985955, + "learning_rate": 8.110212375035234e-06, + "loss": 1.0596, + "step": 10690 + }, + { + "epoch": 0.32228915662650603, + "grad_norm": 12.161236885052919, + "learning_rate": 8.106311315969069e-06, + "loss": 1.0568, + "step": 10700 + }, + { + "epoch": 0.3225903614457831, + "grad_norm": 5.371988165263253, + "learning_rate": 8.102407174983266e-06, + "loss": 0.9959, + "step": 10710 + }, + { + "epoch": 0.3228915662650602, + "grad_norm": 4.483323978123669, + "learning_rate": 8.09849995595131e-06, + "loss": 1.1077, + "step": 10720 + }, + { + "epoch": 0.32319277108433736, + "grad_norm": 6.201584399794071, + "learning_rate": 8.094589662749737e-06, + "loss": 1.0778, + "step": 10730 + }, + { + "epoch": 0.32349397590361445, + "grad_norm": 7.385309985399349, + "learning_rate": 8.090676299258138e-06, + "loss": 0.9272, + "step": 10740 + }, + { + "epoch": 0.32379518072289154, + "grad_norm": 5.064938597035172, + "learning_rate": 8.086759869359145e-06, + "loss": 1.0434, + "step": 10750 + }, + { + "epoch": 0.3240963855421687, + "grad_norm": 2.061147108501892, + "learning_rate": 8.082840376938436e-06, + "loss": 0.9781, + "step": 10760 + }, + { + "epoch": 0.3243975903614458, + "grad_norm": 5.756402561330784, + "learning_rate": 8.078917825884728e-06, + "loss": 0.9888, + "step": 10770 + }, + { + "epoch": 0.32469879518072287, + "grad_norm": 4.540292910517579, + "learning_rate": 8.07499222008977e-06, + "loss": 1.0483, + "step": 10780 + }, + { + "epoch": 0.325, + "grad_norm": 5.000002960734428, + "learning_rate": 8.071063563448341e-06, + "loss": 1.0091, + "step": 10790 + }, + { + "epoch": 0.3253012048192771, + "grad_norm": 2.042460734798289, + "learning_rate": 8.06713185985825e-06, + "loss": 0.8824, + "step": 10800 + }, + { + "epoch": 0.3256024096385542, + "grad_norm": 2.0853705185850457, + "learning_rate": 8.063197113220327e-06, + "loss": 0.9518, + "step": 10810 + }, + { + "epoch": 0.32590361445783134, + "grad_norm": 2.1195196765705737, + "learning_rate": 8.059259327438425e-06, + "loss": 1.0, + "step": 10820 + }, + { + "epoch": 0.3262048192771084, + "grad_norm": 5.880477193857654, + "learning_rate": 8.055318506419403e-06, + "loss": 1.0192, + "step": 10830 + }, + { + "epoch": 0.3265060240963855, + "grad_norm": 1.9781978177363087, + "learning_rate": 8.051374654073142e-06, + "loss": 0.9261, + "step": 10840 + }, + { + "epoch": 0.32680722891566266, + "grad_norm": 2.0116533404843024, + "learning_rate": 8.047427774312528e-06, + "loss": 0.9988, + "step": 10850 + }, + { + "epoch": 0.32710843373493975, + "grad_norm": 4.829421479149449, + "learning_rate": 8.043477871053445e-06, + "loss": 0.955, + "step": 10860 + }, + { + "epoch": 0.32740963855421684, + "grad_norm": 6.453665494173318, + "learning_rate": 8.03952494821478e-06, + "loss": 0.9507, + "step": 10870 + }, + { + "epoch": 0.327710843373494, + "grad_norm": 3.8480244595337014, + "learning_rate": 8.03556900971842e-06, + "loss": 1.0797, + "step": 10880 + }, + { + "epoch": 0.3280120481927711, + "grad_norm": 4.564222918930517, + "learning_rate": 8.031610059489237e-06, + "loss": 0.9727, + "step": 10890 + }, + { + "epoch": 0.32831325301204817, + "grad_norm": 2.3236021716733184, + "learning_rate": 8.027648101455095e-06, + "loss": 1.0288, + "step": 10900 + }, + { + "epoch": 0.3286144578313253, + "grad_norm": 4.169807812805349, + "learning_rate": 8.023683139546841e-06, + "loss": 1.0955, + "step": 10910 + }, + { + "epoch": 0.3289156626506024, + "grad_norm": 2.003645238689451, + "learning_rate": 8.019715177698305e-06, + "loss": 1.0459, + "step": 10920 + }, + { + "epoch": 0.3292168674698795, + "grad_norm": 2.179700787095523, + "learning_rate": 8.015744219846289e-06, + "loss": 0.8858, + "step": 10930 + }, + { + "epoch": 0.32951807228915664, + "grad_norm": 4.444688268717414, + "learning_rate": 8.011770269930572e-06, + "loss": 0.9368, + "step": 10940 + }, + { + "epoch": 0.32981927710843373, + "grad_norm": 8.984812974122574, + "learning_rate": 8.007793331893898e-06, + "loss": 1.0194, + "step": 10950 + }, + { + "epoch": 0.3301204819277108, + "grad_norm": 1.9534289762852177, + "learning_rate": 8.003813409681978e-06, + "loss": 0.8108, + "step": 10960 + }, + { + "epoch": 0.33042168674698796, + "grad_norm": 5.295910267661215, + "learning_rate": 7.999830507243478e-06, + "loss": 0.8803, + "step": 10970 + }, + { + "epoch": 0.33072289156626505, + "grad_norm": 2.123748614518604, + "learning_rate": 7.995844628530034e-06, + "loss": 0.9602, + "step": 10980 + }, + { + "epoch": 0.33102409638554214, + "grad_norm": 12.124457803955242, + "learning_rate": 7.991855777496219e-06, + "loss": 1.0699, + "step": 10990 + }, + { + "epoch": 0.3313253012048193, + "grad_norm": 7.682101489526108, + "learning_rate": 7.987863958099564e-06, + "loss": 1.0692, + "step": 11000 + }, + { + "epoch": 0.3316265060240964, + "grad_norm": 7.738741702593449, + "learning_rate": 7.983869174300544e-06, + "loss": 1.0043, + "step": 11010 + }, + { + "epoch": 0.33192771084337347, + "grad_norm": 6.092864158051944, + "learning_rate": 7.979871430062577e-06, + "loss": 1.0381, + "step": 11020 + }, + { + "epoch": 0.3322289156626506, + "grad_norm": 4.176544518300791, + "learning_rate": 7.975870729352013e-06, + "loss": 1.0028, + "step": 11030 + }, + { + "epoch": 0.3325301204819277, + "grad_norm": 4.871432857202954, + "learning_rate": 7.971867076138139e-06, + "loss": 0.9203, + "step": 11040 + }, + { + "epoch": 0.3328313253012048, + "grad_norm": 4.1801337367747635, + "learning_rate": 7.96786047439317e-06, + "loss": 1.0804, + "step": 11050 + }, + { + "epoch": 0.33313253012048194, + "grad_norm": 11.014746998172521, + "learning_rate": 7.963850928092247e-06, + "loss": 1.0475, + "step": 11060 + }, + { + "epoch": 0.33343373493975903, + "grad_norm": 4.525575667256446, + "learning_rate": 7.959838441213432e-06, + "loss": 0.992, + "step": 11070 + }, + { + "epoch": 0.3337349397590361, + "grad_norm": 4.410829555085609, + "learning_rate": 7.955823017737706e-06, + "loss": 1.0486, + "step": 11080 + }, + { + "epoch": 0.33403614457831327, + "grad_norm": 3.4309869480769413, + "learning_rate": 7.951804661648958e-06, + "loss": 0.9851, + "step": 11090 + }, + { + "epoch": 0.33433734939759036, + "grad_norm": 7.16301588450031, + "learning_rate": 7.947783376933995e-06, + "loss": 0.9675, + "step": 11100 + }, + { + "epoch": 0.33463855421686745, + "grad_norm": 1.7946658805824416, + "learning_rate": 7.943759167582524e-06, + "loss": 0.8823, + "step": 11110 + }, + { + "epoch": 0.3349397590361446, + "grad_norm": 5.01038703406234, + "learning_rate": 7.939732037587155e-06, + "loss": 1.0359, + "step": 11120 + }, + { + "epoch": 0.3352409638554217, + "grad_norm": 4.131627029526289, + "learning_rate": 7.935701990943395e-06, + "loss": 0.9927, + "step": 11130 + }, + { + "epoch": 0.33554216867469877, + "grad_norm": 3.362435113496702, + "learning_rate": 7.931669031649648e-06, + "loss": 1.102, + "step": 11140 + }, + { + "epoch": 0.3358433734939759, + "grad_norm": 2.0904828288683888, + "learning_rate": 7.927633163707202e-06, + "loss": 0.9637, + "step": 11150 + }, + { + "epoch": 0.336144578313253, + "grad_norm": 1.9167001134841364, + "learning_rate": 7.923594391120237e-06, + "loss": 1.0139, + "step": 11160 + }, + { + "epoch": 0.3364457831325301, + "grad_norm": 3.8166751157117487, + "learning_rate": 7.919552717895808e-06, + "loss": 1.1167, + "step": 11170 + }, + { + "epoch": 0.33674698795180724, + "grad_norm": 1.8530000899009555, + "learning_rate": 7.915508148043857e-06, + "loss": 0.998, + "step": 11180 + }, + { + "epoch": 0.33704819277108433, + "grad_norm": 5.089422994663483, + "learning_rate": 7.911460685577193e-06, + "loss": 1.0284, + "step": 11190 + }, + { + "epoch": 0.3373493975903614, + "grad_norm": 5.340756030145489, + "learning_rate": 7.907410334511494e-06, + "loss": 0.9679, + "step": 11200 + }, + { + "epoch": 0.33765060240963857, + "grad_norm": 4.2778241487895645, + "learning_rate": 7.90335709886531e-06, + "loss": 1.0368, + "step": 11210 + }, + { + "epoch": 0.33795180722891566, + "grad_norm": 1.7482343695076692, + "learning_rate": 7.899300982660048e-06, + "loss": 0.9494, + "step": 11220 + }, + { + "epoch": 0.33825301204819275, + "grad_norm": 3.5537222269878637, + "learning_rate": 7.895241989919971e-06, + "loss": 1.0937, + "step": 11230 + }, + { + "epoch": 0.3385542168674699, + "grad_norm": 4.70721515455698, + "learning_rate": 7.891180124672205e-06, + "loss": 0.9367, + "step": 11240 + }, + { + "epoch": 0.338855421686747, + "grad_norm": 4.040149729274763, + "learning_rate": 7.887115390946716e-06, + "loss": 0.975, + "step": 11250 + }, + { + "epoch": 0.3391566265060241, + "grad_norm": 4.191861338628631, + "learning_rate": 7.883047792776324e-06, + "loss": 1.0594, + "step": 11260 + }, + { + "epoch": 0.3394578313253012, + "grad_norm": 5.162182052647102, + "learning_rate": 7.878977334196684e-06, + "loss": 1.0286, + "step": 11270 + }, + { + "epoch": 0.3397590361445783, + "grad_norm": 3.785017852331962, + "learning_rate": 7.874904019246294e-06, + "loss": 0.9398, + "step": 11280 + }, + { + "epoch": 0.3400602409638554, + "grad_norm": 4.492677914363335, + "learning_rate": 7.870827851966482e-06, + "loss": 1.0652, + "step": 11290 + }, + { + "epoch": 0.34036144578313254, + "grad_norm": 3.6401117799449745, + "learning_rate": 7.86674883640141e-06, + "loss": 0.9585, + "step": 11300 + }, + { + "epoch": 0.34066265060240963, + "grad_norm": 3.913573078107745, + "learning_rate": 7.862666976598063e-06, + "loss": 1.0016, + "step": 11310 + }, + { + "epoch": 0.3409638554216867, + "grad_norm": 3.6471231467900225, + "learning_rate": 7.85858227660625e-06, + "loss": 1.1002, + "step": 11320 + }, + { + "epoch": 0.34126506024096387, + "grad_norm": 2.0450868462730947, + "learning_rate": 7.854494740478596e-06, + "loss": 0.9459, + "step": 11330 + }, + { + "epoch": 0.34156626506024096, + "grad_norm": 5.270128040029897, + "learning_rate": 7.85040437227054e-06, + "loss": 0.8647, + "step": 11340 + }, + { + "epoch": 0.34186746987951805, + "grad_norm": 2.7680072621035454, + "learning_rate": 7.846311176040331e-06, + "loss": 1.0631, + "step": 11350 + }, + { + "epoch": 0.3421686746987952, + "grad_norm": 3.672272147127404, + "learning_rate": 7.842215155849024e-06, + "loss": 1.0353, + "step": 11360 + }, + { + "epoch": 0.3424698795180723, + "grad_norm": 7.606629385178783, + "learning_rate": 7.83811631576048e-06, + "loss": 0.9552, + "step": 11370 + }, + { + "epoch": 0.3427710843373494, + "grad_norm": 3.535222873623303, + "learning_rate": 7.834014659841349e-06, + "loss": 1.0624, + "step": 11380 + }, + { + "epoch": 0.3430722891566265, + "grad_norm": 2.0884459939916273, + "learning_rate": 7.829910192161083e-06, + "loss": 0.9438, + "step": 11390 + }, + { + "epoch": 0.3433734939759036, + "grad_norm": 4.195423300204089, + "learning_rate": 7.825802916791917e-06, + "loss": 1.0971, + "step": 11400 + }, + { + "epoch": 0.3436746987951807, + "grad_norm": 1.766631150711695, + "learning_rate": 7.821692837808877e-06, + "loss": 0.915, + "step": 11410 + }, + { + "epoch": 0.34397590361445785, + "grad_norm": 1.9538141950054282, + "learning_rate": 7.817579959289768e-06, + "loss": 0.8541, + "step": 11420 + }, + { + "epoch": 0.34427710843373494, + "grad_norm": 4.066235068672363, + "learning_rate": 7.813464285315171e-06, + "loss": 0.9872, + "step": 11430 + }, + { + "epoch": 0.344578313253012, + "grad_norm": 3.5828437974576963, + "learning_rate": 7.809345819968449e-06, + "loss": 1.1013, + "step": 11440 + }, + { + "epoch": 0.34487951807228917, + "grad_norm": 1.8901758058197726, + "learning_rate": 7.805224567335719e-06, + "loss": 0.9521, + "step": 11450 + }, + { + "epoch": 0.34518072289156626, + "grad_norm": 2.072625529214965, + "learning_rate": 7.801100531505877e-06, + "loss": 0.9442, + "step": 11460 + }, + { + "epoch": 0.34548192771084335, + "grad_norm": 2.0061827751587136, + "learning_rate": 7.796973716570576e-06, + "loss": 1.0048, + "step": 11470 + }, + { + "epoch": 0.3457831325301205, + "grad_norm": 5.362888952636561, + "learning_rate": 7.792844126624226e-06, + "loss": 0.9931, + "step": 11480 + }, + { + "epoch": 0.3460843373493976, + "grad_norm": 4.733275942529403, + "learning_rate": 7.788711765763987e-06, + "loss": 0.8837, + "step": 11490 + }, + { + "epoch": 0.3463855421686747, + "grad_norm": 5.216588717287876, + "learning_rate": 7.784576638089773e-06, + "loss": 1.0845, + "step": 11500 + }, + { + "epoch": 0.3466867469879518, + "grad_norm": 5.221184937904206, + "learning_rate": 7.780438747704246e-06, + "loss": 0.9466, + "step": 11510 + }, + { + "epoch": 0.3469879518072289, + "grad_norm": 4.503741386955517, + "learning_rate": 7.776298098712798e-06, + "loss": 1.0021, + "step": 11520 + }, + { + "epoch": 0.347289156626506, + "grad_norm": 1.8923775713854136, + "learning_rate": 7.772154695223568e-06, + "loss": 0.8952, + "step": 11530 + }, + { + "epoch": 0.34759036144578315, + "grad_norm": 5.6942666030655005, + "learning_rate": 7.768008541347423e-06, + "loss": 0.9808, + "step": 11540 + }, + { + "epoch": 0.34789156626506024, + "grad_norm": 4.077644152619464, + "learning_rate": 7.76385964119796e-06, + "loss": 0.8846, + "step": 11550 + }, + { + "epoch": 0.3481927710843373, + "grad_norm": 2.09267718532834, + "learning_rate": 7.759707998891501e-06, + "loss": 0.8713, + "step": 11560 + }, + { + "epoch": 0.3484939759036145, + "grad_norm": 10.34967959913251, + "learning_rate": 7.75555361854709e-06, + "loss": 0.9726, + "step": 11570 + }, + { + "epoch": 0.34879518072289156, + "grad_norm": 5.35403644832154, + "learning_rate": 7.751396504286482e-06, + "loss": 0.9894, + "step": 11580 + }, + { + "epoch": 0.34909638554216865, + "grad_norm": 1.8889949713805092, + "learning_rate": 7.747236660234152e-06, + "loss": 0.9279, + "step": 11590 + }, + { + "epoch": 0.3493975903614458, + "grad_norm": 3.626748629240793, + "learning_rate": 7.743074090517277e-06, + "loss": 0.9579, + "step": 11600 + }, + { + "epoch": 0.3496987951807229, + "grad_norm": 2.2815070727483175, + "learning_rate": 7.738908799265744e-06, + "loss": 1.0396, + "step": 11610 + }, + { + "epoch": 0.35, + "grad_norm": 3.9994521502929223, + "learning_rate": 7.734740790612137e-06, + "loss": 0.8925, + "step": 11620 + }, + { + "epoch": 0.3503012048192771, + "grad_norm": 3.6622482362392077, + "learning_rate": 7.73057006869173e-06, + "loss": 1.1075, + "step": 11630 + }, + { + "epoch": 0.3506024096385542, + "grad_norm": 2.069341044213193, + "learning_rate": 7.726396637642503e-06, + "loss": 0.8838, + "step": 11640 + }, + { + "epoch": 0.3509036144578313, + "grad_norm": 1.8822289172236946, + "learning_rate": 7.722220501605114e-06, + "loss": 0.9637, + "step": 11650 + }, + { + "epoch": 0.35120481927710845, + "grad_norm": 2.0548049156522814, + "learning_rate": 7.718041664722904e-06, + "loss": 0.8996, + "step": 11660 + }, + { + "epoch": 0.35150602409638554, + "grad_norm": 3.1647212555037894, + "learning_rate": 7.713860131141898e-06, + "loss": 0.9324, + "step": 11670 + }, + { + "epoch": 0.35180722891566263, + "grad_norm": 2.0307667265535407, + "learning_rate": 7.709675905010796e-06, + "loss": 0.8733, + "step": 11680 + }, + { + "epoch": 0.3521084337349398, + "grad_norm": 3.3587937751270207, + "learning_rate": 7.705488990480967e-06, + "loss": 1.0552, + "step": 11690 + }, + { + "epoch": 0.35240963855421686, + "grad_norm": 3.6672200860743716, + "learning_rate": 7.701299391706449e-06, + "loss": 0.9027, + "step": 11700 + }, + { + "epoch": 0.35271084337349395, + "grad_norm": 2.055809584161785, + "learning_rate": 7.697107112843944e-06, + "loss": 0.9677, + "step": 11710 + }, + { + "epoch": 0.3530120481927711, + "grad_norm": 5.681151115408283, + "learning_rate": 7.692912158052807e-06, + "loss": 1.1032, + "step": 11720 + }, + { + "epoch": 0.3533132530120482, + "grad_norm": 1.9187476703936253, + "learning_rate": 7.688714531495061e-06, + "loss": 0.9076, + "step": 11730 + }, + { + "epoch": 0.3536144578313253, + "grad_norm": 1.993897678107153, + "learning_rate": 7.684514237335365e-06, + "loss": 0.9445, + "step": 11740 + }, + { + "epoch": 0.3539156626506024, + "grad_norm": 3.8781429686090974, + "learning_rate": 7.680311279741033e-06, + "loss": 0.962, + "step": 11750 + }, + { + "epoch": 0.3542168674698795, + "grad_norm": 4.598950729399993, + "learning_rate": 7.676105662882023e-06, + "loss": 0.9035, + "step": 11760 + }, + { + "epoch": 0.3545180722891566, + "grad_norm": 4.469712832430609, + "learning_rate": 7.67189739093092e-06, + "loss": 0.9874, + "step": 11770 + }, + { + "epoch": 0.35481927710843375, + "grad_norm": 2.0608164849855215, + "learning_rate": 7.66768646806296e-06, + "loss": 0.9224, + "step": 11780 + }, + { + "epoch": 0.35512048192771084, + "grad_norm": 2.2705628156118784, + "learning_rate": 7.663472898455996e-06, + "loss": 1.0113, + "step": 11790 + }, + { + "epoch": 0.35542168674698793, + "grad_norm": 6.425444929730967, + "learning_rate": 7.659256686290513e-06, + "loss": 1.0422, + "step": 11800 + }, + { + "epoch": 0.3557228915662651, + "grad_norm": 2.0760865511603757, + "learning_rate": 7.655037835749615e-06, + "loss": 0.9742, + "step": 11810 + }, + { + "epoch": 0.35602409638554217, + "grad_norm": 3.222186139567789, + "learning_rate": 7.650816351019024e-06, + "loss": 1.0131, + "step": 11820 + }, + { + "epoch": 0.35632530120481926, + "grad_norm": 5.848832526960766, + "learning_rate": 7.646592236287078e-06, + "loss": 1.0364, + "step": 11830 + }, + { + "epoch": 0.3566265060240964, + "grad_norm": 4.3173339914073585, + "learning_rate": 7.642365495744724e-06, + "loss": 0.9846, + "step": 11840 + }, + { + "epoch": 0.3569277108433735, + "grad_norm": 1.9894810652672887, + "learning_rate": 7.638136133585511e-06, + "loss": 1.0054, + "step": 11850 + }, + { + "epoch": 0.3572289156626506, + "grad_norm": 2.490300414940136, + "learning_rate": 7.633904154005592e-06, + "loss": 0.9597, + "step": 11860 + }, + { + "epoch": 0.3575301204819277, + "grad_norm": 1.9732180468952747, + "learning_rate": 7.629669561203715e-06, + "loss": 0.9413, + "step": 11870 + }, + { + "epoch": 0.3578313253012048, + "grad_norm": 4.519632081109088, + "learning_rate": 7.62543235938122e-06, + "loss": 1.0379, + "step": 11880 + }, + { + "epoch": 0.3581325301204819, + "grad_norm": 7.541543856491166, + "learning_rate": 7.621192552742041e-06, + "loss": 0.9894, + "step": 11890 + }, + { + "epoch": 0.35843373493975905, + "grad_norm": 2.1839219380733037, + "learning_rate": 7.616950145492688e-06, + "loss": 0.9952, + "step": 11900 + }, + { + "epoch": 0.35873493975903614, + "grad_norm": 9.463629219698094, + "learning_rate": 7.612705141842258e-06, + "loss": 0.9865, + "step": 11910 + }, + { + "epoch": 0.35903614457831323, + "grad_norm": 7.0291374448550314, + "learning_rate": 7.608457546002423e-06, + "loss": 1.0408, + "step": 11920 + }, + { + "epoch": 0.3593373493975904, + "grad_norm": 15.025469769876697, + "learning_rate": 7.604207362187423e-06, + "loss": 1.0129, + "step": 11930 + }, + { + "epoch": 0.35963855421686747, + "grad_norm": 8.063578405889139, + "learning_rate": 7.5999545946140696e-06, + "loss": 1.0775, + "step": 11940 + }, + { + "epoch": 0.35993975903614456, + "grad_norm": 6.312545184922915, + "learning_rate": 7.595699247501735e-06, + "loss": 0.9485, + "step": 11950 + }, + { + "epoch": 0.3602409638554217, + "grad_norm": 5.036637979316993, + "learning_rate": 7.591441325072355e-06, + "loss": 0.9692, + "step": 11960 + }, + { + "epoch": 0.3605421686746988, + "grad_norm": 5.121623705611691, + "learning_rate": 7.5871808315504134e-06, + "loss": 0.9424, + "step": 11970 + }, + { + "epoch": 0.3608433734939759, + "grad_norm": 4.708061740133258, + "learning_rate": 7.5829177711629506e-06, + "loss": 0.9425, + "step": 11980 + }, + { + "epoch": 0.36114457831325303, + "grad_norm": 4.403096358482696, + "learning_rate": 7.578652148139553e-06, + "loss": 0.9203, + "step": 11990 + }, + { + "epoch": 0.3614457831325301, + "grad_norm": 4.495083693777487, + "learning_rate": 7.574383966712348e-06, + "loss": 1.028, + "step": 12000 + }, + { + "epoch": 0.3617469879518072, + "grad_norm": 3.7829270017291474, + "learning_rate": 7.570113231116005e-06, + "loss": 1.041, + "step": 12010 + }, + { + "epoch": 0.36204819277108435, + "grad_norm": 4.045104830847792, + "learning_rate": 7.56583994558772e-06, + "loss": 1.0033, + "step": 12020 + }, + { + "epoch": 0.36234939759036144, + "grad_norm": 5.584787634293018, + "learning_rate": 7.561564114367226e-06, + "loss": 0.9923, + "step": 12030 + }, + { + "epoch": 0.36265060240963853, + "grad_norm": 3.878966372313198, + "learning_rate": 7.557285741696777e-06, + "loss": 1.1086, + "step": 12040 + }, + { + "epoch": 0.3629518072289157, + "grad_norm": 3.2923754142678288, + "learning_rate": 7.553004831821153e-06, + "loss": 0.9805, + "step": 12050 + }, + { + "epoch": 0.36325301204819277, + "grad_norm": 3.4821449007447116, + "learning_rate": 7.5487213889876475e-06, + "loss": 1.0026, + "step": 12060 + }, + { + "epoch": 0.36355421686746986, + "grad_norm": 3.4609487186400054, + "learning_rate": 7.544435417446068e-06, + "loss": 0.7465, + "step": 12070 + }, + { + "epoch": 0.363855421686747, + "grad_norm": 3.3833798650840685, + "learning_rate": 7.540146921448735e-06, + "loss": 0.9664, + "step": 12080 + }, + { + "epoch": 0.3641566265060241, + "grad_norm": 1.984224008796806, + "learning_rate": 7.535855905250464e-06, + "loss": 0.9476, + "step": 12090 + }, + { + "epoch": 0.3644578313253012, + "grad_norm": 4.128395853880073, + "learning_rate": 7.531562373108583e-06, + "loss": 0.9898, + "step": 12100 + }, + { + "epoch": 0.36475903614457833, + "grad_norm": 3.523795842940874, + "learning_rate": 7.527266329282905e-06, + "loss": 0.9142, + "step": 12110 + }, + { + "epoch": 0.3650602409638554, + "grad_norm": 1.998039113548706, + "learning_rate": 7.5229677780357435e-06, + "loss": 1.0372, + "step": 12120 + }, + { + "epoch": 0.3653614457831325, + "grad_norm": 3.7965229995688725, + "learning_rate": 7.5186667236318936e-06, + "loss": 0.8416, + "step": 12130 + }, + { + "epoch": 0.36566265060240966, + "grad_norm": 3.481348140636192, + "learning_rate": 7.514363170338639e-06, + "loss": 1.0853, + "step": 12140 + }, + { + "epoch": 0.36596385542168675, + "grad_norm": 2.9042009507627604, + "learning_rate": 7.510057122425737e-06, + "loss": 0.9719, + "step": 12150 + }, + { + "epoch": 0.36626506024096384, + "grad_norm": 5.666203153319428, + "learning_rate": 7.505748584165426e-06, + "loss": 0.9615, + "step": 12160 + }, + { + "epoch": 0.366566265060241, + "grad_norm": 2.0590145327402185, + "learning_rate": 7.501437559832413e-06, + "loss": 0.9807, + "step": 12170 + }, + { + "epoch": 0.36686746987951807, + "grad_norm": 2.077984305278314, + "learning_rate": 7.497124053703868e-06, + "loss": 0.9644, + "step": 12180 + }, + { + "epoch": 0.36716867469879516, + "grad_norm": 3.243407402314189, + "learning_rate": 7.492808070059428e-06, + "loss": 0.9872, + "step": 12190 + }, + { + "epoch": 0.3674698795180723, + "grad_norm": 3.755668320775745, + "learning_rate": 7.488489613181186e-06, + "loss": 1.0369, + "step": 12200 + }, + { + "epoch": 0.3677710843373494, + "grad_norm": 3.0652712536883095, + "learning_rate": 7.48416868735369e-06, + "loss": 0.9233, + "step": 12210 + }, + { + "epoch": 0.3680722891566265, + "grad_norm": 3.4261592542583794, + "learning_rate": 7.479845296863934e-06, + "loss": 1.1334, + "step": 12220 + }, + { + "epoch": 0.36837349397590363, + "grad_norm": 5.704405789481728, + "learning_rate": 7.4755194460013615e-06, + "loss": 1.0497, + "step": 12230 + }, + { + "epoch": 0.3686746987951807, + "grad_norm": 3.1187485791328675, + "learning_rate": 7.4711911390578565e-06, + "loss": 1.0918, + "step": 12240 + }, + { + "epoch": 0.3689759036144578, + "grad_norm": 1.9253129895742902, + "learning_rate": 7.466860380327738e-06, + "loss": 1.0079, + "step": 12250 + }, + { + "epoch": 0.36927710843373496, + "grad_norm": 4.269061577068835, + "learning_rate": 7.462527174107757e-06, + "loss": 1.0405, + "step": 12260 + }, + { + "epoch": 0.36957831325301205, + "grad_norm": 3.8701615824129516, + "learning_rate": 7.458191524697095e-06, + "loss": 0.9703, + "step": 12270 + }, + { + "epoch": 0.36987951807228914, + "grad_norm": 3.3658365668570505, + "learning_rate": 7.453853436397358e-06, + "loss": 0.971, + "step": 12280 + }, + { + "epoch": 0.3701807228915663, + "grad_norm": 2.1062448579153044, + "learning_rate": 7.449512913512569e-06, + "loss": 0.9751, + "step": 12290 + }, + { + "epoch": 0.3704819277108434, + "grad_norm": 2.0748820681020748, + "learning_rate": 7.445169960349167e-06, + "loss": 1.0067, + "step": 12300 + }, + { + "epoch": 0.37078313253012046, + "grad_norm": 3.859794676968757, + "learning_rate": 7.440824581216005e-06, + "loss": 0.9573, + "step": 12310 + }, + { + "epoch": 0.3710843373493976, + "grad_norm": 3.466358057065326, + "learning_rate": 7.436476780424339e-06, + "loss": 0.9606, + "step": 12320 + }, + { + "epoch": 0.3713855421686747, + "grad_norm": 2.1083525878028824, + "learning_rate": 7.432126562287833e-06, + "loss": 0.8662, + "step": 12330 + }, + { + "epoch": 0.3716867469879518, + "grad_norm": 3.797578592911251, + "learning_rate": 7.427773931122541e-06, + "loss": 1.0214, + "step": 12340 + }, + { + "epoch": 0.37198795180722893, + "grad_norm": 3.6345295936608104, + "learning_rate": 7.4234188912469185e-06, + "loss": 1.0191, + "step": 12350 + }, + { + "epoch": 0.372289156626506, + "grad_norm": 3.74131550894578, + "learning_rate": 7.419061446981809e-06, + "loss": 0.9515, + "step": 12360 + }, + { + "epoch": 0.3725903614457831, + "grad_norm": 4.015919698762564, + "learning_rate": 7.41470160265044e-06, + "loss": 0.9016, + "step": 12370 + }, + { + "epoch": 0.37289156626506026, + "grad_norm": 1.8975136407931068, + "learning_rate": 7.410339362578422e-06, + "loss": 0.9815, + "step": 12380 + }, + { + "epoch": 0.37319277108433735, + "grad_norm": 2.027588364097527, + "learning_rate": 7.40597473109374e-06, + "loss": 1.0008, + "step": 12390 + }, + { + "epoch": 0.37349397590361444, + "grad_norm": 4.210884740114786, + "learning_rate": 7.4016077125267524e-06, + "loss": 1.0322, + "step": 12400 + }, + { + "epoch": 0.3737951807228916, + "grad_norm": 1.900718721757264, + "learning_rate": 7.397238311210189e-06, + "loss": 1.0023, + "step": 12410 + }, + { + "epoch": 0.3740963855421687, + "grad_norm": 4.013336922349029, + "learning_rate": 7.392866531479142e-06, + "loss": 1.0003, + "step": 12420 + }, + { + "epoch": 0.37439759036144576, + "grad_norm": 2.1955865782421173, + "learning_rate": 7.38849237767106e-06, + "loss": 0.9563, + "step": 12430 + }, + { + "epoch": 0.3746987951807229, + "grad_norm": 2.0046519737550335, + "learning_rate": 7.384115854125748e-06, + "loss": 0.9102, + "step": 12440 + }, + { + "epoch": 0.375, + "grad_norm": 3.9016136604452996, + "learning_rate": 7.379736965185369e-06, + "loss": 0.9413, + "step": 12450 + }, + { + "epoch": 0.3753012048192771, + "grad_norm": 3.2483313959778872, + "learning_rate": 7.375355715194423e-06, + "loss": 1.0819, + "step": 12460 + }, + { + "epoch": 0.37560240963855424, + "grad_norm": 3.204785265080706, + "learning_rate": 7.37097210849976e-06, + "loss": 1.0078, + "step": 12470 + }, + { + "epoch": 0.3759036144578313, + "grad_norm": 3.3390830579513957, + "learning_rate": 7.366586149450565e-06, + "loss": 1.0894, + "step": 12480 + }, + { + "epoch": 0.3762048192771084, + "grad_norm": 3.2214701749722265, + "learning_rate": 7.362197842398355e-06, + "loss": 0.9376, + "step": 12490 + }, + { + "epoch": 0.37650602409638556, + "grad_norm": 4.187817334567737, + "learning_rate": 7.3578071916969815e-06, + "loss": 1.0763, + "step": 12500 + }, + { + "epoch": 0.37680722891566265, + "grad_norm": 1.9445817326027879, + "learning_rate": 7.353414201702618e-06, + "loss": 0.8968, + "step": 12510 + }, + { + "epoch": 0.37710843373493974, + "grad_norm": 3.3085143469005405, + "learning_rate": 7.349018876773757e-06, + "loss": 1.0455, + "step": 12520 + }, + { + "epoch": 0.3774096385542169, + "grad_norm": 1.8688818785972638, + "learning_rate": 7.344621221271213e-06, + "loss": 0.958, + "step": 12530 + }, + { + "epoch": 0.377710843373494, + "grad_norm": 3.2987717394592617, + "learning_rate": 7.340221239558108e-06, + "loss": 0.9878, + "step": 12540 + }, + { + "epoch": 0.37801204819277107, + "grad_norm": 3.082266542193966, + "learning_rate": 7.335818935999874e-06, + "loss": 1.0674, + "step": 12550 + }, + { + "epoch": 0.3783132530120482, + "grad_norm": 3.2534423955558274, + "learning_rate": 7.331414314964248e-06, + "loss": 1.1027, + "step": 12560 + }, + { + "epoch": 0.3786144578313253, + "grad_norm": 3.43243228305782, + "learning_rate": 7.3270073808212615e-06, + "loss": 0.958, + "step": 12570 + }, + { + "epoch": 0.3789156626506024, + "grad_norm": 2.832033942525752, + "learning_rate": 7.322598137943249e-06, + "loss": 0.9465, + "step": 12580 + }, + { + "epoch": 0.37921686746987954, + "grad_norm": 1.937079304513958, + "learning_rate": 7.318186590704824e-06, + "loss": 0.9161, + "step": 12590 + }, + { + "epoch": 0.3795180722891566, + "grad_norm": 3.182373336700091, + "learning_rate": 7.3137727434828995e-06, + "loss": 1.0651, + "step": 12600 + }, + { + "epoch": 0.3798192771084337, + "grad_norm": 2.1070515774968333, + "learning_rate": 7.309356600656659e-06, + "loss": 0.998, + "step": 12610 + }, + { + "epoch": 0.38012048192771086, + "grad_norm": 2.9732502478244887, + "learning_rate": 7.3049381666075715e-06, + "loss": 0.9811, + "step": 12620 + }, + { + "epoch": 0.38042168674698795, + "grad_norm": 3.514134853004432, + "learning_rate": 7.300517445719375e-06, + "loss": 0.9795, + "step": 12630 + }, + { + "epoch": 0.38072289156626504, + "grad_norm": 3.107817337048043, + "learning_rate": 7.296094442378078e-06, + "loss": 0.9908, + "step": 12640 + }, + { + "epoch": 0.3810240963855422, + "grad_norm": 3.4623836273364055, + "learning_rate": 7.2916691609719525e-06, + "loss": 0.947, + "step": 12650 + }, + { + "epoch": 0.3813253012048193, + "grad_norm": 3.086429428881401, + "learning_rate": 7.2872416058915305e-06, + "loss": 0.9981, + "step": 12660 + }, + { + "epoch": 0.38162650602409637, + "grad_norm": 3.8489801730440503, + "learning_rate": 7.2828117815296025e-06, + "loss": 0.9235, + "step": 12670 + }, + { + "epoch": 0.3819277108433735, + "grad_norm": 3.4102823216410205, + "learning_rate": 7.278379692281209e-06, + "loss": 0.9588, + "step": 12680 + }, + { + "epoch": 0.3822289156626506, + "grad_norm": 2.949334287853416, + "learning_rate": 7.2739453425436345e-06, + "loss": 0.917, + "step": 12690 + }, + { + "epoch": 0.3825301204819277, + "grad_norm": 3.1229075873163845, + "learning_rate": 7.269508736716411e-06, + "loss": 1.0317, + "step": 12700 + }, + { + "epoch": 0.38283132530120484, + "grad_norm": 3.351920270635086, + "learning_rate": 7.2650698792013065e-06, + "loss": 1.0268, + "step": 12710 + }, + { + "epoch": 0.38313253012048193, + "grad_norm": 2.929317217211371, + "learning_rate": 7.260628774402322e-06, + "loss": 1.0022, + "step": 12720 + }, + { + "epoch": 0.383433734939759, + "grad_norm": 3.613800636085388, + "learning_rate": 7.256185426725691e-06, + "loss": 1.0318, + "step": 12730 + }, + { + "epoch": 0.38373493975903616, + "grad_norm": 3.2014664228395064, + "learning_rate": 7.25173984057987e-06, + "loss": 1.0667, + "step": 12740 + }, + { + "epoch": 0.38403614457831325, + "grad_norm": 1.9295027200797479, + "learning_rate": 7.247292020375537e-06, + "loss": 0.9882, + "step": 12750 + }, + { + "epoch": 0.38433734939759034, + "grad_norm": 3.2515691317224134, + "learning_rate": 7.242841970525587e-06, + "loss": 0.9574, + "step": 12760 + }, + { + "epoch": 0.3846385542168675, + "grad_norm": 2.0144366573518604, + "learning_rate": 7.238389695445127e-06, + "loss": 1.0104, + "step": 12770 + }, + { + "epoch": 0.3849397590361446, + "grad_norm": 1.9938680368796105, + "learning_rate": 7.23393519955147e-06, + "loss": 0.8391, + "step": 12780 + }, + { + "epoch": 0.38524096385542167, + "grad_norm": 3.0479896541727065, + "learning_rate": 7.229478487264134e-06, + "loss": 1.0009, + "step": 12790 + }, + { + "epoch": 0.3855421686746988, + "grad_norm": 7.5561014652032235, + "learning_rate": 7.225019563004839e-06, + "loss": 0.914, + "step": 12800 + }, + { + "epoch": 0.3858433734939759, + "grad_norm": 3.177648156582394, + "learning_rate": 7.220558431197493e-06, + "loss": 0.9036, + "step": 12810 + }, + { + "epoch": 0.386144578313253, + "grad_norm": 3.15907668264104, + "learning_rate": 7.216095096268199e-06, + "loss": 0.9943, + "step": 12820 + }, + { + "epoch": 0.38644578313253014, + "grad_norm": 2.9532276163876383, + "learning_rate": 7.2116295626452434e-06, + "loss": 1.0642, + "step": 12830 + }, + { + "epoch": 0.38674698795180723, + "grad_norm": 3.0071648243174125, + "learning_rate": 7.207161834759097e-06, + "loss": 0.9418, + "step": 12840 + }, + { + "epoch": 0.3870481927710843, + "grad_norm": 2.9061077102466104, + "learning_rate": 7.202691917042406e-06, + "loss": 1.0247, + "step": 12850 + }, + { + "epoch": 0.38734939759036147, + "grad_norm": 4.312299457555282, + "learning_rate": 7.198219813929985e-06, + "loss": 1.142, + "step": 12860 + }, + { + "epoch": 0.38765060240963856, + "grad_norm": 3.9434226113424566, + "learning_rate": 7.193745529858827e-06, + "loss": 1.0536, + "step": 12870 + }, + { + "epoch": 0.38795180722891565, + "grad_norm": 3.345993349978265, + "learning_rate": 7.189269069268077e-06, + "loss": 0.9771, + "step": 12880 + }, + { + "epoch": 0.3882530120481928, + "grad_norm": 3.1833847360222634, + "learning_rate": 7.184790436599047e-06, + "loss": 0.9317, + "step": 12890 + }, + { + "epoch": 0.3885542168674699, + "grad_norm": 1.9698557559954264, + "learning_rate": 7.1803096362952066e-06, + "loss": 0.9526, + "step": 12900 + }, + { + "epoch": 0.38885542168674697, + "grad_norm": 4.984760388967978, + "learning_rate": 7.175826672802163e-06, + "loss": 1.0319, + "step": 12910 + }, + { + "epoch": 0.3891566265060241, + "grad_norm": 3.751405052016957, + "learning_rate": 7.171341550567685e-06, + "loss": 1.0487, + "step": 12920 + }, + { + "epoch": 0.3894578313253012, + "grad_norm": 3.7328836878826506, + "learning_rate": 7.166854274041672e-06, + "loss": 0.9844, + "step": 12930 + }, + { + "epoch": 0.3897590361445783, + "grad_norm": 3.3695214026996143, + "learning_rate": 7.162364847676169e-06, + "loss": 0.9598, + "step": 12940 + }, + { + "epoch": 0.39006024096385544, + "grad_norm": 2.142318438489866, + "learning_rate": 7.157873275925347e-06, + "loss": 0.8418, + "step": 12950 + }, + { + "epoch": 0.39036144578313253, + "grad_norm": 1.8798550671654182, + "learning_rate": 7.1533795632455105e-06, + "loss": 0.9877, + "step": 12960 + }, + { + "epoch": 0.3906626506024096, + "grad_norm": 1.9725118116370421, + "learning_rate": 7.1488837140950865e-06, + "loss": 0.9163, + "step": 12970 + }, + { + "epoch": 0.39096385542168677, + "grad_norm": 4.642643795325702, + "learning_rate": 7.144385732934618e-06, + "loss": 1.037, + "step": 12980 + }, + { + "epoch": 0.39126506024096386, + "grad_norm": 3.4401784850680848, + "learning_rate": 7.13988562422677e-06, + "loss": 1.0857, + "step": 12990 + }, + { + "epoch": 0.39156626506024095, + "grad_norm": 4.453893132851004, + "learning_rate": 7.135383392436314e-06, + "loss": 1.0527, + "step": 13000 + }, + { + "epoch": 0.3918674698795181, + "grad_norm": 3.4785194656641596, + "learning_rate": 7.130879042030129e-06, + "loss": 0.9375, + "step": 13010 + }, + { + "epoch": 0.3921686746987952, + "grad_norm": 4.25105276360076, + "learning_rate": 7.126372577477196e-06, + "loss": 1.07, + "step": 13020 + }, + { + "epoch": 0.3924698795180723, + "grad_norm": 3.8572850912672454, + "learning_rate": 7.121864003248593e-06, + "loss": 1.0047, + "step": 13030 + }, + { + "epoch": 0.3927710843373494, + "grad_norm": 3.41181115848503, + "learning_rate": 7.117353323817491e-06, + "loss": 1.0365, + "step": 13040 + }, + { + "epoch": 0.3930722891566265, + "grad_norm": 3.953355678442684, + "learning_rate": 7.112840543659154e-06, + "loss": 0.9804, + "step": 13050 + }, + { + "epoch": 0.3933734939759036, + "grad_norm": 2.0763038861700864, + "learning_rate": 7.10832566725092e-06, + "loss": 0.9337, + "step": 13060 + }, + { + "epoch": 0.39367469879518074, + "grad_norm": 4.7423588529422025, + "learning_rate": 7.103808699072219e-06, + "loss": 1.0017, + "step": 13070 + }, + { + "epoch": 0.39397590361445783, + "grad_norm": 4.681127159543774, + "learning_rate": 7.099289643604549e-06, + "loss": 0.9772, + "step": 13080 + }, + { + "epoch": 0.3942771084337349, + "grad_norm": 2.075029851200195, + "learning_rate": 7.0947685053314785e-06, + "loss": 0.8014, + "step": 13090 + }, + { + "epoch": 0.39457831325301207, + "grad_norm": 8.081133957333527, + "learning_rate": 7.090245288738648e-06, + "loss": 0.9404, + "step": 13100 + }, + { + "epoch": 0.39487951807228916, + "grad_norm": 8.287050295038267, + "learning_rate": 7.085719998313752e-06, + "loss": 1.0125, + "step": 13110 + }, + { + "epoch": 0.39518072289156625, + "grad_norm": 5.2170357109722545, + "learning_rate": 7.081192638546551e-06, + "loss": 0.9959, + "step": 13120 + }, + { + "epoch": 0.3954819277108434, + "grad_norm": 3.4583333334809483, + "learning_rate": 7.076663213928852e-06, + "loss": 1.0156, + "step": 13130 + }, + { + "epoch": 0.3957831325301205, + "grad_norm": 5.98502170032457, + "learning_rate": 7.072131728954514e-06, + "loss": 0.982, + "step": 13140 + }, + { + "epoch": 0.3960843373493976, + "grad_norm": 3.8811049248944207, + "learning_rate": 7.0675981881194415e-06, + "loss": 0.9698, + "step": 13150 + }, + { + "epoch": 0.3963855421686747, + "grad_norm": 3.8526087519677605, + "learning_rate": 7.063062595921573e-06, + "loss": 0.9461, + "step": 13160 + }, + { + "epoch": 0.3966867469879518, + "grad_norm": 4.52942933395037, + "learning_rate": 7.0585249568608885e-06, + "loss": 1.0396, + "step": 13170 + }, + { + "epoch": 0.3969879518072289, + "grad_norm": 4.504220498757502, + "learning_rate": 7.053985275439395e-06, + "loss": 0.998, + "step": 13180 + }, + { + "epoch": 0.39728915662650605, + "grad_norm": 4.933275566569705, + "learning_rate": 7.049443556161127e-06, + "loss": 1.0248, + "step": 13190 + }, + { + "epoch": 0.39759036144578314, + "grad_norm": 7.435038837767327, + "learning_rate": 7.044899803532141e-06, + "loss": 1.0102, + "step": 13200 + }, + { + "epoch": 0.3978915662650602, + "grad_norm": 1.9475239073919097, + "learning_rate": 7.040354022060512e-06, + "loss": 0.9236, + "step": 13210 + }, + { + "epoch": 0.39819277108433737, + "grad_norm": 5.799189333567071, + "learning_rate": 7.035806216256326e-06, + "loss": 0.8302, + "step": 13220 + }, + { + "epoch": 0.39849397590361446, + "grad_norm": 3.474951174700604, + "learning_rate": 7.031256390631675e-06, + "loss": 1.0335, + "step": 13230 + }, + { + "epoch": 0.39879518072289155, + "grad_norm": 2.8426556346425205, + "learning_rate": 7.0267045497006626e-06, + "loss": 1.0369, + "step": 13240 + }, + { + "epoch": 0.3990963855421687, + "grad_norm": 1.9042082433939282, + "learning_rate": 7.022150697979385e-06, + "loss": 0.9762, + "step": 13250 + }, + { + "epoch": 0.3993975903614458, + "grad_norm": 3.0877693825088075, + "learning_rate": 7.017594839985937e-06, + "loss": 0.9054, + "step": 13260 + }, + { + "epoch": 0.3996987951807229, + "grad_norm": 1.8666653249253085, + "learning_rate": 7.013036980240401e-06, + "loss": 0.9911, + "step": 13270 + }, + { + "epoch": 0.4, + "grad_norm": 3.647999584328929, + "learning_rate": 7.008477123264849e-06, + "loss": 1.0216, + "step": 13280 + }, + { + "epoch": 0.4003012048192771, + "grad_norm": 2.7021011233799324, + "learning_rate": 7.00391527358333e-06, + "loss": 1.0102, + "step": 13290 + }, + { + "epoch": 0.4006024096385542, + "grad_norm": 1.861482437705702, + "learning_rate": 6.999351435721875e-06, + "loss": 0.959, + "step": 13300 + }, + { + "epoch": 0.40090361445783135, + "grad_norm": 2.969205698521233, + "learning_rate": 6.994785614208484e-06, + "loss": 1.023, + "step": 13310 + }, + { + "epoch": 0.40120481927710844, + "grad_norm": 3.2800693516097525, + "learning_rate": 6.990217813573126e-06, + "loss": 1.0551, + "step": 13320 + }, + { + "epoch": 0.4015060240963855, + "grad_norm": 3.307627142993508, + "learning_rate": 6.9856480383477355e-06, + "loss": 1.0215, + "step": 13330 + }, + { + "epoch": 0.4018072289156627, + "grad_norm": 2.916283279567481, + "learning_rate": 6.981076293066204e-06, + "loss": 0.9557, + "step": 13340 + }, + { + "epoch": 0.40210843373493976, + "grad_norm": 1.9335183091976946, + "learning_rate": 6.976502582264377e-06, + "loss": 0.8666, + "step": 13350 + }, + { + "epoch": 0.40240963855421685, + "grad_norm": 3.7893028571634293, + "learning_rate": 6.971926910480052e-06, + "loss": 1.0476, + "step": 13360 + }, + { + "epoch": 0.402710843373494, + "grad_norm": 6.308427673739527, + "learning_rate": 6.967349282252973e-06, + "loss": 0.8638, + "step": 13370 + }, + { + "epoch": 0.4030120481927711, + "grad_norm": 1.8787099051799305, + "learning_rate": 6.962769702124819e-06, + "loss": 0.8845, + "step": 13380 + }, + { + "epoch": 0.4033132530120482, + "grad_norm": 3.0708435846186646, + "learning_rate": 6.958188174639214e-06, + "loss": 1.0184, + "step": 13390 + }, + { + "epoch": 0.4036144578313253, + "grad_norm": 3.7886839727098587, + "learning_rate": 6.953604704341707e-06, + "loss": 0.9131, + "step": 13400 + }, + { + "epoch": 0.4039156626506024, + "grad_norm": 2.8716752329837347, + "learning_rate": 6.9490192957797795e-06, + "loss": 1.0776, + "step": 13410 + }, + { + "epoch": 0.4042168674698795, + "grad_norm": 2.15047239622301, + "learning_rate": 6.9444319535028345e-06, + "loss": 0.927, + "step": 13420 + }, + { + "epoch": 0.40451807228915665, + "grad_norm": 3.240249287432723, + "learning_rate": 6.939842682062191e-06, + "loss": 0.9992, + "step": 13430 + }, + { + "epoch": 0.40481927710843374, + "grad_norm": 1.7911534969340555, + "learning_rate": 6.9352514860110876e-06, + "loss": 0.9444, + "step": 13440 + }, + { + "epoch": 0.40512048192771083, + "grad_norm": 2.0337847761318852, + "learning_rate": 6.930658369904664e-06, + "loss": 0.8718, + "step": 13450 + }, + { + "epoch": 0.405421686746988, + "grad_norm": 3.8277504093569634, + "learning_rate": 6.926063338299972e-06, + "loss": 1.0422, + "step": 13460 + }, + { + "epoch": 0.40572289156626506, + "grad_norm": 2.037541183555164, + "learning_rate": 6.921466395755963e-06, + "loss": 0.8345, + "step": 13470 + }, + { + "epoch": 0.40602409638554215, + "grad_norm": 1.857758733614469, + "learning_rate": 6.916867546833481e-06, + "loss": 1.0182, + "step": 13480 + }, + { + "epoch": 0.4063253012048193, + "grad_norm": 3.687087016747306, + "learning_rate": 6.912266796095267e-06, + "loss": 0.8923, + "step": 13490 + }, + { + "epoch": 0.4066265060240964, + "grad_norm": 3.2284327166784608, + "learning_rate": 6.907664148105939e-06, + "loss": 0.9552, + "step": 13500 + }, + { + "epoch": 0.4069277108433735, + "grad_norm": 3.3924336900550047, + "learning_rate": 6.90305960743201e-06, + "loss": 1.038, + "step": 13510 + }, + { + "epoch": 0.4072289156626506, + "grad_norm": 3.284674519110202, + "learning_rate": 6.8984531786418615e-06, + "loss": 0.9603, + "step": 13520 + }, + { + "epoch": 0.4075301204819277, + "grad_norm": 1.8492992696305701, + "learning_rate": 6.893844866305751e-06, + "loss": 0.8277, + "step": 13530 + }, + { + "epoch": 0.4078313253012048, + "grad_norm": 3.952060181626697, + "learning_rate": 6.889234674995806e-06, + "loss": 1.0991, + "step": 13540 + }, + { + "epoch": 0.40813253012048195, + "grad_norm": 4.09744931047574, + "learning_rate": 6.8846226092860145e-06, + "loss": 0.9109, + "step": 13550 + }, + { + "epoch": 0.40843373493975904, + "grad_norm": 8.374704682984751, + "learning_rate": 6.880008673752231e-06, + "loss": 1.0218, + "step": 13560 + }, + { + "epoch": 0.40873493975903613, + "grad_norm": 3.760826936335024, + "learning_rate": 6.87539287297216e-06, + "loss": 0.9871, + "step": 13570 + }, + { + "epoch": 0.4090361445783133, + "grad_norm": 3.3006168338048543, + "learning_rate": 6.8707752115253556e-06, + "loss": 0.9671, + "step": 13580 + }, + { + "epoch": 0.40933734939759037, + "grad_norm": 3.7756222005743343, + "learning_rate": 6.866155693993222e-06, + "loss": 1.0525, + "step": 13590 + }, + { + "epoch": 0.40963855421686746, + "grad_norm": 3.3836339010150143, + "learning_rate": 6.861534324959e-06, + "loss": 0.8573, + "step": 13600 + }, + { + "epoch": 0.4099397590361446, + "grad_norm": 2.9025089389335976, + "learning_rate": 6.856911109007773e-06, + "loss": 1.0036, + "step": 13610 + }, + { + "epoch": 0.4102409638554217, + "grad_norm": 3.9996824449944297, + "learning_rate": 6.852286050726456e-06, + "loss": 1.0109, + "step": 13620 + }, + { + "epoch": 0.4105421686746988, + "grad_norm": 3.737338629163707, + "learning_rate": 6.847659154703785e-06, + "loss": 0.9516, + "step": 13630 + }, + { + "epoch": 0.4108433734939759, + "grad_norm": 4.409322556657848, + "learning_rate": 6.843030425530328e-06, + "loss": 0.9622, + "step": 13640 + }, + { + "epoch": 0.411144578313253, + "grad_norm": 2.079999565648441, + "learning_rate": 6.838399867798467e-06, + "loss": 0.9414, + "step": 13650 + }, + { + "epoch": 0.4114457831325301, + "grad_norm": 4.089245574983412, + "learning_rate": 6.8337674861023975e-06, + "loss": 0.9327, + "step": 13660 + }, + { + "epoch": 0.41174698795180725, + "grad_norm": 3.3804448567114176, + "learning_rate": 6.829133285038128e-06, + "loss": 0.9547, + "step": 13670 + }, + { + "epoch": 0.41204819277108434, + "grad_norm": 2.2023297085482554, + "learning_rate": 6.82449726920347e-06, + "loss": 0.9069, + "step": 13680 + }, + { + "epoch": 0.41234939759036143, + "grad_norm": 5.184320269609493, + "learning_rate": 6.819859443198038e-06, + "loss": 0.9814, + "step": 13690 + }, + { + "epoch": 0.4126506024096386, + "grad_norm": 7.209148464071631, + "learning_rate": 6.815219811623237e-06, + "loss": 0.8775, + "step": 13700 + }, + { + "epoch": 0.41295180722891567, + "grad_norm": 5.309689062332096, + "learning_rate": 6.810578379082269e-06, + "loss": 0.9626, + "step": 13710 + }, + { + "epoch": 0.41325301204819276, + "grad_norm": 3.3671534143446693, + "learning_rate": 6.80593515018012e-06, + "loss": 0.8824, + "step": 13720 + }, + { + "epoch": 0.4135542168674699, + "grad_norm": 5.926113381160186, + "learning_rate": 6.8012901295235575e-06, + "loss": 1.0967, + "step": 13730 + }, + { + "epoch": 0.413855421686747, + "grad_norm": 3.594663545123508, + "learning_rate": 6.796643321721133e-06, + "loss": 0.9188, + "step": 13740 + }, + { + "epoch": 0.4141566265060241, + "grad_norm": 1.9681201373343726, + "learning_rate": 6.7919947313831604e-06, + "loss": 0.9908, + "step": 13750 + }, + { + "epoch": 0.41445783132530123, + "grad_norm": 3.3151046083210063, + "learning_rate": 6.787344363121731e-06, + "loss": 1.009, + "step": 13760 + }, + { + "epoch": 0.4147590361445783, + "grad_norm": 3.8683175992246754, + "learning_rate": 6.782692221550695e-06, + "loss": 0.9059, + "step": 13770 + }, + { + "epoch": 0.4150602409638554, + "grad_norm": 3.5903375766684245, + "learning_rate": 6.778038311285666e-06, + "loss": 0.9906, + "step": 13780 + }, + { + "epoch": 0.41536144578313255, + "grad_norm": 4.143125510768994, + "learning_rate": 6.77338263694401e-06, + "loss": 1.0809, + "step": 13790 + }, + { + "epoch": 0.41566265060240964, + "grad_norm": 3.817530715680697, + "learning_rate": 6.768725203144843e-06, + "loss": 1.0001, + "step": 13800 + }, + { + "epoch": 0.41596385542168673, + "grad_norm": 3.932977811242661, + "learning_rate": 6.764066014509028e-06, + "loss": 1.0326, + "step": 13810 + }, + { + "epoch": 0.4162650602409639, + "grad_norm": 1.8882146891824592, + "learning_rate": 6.759405075659165e-06, + "loss": 0.8824, + "step": 13820 + }, + { + "epoch": 0.41656626506024097, + "grad_norm": 3.63069039946318, + "learning_rate": 6.7547423912196e-06, + "loss": 0.9037, + "step": 13830 + }, + { + "epoch": 0.41686746987951806, + "grad_norm": 4.064663161840107, + "learning_rate": 6.7500779658163996e-06, + "loss": 1.0934, + "step": 13840 + }, + { + "epoch": 0.4171686746987952, + "grad_norm": 2.0403234567177657, + "learning_rate": 6.745411804077365e-06, + "loss": 0.9486, + "step": 13850 + }, + { + "epoch": 0.4174698795180723, + "grad_norm": 26.80763155624957, + "learning_rate": 6.740743910632017e-06, + "loss": 1.0734, + "step": 13860 + }, + { + "epoch": 0.4177710843373494, + "grad_norm": 4.35823590882377, + "learning_rate": 6.736074290111596e-06, + "loss": 1.0046, + "step": 13870 + }, + { + "epoch": 0.41807228915662653, + "grad_norm": 3.5932599421189386, + "learning_rate": 6.731402947149053e-06, + "loss": 0.9644, + "step": 13880 + }, + { + "epoch": 0.4183734939759036, + "grad_norm": 4.97566959388549, + "learning_rate": 6.7267298863790545e-06, + "loss": 0.9288, + "step": 13890 + }, + { + "epoch": 0.4186746987951807, + "grad_norm": 2.1059425318909795, + "learning_rate": 6.722055112437963e-06, + "loss": 0.8103, + "step": 13900 + }, + { + "epoch": 0.41897590361445786, + "grad_norm": 1.8755405253775557, + "learning_rate": 6.717378629963847e-06, + "loss": 0.9401, + "step": 13910 + }, + { + "epoch": 0.41927710843373495, + "grad_norm": 4.448628855973795, + "learning_rate": 6.712700443596467e-06, + "loss": 0.9688, + "step": 13920 + }, + { + "epoch": 0.41957831325301204, + "grad_norm": 4.087419002340595, + "learning_rate": 6.708020557977274e-06, + "loss": 0.9583, + "step": 13930 + }, + { + "epoch": 0.4198795180722892, + "grad_norm": 3.9225320035818037, + "learning_rate": 6.703338977749408e-06, + "loss": 1.0504, + "step": 13940 + }, + { + "epoch": 0.42018072289156627, + "grad_norm": 3.8556715938444386, + "learning_rate": 6.698655707557686e-06, + "loss": 1.0062, + "step": 13950 + }, + { + "epoch": 0.42048192771084336, + "grad_norm": 3.6223976107972544, + "learning_rate": 6.6939707520486065e-06, + "loss": 1.105, + "step": 13960 + }, + { + "epoch": 0.4207831325301205, + "grad_norm": 4.7489882357330595, + "learning_rate": 6.689284115870334e-06, + "loss": 0.9605, + "step": 13970 + }, + { + "epoch": 0.4210843373493976, + "grad_norm": 5.3454934516173305, + "learning_rate": 6.684595803672705e-06, + "loss": 1.007, + "step": 13980 + }, + { + "epoch": 0.4213855421686747, + "grad_norm": 2.242825259302051, + "learning_rate": 6.679905820107217e-06, + "loss": 1.0303, + "step": 13990 + }, + { + "epoch": 0.42168674698795183, + "grad_norm": 3.867493692013836, + "learning_rate": 6.675214169827029e-06, + "loss": 1.0206, + "step": 14000 + }, + { + "epoch": 0.4219879518072289, + "grad_norm": 1.9022218152561652, + "learning_rate": 6.6705208574869504e-06, + "loss": 0.878, + "step": 14010 + }, + { + "epoch": 0.422289156626506, + "grad_norm": 7.351907608382645, + "learning_rate": 6.665825887743437e-06, + "loss": 0.9591, + "step": 14020 + }, + { + "epoch": 0.42259036144578316, + "grad_norm": 4.134218650532387, + "learning_rate": 6.6611292652545955e-06, + "loss": 0.9686, + "step": 14030 + }, + { + "epoch": 0.42289156626506025, + "grad_norm": 5.545679526427971, + "learning_rate": 6.656430994680168e-06, + "loss": 1.0809, + "step": 14040 + }, + { + "epoch": 0.42319277108433734, + "grad_norm": 5.354084179744823, + "learning_rate": 6.651731080681532e-06, + "loss": 0.9496, + "step": 14050 + }, + { + "epoch": 0.4234939759036145, + "grad_norm": 5.0075043787750895, + "learning_rate": 6.6470295279217e-06, + "loss": 0.895, + "step": 14060 + }, + { + "epoch": 0.4237951807228916, + "grad_norm": 4.7102636641490125, + "learning_rate": 6.6423263410653025e-06, + "loss": 0.8286, + "step": 14070 + }, + { + "epoch": 0.42409638554216866, + "grad_norm": 10.220190685371245, + "learning_rate": 6.637621524778597e-06, + "loss": 1.0195, + "step": 14080 + }, + { + "epoch": 0.4243975903614458, + "grad_norm": 4.055966764848486, + "learning_rate": 6.632915083729457e-06, + "loss": 1.0562, + "step": 14090 + }, + { + "epoch": 0.4246987951807229, + "grad_norm": 7.568283656818405, + "learning_rate": 6.628207022587367e-06, + "loss": 0.9617, + "step": 14100 + }, + { + "epoch": 0.425, + "grad_norm": 11.926916376967057, + "learning_rate": 6.6234973460234184e-06, + "loss": 0.9462, + "step": 14110 + }, + { + "epoch": 0.42530120481927713, + "grad_norm": 5.050840217737689, + "learning_rate": 6.618786058710306e-06, + "loss": 0.8864, + "step": 14120 + }, + { + "epoch": 0.4256024096385542, + "grad_norm": 5.256991626041585, + "learning_rate": 6.614073165322322e-06, + "loss": 0.98, + "step": 14130 + }, + { + "epoch": 0.4259036144578313, + "grad_norm": 3.9388850836709355, + "learning_rate": 6.6093586705353545e-06, + "loss": 1.0085, + "step": 14140 + }, + { + "epoch": 0.42620481927710846, + "grad_norm": 4.248981036001199, + "learning_rate": 6.604642579026878e-06, + "loss": 0.8712, + "step": 14150 + }, + { + "epoch": 0.42650602409638555, + "grad_norm": 4.710615070800449, + "learning_rate": 6.599924895475951e-06, + "loss": 1.0189, + "step": 14160 + }, + { + "epoch": 0.42680722891566264, + "grad_norm": 4.079255733779571, + "learning_rate": 6.595205624563214e-06, + "loss": 1.032, + "step": 14170 + }, + { + "epoch": 0.4271084337349398, + "grad_norm": 4.532412318187093, + "learning_rate": 6.59048477097088e-06, + "loss": 0.9976, + "step": 14180 + }, + { + "epoch": 0.4274096385542169, + "grad_norm": 2.2398021659582965, + "learning_rate": 6.585762339382732e-06, + "loss": 0.9661, + "step": 14190 + }, + { + "epoch": 0.42771084337349397, + "grad_norm": 5.315546724125617, + "learning_rate": 6.58103833448412e-06, + "loss": 1.0394, + "step": 14200 + }, + { + "epoch": 0.4280120481927711, + "grad_norm": 1.8260965219107057, + "learning_rate": 6.5763127609619556e-06, + "loss": 0.8876, + "step": 14210 + }, + { + "epoch": 0.4283132530120482, + "grad_norm": 1.9870117025007035, + "learning_rate": 6.571585623504707e-06, + "loss": 0.8464, + "step": 14220 + }, + { + "epoch": 0.4286144578313253, + "grad_norm": 1.9650537200759508, + "learning_rate": 6.56685692680239e-06, + "loss": 0.9696, + "step": 14230 + }, + { + "epoch": 0.42891566265060244, + "grad_norm": 2.2143800408467698, + "learning_rate": 6.562126675546571e-06, + "loss": 1.0792, + "step": 14240 + }, + { + "epoch": 0.4292168674698795, + "grad_norm": 4.6673063210425, + "learning_rate": 6.557394874430357e-06, + "loss": 0.9912, + "step": 14250 + }, + { + "epoch": 0.4295180722891566, + "grad_norm": 4.063286718378896, + "learning_rate": 6.552661528148399e-06, + "loss": 0.9883, + "step": 14260 + }, + { + "epoch": 0.42981927710843376, + "grad_norm": 5.224078068726723, + "learning_rate": 6.547926641396867e-06, + "loss": 0.9713, + "step": 14270 + }, + { + "epoch": 0.43012048192771085, + "grad_norm": 1.8485432679010896, + "learning_rate": 6.543190218873474e-06, + "loss": 0.9844, + "step": 14280 + }, + { + "epoch": 0.43042168674698794, + "grad_norm": 4.35081591142146, + "learning_rate": 6.538452265277448e-06, + "loss": 1.0075, + "step": 14290 + }, + { + "epoch": 0.4307228915662651, + "grad_norm": 2.046262765455691, + "learning_rate": 6.533712785309541e-06, + "loss": 0.8561, + "step": 14300 + }, + { + "epoch": 0.4310240963855422, + "grad_norm": 2.110252868904274, + "learning_rate": 6.528971783672018e-06, + "loss": 1.0265, + "step": 14310 + }, + { + "epoch": 0.43132530120481927, + "grad_norm": 1.9507659317012978, + "learning_rate": 6.524229265068649e-06, + "loss": 0.876, + "step": 14320 + }, + { + "epoch": 0.4316265060240964, + "grad_norm": 4.52830654049744, + "learning_rate": 6.5194852342047164e-06, + "loss": 0.9584, + "step": 14330 + }, + { + "epoch": 0.4319277108433735, + "grad_norm": 2.03685456762182, + "learning_rate": 6.5147396957869954e-06, + "loss": 1.0511, + "step": 14340 + }, + { + "epoch": 0.4322289156626506, + "grad_norm": 2.301853573936113, + "learning_rate": 6.5099926545237655e-06, + "loss": 0.9472, + "step": 14350 + }, + { + "epoch": 0.43253012048192774, + "grad_norm": 63.68246590306157, + "learning_rate": 6.505244115124792e-06, + "loss": 0.9098, + "step": 14360 + }, + { + "epoch": 0.43283132530120483, + "grad_norm": 1.9576463980977339, + "learning_rate": 6.500494082301325e-06, + "loss": 0.8991, + "step": 14370 + }, + { + "epoch": 0.4331325301204819, + "grad_norm": 18.835906175346544, + "learning_rate": 6.4957425607661e-06, + "loss": 0.9087, + "step": 14380 + }, + { + "epoch": 0.43343373493975906, + "grad_norm": 5.337406422650925, + "learning_rate": 6.490989555233328e-06, + "loss": 0.9445, + "step": 14390 + }, + { + "epoch": 0.43373493975903615, + "grad_norm": 4.558817840401622, + "learning_rate": 6.486235070418693e-06, + "loss": 1.0095, + "step": 14400 + }, + { + "epoch": 0.43403614457831324, + "grad_norm": 5.982268811359151, + "learning_rate": 6.481479111039346e-06, + "loss": 0.9943, + "step": 14410 + }, + { + "epoch": 0.4343373493975904, + "grad_norm": 4.837635700430477, + "learning_rate": 6.4767216818139025e-06, + "loss": 0.9011, + "step": 14420 + }, + { + "epoch": 0.4346385542168675, + "grad_norm": 1.9525336026031908, + "learning_rate": 6.4719627874624315e-06, + "loss": 0.9556, + "step": 14430 + }, + { + "epoch": 0.43493975903614457, + "grad_norm": 6.444628382446914, + "learning_rate": 6.467202432706462e-06, + "loss": 1.0309, + "step": 14440 + }, + { + "epoch": 0.4352409638554217, + "grad_norm": 8.989013397893679, + "learning_rate": 6.462440622268968e-06, + "loss": 0.9094, + "step": 14450 + }, + { + "epoch": 0.4355421686746988, + "grad_norm": 5.491630015275027, + "learning_rate": 6.457677360874369e-06, + "loss": 0.8831, + "step": 14460 + }, + { + "epoch": 0.4358433734939759, + "grad_norm": 5.207365934374556, + "learning_rate": 6.452912653248524e-06, + "loss": 0.8662, + "step": 14470 + }, + { + "epoch": 0.43614457831325304, + "grad_norm": 4.235258432845556, + "learning_rate": 6.448146504118727e-06, + "loss": 0.913, + "step": 14480 + }, + { + "epoch": 0.43644578313253013, + "grad_norm": 1.9979824771424923, + "learning_rate": 6.443378918213702e-06, + "loss": 0.954, + "step": 14490 + }, + { + "epoch": 0.4367469879518072, + "grad_norm": 2.0219849962315513, + "learning_rate": 6.438609900263597e-06, + "loss": 0.8747, + "step": 14500 + }, + { + "epoch": 0.43704819277108437, + "grad_norm": 4.536709248678018, + "learning_rate": 6.433839454999984e-06, + "loss": 1.0589, + "step": 14510 + }, + { + "epoch": 0.43734939759036146, + "grad_norm": 1.8603159043406443, + "learning_rate": 6.429067587155846e-06, + "loss": 0.9545, + "step": 14520 + }, + { + "epoch": 0.43765060240963854, + "grad_norm": 5.445589542955186, + "learning_rate": 6.424294301465587e-06, + "loss": 1.0, + "step": 14530 + }, + { + "epoch": 0.43795180722891563, + "grad_norm": 3.647002525829796, + "learning_rate": 6.419519602665005e-06, + "loss": 0.9889, + "step": 14540 + }, + { + "epoch": 0.4382530120481928, + "grad_norm": 3.8125231851329655, + "learning_rate": 6.414743495491309e-06, + "loss": 0.8806, + "step": 14550 + }, + { + "epoch": 0.43855421686746987, + "grad_norm": 11.404506189201793, + "learning_rate": 6.409965984683104e-06, + "loss": 1.0634, + "step": 14560 + }, + { + "epoch": 0.43885542168674696, + "grad_norm": 2.113446004546863, + "learning_rate": 6.4051870749803835e-06, + "loss": 1.0219, + "step": 14570 + }, + { + "epoch": 0.4391566265060241, + "grad_norm": 4.272081146701065, + "learning_rate": 6.4004067711245366e-06, + "loss": 0.8988, + "step": 14580 + }, + { + "epoch": 0.4394578313253012, + "grad_norm": 3.791666887388401, + "learning_rate": 6.395625077858324e-06, + "loss": 0.8926, + "step": 14590 + }, + { + "epoch": 0.4397590361445783, + "grad_norm": 5.541461091861432, + "learning_rate": 6.390841999925897e-06, + "loss": 1.0429, + "step": 14600 + }, + { + "epoch": 0.44006024096385543, + "grad_norm": 21.20771958021225, + "learning_rate": 6.386057542072775e-06, + "loss": 1.0525, + "step": 14610 + }, + { + "epoch": 0.4403614457831325, + "grad_norm": 5.5831870449008365, + "learning_rate": 6.381271709045844e-06, + "loss": 0.8978, + "step": 14620 + }, + { + "epoch": 0.4406626506024096, + "grad_norm": 6.211803700001257, + "learning_rate": 6.376484505593359e-06, + "loss": 0.9337, + "step": 14630 + }, + { + "epoch": 0.44096385542168676, + "grad_norm": 4.327191340451297, + "learning_rate": 6.371695936464932e-06, + "loss": 1.0032, + "step": 14640 + }, + { + "epoch": 0.44126506024096385, + "grad_norm": 4.017862110781731, + "learning_rate": 6.366906006411534e-06, + "loss": 1.0761, + "step": 14650 + }, + { + "epoch": 0.44156626506024094, + "grad_norm": 2.0834143928322355, + "learning_rate": 6.3621147201854795e-06, + "loss": 1.015, + "step": 14660 + }, + { + "epoch": 0.4418674698795181, + "grad_norm": 5.110735498556025, + "learning_rate": 6.357322082540434e-06, + "loss": 0.8988, + "step": 14670 + }, + { + "epoch": 0.44216867469879517, + "grad_norm": 6.354116123529262, + "learning_rate": 6.352528098231401e-06, + "loss": 0.9398, + "step": 14680 + }, + { + "epoch": 0.44246987951807226, + "grad_norm": 2.159530138316348, + "learning_rate": 6.347732772014721e-06, + "loss": 1.0521, + "step": 14690 + }, + { + "epoch": 0.4427710843373494, + "grad_norm": 1.9933712179887328, + "learning_rate": 6.342936108648066e-06, + "loss": 0.8788, + "step": 14700 + }, + { + "epoch": 0.4430722891566265, + "grad_norm": 1.7494841584547256, + "learning_rate": 6.338138112890434e-06, + "loss": 0.8607, + "step": 14710 + }, + { + "epoch": 0.4433734939759036, + "grad_norm": 9.023499609529305, + "learning_rate": 6.333338789502148e-06, + "loss": 0.8774, + "step": 14720 + }, + { + "epoch": 0.44367469879518073, + "grad_norm": 5.1989634372684215, + "learning_rate": 6.328538143244844e-06, + "loss": 0.9683, + "step": 14730 + }, + { + "epoch": 0.4439759036144578, + "grad_norm": 5.524593945218239, + "learning_rate": 6.32373617888147e-06, + "loss": 1.0263, + "step": 14740 + }, + { + "epoch": 0.4442771084337349, + "grad_norm": 1.8271075356600708, + "learning_rate": 6.318932901176288e-06, + "loss": 1.0184, + "step": 14750 + }, + { + "epoch": 0.44457831325301206, + "grad_norm": 4.9739667705548865, + "learning_rate": 6.314128314894856e-06, + "loss": 1.0007, + "step": 14760 + }, + { + "epoch": 0.44487951807228915, + "grad_norm": 7.536616587212736, + "learning_rate": 6.309322424804034e-06, + "loss": 0.9423, + "step": 14770 + }, + { + "epoch": 0.44518072289156624, + "grad_norm": 2.150751980409594, + "learning_rate": 6.3045152356719765e-06, + "loss": 0.9498, + "step": 14780 + }, + { + "epoch": 0.4454819277108434, + "grad_norm": 2.1021294109207216, + "learning_rate": 6.299706752268122e-06, + "loss": 0.9776, + "step": 14790 + }, + { + "epoch": 0.4457831325301205, + "grad_norm": 5.167209206488193, + "learning_rate": 6.294896979363199e-06, + "loss": 1.0821, + "step": 14800 + }, + { + "epoch": 0.44608433734939756, + "grad_norm": 1.8790346506081124, + "learning_rate": 6.290085921729211e-06, + "loss": 0.8127, + "step": 14810 + }, + { + "epoch": 0.4463855421686747, + "grad_norm": 3.6596766848390576, + "learning_rate": 6.285273584139437e-06, + "loss": 1.0506, + "step": 14820 + }, + { + "epoch": 0.4466867469879518, + "grad_norm": 3.4487164190170643, + "learning_rate": 6.280459971368431e-06, + "loss": 0.9859, + "step": 14830 + }, + { + "epoch": 0.4469879518072289, + "grad_norm": 1.8527260757228985, + "learning_rate": 6.275645088192003e-06, + "loss": 0.9228, + "step": 14840 + }, + { + "epoch": 0.44728915662650603, + "grad_norm": 3.2891709546322856, + "learning_rate": 6.270828939387232e-06, + "loss": 1.0175, + "step": 14850 + }, + { + "epoch": 0.4475903614457831, + "grad_norm": 4.9358873030850114, + "learning_rate": 6.266011529732445e-06, + "loss": 0.9466, + "step": 14860 + }, + { + "epoch": 0.4478915662650602, + "grad_norm": 4.0035091825160345, + "learning_rate": 6.261192864007226e-06, + "loss": 0.987, + "step": 14870 + }, + { + "epoch": 0.44819277108433736, + "grad_norm": 4.409290171950485, + "learning_rate": 6.256372946992405e-06, + "loss": 0.8875, + "step": 14880 + }, + { + "epoch": 0.44849397590361445, + "grad_norm": 3.753069019045162, + "learning_rate": 6.251551783470048e-06, + "loss": 0.9529, + "step": 14890 + }, + { + "epoch": 0.44879518072289154, + "grad_norm": 4.315050145797351, + "learning_rate": 6.246729378223465e-06, + "loss": 0.97, + "step": 14900 + }, + { + "epoch": 0.4490963855421687, + "grad_norm": 23.925305437786783, + "learning_rate": 6.2419057360371915e-06, + "loss": 1.0131, + "step": 14910 + }, + { + "epoch": 0.4493975903614458, + "grad_norm": 3.74052244546966, + "learning_rate": 6.237080861696994e-06, + "loss": 0.9892, + "step": 14920 + }, + { + "epoch": 0.44969879518072287, + "grad_norm": 3.4708257534340063, + "learning_rate": 6.232254759989861e-06, + "loss": 0.9847, + "step": 14930 + }, + { + "epoch": 0.45, + "grad_norm": 5.468240886606056, + "learning_rate": 6.227427435703997e-06, + "loss": 1.0243, + "step": 14940 + }, + { + "epoch": 0.4503012048192771, + "grad_norm": 2.0336268064530123, + "learning_rate": 6.222598893628822e-06, + "loss": 0.868, + "step": 14950 + }, + { + "epoch": 0.4506024096385542, + "grad_norm": 5.483948935970706, + "learning_rate": 6.2177691385549595e-06, + "loss": 0.9283, + "step": 14960 + }, + { + "epoch": 0.45090361445783134, + "grad_norm": 5.076421463812874, + "learning_rate": 6.212938175274246e-06, + "loss": 1.0003, + "step": 14970 + }, + { + "epoch": 0.4512048192771084, + "grad_norm": 2.0800752070054016, + "learning_rate": 6.2081060085797065e-06, + "loss": 0.8448, + "step": 14980 + }, + { + "epoch": 0.4515060240963855, + "grad_norm": 2.0948853592958185, + "learning_rate": 6.2032726432655655e-06, + "loss": 0.9675, + "step": 14990 + }, + { + "epoch": 0.45180722891566266, + "grad_norm": 1.9638910363683588, + "learning_rate": 6.198438084127234e-06, + "loss": 0.8365, + "step": 15000 + }, + { + "epoch": 0.45210843373493975, + "grad_norm": 4.629893900801348, + "learning_rate": 6.1936023359613095e-06, + "loss": 0.9763, + "step": 15010 + }, + { + "epoch": 0.45240963855421684, + "grad_norm": 2.1526912677468117, + "learning_rate": 6.188765403565569e-06, + "loss": 0.8948, + "step": 15020 + }, + { + "epoch": 0.452710843373494, + "grad_norm": 4.639081635265701, + "learning_rate": 6.183927291738963e-06, + "loss": 0.8947, + "step": 15030 + }, + { + "epoch": 0.4530120481927711, + "grad_norm": 8.3365809018225, + "learning_rate": 6.179088005281612e-06, + "loss": 0.9226, + "step": 15040 + }, + { + "epoch": 0.45331325301204817, + "grad_norm": 16.48978218950864, + "learning_rate": 6.174247548994805e-06, + "loss": 0.9166, + "step": 15050 + }, + { + "epoch": 0.4536144578313253, + "grad_norm": 4.254658787892794, + "learning_rate": 6.169405927680989e-06, + "loss": 1.0038, + "step": 15060 + }, + { + "epoch": 0.4539156626506024, + "grad_norm": 5.272768246680619, + "learning_rate": 6.164563146143767e-06, + "loss": 0.8653, + "step": 15070 + }, + { + "epoch": 0.4542168674698795, + "grad_norm": 13.789656716662838, + "learning_rate": 6.159719209187896e-06, + "loss": 0.9806, + "step": 15080 + }, + { + "epoch": 0.45451807228915664, + "grad_norm": 2.1193314572411652, + "learning_rate": 6.154874121619271e-06, + "loss": 0.9901, + "step": 15090 + }, + { + "epoch": 0.45481927710843373, + "grad_norm": 5.486995531643311, + "learning_rate": 6.1500278882449415e-06, + "loss": 1.0283, + "step": 15100 + }, + { + "epoch": 0.4551204819277108, + "grad_norm": 2.025029285646546, + "learning_rate": 6.145180513873083e-06, + "loss": 0.8871, + "step": 15110 + }, + { + "epoch": 0.45542168674698796, + "grad_norm": 1.8356724233464343, + "learning_rate": 6.140332003313006e-06, + "loss": 0.9598, + "step": 15120 + }, + { + "epoch": 0.45572289156626505, + "grad_norm": 1.9119626614528773, + "learning_rate": 6.135482361375152e-06, + "loss": 0.9334, + "step": 15130 + }, + { + "epoch": 0.45602409638554214, + "grad_norm": 1.9457899346466425, + "learning_rate": 6.13063159287108e-06, + "loss": 0.8897, + "step": 15140 + }, + { + "epoch": 0.4563253012048193, + "grad_norm": 6.346461425624317, + "learning_rate": 6.125779702613471e-06, + "loss": 0.9209, + "step": 15150 + }, + { + "epoch": 0.4566265060240964, + "grad_norm": 11.082672495907039, + "learning_rate": 6.120926695416114e-06, + "loss": 0.9767, + "step": 15160 + }, + { + "epoch": 0.45692771084337347, + "grad_norm": 1.9417234750492238, + "learning_rate": 6.1160725760939125e-06, + "loss": 0.9292, + "step": 15170 + }, + { + "epoch": 0.4572289156626506, + "grad_norm": 4.910654199287689, + "learning_rate": 6.111217349462864e-06, + "loss": 0.9339, + "step": 15180 + }, + { + "epoch": 0.4575301204819277, + "grad_norm": 4.090750023657973, + "learning_rate": 6.106361020340075e-06, + "loss": 0.9759, + "step": 15190 + }, + { + "epoch": 0.4578313253012048, + "grad_norm": 10.45340995527639, + "learning_rate": 6.101503593543742e-06, + "loss": 1.0505, + "step": 15200 + }, + { + "epoch": 0.45813253012048194, + "grad_norm": 4.393493650041436, + "learning_rate": 6.096645073893147e-06, + "loss": 1.0164, + "step": 15210 + }, + { + "epoch": 0.45843373493975903, + "grad_norm": 2.932003843284258, + "learning_rate": 6.091785466208662e-06, + "loss": 0.9914, + "step": 15220 + }, + { + "epoch": 0.4587349397590361, + "grad_norm": 5.511367512231397, + "learning_rate": 6.086924775311731e-06, + "loss": 0.9843, + "step": 15230 + }, + { + "epoch": 0.45903614457831327, + "grad_norm": 3.622507160094181, + "learning_rate": 6.082063006024883e-06, + "loss": 0.9724, + "step": 15240 + }, + { + "epoch": 0.45933734939759036, + "grad_norm": 4.42097782859806, + "learning_rate": 6.077200163171707e-06, + "loss": 1.0642, + "step": 15250 + }, + { + "epoch": 0.45963855421686745, + "grad_norm": 1.9842004474371322, + "learning_rate": 6.072336251576864e-06, + "loss": 0.8821, + "step": 15260 + }, + { + "epoch": 0.4599397590361446, + "grad_norm": 4.8517110679655095, + "learning_rate": 6.067471276066071e-06, + "loss": 0.9392, + "step": 15270 + }, + { + "epoch": 0.4602409638554217, + "grad_norm": 4.067247531850998, + "learning_rate": 6.062605241466102e-06, + "loss": 0.9131, + "step": 15280 + }, + { + "epoch": 0.46054216867469877, + "grad_norm": 3.628697422434493, + "learning_rate": 6.057738152604783e-06, + "loss": 0.9612, + "step": 15290 + }, + { + "epoch": 0.4608433734939759, + "grad_norm": 1.8238831228622823, + "learning_rate": 6.0528700143109856e-06, + "loss": 0.8898, + "step": 15300 + }, + { + "epoch": 0.461144578313253, + "grad_norm": 1.9506638259955418, + "learning_rate": 6.048000831414621e-06, + "loss": 0.9569, + "step": 15310 + }, + { + "epoch": 0.4614457831325301, + "grad_norm": 2.038623106823851, + "learning_rate": 6.043130608746638e-06, + "loss": 1.0339, + "step": 15320 + }, + { + "epoch": 0.46174698795180724, + "grad_norm": 4.081124055135755, + "learning_rate": 6.0382593511390175e-06, + "loss": 0.9014, + "step": 15330 + }, + { + "epoch": 0.46204819277108433, + "grad_norm": 4.2039738385113, + "learning_rate": 6.033387063424765e-06, + "loss": 1.0432, + "step": 15340 + }, + { + "epoch": 0.4623493975903614, + "grad_norm": 4.771012156477197, + "learning_rate": 6.0285137504379136e-06, + "loss": 0.8905, + "step": 15350 + }, + { + "epoch": 0.46265060240963857, + "grad_norm": 3.8698912276052826, + "learning_rate": 6.023639417013503e-06, + "loss": 0.9479, + "step": 15360 + }, + { + "epoch": 0.46295180722891566, + "grad_norm": 3.653400357953905, + "learning_rate": 6.018764067987597e-06, + "loss": 0.9512, + "step": 15370 + }, + { + "epoch": 0.46325301204819275, + "grad_norm": 4.353461797388446, + "learning_rate": 6.013887708197261e-06, + "loss": 1.0287, + "step": 15380 + }, + { + "epoch": 0.4635542168674699, + "grad_norm": 1.9962682766637123, + "learning_rate": 6.009010342480563e-06, + "loss": 0.9262, + "step": 15390 + }, + { + "epoch": 0.463855421686747, + "grad_norm": 3.393075109953769, + "learning_rate": 6.004131975676571e-06, + "loss": 0.9228, + "step": 15400 + }, + { + "epoch": 0.4641566265060241, + "grad_norm": 3.631344542832589, + "learning_rate": 5.999252612625345e-06, + "loss": 0.968, + "step": 15410 + }, + { + "epoch": 0.4644578313253012, + "grad_norm": 3.8199424808707336, + "learning_rate": 5.994372258167935e-06, + "loss": 0.9909, + "step": 15420 + }, + { + "epoch": 0.4647590361445783, + "grad_norm": 2.020212908749961, + "learning_rate": 5.9894909171463715e-06, + "loss": 0.907, + "step": 15430 + }, + { + "epoch": 0.4650602409638554, + "grad_norm": 4.435765129156293, + "learning_rate": 5.984608594403667e-06, + "loss": 0.9393, + "step": 15440 + }, + { + "epoch": 0.46536144578313254, + "grad_norm": 4.272774995977806, + "learning_rate": 5.979725294783808e-06, + "loss": 1.0173, + "step": 15450 + }, + { + "epoch": 0.46566265060240963, + "grad_norm": 3.4351057816326476, + "learning_rate": 5.974841023131745e-06, + "loss": 0.9682, + "step": 15460 + }, + { + "epoch": 0.4659638554216867, + "grad_norm": 3.343332344783758, + "learning_rate": 5.969955784293403e-06, + "loss": 0.9354, + "step": 15470 + }, + { + "epoch": 0.46626506024096387, + "grad_norm": 5.375685723600656, + "learning_rate": 5.965069583115655e-06, + "loss": 0.986, + "step": 15480 + }, + { + "epoch": 0.46656626506024096, + "grad_norm": 3.7740604287826423, + "learning_rate": 5.9601824244463354e-06, + "loss": 0.98, + "step": 15490 + }, + { + "epoch": 0.46686746987951805, + "grad_norm": 1.958611231889104, + "learning_rate": 5.95529431313423e-06, + "loss": 0.8225, + "step": 15500 + }, + { + "epoch": 0.4671686746987952, + "grad_norm": 3.752224755808922, + "learning_rate": 5.9504052540290655e-06, + "loss": 1.0359, + "step": 15510 + }, + { + "epoch": 0.4674698795180723, + "grad_norm": 1.9932450938425532, + "learning_rate": 5.945515251981511e-06, + "loss": 0.8257, + "step": 15520 + }, + { + "epoch": 0.4677710843373494, + "grad_norm": 6.141550273844298, + "learning_rate": 5.94062431184317e-06, + "loss": 0.9846, + "step": 15530 + }, + { + "epoch": 0.4680722891566265, + "grad_norm": 4.069407210066842, + "learning_rate": 5.935732438466578e-06, + "loss": 0.9558, + "step": 15540 + }, + { + "epoch": 0.4683734939759036, + "grad_norm": 3.8240966984099583, + "learning_rate": 5.930839636705196e-06, + "loss": 0.9485, + "step": 15550 + }, + { + "epoch": 0.4686746987951807, + "grad_norm": 2.1199639876192333, + "learning_rate": 5.925945911413405e-06, + "loss": 0.9409, + "step": 15560 + }, + { + "epoch": 0.46897590361445785, + "grad_norm": 4.253009603524578, + "learning_rate": 5.921051267446507e-06, + "loss": 1.066, + "step": 15570 + }, + { + "epoch": 0.46927710843373494, + "grad_norm": 4.834378079812853, + "learning_rate": 5.916155709660707e-06, + "loss": 0.9991, + "step": 15580 + }, + { + "epoch": 0.469578313253012, + "grad_norm": 2.1958271799561264, + "learning_rate": 5.911259242913124e-06, + "loss": 0.9344, + "step": 15590 + }, + { + "epoch": 0.46987951807228917, + "grad_norm": 4.251159210640614, + "learning_rate": 5.906361872061776e-06, + "loss": 0.8428, + "step": 15600 + }, + { + "epoch": 0.47018072289156626, + "grad_norm": 3.4579718127619654, + "learning_rate": 5.901463601965578e-06, + "loss": 1.0252, + "step": 15610 + }, + { + "epoch": 0.47048192771084335, + "grad_norm": 4.123233913463495, + "learning_rate": 5.896564437484338e-06, + "loss": 0.9784, + "step": 15620 + }, + { + "epoch": 0.4707831325301205, + "grad_norm": 3.805917111735305, + "learning_rate": 5.89166438347875e-06, + "loss": 0.9682, + "step": 15630 + }, + { + "epoch": 0.4710843373493976, + "grad_norm": 3.472449173403724, + "learning_rate": 5.886763444810391e-06, + "loss": 0.9287, + "step": 15640 + }, + { + "epoch": 0.4713855421686747, + "grad_norm": 3.505798023665673, + "learning_rate": 5.8818616263417175e-06, + "loss": 1.0114, + "step": 15650 + }, + { + "epoch": 0.4716867469879518, + "grad_norm": 2.2101309910312774, + "learning_rate": 5.876958932936056e-06, + "loss": 0.9078, + "step": 15660 + }, + { + "epoch": 0.4719879518072289, + "grad_norm": 5.024115659027036, + "learning_rate": 5.872055369457603e-06, + "loss": 0.9819, + "step": 15670 + }, + { + "epoch": 0.472289156626506, + "grad_norm": 1.9388970415560967, + "learning_rate": 5.867150940771418e-06, + "loss": 0.972, + "step": 15680 + }, + { + "epoch": 0.47259036144578315, + "grad_norm": 1.923641231368815, + "learning_rate": 5.862245651743419e-06, + "loss": 0.9894, + "step": 15690 + }, + { + "epoch": 0.47289156626506024, + "grad_norm": 3.5761198625957924, + "learning_rate": 5.857339507240376e-06, + "loss": 1.0599, + "step": 15700 + }, + { + "epoch": 0.4731927710843373, + "grad_norm": 3.7466601545245815, + "learning_rate": 5.85243251212991e-06, + "loss": 1.0256, + "step": 15710 + }, + { + "epoch": 0.4734939759036145, + "grad_norm": 1.8344281968594405, + "learning_rate": 5.8475246712804845e-06, + "loss": 0.9542, + "step": 15720 + }, + { + "epoch": 0.47379518072289156, + "grad_norm": 2.821146548540811, + "learning_rate": 5.8426159895614025e-06, + "loss": 0.9607, + "step": 15730 + }, + { + "epoch": 0.47409638554216865, + "grad_norm": 2.0109465472597425, + "learning_rate": 5.837706471842802e-06, + "loss": 0.9255, + "step": 15740 + }, + { + "epoch": 0.4743975903614458, + "grad_norm": 3.793950849129281, + "learning_rate": 5.832796122995648e-06, + "loss": 0.9613, + "step": 15750 + }, + { + "epoch": 0.4746987951807229, + "grad_norm": 3.5195351231947214, + "learning_rate": 5.8278849478917345e-06, + "loss": 0.9551, + "step": 15760 + }, + { + "epoch": 0.475, + "grad_norm": 3.44390448833659, + "learning_rate": 5.82297295140367e-06, + "loss": 0.9127, + "step": 15770 + }, + { + "epoch": 0.4753012048192771, + "grad_norm": 2.064193688758877, + "learning_rate": 5.818060138404882e-06, + "loss": 0.9144, + "step": 15780 + }, + { + "epoch": 0.4756024096385542, + "grad_norm": 3.930204164602873, + "learning_rate": 5.813146513769607e-06, + "loss": 1.0549, + "step": 15790 + }, + { + "epoch": 0.4759036144578313, + "grad_norm": 1.8105852580327773, + "learning_rate": 5.808232082372884e-06, + "loss": 0.8884, + "step": 15800 + }, + { + "epoch": 0.47620481927710845, + "grad_norm": 3.401223624686546, + "learning_rate": 5.803316849090559e-06, + "loss": 0.8972, + "step": 15810 + }, + { + "epoch": 0.47650602409638554, + "grad_norm": 4.218879987468935, + "learning_rate": 5.798400818799266e-06, + "loss": 0.9108, + "step": 15820 + }, + { + "epoch": 0.47680722891566263, + "grad_norm": 5.303922998277883, + "learning_rate": 5.793483996376433e-06, + "loss": 0.9926, + "step": 15830 + }, + { + "epoch": 0.4771084337349398, + "grad_norm": 3.2250696460266854, + "learning_rate": 5.788566386700276e-06, + "loss": 0.9818, + "step": 15840 + }, + { + "epoch": 0.47740963855421686, + "grad_norm": 2.0605598980504443, + "learning_rate": 5.7836479946497905e-06, + "loss": 0.9629, + "step": 15850 + }, + { + "epoch": 0.47771084337349395, + "grad_norm": 1.990069257829727, + "learning_rate": 5.778728825104745e-06, + "loss": 0.9655, + "step": 15860 + }, + { + "epoch": 0.4780120481927711, + "grad_norm": 4.242590643192938, + "learning_rate": 5.773808882945684e-06, + "loss": 1.0566, + "step": 15870 + }, + { + "epoch": 0.4783132530120482, + "grad_norm": 3.201563490514386, + "learning_rate": 5.768888173053917e-06, + "loss": 0.9913, + "step": 15880 + }, + { + "epoch": 0.4786144578313253, + "grad_norm": 3.6375935532616106, + "learning_rate": 5.763966700311516e-06, + "loss": 1.0634, + "step": 15890 + }, + { + "epoch": 0.4789156626506024, + "grad_norm": 3.42465631670704, + "learning_rate": 5.759044469601307e-06, + "loss": 0.9021, + "step": 15900 + }, + { + "epoch": 0.4792168674698795, + "grad_norm": 3.5933105683523103, + "learning_rate": 5.7541214858068705e-06, + "loss": 0.9248, + "step": 15910 + }, + { + "epoch": 0.4795180722891566, + "grad_norm": 3.818194076790111, + "learning_rate": 5.749197753812533e-06, + "loss": 0.9539, + "step": 15920 + }, + { + "epoch": 0.47981927710843375, + "grad_norm": 2.9683401007502197, + "learning_rate": 5.744273278503365e-06, + "loss": 0.9131, + "step": 15930 + }, + { + "epoch": 0.48012048192771084, + "grad_norm": 3.976873236658985, + "learning_rate": 5.739348064765173e-06, + "loss": 0.8902, + "step": 15940 + }, + { + "epoch": 0.48042168674698793, + "grad_norm": 1.8917758808628107, + "learning_rate": 5.734422117484496e-06, + "loss": 0.9426, + "step": 15950 + }, + { + "epoch": 0.4807228915662651, + "grad_norm": 3.718175782520566, + "learning_rate": 5.729495441548603e-06, + "loss": 0.9244, + "step": 15960 + }, + { + "epoch": 0.48102409638554217, + "grad_norm": 4.92755103394381, + "learning_rate": 5.724568041845482e-06, + "loss": 1.0401, + "step": 15970 + }, + { + "epoch": 0.48132530120481926, + "grad_norm": 3.991297733931076, + "learning_rate": 5.719639923263841e-06, + "loss": 0.914, + "step": 15980 + }, + { + "epoch": 0.4816265060240964, + "grad_norm": 3.683504880134318, + "learning_rate": 5.714711090693105e-06, + "loss": 1.0589, + "step": 15990 + }, + { + "epoch": 0.4819277108433735, + "grad_norm": 2.103618704503937, + "learning_rate": 5.7097815490234e-06, + "loss": 1.0494, + "step": 16000 + }, + { + "epoch": 0.4822289156626506, + "grad_norm": 3.148525504100099, + "learning_rate": 5.704851303145562e-06, + "loss": 0.9384, + "step": 16010 + }, + { + "epoch": 0.4825301204819277, + "grad_norm": 3.7542549837016566, + "learning_rate": 5.699920357951122e-06, + "loss": 0.9955, + "step": 16020 + }, + { + "epoch": 0.4828313253012048, + "grad_norm": 4.263674016496408, + "learning_rate": 5.694988718332305e-06, + "loss": 1.0508, + "step": 16030 + }, + { + "epoch": 0.4831325301204819, + "grad_norm": 2.0369555452543433, + "learning_rate": 5.690056389182029e-06, + "loss": 0.9159, + "step": 16040 + }, + { + "epoch": 0.48343373493975905, + "grad_norm": 2.018208089452482, + "learning_rate": 5.68512337539389e-06, + "loss": 0.9311, + "step": 16050 + }, + { + "epoch": 0.48373493975903614, + "grad_norm": 3.6863006912289906, + "learning_rate": 5.680189681862169e-06, + "loss": 0.9928, + "step": 16060 + }, + { + "epoch": 0.48403614457831323, + "grad_norm": 3.883766819311878, + "learning_rate": 5.675255313481816e-06, + "loss": 1.011, + "step": 16070 + }, + { + "epoch": 0.4843373493975904, + "grad_norm": 4.022219241501254, + "learning_rate": 5.670320275148453e-06, + "loss": 1.0192, + "step": 16080 + }, + { + "epoch": 0.48463855421686747, + "grad_norm": 3.625969394490103, + "learning_rate": 5.66538457175837e-06, + "loss": 0.8921, + "step": 16090 + }, + { + "epoch": 0.48493975903614456, + "grad_norm": 3.934005534709987, + "learning_rate": 5.660448208208513e-06, + "loss": 1.0507, + "step": 16100 + }, + { + "epoch": 0.4852409638554217, + "grad_norm": 1.9434506662927051, + "learning_rate": 5.655511189396482e-06, + "loss": 0.8856, + "step": 16110 + }, + { + "epoch": 0.4855421686746988, + "grad_norm": 3.2593475606632722, + "learning_rate": 5.650573520220528e-06, + "loss": 0.8519, + "step": 16120 + }, + { + "epoch": 0.4858433734939759, + "grad_norm": 3.5373579746777466, + "learning_rate": 5.645635205579551e-06, + "loss": 0.9807, + "step": 16130 + }, + { + "epoch": 0.48614457831325303, + "grad_norm": 3.4058108606170596, + "learning_rate": 5.640696250373089e-06, + "loss": 0.9153, + "step": 16140 + }, + { + "epoch": 0.4864457831325301, + "grad_norm": 2.1061104208189803, + "learning_rate": 5.635756659501312e-06, + "loss": 1.0289, + "step": 16150 + }, + { + "epoch": 0.4867469879518072, + "grad_norm": 3.7143925075194324, + "learning_rate": 5.630816437865026e-06, + "loss": 0.9853, + "step": 16160 + }, + { + "epoch": 0.48704819277108435, + "grad_norm": 3.4675259291475284, + "learning_rate": 5.62587559036566e-06, + "loss": 0.9042, + "step": 16170 + }, + { + "epoch": 0.48734939759036144, + "grad_norm": 3.258157402874766, + "learning_rate": 5.6209341219052625e-06, + "loss": 1.0681, + "step": 16180 + }, + { + "epoch": 0.48765060240963853, + "grad_norm": 3.5750396020214454, + "learning_rate": 5.615992037386505e-06, + "loss": 0.9625, + "step": 16190 + }, + { + "epoch": 0.4879518072289157, + "grad_norm": 5.736958849615456, + "learning_rate": 5.61104934171266e-06, + "loss": 1.0082, + "step": 16200 + }, + { + "epoch": 0.48825301204819277, + "grad_norm": 3.370647439497055, + "learning_rate": 5.606106039787614e-06, + "loss": 0.9527, + "step": 16210 + }, + { + "epoch": 0.48855421686746986, + "grad_norm": 4.12517330758, + "learning_rate": 5.601162136515853e-06, + "loss": 0.96, + "step": 16220 + }, + { + "epoch": 0.488855421686747, + "grad_norm": 3.697771225010867, + "learning_rate": 5.596217636802459e-06, + "loss": 1.0006, + "step": 16230 + }, + { + "epoch": 0.4891566265060241, + "grad_norm": 3.281437853463002, + "learning_rate": 5.591272545553105e-06, + "loss": 0.9578, + "step": 16240 + }, + { + "epoch": 0.4894578313253012, + "grad_norm": 4.635097307896269, + "learning_rate": 5.586326867674052e-06, + "loss": 0.9059, + "step": 16250 + }, + { + "epoch": 0.48975903614457833, + "grad_norm": 14.395878411842967, + "learning_rate": 5.5813806080721465e-06, + "loss": 1.0381, + "step": 16260 + }, + { + "epoch": 0.4900602409638554, + "grad_norm": 3.193713434679751, + "learning_rate": 5.576433771654803e-06, + "loss": 0.8877, + "step": 16270 + }, + { + "epoch": 0.4903614457831325, + "grad_norm": 4.207042639185046, + "learning_rate": 5.571486363330019e-06, + "loss": 0.998, + "step": 16280 + }, + { + "epoch": 0.49066265060240966, + "grad_norm": 3.5938725066185464, + "learning_rate": 5.566538388006351e-06, + "loss": 0.9872, + "step": 16290 + }, + { + "epoch": 0.49096385542168675, + "grad_norm": 4.012319826138452, + "learning_rate": 5.56158985059292e-06, + "loss": 1.0337, + "step": 16300 + }, + { + "epoch": 0.49126506024096384, + "grad_norm": 2.113113287905368, + "learning_rate": 5.556640755999412e-06, + "loss": 0.9452, + "step": 16310 + }, + { + "epoch": 0.491566265060241, + "grad_norm": 3.413853496438198, + "learning_rate": 5.551691109136053e-06, + "loss": 0.9903, + "step": 16320 + }, + { + "epoch": 0.49186746987951807, + "grad_norm": 5.3313796303409, + "learning_rate": 5.54674091491363e-06, + "loss": 0.968, + "step": 16330 + }, + { + "epoch": 0.49216867469879516, + "grad_norm": 7.5749167631122765, + "learning_rate": 5.5417901782434624e-06, + "loss": 0.918, + "step": 16340 + }, + { + "epoch": 0.4924698795180723, + "grad_norm": 3.886130707996784, + "learning_rate": 5.536838904037415e-06, + "loss": 1.0647, + "step": 16350 + }, + { + "epoch": 0.4927710843373494, + "grad_norm": 3.3551181609478564, + "learning_rate": 5.531887097207881e-06, + "loss": 0.9375, + "step": 16360 + }, + { + "epoch": 0.4930722891566265, + "grad_norm": 4.268081130671484, + "learning_rate": 5.526934762667783e-06, + "loss": 1.0252, + "step": 16370 + }, + { + "epoch": 0.49337349397590363, + "grad_norm": 2.0952551160850095, + "learning_rate": 5.521981905330572e-06, + "loss": 0.9003, + "step": 16380 + }, + { + "epoch": 0.4936746987951807, + "grad_norm": 4.735644970986755, + "learning_rate": 5.517028530110212e-06, + "loss": 1.0146, + "step": 16390 + }, + { + "epoch": 0.4939759036144578, + "grad_norm": 3.3887005523926694, + "learning_rate": 5.512074641921185e-06, + "loss": 0.9946, + "step": 16400 + }, + { + "epoch": 0.49427710843373496, + "grad_norm": 3.3361038650093424, + "learning_rate": 5.507120245678476e-06, + "loss": 1.0096, + "step": 16410 + }, + { + "epoch": 0.49457831325301205, + "grad_norm": 3.516599104133175, + "learning_rate": 5.5021653462975795e-06, + "loss": 0.9707, + "step": 16420 + }, + { + "epoch": 0.49487951807228914, + "grad_norm": 3.796649488608317, + "learning_rate": 5.49720994869449e-06, + "loss": 0.9433, + "step": 16430 + }, + { + "epoch": 0.4951807228915663, + "grad_norm": 2.161848452188897, + "learning_rate": 5.492254057785689e-06, + "loss": 1.0216, + "step": 16440 + }, + { + "epoch": 0.4954819277108434, + "grad_norm": 3.6082558748692275, + "learning_rate": 5.487297678488156e-06, + "loss": 1.0332, + "step": 16450 + }, + { + "epoch": 0.49578313253012046, + "grad_norm": 3.6447919883275954, + "learning_rate": 5.4823408157193516e-06, + "loss": 0.9219, + "step": 16460 + }, + { + "epoch": 0.4960843373493976, + "grad_norm": 3.8284824445728423, + "learning_rate": 5.477383474397213e-06, + "loss": 0.9927, + "step": 16470 + }, + { + "epoch": 0.4963855421686747, + "grad_norm": 3.8326747018520066, + "learning_rate": 5.472425659440157e-06, + "loss": 0.8893, + "step": 16480 + }, + { + "epoch": 0.4966867469879518, + "grad_norm": 1.911047180054358, + "learning_rate": 5.4674673757670685e-06, + "loss": 0.9214, + "step": 16490 + }, + { + "epoch": 0.49698795180722893, + "grad_norm": 1.88254732748915, + "learning_rate": 5.462508628297295e-06, + "loss": 0.9643, + "step": 16500 + }, + { + "epoch": 0.497289156626506, + "grad_norm": 2.082898839883877, + "learning_rate": 5.457549421950651e-06, + "loss": 0.9658, + "step": 16510 + }, + { + "epoch": 0.4975903614457831, + "grad_norm": 4.889244313390055, + "learning_rate": 5.4525897616473955e-06, + "loss": 0.9504, + "step": 16520 + }, + { + "epoch": 0.49789156626506026, + "grad_norm": 4.4994284167839105, + "learning_rate": 5.447629652308249e-06, + "loss": 0.9766, + "step": 16530 + }, + { + "epoch": 0.49819277108433735, + "grad_norm": 3.539972699302177, + "learning_rate": 5.442669098854371e-06, + "loss": 1.0098, + "step": 16540 + }, + { + "epoch": 0.49849397590361444, + "grad_norm": 3.095128366613158, + "learning_rate": 5.437708106207362e-06, + "loss": 0.9201, + "step": 16550 + }, + { + "epoch": 0.4987951807228916, + "grad_norm": 11.814963135069243, + "learning_rate": 5.432746679289263e-06, + "loss": 0.9717, + "step": 16560 + }, + { + "epoch": 0.4990963855421687, + "grad_norm": 3.868649908326447, + "learning_rate": 5.4277848230225375e-06, + "loss": 1.0115, + "step": 16570 + }, + { + "epoch": 0.49939759036144576, + "grad_norm": 8.5980024382822, + "learning_rate": 5.422822542330084e-06, + "loss": 0.952, + "step": 16580 + }, + { + "epoch": 0.4996987951807229, + "grad_norm": 3.3459203711087873, + "learning_rate": 5.417859842135214e-06, + "loss": 0.7811, + "step": 16590 + }, + { + "epoch": 0.5, + "grad_norm": 5.8414494077082875, + "learning_rate": 5.412896727361663e-06, + "loss": 0.865, + "step": 16600 + }, + { + "epoch": 0.5003012048192771, + "grad_norm": 2.096825972352622, + "learning_rate": 5.40793320293357e-06, + "loss": 0.8719, + "step": 16610 + }, + { + "epoch": 0.5006024096385542, + "grad_norm": 3.3793756859421085, + "learning_rate": 5.402969273775488e-06, + "loss": 1.0394, + "step": 16620 + }, + { + "epoch": 0.5009036144578313, + "grad_norm": 3.5366354603671377, + "learning_rate": 5.398004944812368e-06, + "loss": 0.9838, + "step": 16630 + }, + { + "epoch": 0.5012048192771085, + "grad_norm": 3.443967399642233, + "learning_rate": 5.393040220969553e-06, + "loss": 0.882, + "step": 16640 + }, + { + "epoch": 0.5015060240963856, + "grad_norm": 3.883316324592676, + "learning_rate": 5.388075107172788e-06, + "loss": 1.0068, + "step": 16650 + }, + { + "epoch": 0.5018072289156627, + "grad_norm": 1.9661484881701687, + "learning_rate": 5.383109608348195e-06, + "loss": 0.9525, + "step": 16660 + }, + { + "epoch": 0.5021084337349397, + "grad_norm": 4.397328415581478, + "learning_rate": 5.378143729422285e-06, + "loss": 0.8473, + "step": 16670 + }, + { + "epoch": 0.5024096385542168, + "grad_norm": 3.7685085101788474, + "learning_rate": 5.373177475321941e-06, + "loss": 0.8884, + "step": 16680 + }, + { + "epoch": 0.5027108433734939, + "grad_norm": 3.52299888030089, + "learning_rate": 5.3682108509744215e-06, + "loss": 0.952, + "step": 16690 + }, + { + "epoch": 0.5030120481927711, + "grad_norm": 3.1985604354976, + "learning_rate": 5.363243861307352e-06, + "loss": 0.9665, + "step": 16700 + }, + { + "epoch": 0.5033132530120482, + "grad_norm": 2.0732364114962256, + "learning_rate": 5.358276511248721e-06, + "loss": 0.8739, + "step": 16710 + }, + { + "epoch": 0.5036144578313253, + "grad_norm": 5.338086958389926, + "learning_rate": 5.3533088057268726e-06, + "loss": 1.0419, + "step": 16720 + }, + { + "epoch": 0.5039156626506024, + "grad_norm": 1.8328216310168854, + "learning_rate": 5.348340749670502e-06, + "loss": 0.8538, + "step": 16730 + }, + { + "epoch": 0.5042168674698795, + "grad_norm": 5.621038149996067, + "learning_rate": 5.343372348008657e-06, + "loss": 0.9176, + "step": 16740 + }, + { + "epoch": 0.5045180722891566, + "grad_norm": 6.444314752666458, + "learning_rate": 5.338403605670726e-06, + "loss": 1.0439, + "step": 16750 + }, + { + "epoch": 0.5048192771084338, + "grad_norm": 4.368052837782761, + "learning_rate": 5.333434527586435e-06, + "loss": 0.9521, + "step": 16760 + }, + { + "epoch": 0.5051204819277109, + "grad_norm": 4.8500848545911985, + "learning_rate": 5.3284651186858415e-06, + "loss": 1.0498, + "step": 16770 + }, + { + "epoch": 0.505421686746988, + "grad_norm": 3.3613658046037673, + "learning_rate": 5.323495383899336e-06, + "loss": 0.9349, + "step": 16780 + }, + { + "epoch": 0.505722891566265, + "grad_norm": 3.586912532561054, + "learning_rate": 5.318525328157629e-06, + "loss": 0.9039, + "step": 16790 + }, + { + "epoch": 0.5060240963855421, + "grad_norm": 4.141339792328036, + "learning_rate": 5.313554956391748e-06, + "loss": 0.9305, + "step": 16800 + }, + { + "epoch": 0.5063253012048192, + "grad_norm": 9.439351804482932, + "learning_rate": 5.308584273533038e-06, + "loss": 1.0657, + "step": 16810 + }, + { + "epoch": 0.5066265060240964, + "grad_norm": 3.882519429169925, + "learning_rate": 5.3036132845131485e-06, + "loss": 0.9061, + "step": 16820 + }, + { + "epoch": 0.5069277108433735, + "grad_norm": 5.082502222873879, + "learning_rate": 5.298641994264038e-06, + "loss": 0.9464, + "step": 16830 + }, + { + "epoch": 0.5072289156626506, + "grad_norm": 3.678118838009688, + "learning_rate": 5.293670407717957e-06, + "loss": 0.9193, + "step": 16840 + }, + { + "epoch": 0.5075301204819277, + "grad_norm": 1.928823152489833, + "learning_rate": 5.288698529807455e-06, + "loss": 0.9423, + "step": 16850 + }, + { + "epoch": 0.5078313253012048, + "grad_norm": 5.346641251646554, + "learning_rate": 5.2837263654653715e-06, + "loss": 0.8375, + "step": 16860 + }, + { + "epoch": 0.5081325301204819, + "grad_norm": 7.322236295268061, + "learning_rate": 5.278753919624824e-06, + "loss": 0.9388, + "step": 16870 + }, + { + "epoch": 0.5084337349397591, + "grad_norm": 1.8586260821774663, + "learning_rate": 5.273781197219217e-06, + "loss": 0.8868, + "step": 16880 + }, + { + "epoch": 0.5087349397590362, + "grad_norm": 5.532900617505604, + "learning_rate": 5.268808203182222e-06, + "loss": 1.0514, + "step": 16890 + }, + { + "epoch": 0.5090361445783133, + "grad_norm": 4.011795165947062, + "learning_rate": 5.263834942447787e-06, + "loss": 0.9166, + "step": 16900 + }, + { + "epoch": 0.5093373493975903, + "grad_norm": 3.7516058698720944, + "learning_rate": 5.25886141995012e-06, + "loss": 0.964, + "step": 16910 + }, + { + "epoch": 0.5096385542168674, + "grad_norm": 4.206936197114627, + "learning_rate": 5.253887640623688e-06, + "loss": 0.949, + "step": 16920 + }, + { + "epoch": 0.5099397590361445, + "grad_norm": 5.066907135959254, + "learning_rate": 5.248913609403218e-06, + "loss": 1.0073, + "step": 16930 + }, + { + "epoch": 0.5102409638554217, + "grad_norm": 3.3512241583371454, + "learning_rate": 5.243939331223681e-06, + "loss": 1.0165, + "step": 16940 + }, + { + "epoch": 0.5105421686746988, + "grad_norm": 3.927804930781103, + "learning_rate": 5.238964811020299e-06, + "loss": 1.0325, + "step": 16950 + }, + { + "epoch": 0.5108433734939759, + "grad_norm": 1.9899819718907672, + "learning_rate": 5.233990053728525e-06, + "loss": 0.9605, + "step": 16960 + }, + { + "epoch": 0.511144578313253, + "grad_norm": 3.3601632658216025, + "learning_rate": 5.229015064284057e-06, + "loss": 1.0048, + "step": 16970 + }, + { + "epoch": 0.5114457831325301, + "grad_norm": 4.047990702246386, + "learning_rate": 5.224039847622821e-06, + "loss": 1.0879, + "step": 16980 + }, + { + "epoch": 0.5117469879518072, + "grad_norm": 3.7722216904260333, + "learning_rate": 5.219064408680964e-06, + "loss": 1.0098, + "step": 16990 + }, + { + "epoch": 0.5120481927710844, + "grad_norm": 1.93704875953522, + "learning_rate": 5.214088752394857e-06, + "loss": 0.9124, + "step": 17000 + }, + { + "epoch": 0.5123493975903615, + "grad_norm": 7.500304067179252, + "learning_rate": 5.209112883701086e-06, + "loss": 1.0047, + "step": 17010 + }, + { + "epoch": 0.5126506024096386, + "grad_norm": 3.47159955480306, + "learning_rate": 5.204136807536446e-06, + "loss": 1.0444, + "step": 17020 + }, + { + "epoch": 0.5129518072289156, + "grad_norm": 3.5945005438563125, + "learning_rate": 5.1991605288379435e-06, + "loss": 0.9632, + "step": 17030 + }, + { + "epoch": 0.5132530120481927, + "grad_norm": 3.7044643169981386, + "learning_rate": 5.194184052542779e-06, + "loss": 0.844, + "step": 17040 + }, + { + "epoch": 0.5135542168674698, + "grad_norm": 4.019279915658249, + "learning_rate": 5.189207383588353e-06, + "loss": 0.9806, + "step": 17050 + }, + { + "epoch": 0.513855421686747, + "grad_norm": 3.7902114831517504, + "learning_rate": 5.184230526912255e-06, + "loss": 0.9074, + "step": 17060 + }, + { + "epoch": 0.5141566265060241, + "grad_norm": 3.3532788051860853, + "learning_rate": 5.1792534874522614e-06, + "loss": 1.0144, + "step": 17070 + }, + { + "epoch": 0.5144578313253012, + "grad_norm": 3.712019236827946, + "learning_rate": 5.174276270146335e-06, + "loss": 1.0634, + "step": 17080 + }, + { + "epoch": 0.5147590361445783, + "grad_norm": 1.9844335581390444, + "learning_rate": 5.169298879932605e-06, + "loss": 0.8989, + "step": 17090 + }, + { + "epoch": 0.5150602409638554, + "grad_norm": 5.182718294180661, + "learning_rate": 5.1643213217493815e-06, + "loss": 0.9546, + "step": 17100 + }, + { + "epoch": 0.5153614457831325, + "grad_norm": 3.5507685142494854, + "learning_rate": 5.1593436005351364e-06, + "loss": 1.0191, + "step": 17110 + }, + { + "epoch": 0.5156626506024097, + "grad_norm": 2.05952870693458, + "learning_rate": 5.154365721228502e-06, + "loss": 0.9325, + "step": 17120 + }, + { + "epoch": 0.5159638554216868, + "grad_norm": 3.634311089964099, + "learning_rate": 5.149387688768274e-06, + "loss": 0.9314, + "step": 17130 + }, + { + "epoch": 0.5162650602409639, + "grad_norm": 4.304533662235416, + "learning_rate": 5.144409508093391e-06, + "loss": 1.0373, + "step": 17140 + }, + { + "epoch": 0.516566265060241, + "grad_norm": 3.5427747846881616, + "learning_rate": 5.139431184142949e-06, + "loss": 0.947, + "step": 17150 + }, + { + "epoch": 0.516867469879518, + "grad_norm": 4.153543241500085, + "learning_rate": 5.1344527218561754e-06, + "loss": 0.9852, + "step": 17160 + }, + { + "epoch": 0.5171686746987951, + "grad_norm": 3.9695888760257345, + "learning_rate": 5.129474126172444e-06, + "loss": 0.9578, + "step": 17170 + }, + { + "epoch": 0.5174698795180723, + "grad_norm": 3.1424892784401597, + "learning_rate": 5.124495402031255e-06, + "loss": 0.9758, + "step": 17180 + }, + { + "epoch": 0.5177710843373494, + "grad_norm": 4.4720130227836865, + "learning_rate": 5.119516554372239e-06, + "loss": 0.8981, + "step": 17190 + }, + { + "epoch": 0.5180722891566265, + "grad_norm": 4.455717293784039, + "learning_rate": 5.114537588135149e-06, + "loss": 1.0093, + "step": 17200 + }, + { + "epoch": 0.5183734939759036, + "grad_norm": 1.8445645487163518, + "learning_rate": 5.109558508259852e-06, + "loss": 0.8968, + "step": 17210 + }, + { + "epoch": 0.5186746987951807, + "grad_norm": 3.4562959811878606, + "learning_rate": 5.104579319686335e-06, + "loss": 0.9239, + "step": 17220 + }, + { + "epoch": 0.5189759036144578, + "grad_norm": 3.4755156299950176, + "learning_rate": 5.0996000273546845e-06, + "loss": 0.9662, + "step": 17230 + }, + { + "epoch": 0.519277108433735, + "grad_norm": 4.054513837691121, + "learning_rate": 5.094620636205096e-06, + "loss": 0.9137, + "step": 17240 + }, + { + "epoch": 0.5195783132530121, + "grad_norm": 1.912461255336725, + "learning_rate": 5.089641151177861e-06, + "loss": 0.9686, + "step": 17250 + }, + { + "epoch": 0.5198795180722892, + "grad_norm": 1.8423595709599472, + "learning_rate": 5.084661577213363e-06, + "loss": 0.9428, + "step": 17260 + }, + { + "epoch": 0.5201807228915662, + "grad_norm": 1.8158581187420892, + "learning_rate": 5.079681919252076e-06, + "loss": 1.0003, + "step": 17270 + }, + { + "epoch": 0.5204819277108433, + "grad_norm": 4.787825750588715, + "learning_rate": 5.074702182234554e-06, + "loss": 1.0396, + "step": 17280 + }, + { + "epoch": 0.5207831325301204, + "grad_norm": 3.502506610642177, + "learning_rate": 5.0697223711014355e-06, + "loss": 0.9102, + "step": 17290 + }, + { + "epoch": 0.5210843373493976, + "grad_norm": 1.833976135044235, + "learning_rate": 5.0647424907934265e-06, + "loss": 0.9133, + "step": 17300 + }, + { + "epoch": 0.5213855421686747, + "grad_norm": 2.0346727920494883, + "learning_rate": 5.059762546251304e-06, + "loss": 1.0013, + "step": 17310 + }, + { + "epoch": 0.5216867469879518, + "grad_norm": 4.690418242534549, + "learning_rate": 5.054782542415908e-06, + "loss": 0.9793, + "step": 17320 + }, + { + "epoch": 0.5219879518072289, + "grad_norm": 4.718099340990269, + "learning_rate": 5.049802484228139e-06, + "loss": 0.9517, + "step": 17330 + }, + { + "epoch": 0.522289156626506, + "grad_norm": 1.774718518131299, + "learning_rate": 5.0448223766289485e-06, + "loss": 0.8892, + "step": 17340 + }, + { + "epoch": 0.5225903614457831, + "grad_norm": 3.3161270959024085, + "learning_rate": 5.03984222455934e-06, + "loss": 0.9927, + "step": 17350 + }, + { + "epoch": 0.5228915662650603, + "grad_norm": 3.6248939785901673, + "learning_rate": 5.03486203296036e-06, + "loss": 0.9208, + "step": 17360 + }, + { + "epoch": 0.5231927710843374, + "grad_norm": 2.1425751026089297, + "learning_rate": 5.029881806773096e-06, + "loss": 0.9274, + "step": 17370 + }, + { + "epoch": 0.5234939759036145, + "grad_norm": 4.163664848576764, + "learning_rate": 5.024901550938665e-06, + "loss": 0.987, + "step": 17380 + }, + { + "epoch": 0.5237951807228916, + "grad_norm": 6.647738532957064, + "learning_rate": 5.019921270398215e-06, + "loss": 1.0385, + "step": 17390 + }, + { + "epoch": 0.5240963855421686, + "grad_norm": 5.3044873464190045, + "learning_rate": 5.014940970092926e-06, + "loss": 0.9234, + "step": 17400 + }, + { + "epoch": 0.5243975903614457, + "grad_norm": 5.20998974799349, + "learning_rate": 5.009960654963985e-06, + "loss": 0.98, + "step": 17410 + }, + { + "epoch": 0.5246987951807229, + "grad_norm": 3.4715745569717775, + "learning_rate": 5.004980329952606e-06, + "loss": 0.9598, + "step": 17420 + }, + { + "epoch": 0.525, + "grad_norm": 5.251536276234192, + "learning_rate": 5e-06, + "loss": 1.0854, + "step": 17430 + }, + { + "epoch": 0.5253012048192771, + "grad_norm": 4.42136930557203, + "learning_rate": 4.995019670047397e-06, + "loss": 0.9094, + "step": 17440 + }, + { + "epoch": 0.5256024096385542, + "grad_norm": 4.001410215815066, + "learning_rate": 4.990039345036015e-06, + "loss": 0.9663, + "step": 17450 + }, + { + "epoch": 0.5259036144578313, + "grad_norm": 3.463306758529009, + "learning_rate": 4.985059029907075e-06, + "loss": 1.0536, + "step": 17460 + }, + { + "epoch": 0.5262048192771084, + "grad_norm": 3.522453864262608, + "learning_rate": 4.980078729601786e-06, + "loss": 0.9821, + "step": 17470 + }, + { + "epoch": 0.5265060240963856, + "grad_norm": 4.056603488992403, + "learning_rate": 4.9750984490613385e-06, + "loss": 0.9351, + "step": 17480 + }, + { + "epoch": 0.5268072289156627, + "grad_norm": 4.868537127841475, + "learning_rate": 4.970118193226906e-06, + "loss": 0.9887, + "step": 17490 + }, + { + "epoch": 0.5271084337349398, + "grad_norm": 1.9246294326348845, + "learning_rate": 4.965137967039641e-06, + "loss": 0.9296, + "step": 17500 + }, + { + "epoch": 0.5274096385542169, + "grad_norm": 2.225110752535346, + "learning_rate": 4.960157775440661e-06, + "loss": 0.9025, + "step": 17510 + }, + { + "epoch": 0.5277108433734939, + "grad_norm": 3.37465435685935, + "learning_rate": 4.955177623371052e-06, + "loss": 1.0208, + "step": 17520 + }, + { + "epoch": 0.528012048192771, + "grad_norm": 2.0000699800782207, + "learning_rate": 4.950197515771862e-06, + "loss": 0.9341, + "step": 17530 + }, + { + "epoch": 0.5283132530120482, + "grad_norm": 2.0211153004119358, + "learning_rate": 4.945217457584093e-06, + "loss": 0.9828, + "step": 17540 + }, + { + "epoch": 0.5286144578313253, + "grad_norm": 3.4947840076925014, + "learning_rate": 4.940237453748698e-06, + "loss": 0.9797, + "step": 17550 + }, + { + "epoch": 0.5289156626506024, + "grad_norm": 2.7483176433858088, + "learning_rate": 4.9352575092065735e-06, + "loss": 0.9642, + "step": 17560 + }, + { + "epoch": 0.5292168674698795, + "grad_norm": 1.8624396770427465, + "learning_rate": 4.930277628898565e-06, + "loss": 0.8357, + "step": 17570 + }, + { + "epoch": 0.5295180722891566, + "grad_norm": 2.142716430230594, + "learning_rate": 4.925297817765447e-06, + "loss": 0.9222, + "step": 17580 + }, + { + "epoch": 0.5298192771084337, + "grad_norm": 3.397721034908635, + "learning_rate": 4.920318080747926e-06, + "loss": 0.9685, + "step": 17590 + }, + { + "epoch": 0.5301204819277109, + "grad_norm": 3.7788232985839167, + "learning_rate": 4.915338422786639e-06, + "loss": 0.9021, + "step": 17600 + }, + { + "epoch": 0.530421686746988, + "grad_norm": 2.105592567465183, + "learning_rate": 4.910358848822141e-06, + "loss": 0.9367, + "step": 17610 + }, + { + "epoch": 0.5307228915662651, + "grad_norm": 3.861617065581075, + "learning_rate": 4.905379363794907e-06, + "loss": 1.0091, + "step": 17620 + }, + { + "epoch": 0.5310240963855422, + "grad_norm": 4.0934314679798245, + "learning_rate": 4.900399972645316e-06, + "loss": 0.9582, + "step": 17630 + }, + { + "epoch": 0.5313253012048192, + "grad_norm": 3.2743123976308115, + "learning_rate": 4.8954206803136675e-06, + "loss": 1.0131, + "step": 17640 + }, + { + "epoch": 0.5316265060240963, + "grad_norm": 3.91844832163504, + "learning_rate": 4.89044149174015e-06, + "loss": 1.015, + "step": 17650 + }, + { + "epoch": 0.5319277108433735, + "grad_norm": 3.220576979788481, + "learning_rate": 4.885462411864854e-06, + "loss": 0.9799, + "step": 17660 + }, + { + "epoch": 0.5322289156626506, + "grad_norm": 3.4419367482701557, + "learning_rate": 4.880483445627762e-06, + "loss": 0.9709, + "step": 17670 + }, + { + "epoch": 0.5325301204819277, + "grad_norm": 3.1698518987875444, + "learning_rate": 4.875504597968746e-06, + "loss": 0.885, + "step": 17680 + }, + { + "epoch": 0.5328313253012048, + "grad_norm": 5.532121312291464, + "learning_rate": 4.8705258738275575e-06, + "loss": 0.8918, + "step": 17690 + }, + { + "epoch": 0.5331325301204819, + "grad_norm": 4.087596559666748, + "learning_rate": 4.8655472781438245e-06, + "loss": 0.995, + "step": 17700 + }, + { + "epoch": 0.533433734939759, + "grad_norm": 2.041813382381951, + "learning_rate": 4.860568815857052e-06, + "loss": 1.04, + "step": 17710 + }, + { + "epoch": 0.5337349397590362, + "grad_norm": 4.325577814191232, + "learning_rate": 4.855590491906611e-06, + "loss": 1.0045, + "step": 17720 + }, + { + "epoch": 0.5340361445783133, + "grad_norm": 4.915812484305553, + "learning_rate": 4.850612311231729e-06, + "loss": 0.969, + "step": 17730 + }, + { + "epoch": 0.5343373493975904, + "grad_norm": 3.9105916475757168, + "learning_rate": 4.845634278771499e-06, + "loss": 0.8759, + "step": 17740 + }, + { + "epoch": 0.5346385542168675, + "grad_norm": 4.108462135369391, + "learning_rate": 4.840656399464866e-06, + "loss": 0.9976, + "step": 17750 + }, + { + "epoch": 0.5349397590361445, + "grad_norm": 1.9324226196798016, + "learning_rate": 4.83567867825062e-06, + "loss": 0.8654, + "step": 17760 + }, + { + "epoch": 0.5352409638554216, + "grad_norm": 3.580893486094657, + "learning_rate": 4.830701120067395e-06, + "loss": 0.9513, + "step": 17770 + }, + { + "epoch": 0.5355421686746988, + "grad_norm": 1.9065088849623395, + "learning_rate": 4.825723729853666e-06, + "loss": 0.9402, + "step": 17780 + }, + { + "epoch": 0.5358433734939759, + "grad_norm": 3.5789673775529853, + "learning_rate": 4.820746512547739e-06, + "loss": 0.9909, + "step": 17790 + }, + { + "epoch": 0.536144578313253, + "grad_norm": 1.797134767555618, + "learning_rate": 4.815769473087748e-06, + "loss": 0.9261, + "step": 17800 + }, + { + "epoch": 0.5364457831325301, + "grad_norm": 4.699788177369744, + "learning_rate": 4.81079261641165e-06, + "loss": 0.9696, + "step": 17810 + }, + { + "epoch": 0.5367469879518072, + "grad_norm": 4.0347822848961785, + "learning_rate": 4.805815947457223e-06, + "loss": 1.0048, + "step": 17820 + }, + { + "epoch": 0.5370481927710843, + "grad_norm": 1.8877736285879627, + "learning_rate": 4.800839471162058e-06, + "loss": 0.9339, + "step": 17830 + }, + { + "epoch": 0.5373493975903615, + "grad_norm": 4.061992285499995, + "learning_rate": 4.795863192463555e-06, + "loss": 0.9045, + "step": 17840 + }, + { + "epoch": 0.5376506024096386, + "grad_norm": 3.2819153245763713, + "learning_rate": 4.790887116298915e-06, + "loss": 1.0393, + "step": 17850 + }, + { + "epoch": 0.5379518072289157, + "grad_norm": 5.540904481275622, + "learning_rate": 4.785911247605144e-06, + "loss": 1.0325, + "step": 17860 + }, + { + "epoch": 0.5382530120481928, + "grad_norm": 3.9915915146618874, + "learning_rate": 4.780935591319038e-06, + "loss": 1.0207, + "step": 17870 + }, + { + "epoch": 0.5385542168674698, + "grad_norm": 3.3171207025582246, + "learning_rate": 4.7759601523771795e-06, + "loss": 0.9246, + "step": 17880 + }, + { + "epoch": 0.5388554216867469, + "grad_norm": 6.787166761029138, + "learning_rate": 4.770984935715943e-06, + "loss": 1.0477, + "step": 17890 + }, + { + "epoch": 0.5391566265060241, + "grad_norm": 1.9347569368374558, + "learning_rate": 4.766009946271477e-06, + "loss": 0.8728, + "step": 17900 + }, + { + "epoch": 0.5394578313253012, + "grad_norm": 3.3782462151839545, + "learning_rate": 4.7610351889797045e-06, + "loss": 0.9746, + "step": 17910 + }, + { + "epoch": 0.5397590361445783, + "grad_norm": 6.551315258730182, + "learning_rate": 4.75606066877632e-06, + "loss": 1.0123, + "step": 17920 + }, + { + "epoch": 0.5400602409638554, + "grad_norm": 4.852048589451806, + "learning_rate": 4.751086390596784e-06, + "loss": 0.9505, + "step": 17930 + }, + { + "epoch": 0.5403614457831325, + "grad_norm": 4.129868979814431, + "learning_rate": 4.7461123593763145e-06, + "loss": 0.9803, + "step": 17940 + }, + { + "epoch": 0.5406626506024096, + "grad_norm": 4.492534192590365, + "learning_rate": 4.741138580049881e-06, + "loss": 0.973, + "step": 17950 + }, + { + "epoch": 0.5409638554216868, + "grad_norm": 6.711979410474132, + "learning_rate": 4.736165057552214e-06, + "loss": 0.9755, + "step": 17960 + }, + { + "epoch": 0.5412650602409639, + "grad_norm": 2.143347947839009, + "learning_rate": 4.73119179681778e-06, + "loss": 0.9958, + "step": 17970 + }, + { + "epoch": 0.541566265060241, + "grad_norm": 2.050007419900864, + "learning_rate": 4.726218802780786e-06, + "loss": 0.9691, + "step": 17980 + }, + { + "epoch": 0.5418674698795181, + "grad_norm": 3.56161063519906, + "learning_rate": 4.721246080375177e-06, + "loss": 0.9714, + "step": 17990 + }, + { + "epoch": 0.5421686746987951, + "grad_norm": 3.5959178734003974, + "learning_rate": 4.71627363453463e-06, + "loss": 0.8454, + "step": 18000 + }, + { + "epoch": 0.5424698795180722, + "grad_norm": 4.890303472159129, + "learning_rate": 4.711301470192546e-06, + "loss": 0.9353, + "step": 18010 + }, + { + "epoch": 0.5427710843373494, + "grad_norm": 1.8706983160096207, + "learning_rate": 4.706329592282044e-06, + "loss": 0.9297, + "step": 18020 + }, + { + "epoch": 0.5430722891566265, + "grad_norm": 3.940535617761609, + "learning_rate": 4.701358005735963e-06, + "loss": 0.9424, + "step": 18030 + }, + { + "epoch": 0.5433734939759036, + "grad_norm": 6.2321792966622285, + "learning_rate": 4.696386715486852e-06, + "loss": 0.8667, + "step": 18040 + }, + { + "epoch": 0.5436746987951807, + "grad_norm": 2.0485982332257295, + "learning_rate": 4.691415726466964e-06, + "loss": 0.9719, + "step": 18050 + }, + { + "epoch": 0.5439759036144578, + "grad_norm": 2.1319835740570885, + "learning_rate": 4.686445043608252e-06, + "loss": 0.8765, + "step": 18060 + }, + { + "epoch": 0.5442771084337349, + "grad_norm": 3.869973375080356, + "learning_rate": 4.681474671842372e-06, + "loss": 1.0161, + "step": 18070 + }, + { + "epoch": 0.5445783132530121, + "grad_norm": 3.9407887892231024, + "learning_rate": 4.676504616100665e-06, + "loss": 0.9007, + "step": 18080 + }, + { + "epoch": 0.5448795180722892, + "grad_norm": 4.208148187578163, + "learning_rate": 4.671534881314159e-06, + "loss": 0.9004, + "step": 18090 + }, + { + "epoch": 0.5451807228915663, + "grad_norm": 6.110829514591843, + "learning_rate": 4.666565472413566e-06, + "loss": 0.9694, + "step": 18100 + }, + { + "epoch": 0.5454819277108434, + "grad_norm": 4.11019509226744, + "learning_rate": 4.6615963943292746e-06, + "loss": 0.9301, + "step": 18110 + }, + { + "epoch": 0.5457831325301205, + "grad_norm": 4.8827685283160625, + "learning_rate": 4.656627651991345e-06, + "loss": 0.9501, + "step": 18120 + }, + { + "epoch": 0.5460843373493975, + "grad_norm": 1.9276015787098681, + "learning_rate": 4.651659250329498e-06, + "loss": 0.9188, + "step": 18130 + }, + { + "epoch": 0.5463855421686747, + "grad_norm": 5.253274346868863, + "learning_rate": 4.64669119427313e-06, + "loss": 0.9809, + "step": 18140 + }, + { + "epoch": 0.5466867469879518, + "grad_norm": 2.0747414953348593, + "learning_rate": 4.6417234887512806e-06, + "loss": 0.84, + "step": 18150 + }, + { + "epoch": 0.5469879518072289, + "grad_norm": 4.081615293667149, + "learning_rate": 4.636756138692649e-06, + "loss": 0.9868, + "step": 18160 + }, + { + "epoch": 0.547289156626506, + "grad_norm": 1.9149772773466076, + "learning_rate": 4.631789149025579e-06, + "loss": 0.9231, + "step": 18170 + }, + { + "epoch": 0.5475903614457831, + "grad_norm": 1.9362830048849156, + "learning_rate": 4.62682252467806e-06, + "loss": 0.9863, + "step": 18180 + }, + { + "epoch": 0.5478915662650602, + "grad_norm": 1.8451057636135912, + "learning_rate": 4.6218562705777185e-06, + "loss": 0.7929, + "step": 18190 + }, + { + "epoch": 0.5481927710843374, + "grad_norm": 4.306535903769046, + "learning_rate": 4.616890391651806e-06, + "loss": 0.8187, + "step": 18200 + }, + { + "epoch": 0.5484939759036145, + "grad_norm": 6.2784166812719375, + "learning_rate": 4.611924892827214e-06, + "loss": 1.0032, + "step": 18210 + }, + { + "epoch": 0.5487951807228916, + "grad_norm": 5.140685994476185, + "learning_rate": 4.606959779030448e-06, + "loss": 0.9382, + "step": 18220 + }, + { + "epoch": 0.5490963855421687, + "grad_norm": 1.9687472287786298, + "learning_rate": 4.6019950551876344e-06, + "loss": 0.9879, + "step": 18230 + }, + { + "epoch": 0.5493975903614458, + "grad_norm": 6.239333787072131, + "learning_rate": 4.597030726224512e-06, + "loss": 0.8209, + "step": 18240 + }, + { + "epoch": 0.5496987951807228, + "grad_norm": 1.9971325593798464, + "learning_rate": 4.5920667970664305e-06, + "loss": 0.9806, + "step": 18250 + }, + { + "epoch": 0.55, + "grad_norm": 4.065120900266769, + "learning_rate": 4.587103272638339e-06, + "loss": 0.9718, + "step": 18260 + }, + { + "epoch": 0.5503012048192771, + "grad_norm": 1.8479007403994274, + "learning_rate": 4.582140157864786e-06, + "loss": 0.9457, + "step": 18270 + }, + { + "epoch": 0.5506024096385542, + "grad_norm": 5.232467134900734, + "learning_rate": 4.577177457669918e-06, + "loss": 0.9836, + "step": 18280 + }, + { + "epoch": 0.5509036144578313, + "grad_norm": 3.686243550061335, + "learning_rate": 4.572215176977465e-06, + "loss": 1.0281, + "step": 18290 + }, + { + "epoch": 0.5512048192771084, + "grad_norm": 1.9160010285583549, + "learning_rate": 4.56725332071074e-06, + "loss": 0.8726, + "step": 18300 + }, + { + "epoch": 0.5515060240963855, + "grad_norm": 3.9259430418582584, + "learning_rate": 4.562291893792639e-06, + "loss": 0.9226, + "step": 18310 + }, + { + "epoch": 0.5518072289156627, + "grad_norm": 1.958130002078843, + "learning_rate": 4.557330901145631e-06, + "loss": 0.9502, + "step": 18320 + }, + { + "epoch": 0.5521084337349398, + "grad_norm": 4.683451039328426, + "learning_rate": 4.552370347691752e-06, + "loss": 1.0428, + "step": 18330 + }, + { + "epoch": 0.5524096385542169, + "grad_norm": 7.304841244404751, + "learning_rate": 4.5474102383526045e-06, + "loss": 0.9516, + "step": 18340 + }, + { + "epoch": 0.552710843373494, + "grad_norm": 3.9484063329996713, + "learning_rate": 4.542450578049351e-06, + "loss": 0.9295, + "step": 18350 + }, + { + "epoch": 0.553012048192771, + "grad_norm": 1.9830861189302107, + "learning_rate": 4.5374913717027065e-06, + "loss": 0.9482, + "step": 18360 + }, + { + "epoch": 0.5533132530120481, + "grad_norm": 4.071109109673892, + "learning_rate": 4.532532624232934e-06, + "loss": 0.8806, + "step": 18370 + }, + { + "epoch": 0.5536144578313253, + "grad_norm": 3.877351017673144, + "learning_rate": 4.527574340559844e-06, + "loss": 0.9399, + "step": 18380 + }, + { + "epoch": 0.5539156626506024, + "grad_norm": 5.156381182005516, + "learning_rate": 4.522616525602789e-06, + "loss": 0.9347, + "step": 18390 + }, + { + "epoch": 0.5542168674698795, + "grad_norm": 1.9441536621399538, + "learning_rate": 4.517659184280651e-06, + "loss": 0.8091, + "step": 18400 + }, + { + "epoch": 0.5545180722891566, + "grad_norm": 3.68109023439261, + "learning_rate": 4.512702321511845e-06, + "loss": 0.9961, + "step": 18410 + }, + { + "epoch": 0.5548192771084337, + "grad_norm": 1.922771806218246, + "learning_rate": 4.507745942214311e-06, + "loss": 0.836, + "step": 18420 + }, + { + "epoch": 0.5551204819277108, + "grad_norm": 2.0451128315073124, + "learning_rate": 4.502790051305512e-06, + "loss": 0.8536, + "step": 18430 + }, + { + "epoch": 0.555421686746988, + "grad_norm": 2.0886689680083013, + "learning_rate": 4.497834653702422e-06, + "loss": 0.9008, + "step": 18440 + }, + { + "epoch": 0.5557228915662651, + "grad_norm": 4.115499394418487, + "learning_rate": 4.492879754321525e-06, + "loss": 0.9723, + "step": 18450 + }, + { + "epoch": 0.5560240963855422, + "grad_norm": 1.9508285900298126, + "learning_rate": 4.487925358078817e-06, + "loss": 0.8829, + "step": 18460 + }, + { + "epoch": 0.5563253012048193, + "grad_norm": 1.9166029511192284, + "learning_rate": 4.4829714698897885e-06, + "loss": 0.9262, + "step": 18470 + }, + { + "epoch": 0.5566265060240964, + "grad_norm": 4.334708285468265, + "learning_rate": 4.478018094669429e-06, + "loss": 1.0462, + "step": 18480 + }, + { + "epoch": 0.5569277108433734, + "grad_norm": 6.100242190534848, + "learning_rate": 4.473065237332217e-06, + "loss": 1.0086, + "step": 18490 + }, + { + "epoch": 0.5572289156626506, + "grad_norm": 1.8737237748603184, + "learning_rate": 4.468112902792121e-06, + "loss": 1.0117, + "step": 18500 + }, + { + "epoch": 0.5575301204819277, + "grad_norm": 1.9908048867633226, + "learning_rate": 4.4631610959625885e-06, + "loss": 1.01, + "step": 18510 + }, + { + "epoch": 0.5578313253012048, + "grad_norm": 2.067508599090194, + "learning_rate": 4.4582098217565375e-06, + "loss": 0.946, + "step": 18520 + }, + { + "epoch": 0.5581325301204819, + "grad_norm": 2.040770869396424, + "learning_rate": 4.453259085086371e-06, + "loss": 0.9013, + "step": 18530 + }, + { + "epoch": 0.558433734939759, + "grad_norm": 4.935813901130578, + "learning_rate": 4.4483088908639475e-06, + "loss": 0.8959, + "step": 18540 + }, + { + "epoch": 0.5587349397590361, + "grad_norm": 4.236655298417486, + "learning_rate": 4.44335924400059e-06, + "loss": 0.9883, + "step": 18550 + }, + { + "epoch": 0.5590361445783133, + "grad_norm": 4.283614854202966, + "learning_rate": 4.43841014940708e-06, + "loss": 1.0008, + "step": 18560 + }, + { + "epoch": 0.5593373493975904, + "grad_norm": 4.698810672546298, + "learning_rate": 4.4334616119936516e-06, + "loss": 1.0652, + "step": 18570 + }, + { + "epoch": 0.5596385542168675, + "grad_norm": 6.206177154210965, + "learning_rate": 4.4285136366699835e-06, + "loss": 0.8549, + "step": 18580 + }, + { + "epoch": 0.5599397590361446, + "grad_norm": 1.8802465834282318, + "learning_rate": 4.4235662283451975e-06, + "loss": 0.9175, + "step": 18590 + }, + { + "epoch": 0.5602409638554217, + "grad_norm": 3.9318789018894313, + "learning_rate": 4.418619391927854e-06, + "loss": 0.9142, + "step": 18600 + }, + { + "epoch": 0.5605421686746987, + "grad_norm": 3.9094542093330675, + "learning_rate": 4.4136731323259494e-06, + "loss": 0.86, + "step": 18610 + }, + { + "epoch": 0.560843373493976, + "grad_norm": 3.836422362284994, + "learning_rate": 4.408727454446897e-06, + "loss": 0.8367, + "step": 18620 + }, + { + "epoch": 0.561144578313253, + "grad_norm": 4.114832915900658, + "learning_rate": 4.403782363197543e-06, + "loss": 0.9804, + "step": 18630 + }, + { + "epoch": 0.5614457831325301, + "grad_norm": 5.596184527159677, + "learning_rate": 4.398837863484149e-06, + "loss": 0.95, + "step": 18640 + }, + { + "epoch": 0.5617469879518072, + "grad_norm": 1.9534028234204528, + "learning_rate": 4.393893960212387e-06, + "loss": 0.9503, + "step": 18650 + }, + { + "epoch": 0.5620481927710843, + "grad_norm": 3.7464535944102817, + "learning_rate": 4.388950658287343e-06, + "loss": 0.8979, + "step": 18660 + }, + { + "epoch": 0.5623493975903614, + "grad_norm": 2.111862104385048, + "learning_rate": 4.384007962613496e-06, + "loss": 0.9893, + "step": 18670 + }, + { + "epoch": 0.5626506024096386, + "grad_norm": 3.579984988879612, + "learning_rate": 4.379065878094738e-06, + "loss": 0.9144, + "step": 18680 + }, + { + "epoch": 0.5629518072289157, + "grad_norm": 3.8163492778300805, + "learning_rate": 4.374124409634342e-06, + "loss": 0.9661, + "step": 18690 + }, + { + "epoch": 0.5632530120481928, + "grad_norm": 3.7980937513766158, + "learning_rate": 4.369183562134975e-06, + "loss": 1.0123, + "step": 18700 + }, + { + "epoch": 0.5635542168674699, + "grad_norm": 4.803940294654779, + "learning_rate": 4.364243340498689e-06, + "loss": 1.025, + "step": 18710 + }, + { + "epoch": 0.563855421686747, + "grad_norm": 2.1392657146156475, + "learning_rate": 4.359303749626913e-06, + "loss": 0.9972, + "step": 18720 + }, + { + "epoch": 0.564156626506024, + "grad_norm": 4.637960149146857, + "learning_rate": 4.3543647944204495e-06, + "loss": 0.979, + "step": 18730 + }, + { + "epoch": 0.5644578313253013, + "grad_norm": 3.770254882330825, + "learning_rate": 4.349426479779472e-06, + "loss": 0.9557, + "step": 18740 + }, + { + "epoch": 0.5647590361445783, + "grad_norm": 4.127228391502067, + "learning_rate": 4.34448881060352e-06, + "loss": 0.9658, + "step": 18750 + }, + { + "epoch": 0.5650602409638554, + "grad_norm": 4.262381967244327, + "learning_rate": 4.33955179179149e-06, + "loss": 1.0109, + "step": 18760 + }, + { + "epoch": 0.5653614457831325, + "grad_norm": 3.4941324642398746, + "learning_rate": 4.334615428241629e-06, + "loss": 0.9235, + "step": 18770 + }, + { + "epoch": 0.5656626506024096, + "grad_norm": 5.75497366241385, + "learning_rate": 4.329679724851548e-06, + "loss": 1.0059, + "step": 18780 + }, + { + "epoch": 0.5659638554216867, + "grad_norm": 3.7544708516209657, + "learning_rate": 4.324744686518187e-06, + "loss": 0.8923, + "step": 18790 + }, + { + "epoch": 0.5662650602409639, + "grad_norm": 3.5842199961136165, + "learning_rate": 4.3198103181378335e-06, + "loss": 1.0152, + "step": 18800 + }, + { + "epoch": 0.566566265060241, + "grad_norm": 3.7537876702822475, + "learning_rate": 4.31487662460611e-06, + "loss": 1.0748, + "step": 18810 + }, + { + "epoch": 0.5668674698795181, + "grad_norm": 1.9374060537841604, + "learning_rate": 4.309943610817972e-06, + "loss": 0.9367, + "step": 18820 + }, + { + "epoch": 0.5671686746987952, + "grad_norm": 4.621492949057501, + "learning_rate": 4.305011281667697e-06, + "loss": 0.8945, + "step": 18830 + }, + { + "epoch": 0.5674698795180723, + "grad_norm": 3.9082665610282983, + "learning_rate": 4.300079642048878e-06, + "loss": 0.9052, + "step": 18840 + }, + { + "epoch": 0.5677710843373494, + "grad_norm": 4.924352913638754, + "learning_rate": 4.29514869685444e-06, + "loss": 0.9619, + "step": 18850 + }, + { + "epoch": 0.5680722891566266, + "grad_norm": 3.704516720818371, + "learning_rate": 4.290218450976602e-06, + "loss": 0.97, + "step": 18860 + }, + { + "epoch": 0.5683734939759036, + "grad_norm": 2.089249845955756, + "learning_rate": 4.285288909306898e-06, + "loss": 0.8702, + "step": 18870 + }, + { + "epoch": 0.5686746987951807, + "grad_norm": 20.96076288468202, + "learning_rate": 4.28036007673616e-06, + "loss": 0.951, + "step": 18880 + }, + { + "epoch": 0.5689759036144578, + "grad_norm": 4.633173693165347, + "learning_rate": 4.27543195815452e-06, + "loss": 0.9514, + "step": 18890 + }, + { + "epoch": 0.5692771084337349, + "grad_norm": 4.299784221202882, + "learning_rate": 4.270504558451399e-06, + "loss": 0.8395, + "step": 18900 + }, + { + "epoch": 0.569578313253012, + "grad_norm": 1.9314982054027046, + "learning_rate": 4.265577882515506e-06, + "loss": 0.8127, + "step": 18910 + }, + { + "epoch": 0.5698795180722892, + "grad_norm": 4.698112498219021, + "learning_rate": 4.2606519352348276e-06, + "loss": 0.948, + "step": 18920 + }, + { + "epoch": 0.5701807228915663, + "grad_norm": 6.041828060365895, + "learning_rate": 4.255726721496637e-06, + "loss": 0.9664, + "step": 18930 + }, + { + "epoch": 0.5704819277108434, + "grad_norm": 5.2524601599322605, + "learning_rate": 4.2508022461874695e-06, + "loss": 0.9306, + "step": 18940 + }, + { + "epoch": 0.5707831325301205, + "grad_norm": 1.9830774556820767, + "learning_rate": 4.245878514193131e-06, + "loss": 0.9587, + "step": 18950 + }, + { + "epoch": 0.5710843373493976, + "grad_norm": 4.384318913526011, + "learning_rate": 4.240955530398695e-06, + "loss": 0.8796, + "step": 18960 + }, + { + "epoch": 0.5713855421686747, + "grad_norm": 4.287595994126813, + "learning_rate": 4.236033299688486e-06, + "loss": 0.8963, + "step": 18970 + }, + { + "epoch": 0.5716867469879519, + "grad_norm": 7.136055525295403, + "learning_rate": 4.2311118269460845e-06, + "loss": 1.0356, + "step": 18980 + }, + { + "epoch": 0.571987951807229, + "grad_norm": 4.962219512409365, + "learning_rate": 4.226191117054316e-06, + "loss": 0.9524, + "step": 18990 + }, + { + "epoch": 0.572289156626506, + "grad_norm": 2.0172963730938362, + "learning_rate": 4.2212711748952575e-06, + "loss": 0.9047, + "step": 19000 + }, + { + "epoch": 0.5725903614457831, + "grad_norm": 2.077594005170364, + "learning_rate": 4.216352005350212e-06, + "loss": 0.9941, + "step": 19010 + }, + { + "epoch": 0.5728915662650602, + "grad_norm": 4.217654263507011, + "learning_rate": 4.211433613299725e-06, + "loss": 0.8907, + "step": 19020 + }, + { + "epoch": 0.5731927710843373, + "grad_norm": 6.467371706436814, + "learning_rate": 4.206516003623568e-06, + "loss": 1.0311, + "step": 19030 + }, + { + "epoch": 0.5734939759036145, + "grad_norm": 4.6591531707198905, + "learning_rate": 4.201599181200736e-06, + "loss": 0.8733, + "step": 19040 + }, + { + "epoch": 0.5737951807228916, + "grad_norm": 2.0132631749228063, + "learning_rate": 4.196683150909443e-06, + "loss": 0.9221, + "step": 19050 + }, + { + "epoch": 0.5740963855421687, + "grad_norm": 1.880468359992078, + "learning_rate": 4.191767917627115e-06, + "loss": 0.8268, + "step": 19060 + }, + { + "epoch": 0.5743975903614458, + "grad_norm": 5.970993317806257, + "learning_rate": 4.1868534862303935e-06, + "loss": 1.0026, + "step": 19070 + }, + { + "epoch": 0.5746987951807229, + "grad_norm": 1.850177492612431, + "learning_rate": 4.1819398615951205e-06, + "loss": 0.8882, + "step": 19080 + }, + { + "epoch": 0.575, + "grad_norm": 5.090666853571762, + "learning_rate": 4.17702704859633e-06, + "loss": 0.8254, + "step": 19090 + }, + { + "epoch": 0.5753012048192772, + "grad_norm": 4.473504116706102, + "learning_rate": 4.172115052108268e-06, + "loss": 0.9926, + "step": 19100 + }, + { + "epoch": 0.5756024096385542, + "grad_norm": 6.458336484469976, + "learning_rate": 4.167203877004353e-06, + "loss": 0.8647, + "step": 19110 + }, + { + "epoch": 0.5759036144578313, + "grad_norm": 3.926443743927527, + "learning_rate": 4.162293528157201e-06, + "loss": 0.9696, + "step": 19120 + }, + { + "epoch": 0.5762048192771084, + "grad_norm": 4.123870508627569, + "learning_rate": 4.157384010438598e-06, + "loss": 0.8732, + "step": 19130 + }, + { + "epoch": 0.5765060240963855, + "grad_norm": 5.650853444971126, + "learning_rate": 4.152475328719517e-06, + "loss": 0.9049, + "step": 19140 + }, + { + "epoch": 0.5768072289156626, + "grad_norm": 4.038437621321253, + "learning_rate": 4.147567487870092e-06, + "loss": 0.9814, + "step": 19150 + }, + { + "epoch": 0.5771084337349398, + "grad_norm": 2.095143797451166, + "learning_rate": 4.142660492759627e-06, + "loss": 0.8653, + "step": 19160 + }, + { + "epoch": 0.5774096385542169, + "grad_norm": 9.477101458372529, + "learning_rate": 4.137754348256582e-06, + "loss": 0.9629, + "step": 19170 + }, + { + "epoch": 0.577710843373494, + "grad_norm": 4.636146793154455, + "learning_rate": 4.1328490592285835e-06, + "loss": 0.9698, + "step": 19180 + }, + { + "epoch": 0.5780120481927711, + "grad_norm": 3.966840597990876, + "learning_rate": 4.127944630542399e-06, + "loss": 0.9285, + "step": 19190 + }, + { + "epoch": 0.5783132530120482, + "grad_norm": 3.870866783235835, + "learning_rate": 4.123041067063945e-06, + "loss": 0.982, + "step": 19200 + }, + { + "epoch": 0.5786144578313253, + "grad_norm": 3.9939869149013063, + "learning_rate": 4.118138373658283e-06, + "loss": 0.9778, + "step": 19210 + }, + { + "epoch": 0.5789156626506025, + "grad_norm": 4.736026530445284, + "learning_rate": 4.1132365551896104e-06, + "loss": 0.9062, + "step": 19220 + }, + { + "epoch": 0.5792168674698795, + "grad_norm": 4.063967782259461, + "learning_rate": 4.108335616521253e-06, + "loss": 1.0021, + "step": 19230 + }, + { + "epoch": 0.5795180722891566, + "grad_norm": 8.251032895747358, + "learning_rate": 4.103435562515662e-06, + "loss": 0.9911, + "step": 19240 + }, + { + "epoch": 0.5798192771084337, + "grad_norm": 4.472907905804807, + "learning_rate": 4.098536398034423e-06, + "loss": 0.9788, + "step": 19250 + }, + { + "epoch": 0.5801204819277108, + "grad_norm": 5.11668912041138, + "learning_rate": 4.093638127938225e-06, + "loss": 0.9856, + "step": 19260 + }, + { + "epoch": 0.5804216867469879, + "grad_norm": 2.0103905517811356, + "learning_rate": 4.088740757086877e-06, + "loss": 0.8786, + "step": 19270 + }, + { + "epoch": 0.5807228915662651, + "grad_norm": 6.724249971609902, + "learning_rate": 4.0838442903392936e-06, + "loss": 0.98, + "step": 19280 + }, + { + "epoch": 0.5810240963855422, + "grad_norm": 1.8756102514139061, + "learning_rate": 4.078948732553495e-06, + "loss": 0.8569, + "step": 19290 + }, + { + "epoch": 0.5813253012048193, + "grad_norm": 2.018430839661244, + "learning_rate": 4.074054088586596e-06, + "loss": 0.9565, + "step": 19300 + }, + { + "epoch": 0.5816265060240964, + "grad_norm": 2.063677566455654, + "learning_rate": 4.069160363294805e-06, + "loss": 1.0176, + "step": 19310 + }, + { + "epoch": 0.5819277108433735, + "grad_norm": 5.305992921836024, + "learning_rate": 4.064267561533423e-06, + "loss": 0.9794, + "step": 19320 + }, + { + "epoch": 0.5822289156626506, + "grad_norm": 5.892546927954209, + "learning_rate": 4.059375688156833e-06, + "loss": 0.8623, + "step": 19330 + }, + { + "epoch": 0.5825301204819278, + "grad_norm": 2.0344855813127674, + "learning_rate": 4.05448474801849e-06, + "loss": 0.8925, + "step": 19340 + }, + { + "epoch": 0.5828313253012049, + "grad_norm": 4.281680194569213, + "learning_rate": 4.049594745970935e-06, + "loss": 0.8764, + "step": 19350 + }, + { + "epoch": 0.5831325301204819, + "grad_norm": 1.984358877697064, + "learning_rate": 4.044705686865771e-06, + "loss": 0.957, + "step": 19360 + }, + { + "epoch": 0.583433734939759, + "grad_norm": 5.077279148504344, + "learning_rate": 4.039817575553665e-06, + "loss": 0.9986, + "step": 19370 + }, + { + "epoch": 0.5837349397590361, + "grad_norm": 3.8524178742577386, + "learning_rate": 4.034930416884346e-06, + "loss": 1.0043, + "step": 19380 + }, + { + "epoch": 0.5840361445783132, + "grad_norm": 5.127115601818105, + "learning_rate": 4.030044215706599e-06, + "loss": 0.9809, + "step": 19390 + }, + { + "epoch": 0.5843373493975904, + "grad_norm": 5.1176030232461125, + "learning_rate": 4.025158976868256e-06, + "loss": 0.9775, + "step": 19400 + }, + { + "epoch": 0.5846385542168675, + "grad_norm": 1.9987359372376363, + "learning_rate": 4.020274705216195e-06, + "loss": 0.9231, + "step": 19410 + }, + { + "epoch": 0.5849397590361446, + "grad_norm": 4.5078001613848535, + "learning_rate": 4.015391405596334e-06, + "loss": 0.9954, + "step": 19420 + }, + { + "epoch": 0.5852409638554217, + "grad_norm": 4.611302971350435, + "learning_rate": 4.010509082853629e-06, + "loss": 1.0505, + "step": 19430 + }, + { + "epoch": 0.5855421686746988, + "grad_norm": 5.544955522529904, + "learning_rate": 4.0056277418320675e-06, + "loss": 0.9018, + "step": 19440 + }, + { + "epoch": 0.5858433734939759, + "grad_norm": 2.0730805611868024, + "learning_rate": 4.000747387374655e-06, + "loss": 0.8096, + "step": 19450 + }, + { + "epoch": 0.5861445783132531, + "grad_norm": 3.2711034213875223, + "learning_rate": 3.9958680243234296e-06, + "loss": 0.9079, + "step": 19460 + }, + { + "epoch": 0.5864457831325302, + "grad_norm": 2.0374152947334303, + "learning_rate": 3.990989657519438e-06, + "loss": 0.9764, + "step": 19470 + }, + { + "epoch": 0.5867469879518072, + "grad_norm": 1.8549051826050869, + "learning_rate": 3.986112291802742e-06, + "loss": 0.9459, + "step": 19480 + }, + { + "epoch": 0.5870481927710843, + "grad_norm": 9.582038381432506, + "learning_rate": 3.9812359320124035e-06, + "loss": 0.8867, + "step": 19490 + }, + { + "epoch": 0.5873493975903614, + "grad_norm": 4.144057842139621, + "learning_rate": 3.976360582986498e-06, + "loss": 0.9371, + "step": 19500 + }, + { + "epoch": 0.5876506024096385, + "grad_norm": 4.375544782740267, + "learning_rate": 3.97148624956209e-06, + "loss": 0.8329, + "step": 19510 + }, + { + "epoch": 0.5879518072289157, + "grad_norm": 4.780092041284829, + "learning_rate": 3.966612936575235e-06, + "loss": 0.8714, + "step": 19520 + }, + { + "epoch": 0.5882530120481928, + "grad_norm": 4.38604775170126, + "learning_rate": 3.961740648860984e-06, + "loss": 0.9381, + "step": 19530 + }, + { + "epoch": 0.5885542168674699, + "grad_norm": 4.404034979998394, + "learning_rate": 3.9568693912533634e-06, + "loss": 0.8795, + "step": 19540 + }, + { + "epoch": 0.588855421686747, + "grad_norm": 3.8054401865690033, + "learning_rate": 3.951999168585382e-06, + "loss": 1.0025, + "step": 19550 + }, + { + "epoch": 0.5891566265060241, + "grad_norm": 4.564835777626075, + "learning_rate": 3.947129985689015e-06, + "loss": 0.8826, + "step": 19560 + }, + { + "epoch": 0.5894578313253012, + "grad_norm": 4.05307633524432, + "learning_rate": 3.942261847395219e-06, + "loss": 0.9938, + "step": 19570 + }, + { + "epoch": 0.5897590361445784, + "grad_norm": 3.9612524260890796, + "learning_rate": 3.9373947585339006e-06, + "loss": 0.8985, + "step": 19580 + }, + { + "epoch": 0.5900602409638555, + "grad_norm": 4.269101592949929, + "learning_rate": 3.932528723933931e-06, + "loss": 0.9395, + "step": 19590 + }, + { + "epoch": 0.5903614457831325, + "grad_norm": 2.091604360713433, + "learning_rate": 3.927663748423139e-06, + "loss": 0.9688, + "step": 19600 + }, + { + "epoch": 0.5906626506024096, + "grad_norm": 3.947046091444418, + "learning_rate": 3.922799836828295e-06, + "loss": 0.8924, + "step": 19610 + }, + { + "epoch": 0.5909638554216867, + "grad_norm": 2.0258762535317145, + "learning_rate": 3.917936993975119e-06, + "loss": 0.9684, + "step": 19620 + }, + { + "epoch": 0.5912650602409638, + "grad_norm": 6.170474460728932, + "learning_rate": 3.913075224688268e-06, + "loss": 1.0269, + "step": 19630 + }, + { + "epoch": 0.591566265060241, + "grad_norm": 3.4139801657324136, + "learning_rate": 3.90821453379134e-06, + "loss": 0.9085, + "step": 19640 + }, + { + "epoch": 0.5918674698795181, + "grad_norm": 2.0513524319317438, + "learning_rate": 3.903354926106855e-06, + "loss": 0.8842, + "step": 19650 + }, + { + "epoch": 0.5921686746987952, + "grad_norm": 3.4406615044779416, + "learning_rate": 3.8984964064562584e-06, + "loss": 0.8554, + "step": 19660 + }, + { + "epoch": 0.5924698795180723, + "grad_norm": 2.129923436331325, + "learning_rate": 3.893638979659925e-06, + "loss": 0.8639, + "step": 19670 + }, + { + "epoch": 0.5927710843373494, + "grad_norm": 4.864839862012951, + "learning_rate": 3.888782650537137e-06, + "loss": 1.0265, + "step": 19680 + }, + { + "epoch": 0.5930722891566265, + "grad_norm": 2.046001811984436, + "learning_rate": 3.883927423906091e-06, + "loss": 0.9386, + "step": 19690 + }, + { + "epoch": 0.5933734939759037, + "grad_norm": 4.2082964948951, + "learning_rate": 3.8790733045838865e-06, + "loss": 0.8413, + "step": 19700 + }, + { + "epoch": 0.5936746987951808, + "grad_norm": 1.875959934560138, + "learning_rate": 3.87422029738653e-06, + "loss": 0.835, + "step": 19710 + }, + { + "epoch": 0.5939759036144578, + "grad_norm": 4.572085365663675, + "learning_rate": 3.869368407128922e-06, + "loss": 0.9211, + "step": 19720 + }, + { + "epoch": 0.5942771084337349, + "grad_norm": 3.7668194982300194, + "learning_rate": 3.86451763862485e-06, + "loss": 1.0513, + "step": 19730 + }, + { + "epoch": 0.594578313253012, + "grad_norm": 4.738725122704846, + "learning_rate": 3.8596679966869946e-06, + "loss": 0.8918, + "step": 19740 + }, + { + "epoch": 0.5948795180722891, + "grad_norm": 2.096804706431497, + "learning_rate": 3.854819486126919e-06, + "loss": 0.9515, + "step": 19750 + }, + { + "epoch": 0.5951807228915663, + "grad_norm": 2.02920731379469, + "learning_rate": 3.84997211175506e-06, + "loss": 0.9241, + "step": 19760 + }, + { + "epoch": 0.5954819277108434, + "grad_norm": 1.9833211657330019, + "learning_rate": 3.8451258783807285e-06, + "loss": 0.8581, + "step": 19770 + }, + { + "epoch": 0.5957831325301205, + "grad_norm": 14.198194598438121, + "learning_rate": 3.840280790812107e-06, + "loss": 0.9752, + "step": 19780 + }, + { + "epoch": 0.5960843373493976, + "grad_norm": 5.687403900320029, + "learning_rate": 3.835436853856234e-06, + "loss": 0.9123, + "step": 19790 + }, + { + "epoch": 0.5963855421686747, + "grad_norm": 5.157563081151716, + "learning_rate": 3.830594072319014e-06, + "loss": 0.8172, + "step": 19800 + }, + { + "epoch": 0.5966867469879518, + "grad_norm": 5.654074952590619, + "learning_rate": 3.825752451005196e-06, + "loss": 0.9453, + "step": 19810 + }, + { + "epoch": 0.596987951807229, + "grad_norm": 3.956724339976459, + "learning_rate": 3.82091199471839e-06, + "loss": 0.8837, + "step": 19820 + }, + { + "epoch": 0.5972891566265061, + "grad_norm": 4.089607905283346, + "learning_rate": 3.816072708261041e-06, + "loss": 0.8458, + "step": 19830 + }, + { + "epoch": 0.5975903614457831, + "grad_norm": 6.045610661537695, + "learning_rate": 3.811234596434433e-06, + "loss": 0.9642, + "step": 19840 + }, + { + "epoch": 0.5978915662650602, + "grad_norm": 2.066210435504891, + "learning_rate": 3.8063976640386926e-06, + "loss": 0.9222, + "step": 19850 + }, + { + "epoch": 0.5981927710843373, + "grad_norm": 6.38183896078686, + "learning_rate": 3.801561915872768e-06, + "loss": 1.0151, + "step": 19860 + }, + { + "epoch": 0.5984939759036144, + "grad_norm": 4.832405016095391, + "learning_rate": 3.7967273567344374e-06, + "loss": 0.9098, + "step": 19870 + }, + { + "epoch": 0.5987951807228916, + "grad_norm": 3.857923639667753, + "learning_rate": 3.791893991420294e-06, + "loss": 0.9968, + "step": 19880 + }, + { + "epoch": 0.5990963855421687, + "grad_norm": 11.921100115865212, + "learning_rate": 3.7870618247257558e-06, + "loss": 0.9933, + "step": 19890 + }, + { + "epoch": 0.5993975903614458, + "grad_norm": 3.493248593589962, + "learning_rate": 3.782230861445041e-06, + "loss": 0.9846, + "step": 19900 + }, + { + "epoch": 0.5996987951807229, + "grad_norm": 2.015379592753621, + "learning_rate": 3.77740110637118e-06, + "loss": 0.8482, + "step": 19910 + }, + { + "epoch": 0.6, + "grad_norm": 2.000139425927677, + "learning_rate": 3.7725725642960047e-06, + "loss": 0.8741, + "step": 19920 + }, + { + "epoch": 0.6003012048192771, + "grad_norm": 2.172752870145956, + "learning_rate": 3.7677452400101413e-06, + "loss": 0.9144, + "step": 19930 + }, + { + "epoch": 0.6006024096385543, + "grad_norm": 4.668322278976479, + "learning_rate": 3.762919138303008e-06, + "loss": 0.9732, + "step": 19940 + }, + { + "epoch": 0.6009036144578314, + "grad_norm": 5.782784828200442, + "learning_rate": 3.7580942639628093e-06, + "loss": 0.9736, + "step": 19950 + }, + { + "epoch": 0.6012048192771084, + "grad_norm": 5.371808547098476, + "learning_rate": 3.753270621776536e-06, + "loss": 0.8671, + "step": 19960 + }, + { + "epoch": 0.6015060240963855, + "grad_norm": 5.290640387085528, + "learning_rate": 3.748448216529953e-06, + "loss": 0.8577, + "step": 19970 + }, + { + "epoch": 0.6018072289156626, + "grad_norm": 3.353020582675196, + "learning_rate": 3.7436270530075973e-06, + "loss": 0.9605, + "step": 19980 + }, + { + "epoch": 0.6021084337349397, + "grad_norm": 1.9049160341811593, + "learning_rate": 3.7388071359927745e-06, + "loss": 0.9228, + "step": 19990 + }, + { + "epoch": 0.6024096385542169, + "grad_norm": 2.0013597522662403, + "learning_rate": 3.7339884702675566e-06, + "loss": 0.9478, + "step": 20000 + }, + { + "epoch": 0.602710843373494, + "grad_norm": 4.185587185278027, + "learning_rate": 3.7291710606127706e-06, + "loss": 1.0358, + "step": 20010 + }, + { + "epoch": 0.6030120481927711, + "grad_norm": 5.135797612131306, + "learning_rate": 3.7243549118079973e-06, + "loss": 0.8667, + "step": 20020 + }, + { + "epoch": 0.6033132530120482, + "grad_norm": 4.792552611251369, + "learning_rate": 3.71954002863157e-06, + "loss": 0.862, + "step": 20030 + }, + { + "epoch": 0.6036144578313253, + "grad_norm": 13.706315325819853, + "learning_rate": 3.714726415860564e-06, + "loss": 0.8559, + "step": 20040 + }, + { + "epoch": 0.6039156626506024, + "grad_norm": 1.9506257104378237, + "learning_rate": 3.7099140782707916e-06, + "loss": 0.7836, + "step": 20050 + }, + { + "epoch": 0.6042168674698796, + "grad_norm": 5.40062128101387, + "learning_rate": 3.7051030206368026e-06, + "loss": 0.9976, + "step": 20060 + }, + { + "epoch": 0.6045180722891567, + "grad_norm": 4.688678898598641, + "learning_rate": 3.70029324773188e-06, + "loss": 0.9433, + "step": 20070 + }, + { + "epoch": 0.6048192771084338, + "grad_norm": 4.427468895008978, + "learning_rate": 3.6954847643280256e-06, + "loss": 0.9371, + "step": 20080 + }, + { + "epoch": 0.6051204819277108, + "grad_norm": 4.190971723261509, + "learning_rate": 3.6906775751959667e-06, + "loss": 0.9586, + "step": 20090 + }, + { + "epoch": 0.6054216867469879, + "grad_norm": 4.474464589994147, + "learning_rate": 3.6858716851051446e-06, + "loss": 0.8448, + "step": 20100 + }, + { + "epoch": 0.605722891566265, + "grad_norm": 4.959125383185958, + "learning_rate": 3.6810670988237134e-06, + "loss": 1.0073, + "step": 20110 + }, + { + "epoch": 0.6060240963855422, + "grad_norm": 1.9839334817570982, + "learning_rate": 3.676263821118532e-06, + "loss": 0.9316, + "step": 20120 + }, + { + "epoch": 0.6063253012048193, + "grad_norm": 4.1999695178417955, + "learning_rate": 3.6714618567551574e-06, + "loss": 1.0083, + "step": 20130 + }, + { + "epoch": 0.6066265060240964, + "grad_norm": 4.495331797883539, + "learning_rate": 3.666661210497854e-06, + "loss": 0.94, + "step": 20140 + }, + { + "epoch": 0.6069277108433735, + "grad_norm": 1.9935464448559828, + "learning_rate": 3.6618618871095675e-06, + "loss": 0.9016, + "step": 20150 + }, + { + "epoch": 0.6072289156626506, + "grad_norm": 2.02391953136053, + "learning_rate": 3.657063891351935e-06, + "loss": 0.9214, + "step": 20160 + }, + { + "epoch": 0.6075301204819277, + "grad_norm": 2.0226333520760065, + "learning_rate": 3.652267227985281e-06, + "loss": 0.9001, + "step": 20170 + }, + { + "epoch": 0.6078313253012049, + "grad_norm": 5.376248438595198, + "learning_rate": 3.647471901768601e-06, + "loss": 0.894, + "step": 20180 + }, + { + "epoch": 0.608132530120482, + "grad_norm": 4.628581474920254, + "learning_rate": 3.6426779174595693e-06, + "loss": 1.0381, + "step": 20190 + }, + { + "epoch": 0.608433734939759, + "grad_norm": 13.161008381782125, + "learning_rate": 3.637885279814521e-06, + "loss": 0.7851, + "step": 20200 + }, + { + "epoch": 0.6087349397590361, + "grad_norm": 4.633153878072175, + "learning_rate": 3.633093993588468e-06, + "loss": 0.9597, + "step": 20210 + }, + { + "epoch": 0.6090361445783132, + "grad_norm": 5.207531496874266, + "learning_rate": 3.6283040635350687e-06, + "loss": 0.9143, + "step": 20220 + }, + { + "epoch": 0.6093373493975903, + "grad_norm": 4.992257214257305, + "learning_rate": 3.623515494406643e-06, + "loss": 0.9635, + "step": 20230 + }, + { + "epoch": 0.6096385542168675, + "grad_norm": 4.397151380303392, + "learning_rate": 3.6187282909541577e-06, + "loss": 0.9109, + "step": 20240 + }, + { + "epoch": 0.6099397590361446, + "grad_norm": 2.082415468363504, + "learning_rate": 3.613942457927227e-06, + "loss": 0.9128, + "step": 20250 + }, + { + "epoch": 0.6102409638554217, + "grad_norm": 5.463364776790733, + "learning_rate": 3.6091580000741037e-06, + "loss": 0.8842, + "step": 20260 + }, + { + "epoch": 0.6105421686746988, + "grad_norm": 1.9078278870053709, + "learning_rate": 3.6043749221416765e-06, + "loss": 0.8429, + "step": 20270 + }, + { + "epoch": 0.6108433734939759, + "grad_norm": 4.108902519580929, + "learning_rate": 3.5995932288754655e-06, + "loss": 0.9702, + "step": 20280 + }, + { + "epoch": 0.611144578313253, + "grad_norm": 5.556046897969918, + "learning_rate": 3.5948129250196178e-06, + "loss": 0.8819, + "step": 20290 + }, + { + "epoch": 0.6114457831325302, + "grad_norm": 5.095835017905952, + "learning_rate": 3.590034015316899e-06, + "loss": 0.9301, + "step": 20300 + }, + { + "epoch": 0.6117469879518073, + "grad_norm": 1.8649830024983534, + "learning_rate": 3.585256504508692e-06, + "loss": 0.9225, + "step": 20310 + }, + { + "epoch": 0.6120481927710844, + "grad_norm": 4.849726724922682, + "learning_rate": 3.5804803973349967e-06, + "loss": 0.9656, + "step": 20320 + }, + { + "epoch": 0.6123493975903614, + "grad_norm": 1.7924568659544633, + "learning_rate": 3.5757056985344156e-06, + "loss": 0.9468, + "step": 20330 + }, + { + "epoch": 0.6126506024096385, + "grad_norm": 3.975614826615879, + "learning_rate": 3.5709324128441537e-06, + "loss": 0.8459, + "step": 20340 + }, + { + "epoch": 0.6129518072289156, + "grad_norm": 7.2789878240228685, + "learning_rate": 3.5661605450000175e-06, + "loss": 0.9311, + "step": 20350 + }, + { + "epoch": 0.6132530120481928, + "grad_norm": 3.9529557637741175, + "learning_rate": 3.561390099736404e-06, + "loss": 0.907, + "step": 20360 + }, + { + "epoch": 0.6135542168674699, + "grad_norm": 4.465699181195703, + "learning_rate": 3.5566210817863008e-06, + "loss": 0.9014, + "step": 20370 + }, + { + "epoch": 0.613855421686747, + "grad_norm": 5.32725086027786, + "learning_rate": 3.5518534958812728e-06, + "loss": 0.9585, + "step": 20380 + }, + { + "epoch": 0.6141566265060241, + "grad_norm": 4.276732014918137, + "learning_rate": 3.5470873467514766e-06, + "loss": 0.9224, + "step": 20390 + }, + { + "epoch": 0.6144578313253012, + "grad_norm": 1.9450433772783566, + "learning_rate": 3.542322639125632e-06, + "loss": 0.8167, + "step": 20400 + }, + { + "epoch": 0.6147590361445783, + "grad_norm": 5.263741054792196, + "learning_rate": 3.5375593777310324e-06, + "loss": 0.9317, + "step": 20410 + }, + { + "epoch": 0.6150602409638555, + "grad_norm": 5.15699066847789, + "learning_rate": 3.532797567293539e-06, + "loss": 0.9665, + "step": 20420 + }, + { + "epoch": 0.6153614457831326, + "grad_norm": 4.8665988430664875, + "learning_rate": 3.5280372125375693e-06, + "loss": 0.8968, + "step": 20430 + }, + { + "epoch": 0.6156626506024097, + "grad_norm": 5.321997110097974, + "learning_rate": 3.523278318186101e-06, + "loss": 0.9159, + "step": 20440 + }, + { + "epoch": 0.6159638554216867, + "grad_norm": 4.8472300516266715, + "learning_rate": 3.5185208889606538e-06, + "loss": 1.0397, + "step": 20450 + }, + { + "epoch": 0.6162650602409638, + "grad_norm": 4.693231960247368, + "learning_rate": 3.513764929581308e-06, + "loss": 1.0035, + "step": 20460 + }, + { + "epoch": 0.6165662650602409, + "grad_norm": 5.936565705194099, + "learning_rate": 3.509010444766674e-06, + "loss": 0.9271, + "step": 20470 + }, + { + "epoch": 0.6168674698795181, + "grad_norm": 7.129494217818609, + "learning_rate": 3.5042574392339026e-06, + "loss": 0.9824, + "step": 20480 + }, + { + "epoch": 0.6171686746987952, + "grad_norm": 5.480968897885322, + "learning_rate": 3.4995059176986767e-06, + "loss": 1.0018, + "step": 20490 + }, + { + "epoch": 0.6174698795180723, + "grad_norm": 9.132938514513636, + "learning_rate": 3.4947558848752104e-06, + "loss": 1.0223, + "step": 20500 + }, + { + "epoch": 0.6177710843373494, + "grad_norm": 4.1959733837300055, + "learning_rate": 3.4900073454762357e-06, + "loss": 0.9116, + "step": 20510 + }, + { + "epoch": 0.6180722891566265, + "grad_norm": 5.362536928090458, + "learning_rate": 3.485260304213005e-06, + "loss": 0.8883, + "step": 20520 + }, + { + "epoch": 0.6183734939759036, + "grad_norm": 1.9062269351441563, + "learning_rate": 3.4805147657952852e-06, + "loss": 0.9426, + "step": 20530 + }, + { + "epoch": 0.6186746987951808, + "grad_norm": 4.679132122109383, + "learning_rate": 3.475770734931353e-06, + "loss": 0.9292, + "step": 20540 + }, + { + "epoch": 0.6189759036144579, + "grad_norm": 1.9220586476908237, + "learning_rate": 3.4710282163279845e-06, + "loss": 0.9043, + "step": 20550 + }, + { + "epoch": 0.619277108433735, + "grad_norm": 4.785419168173132, + "learning_rate": 3.466287214690459e-06, + "loss": 0.9772, + "step": 20560 + }, + { + "epoch": 0.619578313253012, + "grad_norm": 2.0911223943850508, + "learning_rate": 3.4615477347225525e-06, + "loss": 0.9037, + "step": 20570 + }, + { + "epoch": 0.6198795180722891, + "grad_norm": 4.823163971264218, + "learning_rate": 3.4568097811265276e-06, + "loss": 0.986, + "step": 20580 + }, + { + "epoch": 0.6201807228915662, + "grad_norm": 3.910411253177122, + "learning_rate": 3.452073358603134e-06, + "loss": 0.9997, + "step": 20590 + }, + { + "epoch": 0.6204819277108434, + "grad_norm": 7.497753390009346, + "learning_rate": 3.4473384718516034e-06, + "loss": 0.8518, + "step": 20600 + }, + { + "epoch": 0.6207831325301205, + "grad_norm": 5.469582698139495, + "learning_rate": 3.4426051255696434e-06, + "loss": 0.9357, + "step": 20610 + }, + { + "epoch": 0.6210843373493976, + "grad_norm": 4.964654287958929, + "learning_rate": 3.437873324453431e-06, + "loss": 0.9093, + "step": 20620 + }, + { + "epoch": 0.6213855421686747, + "grad_norm": 4.095211442335393, + "learning_rate": 3.4331430731976116e-06, + "loss": 1.0008, + "step": 20630 + }, + { + "epoch": 0.6216867469879518, + "grad_norm": 1.955445042840258, + "learning_rate": 3.428414376495295e-06, + "loss": 0.8561, + "step": 20640 + }, + { + "epoch": 0.6219879518072289, + "grad_norm": 4.372613284683938, + "learning_rate": 3.423687239038045e-06, + "loss": 0.896, + "step": 20650 + }, + { + "epoch": 0.6222891566265061, + "grad_norm": 5.321712441142346, + "learning_rate": 3.4189616655158803e-06, + "loss": 0.8856, + "step": 20660 + }, + { + "epoch": 0.6225903614457832, + "grad_norm": 4.741666056448641, + "learning_rate": 3.4142376606172694e-06, + "loss": 0.9079, + "step": 20670 + }, + { + "epoch": 0.6228915662650603, + "grad_norm": 2.084327553394434, + "learning_rate": 3.4095152290291216e-06, + "loss": 0.8534, + "step": 20680 + }, + { + "epoch": 0.6231927710843373, + "grad_norm": 5.713163657925427, + "learning_rate": 3.4047943754367887e-06, + "loss": 0.928, + "step": 20690 + }, + { + "epoch": 0.6234939759036144, + "grad_norm": 6.769462101164566, + "learning_rate": 3.400075104524049e-06, + "loss": 0.9881, + "step": 20700 + }, + { + "epoch": 0.6237951807228915, + "grad_norm": 4.803985239731172, + "learning_rate": 3.3953574209731234e-06, + "loss": 1.0066, + "step": 20710 + }, + { + "epoch": 0.6240963855421687, + "grad_norm": 1.9421694425476366, + "learning_rate": 3.3906413294646468e-06, + "loss": 0.885, + "step": 20720 + }, + { + "epoch": 0.6243975903614458, + "grad_norm": 4.404078424357399, + "learning_rate": 3.3859268346776795e-06, + "loss": 0.9244, + "step": 20730 + }, + { + "epoch": 0.6246987951807229, + "grad_norm": 2.092217539247877, + "learning_rate": 3.381213941289695e-06, + "loss": 0.9255, + "step": 20740 + }, + { + "epoch": 0.625, + "grad_norm": 5.637280669702821, + "learning_rate": 3.3765026539765832e-06, + "loss": 1.0102, + "step": 20750 + }, + { + "epoch": 0.6253012048192771, + "grad_norm": 4.940501786030459, + "learning_rate": 3.371792977412635e-06, + "loss": 0.8817, + "step": 20760 + }, + { + "epoch": 0.6256024096385542, + "grad_norm": 2.011743660124654, + "learning_rate": 3.3670849162705426e-06, + "loss": 0.9622, + "step": 20770 + }, + { + "epoch": 0.6259036144578313, + "grad_norm": 4.069794571218934, + "learning_rate": 3.362378475221404e-06, + "loss": 0.9492, + "step": 20780 + }, + { + "epoch": 0.6262048192771085, + "grad_norm": 5.497143811656474, + "learning_rate": 3.357673658934699e-06, + "loss": 0.9171, + "step": 20790 + }, + { + "epoch": 0.6265060240963856, + "grad_norm": 6.532696567953873, + "learning_rate": 3.352970472078302e-06, + "loss": 0.8539, + "step": 20800 + }, + { + "epoch": 0.6268072289156627, + "grad_norm": 6.8112701715789274, + "learning_rate": 3.3482689193184677e-06, + "loss": 0.894, + "step": 20810 + }, + { + "epoch": 0.6271084337349397, + "grad_norm": 6.67718122150574, + "learning_rate": 3.3435690053198335e-06, + "loss": 0.9361, + "step": 20820 + }, + { + "epoch": 0.6274096385542168, + "grad_norm": 9.138516553035258, + "learning_rate": 3.3388707347454058e-06, + "loss": 0.9582, + "step": 20830 + }, + { + "epoch": 0.6277108433734939, + "grad_norm": 3.9872707008689683, + "learning_rate": 3.3341741122565636e-06, + "loss": 0.9401, + "step": 20840 + }, + { + "epoch": 0.6280120481927711, + "grad_norm": 13.163429872336879, + "learning_rate": 3.3294791425130512e-06, + "loss": 0.9015, + "step": 20850 + }, + { + "epoch": 0.6283132530120482, + "grad_norm": 5.050196117407373, + "learning_rate": 3.3247858301729726e-06, + "loss": 0.997, + "step": 20860 + }, + { + "epoch": 0.6286144578313253, + "grad_norm": 6.989204899709363, + "learning_rate": 3.3200941798927845e-06, + "loss": 1.0165, + "step": 20870 + }, + { + "epoch": 0.6289156626506024, + "grad_norm": 5.340432081133796, + "learning_rate": 3.3154041963272965e-06, + "loss": 0.9836, + "step": 20880 + }, + { + "epoch": 0.6292168674698795, + "grad_norm": 4.391546715755655, + "learning_rate": 3.3107158841296684e-06, + "loss": 1.0053, + "step": 20890 + }, + { + "epoch": 0.6295180722891566, + "grad_norm": 5.811563634386749, + "learning_rate": 3.3060292479513955e-06, + "loss": 0.9144, + "step": 20900 + }, + { + "epoch": 0.6298192771084338, + "grad_norm": 4.471575062661988, + "learning_rate": 3.301344292442314e-06, + "loss": 0.9673, + "step": 20910 + }, + { + "epoch": 0.6301204819277109, + "grad_norm": 10.841630783650825, + "learning_rate": 3.2966610222505926e-06, + "loss": 0.8931, + "step": 20920 + }, + { + "epoch": 0.630421686746988, + "grad_norm": 2.026385924513245, + "learning_rate": 3.2919794420227273e-06, + "loss": 0.8836, + "step": 20930 + }, + { + "epoch": 0.630722891566265, + "grad_norm": 1.96031208345594, + "learning_rate": 3.2872995564035353e-06, + "loss": 0.8269, + "step": 20940 + }, + { + "epoch": 0.6310240963855421, + "grad_norm": 4.189081050732709, + "learning_rate": 3.2826213700361543e-06, + "loss": 0.9789, + "step": 20950 + }, + { + "epoch": 0.6313253012048192, + "grad_norm": 7.1258907113227385, + "learning_rate": 3.2779448875620384e-06, + "loss": 1.0044, + "step": 20960 + }, + { + "epoch": 0.6316265060240964, + "grad_norm": 5.258855136928314, + "learning_rate": 3.2732701136209475e-06, + "loss": 1.0181, + "step": 20970 + }, + { + "epoch": 0.6319277108433735, + "grad_norm": 1.997481998766399, + "learning_rate": 3.268597052850948e-06, + "loss": 0.9278, + "step": 20980 + }, + { + "epoch": 0.6322289156626506, + "grad_norm": 4.314780294049713, + "learning_rate": 3.263925709888406e-06, + "loss": 0.9017, + "step": 20990 + }, + { + "epoch": 0.6325301204819277, + "grad_norm": 2.0480808621873416, + "learning_rate": 3.2592560893679843e-06, + "loss": 1.0029, + "step": 21000 + }, + { + "epoch": 0.6328313253012048, + "grad_norm": 2.180648854142401, + "learning_rate": 3.254588195922637e-06, + "loss": 0.8996, + "step": 21010 + }, + { + "epoch": 0.6331325301204819, + "grad_norm": 3.762274851734408, + "learning_rate": 3.2499220341836e-06, + "loss": 0.9475, + "step": 21020 + }, + { + "epoch": 0.6334337349397591, + "grad_norm": 2.0875260721228552, + "learning_rate": 3.2452576087804016e-06, + "loss": 0.9259, + "step": 21030 + }, + { + "epoch": 0.6337349397590362, + "grad_norm": 5.2835387813964765, + "learning_rate": 3.240594924340835e-06, + "loss": 0.9248, + "step": 21040 + }, + { + "epoch": 0.6340361445783133, + "grad_norm": 5.87641539779945, + "learning_rate": 3.2359339854909743e-06, + "loss": 0.9547, + "step": 21050 + }, + { + "epoch": 0.6343373493975903, + "grad_norm": 1.9379416125510331, + "learning_rate": 3.2312747968551585e-06, + "loss": 0.9719, + "step": 21060 + }, + { + "epoch": 0.6346385542168674, + "grad_norm": 2.0093708815095805, + "learning_rate": 3.2266173630559916e-06, + "loss": 0.906, + "step": 21070 + }, + { + "epoch": 0.6349397590361445, + "grad_norm": 1.9094212823134054, + "learning_rate": 3.221961688714336e-06, + "loss": 0.9814, + "step": 21080 + }, + { + "epoch": 0.6352409638554217, + "grad_norm": 4.4177387430879484, + "learning_rate": 3.217307778449306e-06, + "loss": 0.9457, + "step": 21090 + }, + { + "epoch": 0.6355421686746988, + "grad_norm": 3.8700286203190046, + "learning_rate": 3.2126556368782713e-06, + "loss": 0.8604, + "step": 21100 + }, + { + "epoch": 0.6358433734939759, + "grad_norm": 4.233246673498202, + "learning_rate": 3.2080052686168417e-06, + "loss": 0.9447, + "step": 21110 + }, + { + "epoch": 0.636144578313253, + "grad_norm": 4.449524022982166, + "learning_rate": 3.2033566782788694e-06, + "loss": 1.0633, + "step": 21120 + }, + { + "epoch": 0.6364457831325301, + "grad_norm": 1.8467476537699954, + "learning_rate": 3.1987098704764425e-06, + "loss": 0.8408, + "step": 21130 + }, + { + "epoch": 0.6367469879518072, + "grad_norm": 4.668357602821081, + "learning_rate": 3.1940648498198812e-06, + "loss": 1.0066, + "step": 21140 + }, + { + "epoch": 0.6370481927710844, + "grad_norm": 6.243675596567151, + "learning_rate": 3.1894216209177327e-06, + "loss": 1.0347, + "step": 21150 + }, + { + "epoch": 0.6373493975903615, + "grad_norm": 5.874779925360037, + "learning_rate": 3.1847801883767638e-06, + "loss": 1.0558, + "step": 21160 + }, + { + "epoch": 0.6376506024096386, + "grad_norm": 3.719219897415577, + "learning_rate": 3.180140556801963e-06, + "loss": 0.8836, + "step": 21170 + }, + { + "epoch": 0.6379518072289156, + "grad_norm": 5.218115830473539, + "learning_rate": 3.1755027307965314e-06, + "loss": 0.8676, + "step": 21180 + }, + { + "epoch": 0.6382530120481927, + "grad_norm": 5.262196822833833, + "learning_rate": 3.1708667149618743e-06, + "loss": 0.9798, + "step": 21190 + }, + { + "epoch": 0.6385542168674698, + "grad_norm": 4.635588946933596, + "learning_rate": 3.166232513897604e-06, + "loss": 1.0011, + "step": 21200 + }, + { + "epoch": 0.638855421686747, + "grad_norm": 4.0853151451849286, + "learning_rate": 3.1616001322015354e-06, + "loss": 0.9333, + "step": 21210 + }, + { + "epoch": 0.6391566265060241, + "grad_norm": 4.519658605776038, + "learning_rate": 3.156969574469674e-06, + "loss": 0.889, + "step": 21220 + }, + { + "epoch": 0.6394578313253012, + "grad_norm": 3.5410025112383807, + "learning_rate": 3.1523408452962156e-06, + "loss": 0.9852, + "step": 21230 + }, + { + "epoch": 0.6397590361445783, + "grad_norm": 6.185458822317108, + "learning_rate": 3.1477139492735455e-06, + "loss": 0.9616, + "step": 21240 + }, + { + "epoch": 0.6400602409638554, + "grad_norm": 8.127777230816926, + "learning_rate": 3.1430888909922273e-06, + "loss": 0.9867, + "step": 21250 + }, + { + "epoch": 0.6403614457831325, + "grad_norm": 6.7524889501059056, + "learning_rate": 3.1384656750410013e-06, + "loss": 1.0125, + "step": 21260 + }, + { + "epoch": 0.6406626506024097, + "grad_norm": 4.575187436704799, + "learning_rate": 3.13384430600678e-06, + "loss": 0.828, + "step": 21270 + }, + { + "epoch": 0.6409638554216868, + "grad_norm": 4.58634340419795, + "learning_rate": 3.129224788474646e-06, + "loss": 0.8931, + "step": 21280 + }, + { + "epoch": 0.6412650602409639, + "grad_norm": 6.398077486688091, + "learning_rate": 3.1246071270278422e-06, + "loss": 0.8867, + "step": 21290 + }, + { + "epoch": 0.641566265060241, + "grad_norm": 8.562850814957184, + "learning_rate": 3.1199913262477697e-06, + "loss": 0.891, + "step": 21300 + }, + { + "epoch": 0.641867469879518, + "grad_norm": 7.3977415208563855, + "learning_rate": 3.1153773907139855e-06, + "loss": 0.827, + "step": 21310 + }, + { + "epoch": 0.6421686746987951, + "grad_norm": 3.906194519864792, + "learning_rate": 3.1107653250041956e-06, + "loss": 0.9751, + "step": 21320 + }, + { + "epoch": 0.6424698795180723, + "grad_norm": 4.563963427294925, + "learning_rate": 3.1061551336942513e-06, + "loss": 0.9647, + "step": 21330 + }, + { + "epoch": 0.6427710843373494, + "grad_norm": 4.5838313654218785, + "learning_rate": 3.101546821358139e-06, + "loss": 0.9122, + "step": 21340 + }, + { + "epoch": 0.6430722891566265, + "grad_norm": 24.549443758048707, + "learning_rate": 3.096940392567991e-06, + "loss": 0.9398, + "step": 21350 + }, + { + "epoch": 0.6433734939759036, + "grad_norm": 3.2979766870583487, + "learning_rate": 3.0923358518940616e-06, + "loss": 1.0312, + "step": 21360 + }, + { + "epoch": 0.6436746987951807, + "grad_norm": 4.862204403795849, + "learning_rate": 3.0877332039047357e-06, + "loss": 1.0492, + "step": 21370 + }, + { + "epoch": 0.6439759036144578, + "grad_norm": 5.941711738298755, + "learning_rate": 3.083132453166519e-06, + "loss": 0.8433, + "step": 21380 + }, + { + "epoch": 0.644277108433735, + "grad_norm": 2.065313804808686, + "learning_rate": 3.0785336042440384e-06, + "loss": 0.9444, + "step": 21390 + }, + { + "epoch": 0.6445783132530121, + "grad_norm": 4.452525298541014, + "learning_rate": 3.073936661700029e-06, + "loss": 0.8828, + "step": 21400 + }, + { + "epoch": 0.6448795180722892, + "grad_norm": 2.2144776002648268, + "learning_rate": 3.069341630095337e-06, + "loss": 0.8824, + "step": 21410 + }, + { + "epoch": 0.6451807228915662, + "grad_norm": 5.615872580136008, + "learning_rate": 3.0647485139889145e-06, + "loss": 0.9733, + "step": 21420 + }, + { + "epoch": 0.6454819277108433, + "grad_norm": 1.9154614170494852, + "learning_rate": 3.0601573179378095e-06, + "loss": 0.834, + "step": 21430 + }, + { + "epoch": 0.6457831325301204, + "grad_norm": 4.325685577824708, + "learning_rate": 3.055568046497167e-06, + "loss": 0.8614, + "step": 21440 + }, + { + "epoch": 0.6460843373493976, + "grad_norm": 4.667964013476934, + "learning_rate": 3.0509807042202205e-06, + "loss": 0.9739, + "step": 21450 + }, + { + "epoch": 0.6463855421686747, + "grad_norm": 5.049570869182073, + "learning_rate": 3.046395295658293e-06, + "loss": 1.0064, + "step": 21460 + }, + { + "epoch": 0.6466867469879518, + "grad_norm": 4.754221646873815, + "learning_rate": 3.0418118253607876e-06, + "loss": 1.089, + "step": 21470 + }, + { + "epoch": 0.6469879518072289, + "grad_norm": 5.263135051321002, + "learning_rate": 3.037230297875181e-06, + "loss": 0.9869, + "step": 21480 + }, + { + "epoch": 0.647289156626506, + "grad_norm": 6.299277849284255, + "learning_rate": 3.032650717747028e-06, + "loss": 0.879, + "step": 21490 + }, + { + "epoch": 0.6475903614457831, + "grad_norm": 6.14985684922634, + "learning_rate": 3.0280730895199494e-06, + "loss": 0.9518, + "step": 21500 + }, + { + "epoch": 0.6478915662650603, + "grad_norm": 5.010786809719041, + "learning_rate": 3.0234974177356246e-06, + "loss": 0.9193, + "step": 21510 + }, + { + "epoch": 0.6481927710843374, + "grad_norm": 3.884190230743868, + "learning_rate": 3.0189237069337973e-06, + "loss": 0.986, + "step": 21520 + }, + { + "epoch": 0.6484939759036145, + "grad_norm": 1.9760447241771413, + "learning_rate": 3.0143519616522654e-06, + "loss": 0.779, + "step": 21530 + }, + { + "epoch": 0.6487951807228916, + "grad_norm": 4.191624279760544, + "learning_rate": 3.009782186426875e-06, + "loss": 0.9806, + "step": 21540 + }, + { + "epoch": 0.6490963855421686, + "grad_norm": 6.759338320255287, + "learning_rate": 3.0052143857915184e-06, + "loss": 0.8706, + "step": 21550 + }, + { + "epoch": 0.6493975903614457, + "grad_norm": 4.264958450597003, + "learning_rate": 3.0006485642781268e-06, + "loss": 0.9832, + "step": 21560 + }, + { + "epoch": 0.6496987951807229, + "grad_norm": 10.751978846120968, + "learning_rate": 2.9960847264166715e-06, + "loss": 0.9013, + "step": 21570 + }, + { + "epoch": 0.65, + "grad_norm": 2.0557452139786863, + "learning_rate": 2.991522876735154e-06, + "loss": 0.9567, + "step": 21580 + }, + { + "epoch": 0.6503012048192771, + "grad_norm": 5.519619416231213, + "learning_rate": 2.986963019759599e-06, + "loss": 0.9118, + "step": 21590 + }, + { + "epoch": 0.6506024096385542, + "grad_norm": 1.759742155520633, + "learning_rate": 2.9824051600140647e-06, + "loss": 0.9272, + "step": 21600 + }, + { + "epoch": 0.6509036144578313, + "grad_norm": 10.756528558162993, + "learning_rate": 2.9778493020206155e-06, + "loss": 0.9989, + "step": 21610 + }, + { + "epoch": 0.6512048192771084, + "grad_norm": 4.89241199410044, + "learning_rate": 2.9732954502993383e-06, + "loss": 0.9212, + "step": 21620 + }, + { + "epoch": 0.6515060240963856, + "grad_norm": 6.12006754836966, + "learning_rate": 2.9687436093683252e-06, + "loss": 0.8998, + "step": 21630 + }, + { + "epoch": 0.6518072289156627, + "grad_norm": 1.8227140458962348, + "learning_rate": 2.9641937837436762e-06, + "loss": 0.9195, + "step": 21640 + }, + { + "epoch": 0.6521084337349398, + "grad_norm": 8.056246962058308, + "learning_rate": 2.9596459779394902e-06, + "loss": 0.9162, + "step": 21650 + }, + { + "epoch": 0.6524096385542169, + "grad_norm": 1.8153106365669807, + "learning_rate": 2.955100196467859e-06, + "loss": 0.9189, + "step": 21660 + }, + { + "epoch": 0.6527108433734939, + "grad_norm": 4.19096140512222, + "learning_rate": 2.9505564438388746e-06, + "loss": 0.9901, + "step": 21670 + }, + { + "epoch": 0.653012048192771, + "grad_norm": 4.40998587864527, + "learning_rate": 2.946014724560607e-06, + "loss": 0.9241, + "step": 21680 + }, + { + "epoch": 0.6533132530120482, + "grad_norm": 2.08107272327245, + "learning_rate": 2.9414750431391136e-06, + "loss": 0.9554, + "step": 21690 + }, + { + "epoch": 0.6536144578313253, + "grad_norm": 1.9749393817026102, + "learning_rate": 2.936937404078428e-06, + "loss": 0.8746, + "step": 21700 + }, + { + "epoch": 0.6539156626506024, + "grad_norm": 6.080370488751193, + "learning_rate": 2.93240181188056e-06, + "loss": 0.8827, + "step": 21710 + }, + { + "epoch": 0.6542168674698795, + "grad_norm": 4.235770083389657, + "learning_rate": 2.9278682710454863e-06, + "loss": 0.9714, + "step": 21720 + }, + { + "epoch": 0.6545180722891566, + "grad_norm": 4.691061814383612, + "learning_rate": 2.9233367860711485e-06, + "loss": 0.9328, + "step": 21730 + }, + { + "epoch": 0.6548192771084337, + "grad_norm": 3.777238716083142, + "learning_rate": 2.91880736145345e-06, + "loss": 1.0006, + "step": 21740 + }, + { + "epoch": 0.6551204819277109, + "grad_norm": 4.1143117256133435, + "learning_rate": 2.9142800016862487e-06, + "loss": 0.9042, + "step": 21750 + }, + { + "epoch": 0.655421686746988, + "grad_norm": 4.58042100265641, + "learning_rate": 2.9097547112613556e-06, + "loss": 0.9748, + "step": 21760 + }, + { + "epoch": 0.6557228915662651, + "grad_norm": 4.354875232053938, + "learning_rate": 2.9052314946685223e-06, + "loss": 0.9474, + "step": 21770 + }, + { + "epoch": 0.6560240963855422, + "grad_norm": 5.6314880479577285, + "learning_rate": 2.9007103563954525e-06, + "loss": 1.0376, + "step": 21780 + }, + { + "epoch": 0.6563253012048192, + "grad_norm": 1.9681158907671024, + "learning_rate": 2.896191300927782e-06, + "loss": 0.9036, + "step": 21790 + }, + { + "epoch": 0.6566265060240963, + "grad_norm": 2.1799316261529778, + "learning_rate": 2.89167433274908e-06, + "loss": 0.8854, + "step": 21800 + }, + { + "epoch": 0.6569277108433735, + "grad_norm": 3.6531469138663084, + "learning_rate": 2.887159456340849e-06, + "loss": 0.961, + "step": 21810 + }, + { + "epoch": 0.6572289156626506, + "grad_norm": 4.7381749172876075, + "learning_rate": 2.8826466761825102e-06, + "loss": 0.7997, + "step": 21820 + }, + { + "epoch": 0.6575301204819277, + "grad_norm": 10.760848820952543, + "learning_rate": 2.8781359967514094e-06, + "loss": 0.874, + "step": 21830 + }, + { + "epoch": 0.6578313253012048, + "grad_norm": 3.110054614216242, + "learning_rate": 2.8736274225228045e-06, + "loss": 0.9682, + "step": 21840 + }, + { + "epoch": 0.6581325301204819, + "grad_norm": 5.364596429239417, + "learning_rate": 2.869120957969872e-06, + "loss": 0.9245, + "step": 21850 + }, + { + "epoch": 0.658433734939759, + "grad_norm": 7.324574404699955, + "learning_rate": 2.864616607563689e-06, + "loss": 0.9278, + "step": 21860 + }, + { + "epoch": 0.6587349397590362, + "grad_norm": 4.062528090718364, + "learning_rate": 2.860114375773233e-06, + "loss": 0.8504, + "step": 21870 + }, + { + "epoch": 0.6590361445783133, + "grad_norm": 4.1432392143125885, + "learning_rate": 2.855614267065383e-06, + "loss": 0.9679, + "step": 21880 + }, + { + "epoch": 0.6593373493975904, + "grad_norm": 4.627951143207221, + "learning_rate": 2.8511162859049156e-06, + "loss": 0.8838, + "step": 21890 + }, + { + "epoch": 0.6596385542168675, + "grad_norm": 1.9882933520820298, + "learning_rate": 2.8466204367544904e-06, + "loss": 0.9857, + "step": 21900 + }, + { + "epoch": 0.6599397590361445, + "grad_norm": 13.916313659291012, + "learning_rate": 2.8421267240746535e-06, + "loss": 0.916, + "step": 21910 + }, + { + "epoch": 0.6602409638554216, + "grad_norm": 1.8548867782047582, + "learning_rate": 2.8376351523238323e-06, + "loss": 0.902, + "step": 21920 + }, + { + "epoch": 0.6605421686746988, + "grad_norm": 2.1290771136000566, + "learning_rate": 2.833145725958328e-06, + "loss": 0.9487, + "step": 21930 + }, + { + "epoch": 0.6608433734939759, + "grad_norm": 4.476812288039868, + "learning_rate": 2.828658449432317e-06, + "loss": 0.8224, + "step": 21940 + }, + { + "epoch": 0.661144578313253, + "grad_norm": 5.157041111022135, + "learning_rate": 2.8241733271978367e-06, + "loss": 0.9983, + "step": 21950 + }, + { + "epoch": 0.6614457831325301, + "grad_norm": 3.906482197609336, + "learning_rate": 2.819690363704797e-06, + "loss": 1.0521, + "step": 21960 + }, + { + "epoch": 0.6617469879518072, + "grad_norm": 5.18254525615854, + "learning_rate": 2.8152095634009534e-06, + "loss": 0.9504, + "step": 21970 + }, + { + "epoch": 0.6620481927710843, + "grad_norm": 3.485683403366198, + "learning_rate": 2.810730930731923e-06, + "loss": 0.8491, + "step": 21980 + }, + { + "epoch": 0.6623493975903615, + "grad_norm": 2.013843803590297, + "learning_rate": 2.806254470141174e-06, + "loss": 0.8204, + "step": 21990 + }, + { + "epoch": 0.6626506024096386, + "grad_norm": 4.72464974501394, + "learning_rate": 2.8017801860700144e-06, + "loss": 1.0248, + "step": 22000 + }, + { + "epoch": 0.6629518072289157, + "grad_norm": 4.382429131044873, + "learning_rate": 2.797308082957597e-06, + "loss": 0.9791, + "step": 22010 + }, + { + "epoch": 0.6632530120481928, + "grad_norm": 4.119000658522985, + "learning_rate": 2.792838165240903e-06, + "loss": 0.955, + "step": 22020 + }, + { + "epoch": 0.6635542168674698, + "grad_norm": 4.186191863720116, + "learning_rate": 2.788370437354757e-06, + "loss": 0.9253, + "step": 22030 + }, + { + "epoch": 0.6638554216867469, + "grad_norm": 2.129445625917607, + "learning_rate": 2.783904903731803e-06, + "loss": 0.9486, + "step": 22040 + }, + { + "epoch": 0.6641566265060241, + "grad_norm": 3.633194431817707, + "learning_rate": 2.779441568802509e-06, + "loss": 0.9825, + "step": 22050 + }, + { + "epoch": 0.6644578313253012, + "grad_norm": 1.9479286988967859, + "learning_rate": 2.7749804369951628e-06, + "loss": 0.9261, + "step": 22060 + }, + { + "epoch": 0.6647590361445783, + "grad_norm": 2.0032089183615884, + "learning_rate": 2.770521512735867e-06, + "loss": 0.8994, + "step": 22070 + }, + { + "epoch": 0.6650602409638554, + "grad_norm": 5.779242393722818, + "learning_rate": 2.7660648004485326e-06, + "loss": 0.8372, + "step": 22080 + }, + { + "epoch": 0.6653614457831325, + "grad_norm": 1.8492622598829354, + "learning_rate": 2.7616103045548736e-06, + "loss": 0.7804, + "step": 22090 + }, + { + "epoch": 0.6656626506024096, + "grad_norm": 2.101195866679531, + "learning_rate": 2.757158029474414e-06, + "loss": 0.9847, + "step": 22100 + }, + { + "epoch": 0.6659638554216868, + "grad_norm": 3.7557309817886506, + "learning_rate": 2.7527079796244655e-06, + "loss": 0.8384, + "step": 22110 + }, + { + "epoch": 0.6662650602409639, + "grad_norm": 2.035807673876726, + "learning_rate": 2.748260159420133e-06, + "loss": 0.87, + "step": 22120 + }, + { + "epoch": 0.666566265060241, + "grad_norm": 4.009242037465268, + "learning_rate": 2.743814573274309e-06, + "loss": 0.8758, + "step": 22130 + }, + { + "epoch": 0.6668674698795181, + "grad_norm": 1.8449185982373857, + "learning_rate": 2.739371225597679e-06, + "loss": 0.9592, + "step": 22140 + }, + { + "epoch": 0.6671686746987951, + "grad_norm": 5.088642472988054, + "learning_rate": 2.734930120798695e-06, + "loss": 0.9281, + "step": 22150 + }, + { + "epoch": 0.6674698795180722, + "grad_norm": 3.9790770984468335, + "learning_rate": 2.73049126328359e-06, + "loss": 0.973, + "step": 22160 + }, + { + "epoch": 0.6677710843373494, + "grad_norm": 3.747401373086219, + "learning_rate": 2.7260546574563667e-06, + "loss": 1.0107, + "step": 22170 + }, + { + "epoch": 0.6680722891566265, + "grad_norm": 4.065737050583372, + "learning_rate": 2.721620307718793e-06, + "loss": 0.9498, + "step": 22180 + }, + { + "epoch": 0.6683734939759036, + "grad_norm": 1.9647076713611802, + "learning_rate": 2.7171882184703983e-06, + "loss": 0.8652, + "step": 22190 + }, + { + "epoch": 0.6686746987951807, + "grad_norm": 3.8641790642423692, + "learning_rate": 2.712758394108471e-06, + "loss": 0.961, + "step": 22200 + }, + { + "epoch": 0.6689759036144578, + "grad_norm": 5.289873751951875, + "learning_rate": 2.7083308390280505e-06, + "loss": 0.9675, + "step": 22210 + }, + { + "epoch": 0.6692771084337349, + "grad_norm": 2.178191652381719, + "learning_rate": 2.703905557621925e-06, + "loss": 1.0099, + "step": 22220 + }, + { + "epoch": 0.6695783132530121, + "grad_norm": 3.4198526034186187, + "learning_rate": 2.6994825542806256e-06, + "loss": 0.9222, + "step": 22230 + }, + { + "epoch": 0.6698795180722892, + "grad_norm": 4.575566807041047, + "learning_rate": 2.6950618333924293e-06, + "loss": 0.9811, + "step": 22240 + }, + { + "epoch": 0.6701807228915663, + "grad_norm": 4.2939082694984, + "learning_rate": 2.690643399343341e-06, + "loss": 0.9601, + "step": 22250 + }, + { + "epoch": 0.6704819277108434, + "grad_norm": 2.108921527222617, + "learning_rate": 2.686227256517104e-06, + "loss": 0.9413, + "step": 22260 + }, + { + "epoch": 0.6707831325301205, + "grad_norm": 2.0112532897038213, + "learning_rate": 2.6818134092951763e-06, + "loss": 0.9312, + "step": 22270 + }, + { + "epoch": 0.6710843373493975, + "grad_norm": 4.732203064573461, + "learning_rate": 2.677401862056753e-06, + "loss": 0.8101, + "step": 22280 + }, + { + "epoch": 0.6713855421686747, + "grad_norm": 4.006396254824637, + "learning_rate": 2.6729926191787393e-06, + "loss": 0.9421, + "step": 22290 + }, + { + "epoch": 0.6716867469879518, + "grad_norm": 3.878257768154987, + "learning_rate": 2.668585685035754e-06, + "loss": 0.9401, + "step": 22300 + }, + { + "epoch": 0.6719879518072289, + "grad_norm": 5.608380025610696, + "learning_rate": 2.664181064000127e-06, + "loss": 0.867, + "step": 22310 + }, + { + "epoch": 0.672289156626506, + "grad_norm": 5.272795359162881, + "learning_rate": 2.6597787604418932e-06, + "loss": 1.0358, + "step": 22320 + }, + { + "epoch": 0.6725903614457831, + "grad_norm": 4.007619309517507, + "learning_rate": 2.6553787787287887e-06, + "loss": 0.9925, + "step": 22330 + }, + { + "epoch": 0.6728915662650602, + "grad_norm": 1.901190551834578, + "learning_rate": 2.650981123226243e-06, + "loss": 1.0218, + "step": 22340 + }, + { + "epoch": 0.6731927710843374, + "grad_norm": 1.9356280137402926, + "learning_rate": 2.646585798297384e-06, + "loss": 0.9435, + "step": 22350 + }, + { + "epoch": 0.6734939759036145, + "grad_norm": 8.077916312214645, + "learning_rate": 2.64219280830302e-06, + "loss": 0.9134, + "step": 22360 + }, + { + "epoch": 0.6737951807228916, + "grad_norm": 4.015693029221075, + "learning_rate": 2.6378021576016467e-06, + "loss": 0.8836, + "step": 22370 + }, + { + "epoch": 0.6740963855421687, + "grad_norm": 4.858974586780675, + "learning_rate": 2.6334138505494357e-06, + "loss": 0.8455, + "step": 22380 + }, + { + "epoch": 0.6743975903614458, + "grad_norm": 4.446218214935879, + "learning_rate": 2.6290278915002397e-06, + "loss": 0.9219, + "step": 22390 + }, + { + "epoch": 0.6746987951807228, + "grad_norm": 2.024909669609107, + "learning_rate": 2.6246442848055774e-06, + "loss": 0.8926, + "step": 22400 + }, + { + "epoch": 0.675, + "grad_norm": 4.101509972320014, + "learning_rate": 2.6202630348146323e-06, + "loss": 0.9962, + "step": 22410 + }, + { + "epoch": 0.6753012048192771, + "grad_norm": 6.324377242940589, + "learning_rate": 2.615884145874253e-06, + "loss": 0.8651, + "step": 22420 + }, + { + "epoch": 0.6756024096385542, + "grad_norm": 1.9698858673636552, + "learning_rate": 2.6115076223289425e-06, + "loss": 0.9822, + "step": 22430 + }, + { + "epoch": 0.6759036144578313, + "grad_norm": 4.18087979642135, + "learning_rate": 2.6071334685208603e-06, + "loss": 0.9398, + "step": 22440 + }, + { + "epoch": 0.6762048192771084, + "grad_norm": 4.399047417508821, + "learning_rate": 2.6027616887898115e-06, + "loss": 0.9018, + "step": 22450 + }, + { + "epoch": 0.6765060240963855, + "grad_norm": 4.4678499320129355, + "learning_rate": 2.598392287473248e-06, + "loss": 1.0286, + "step": 22460 + }, + { + "epoch": 0.6768072289156627, + "grad_norm": 4.733618726120682, + "learning_rate": 2.5940252689062624e-06, + "loss": 0.8244, + "step": 22470 + }, + { + "epoch": 0.6771084337349398, + "grad_norm": 3.536548697342829, + "learning_rate": 2.5896606374215782e-06, + "loss": 0.9445, + "step": 22480 + }, + { + "epoch": 0.6774096385542169, + "grad_norm": 5.336909603721039, + "learning_rate": 2.5852983973495595e-06, + "loss": 0.924, + "step": 22490 + }, + { + "epoch": 0.677710843373494, + "grad_norm": 10.108833754127733, + "learning_rate": 2.580938553018193e-06, + "loss": 0.8747, + "step": 22500 + }, + { + "epoch": 0.678012048192771, + "grad_norm": 5.081763839435943, + "learning_rate": 2.5765811087530836e-06, + "loss": 0.9274, + "step": 22510 + }, + { + "epoch": 0.6783132530120481, + "grad_norm": 4.37994774438734, + "learning_rate": 2.5722260688774603e-06, + "loss": 0.9863, + "step": 22520 + }, + { + "epoch": 0.6786144578313253, + "grad_norm": 1.9454870534852542, + "learning_rate": 2.567873437712169e-06, + "loss": 0.8846, + "step": 22530 + }, + { + "epoch": 0.6789156626506024, + "grad_norm": 5.497447563037934, + "learning_rate": 2.563523219575661e-06, + "loss": 0.9687, + "step": 22540 + }, + { + "epoch": 0.6792168674698795, + "grad_norm": 4.765911217095411, + "learning_rate": 2.5591754187839963e-06, + "loss": 1.0395, + "step": 22550 + }, + { + "epoch": 0.6795180722891566, + "grad_norm": 5.84346943692407, + "learning_rate": 2.554830039650834e-06, + "loss": 0.9722, + "step": 22560 + }, + { + "epoch": 0.6798192771084337, + "grad_norm": 4.435380094093738, + "learning_rate": 2.550487086487433e-06, + "loss": 0.9452, + "step": 22570 + }, + { + "epoch": 0.6801204819277108, + "grad_norm": 1.9225631411474826, + "learning_rate": 2.5461465636026437e-06, + "loss": 0.893, + "step": 22580 + }, + { + "epoch": 0.680421686746988, + "grad_norm": 3.8590481171700857, + "learning_rate": 2.541808475302905e-06, + "loss": 0.836, + "step": 22590 + }, + { + "epoch": 0.6807228915662651, + "grad_norm": 3.360641772594274, + "learning_rate": 2.5374728258922453e-06, + "loss": 0.9933, + "step": 22600 + }, + { + "epoch": 0.6810240963855422, + "grad_norm": 6.8797070632404385, + "learning_rate": 2.5331396196722657e-06, + "loss": 0.9276, + "step": 22610 + }, + { + "epoch": 0.6813253012048193, + "grad_norm": 2.06110467254301, + "learning_rate": 2.5288088609421465e-06, + "loss": 0.8647, + "step": 22620 + }, + { + "epoch": 0.6816265060240964, + "grad_norm": 7.585795875612082, + "learning_rate": 2.5244805539986394e-06, + "loss": 0.9727, + "step": 22630 + }, + { + "epoch": 0.6819277108433734, + "grad_norm": 4.816359568779887, + "learning_rate": 2.5201547031360675e-06, + "loss": 0.9718, + "step": 22640 + }, + { + "epoch": 0.6822289156626506, + "grad_norm": 6.680003480617296, + "learning_rate": 2.5158313126463143e-06, + "loss": 0.9246, + "step": 22650 + }, + { + "epoch": 0.6825301204819277, + "grad_norm": 4.603594325945119, + "learning_rate": 2.511510386818815e-06, + "loss": 0.992, + "step": 22660 + }, + { + "epoch": 0.6828313253012048, + "grad_norm": 7.142321913370482, + "learning_rate": 2.5071919299405727e-06, + "loss": 0.9281, + "step": 22670 + }, + { + "epoch": 0.6831325301204819, + "grad_norm": 7.449322790098273, + "learning_rate": 2.502875946296133e-06, + "loss": 0.9674, + "step": 22680 + }, + { + "epoch": 0.683433734939759, + "grad_norm": 3.7241719379183857, + "learning_rate": 2.4985624401675882e-06, + "loss": 0.9732, + "step": 22690 + }, + { + "epoch": 0.6837349397590361, + "grad_norm": 6.4546878872351465, + "learning_rate": 2.494251415834575e-06, + "loss": 0.9987, + "step": 22700 + }, + { + "epoch": 0.6840361445783133, + "grad_norm": 3.806961100254641, + "learning_rate": 2.4899428775742647e-06, + "loss": 0.9286, + "step": 22710 + }, + { + "epoch": 0.6843373493975904, + "grad_norm": 1.8950758895547435, + "learning_rate": 2.4856368296613642e-06, + "loss": 0.9196, + "step": 22720 + }, + { + "epoch": 0.6846385542168675, + "grad_norm": 4.920706839616375, + "learning_rate": 2.481333276368107e-06, + "loss": 0.8401, + "step": 22730 + }, + { + "epoch": 0.6849397590361446, + "grad_norm": 2.137533867682102, + "learning_rate": 2.4770322219642577e-06, + "loss": 0.9533, + "step": 22740 + }, + { + "epoch": 0.6852409638554217, + "grad_norm": 3.5879285046816864, + "learning_rate": 2.4727336707170973e-06, + "loss": 0.9342, + "step": 22750 + }, + { + "epoch": 0.6855421686746987, + "grad_norm": 4.667930268627476, + "learning_rate": 2.4684376268914203e-06, + "loss": 0.9658, + "step": 22760 + }, + { + "epoch": 0.685843373493976, + "grad_norm": 5.974689819002652, + "learning_rate": 2.464144094749536e-06, + "loss": 1.0093, + "step": 22770 + }, + { + "epoch": 0.686144578313253, + "grad_norm": 3.794590653212425, + "learning_rate": 2.4598530785512666e-06, + "loss": 0.9778, + "step": 22780 + }, + { + "epoch": 0.6864457831325301, + "grad_norm": 3.727741198701043, + "learning_rate": 2.455564582553932e-06, + "loss": 0.9811, + "step": 22790 + }, + { + "epoch": 0.6867469879518072, + "grad_norm": 2.065284952343428, + "learning_rate": 2.4512786110123537e-06, + "loss": 0.8216, + "step": 22800 + }, + { + "epoch": 0.6870481927710843, + "grad_norm": 3.6275863401766553, + "learning_rate": 2.4469951681788483e-06, + "loss": 0.9555, + "step": 22810 + }, + { + "epoch": 0.6873493975903614, + "grad_norm": 7.553084797736424, + "learning_rate": 2.4427142583032244e-06, + "loss": 1.0507, + "step": 22820 + }, + { + "epoch": 0.6876506024096386, + "grad_norm": 4.238734301726904, + "learning_rate": 2.4384358856327762e-06, + "loss": 0.9991, + "step": 22830 + }, + { + "epoch": 0.6879518072289157, + "grad_norm": 3.7476143896509155, + "learning_rate": 2.434160054412282e-06, + "loss": 0.9174, + "step": 22840 + }, + { + "epoch": 0.6882530120481928, + "grad_norm": 11.350813759651311, + "learning_rate": 2.4298867688839967e-06, + "loss": 0.9428, + "step": 22850 + }, + { + "epoch": 0.6885542168674699, + "grad_norm": 6.179310566217496, + "learning_rate": 2.425616033287652e-06, + "loss": 0.8853, + "step": 22860 + }, + { + "epoch": 0.688855421686747, + "grad_norm": 2.030623479241787, + "learning_rate": 2.4213478518604483e-06, + "loss": 0.9189, + "step": 22870 + }, + { + "epoch": 0.689156626506024, + "grad_norm": 4.5070303401318705, + "learning_rate": 2.4170822288370494e-06, + "loss": 1.0062, + "step": 22880 + }, + { + "epoch": 0.6894578313253013, + "grad_norm": 4.86138478304233, + "learning_rate": 2.412819168449588e-06, + "loss": 0.8871, + "step": 22890 + }, + { + "epoch": 0.6897590361445783, + "grad_norm": 2.049620763160923, + "learning_rate": 2.4085586749276484e-06, + "loss": 0.8572, + "step": 22900 + }, + { + "epoch": 0.6900602409638554, + "grad_norm": 4.422205159863266, + "learning_rate": 2.4043007524982648e-06, + "loss": 0.9579, + "step": 22910 + }, + { + "epoch": 0.6903614457831325, + "grad_norm": 118.33740839028648, + "learning_rate": 2.4000454053859312e-06, + "loss": 0.8765, + "step": 22920 + }, + { + "epoch": 0.6906626506024096, + "grad_norm": 5.252863704536765, + "learning_rate": 2.395792637812577e-06, + "loss": 0.9471, + "step": 22930 + }, + { + "epoch": 0.6909638554216867, + "grad_norm": 4.163202570653524, + "learning_rate": 2.391542453997578e-06, + "loss": 0.9042, + "step": 22940 + }, + { + "epoch": 0.6912650602409639, + "grad_norm": 4.409153591249788, + "learning_rate": 2.3872948581577427e-06, + "loss": 0.9913, + "step": 22950 + }, + { + "epoch": 0.691566265060241, + "grad_norm": 4.796527835663719, + "learning_rate": 2.383049854507314e-06, + "loss": 1.0036, + "step": 22960 + }, + { + "epoch": 0.6918674698795181, + "grad_norm": 1.9767573256592306, + "learning_rate": 2.3788074472579618e-06, + "loss": 0.9704, + "step": 22970 + }, + { + "epoch": 0.6921686746987952, + "grad_norm": 3.6699769998441423, + "learning_rate": 2.37456764061878e-06, + "loss": 0.8877, + "step": 22980 + }, + { + "epoch": 0.6924698795180723, + "grad_norm": 2.0295044424929753, + "learning_rate": 2.370330438796288e-06, + "loss": 0.9752, + "step": 22990 + }, + { + "epoch": 0.6927710843373494, + "grad_norm": 2.06139463612227, + "learning_rate": 2.366095845994411e-06, + "loss": 0.9895, + "step": 23000 + }, + { + "epoch": 0.6930722891566266, + "grad_norm": 6.780188527433399, + "learning_rate": 2.361863866414491e-06, + "loss": 0.8674, + "step": 23010 + }, + { + "epoch": 0.6933734939759036, + "grad_norm": 3.97929879716764, + "learning_rate": 2.357634504255276e-06, + "loss": 0.971, + "step": 23020 + }, + { + "epoch": 0.6936746987951807, + "grad_norm": 4.50396029068378, + "learning_rate": 2.3534077637129223e-06, + "loss": 1.0326, + "step": 23030 + }, + { + "epoch": 0.6939759036144578, + "grad_norm": 5.325511959961041, + "learning_rate": 2.349183648980977e-06, + "loss": 0.9477, + "step": 23040 + }, + { + "epoch": 0.6942771084337349, + "grad_norm": 5.396004785947532, + "learning_rate": 2.344962164250387e-06, + "loss": 0.9319, + "step": 23050 + }, + { + "epoch": 0.694578313253012, + "grad_norm": 2.1516068683771143, + "learning_rate": 2.3407433137094894e-06, + "loss": 0.9495, + "step": 23060 + }, + { + "epoch": 0.6948795180722892, + "grad_norm": 6.29960294901912, + "learning_rate": 2.3365271015440056e-06, + "loss": 0.9331, + "step": 23070 + }, + { + "epoch": 0.6951807228915663, + "grad_norm": 7.340386043870026, + "learning_rate": 2.3323135319370414e-06, + "loss": 0.9396, + "step": 23080 + }, + { + "epoch": 0.6954819277108434, + "grad_norm": 5.744500811222904, + "learning_rate": 2.3281026090690804e-06, + "loss": 0.8906, + "step": 23090 + }, + { + "epoch": 0.6957831325301205, + "grad_norm": 5.180073843839168, + "learning_rate": 2.3238943371179807e-06, + "loss": 0.8364, + "step": 23100 + }, + { + "epoch": 0.6960843373493976, + "grad_norm": 6.6853397673151695, + "learning_rate": 2.3196887202589686e-06, + "loss": 1.015, + "step": 23110 + }, + { + "epoch": 0.6963855421686747, + "grad_norm": 4.10735060104868, + "learning_rate": 2.315485762664637e-06, + "loss": 0.934, + "step": 23120 + }, + { + "epoch": 0.6966867469879519, + "grad_norm": 7.867614441488752, + "learning_rate": 2.3112854685049397e-06, + "loss": 0.8877, + "step": 23130 + }, + { + "epoch": 0.696987951807229, + "grad_norm": 7.277405452113129, + "learning_rate": 2.307087841947192e-06, + "loss": 1.0054, + "step": 23140 + }, + { + "epoch": 0.697289156626506, + "grad_norm": 4.4566872438858205, + "learning_rate": 2.3028928871560596e-06, + "loss": 0.9394, + "step": 23150 + }, + { + "epoch": 0.6975903614457831, + "grad_norm": 5.153560737661477, + "learning_rate": 2.298700608293552e-06, + "loss": 0.8738, + "step": 23160 + }, + { + "epoch": 0.6978915662650602, + "grad_norm": 1.8817787680846003, + "learning_rate": 2.294511009519034e-06, + "loss": 0.9579, + "step": 23170 + }, + { + "epoch": 0.6981927710843373, + "grad_norm": 1.827764606945967, + "learning_rate": 2.290324094989205e-06, + "loss": 0.9579, + "step": 23180 + }, + { + "epoch": 0.6984939759036145, + "grad_norm": 5.123900870063656, + "learning_rate": 2.286139868858103e-06, + "loss": 0.9587, + "step": 23190 + }, + { + "epoch": 0.6987951807228916, + "grad_norm": 5.61876482017539, + "learning_rate": 2.281958335277098e-06, + "loss": 0.9038, + "step": 23200 + }, + { + "epoch": 0.6990963855421687, + "grad_norm": 8.317484335056166, + "learning_rate": 2.277779498394889e-06, + "loss": 1.075, + "step": 23210 + }, + { + "epoch": 0.6993975903614458, + "grad_norm": 3.896827643658223, + "learning_rate": 2.273603362357498e-06, + "loss": 1.0394, + "step": 23220 + }, + { + "epoch": 0.6996987951807229, + "grad_norm": 1.8037348674155738, + "learning_rate": 2.2694299313082693e-06, + "loss": 0.8951, + "step": 23230 + }, + { + "epoch": 0.7, + "grad_norm": 4.3173602814544605, + "learning_rate": 2.265259209387867e-06, + "loss": 0.9407, + "step": 23240 + }, + { + "epoch": 0.7003012048192772, + "grad_norm": 3.9829492743839823, + "learning_rate": 2.261091200734258e-06, + "loss": 0.9511, + "step": 23250 + }, + { + "epoch": 0.7006024096385542, + "grad_norm": 4.057457437562215, + "learning_rate": 2.2569259094827243e-06, + "loss": 0.9642, + "step": 23260 + }, + { + "epoch": 0.7009036144578313, + "grad_norm": 2.0068270999768654, + "learning_rate": 2.252763339765849e-06, + "loss": 0.9944, + "step": 23270 + }, + { + "epoch": 0.7012048192771084, + "grad_norm": 4.6468511646938815, + "learning_rate": 2.248603495713518e-06, + "loss": 0.9326, + "step": 23280 + }, + { + "epoch": 0.7015060240963855, + "grad_norm": 5.193256640454499, + "learning_rate": 2.2444463814529115e-06, + "loss": 1.0054, + "step": 23290 + }, + { + "epoch": 0.7018072289156626, + "grad_norm": 5.146488303300006, + "learning_rate": 2.2402920011084995e-06, + "loss": 0.8454, + "step": 23300 + }, + { + "epoch": 0.7021084337349398, + "grad_norm": 8.582656638559628, + "learning_rate": 2.2361403588020404e-06, + "loss": 0.9063, + "step": 23310 + }, + { + "epoch": 0.7024096385542169, + "grad_norm": 6.552475722104356, + "learning_rate": 2.2319914586525776e-06, + "loss": 0.9258, + "step": 23320 + }, + { + "epoch": 0.702710843373494, + "grad_norm": 4.684625547165652, + "learning_rate": 2.2278453047764333e-06, + "loss": 0.8923, + "step": 23330 + }, + { + "epoch": 0.7030120481927711, + "grad_norm": 4.419180083014369, + "learning_rate": 2.223701901287203e-06, + "loss": 0.9415, + "step": 23340 + }, + { + "epoch": 0.7033132530120482, + "grad_norm": 19.834028959074853, + "learning_rate": 2.2195612522957555e-06, + "loss": 0.9937, + "step": 23350 + }, + { + "epoch": 0.7036144578313253, + "grad_norm": 4.22237959378623, + "learning_rate": 2.2154233619102266e-06, + "loss": 1.0087, + "step": 23360 + }, + { + "epoch": 0.7039156626506025, + "grad_norm": 2.1830823978849496, + "learning_rate": 2.2112882342360154e-06, + "loss": 0.8925, + "step": 23370 + }, + { + "epoch": 0.7042168674698795, + "grad_norm": 1.9840030888992062, + "learning_rate": 2.2071558733757757e-06, + "loss": 0.8569, + "step": 23380 + }, + { + "epoch": 0.7045180722891566, + "grad_norm": 3.9195650077889987, + "learning_rate": 2.2030262834294267e-06, + "loss": 0.9396, + "step": 23390 + }, + { + "epoch": 0.7048192771084337, + "grad_norm": 4.2134851341482795, + "learning_rate": 2.1988994684941256e-06, + "loss": 0.9038, + "step": 23400 + }, + { + "epoch": 0.7051204819277108, + "grad_norm": 6.404384021273373, + "learning_rate": 2.194775432664282e-06, + "loss": 0.8647, + "step": 23410 + }, + { + "epoch": 0.7054216867469879, + "grad_norm": 3.9554394561372503, + "learning_rate": 2.1906541800315535e-06, + "loss": 0.9661, + "step": 23420 + }, + { + "epoch": 0.7057228915662651, + "grad_norm": 5.5619589086390055, + "learning_rate": 2.1865357146848286e-06, + "loss": 0.9434, + "step": 23430 + }, + { + "epoch": 0.7060240963855422, + "grad_norm": 1.8748269889229263, + "learning_rate": 2.1824200407102332e-06, + "loss": 0.9844, + "step": 23440 + }, + { + "epoch": 0.7063253012048193, + "grad_norm": 5.4605865906090685, + "learning_rate": 2.178307162191124e-06, + "loss": 0.8661, + "step": 23450 + }, + { + "epoch": 0.7066265060240964, + "grad_norm": 3.641901518181943, + "learning_rate": 2.1741970832080846e-06, + "loss": 0.9045, + "step": 23460 + }, + { + "epoch": 0.7069277108433735, + "grad_norm": 1.9161795197656988, + "learning_rate": 2.170089807838919e-06, + "loss": 0.8912, + "step": 23470 + }, + { + "epoch": 0.7072289156626506, + "grad_norm": 5.085086927965741, + "learning_rate": 2.16598534015865e-06, + "loss": 0.9936, + "step": 23480 + }, + { + "epoch": 0.7075301204819278, + "grad_norm": 6.773889369343873, + "learning_rate": 2.161883684239522e-06, + "loss": 0.9811, + "step": 23490 + }, + { + "epoch": 0.7078313253012049, + "grad_norm": 3.6246561338536094, + "learning_rate": 2.1577848441509764e-06, + "loss": 0.9314, + "step": 23500 + }, + { + "epoch": 0.7081325301204819, + "grad_norm": 7.518998824039743, + "learning_rate": 2.1536888239596714e-06, + "loss": 0.8642, + "step": 23510 + }, + { + "epoch": 0.708433734939759, + "grad_norm": 1.8952967771917464, + "learning_rate": 2.149595627729461e-06, + "loss": 0.8739, + "step": 23520 + }, + { + "epoch": 0.7087349397590361, + "grad_norm": 5.191648142393608, + "learning_rate": 2.145505259521405e-06, + "loss": 0.9417, + "step": 23530 + }, + { + "epoch": 0.7090361445783132, + "grad_norm": 3.640175408075149, + "learning_rate": 2.141417723393752e-06, + "loss": 1.0436, + "step": 23540 + }, + { + "epoch": 0.7093373493975904, + "grad_norm": 5.168422152662087, + "learning_rate": 2.1373330234019374e-06, + "loss": 0.9235, + "step": 23550 + }, + { + "epoch": 0.7096385542168675, + "grad_norm": 5.069873338886798, + "learning_rate": 2.133251163598591e-06, + "loss": 0.9272, + "step": 23560 + }, + { + "epoch": 0.7099397590361446, + "grad_norm": 3.787466833212854, + "learning_rate": 2.129172148033519e-06, + "loss": 0.936, + "step": 23570 + }, + { + "epoch": 0.7102409638554217, + "grad_norm": 2.207250153121542, + "learning_rate": 2.125095980753708e-06, + "loss": 0.8954, + "step": 23580 + }, + { + "epoch": 0.7105421686746988, + "grad_norm": 3.9108531345414344, + "learning_rate": 2.121022665803317e-06, + "loss": 0.9488, + "step": 23590 + }, + { + "epoch": 0.7108433734939759, + "grad_norm": 5.025478982342228, + "learning_rate": 2.1169522072236774e-06, + "loss": 0.8871, + "step": 23600 + }, + { + "epoch": 0.7111445783132531, + "grad_norm": 3.8721950596206653, + "learning_rate": 2.1128846090532852e-06, + "loss": 1.0189, + "step": 23610 + }, + { + "epoch": 0.7114457831325302, + "grad_norm": 4.262500698458053, + "learning_rate": 2.1088198753277973e-06, + "loss": 0.9333, + "step": 23620 + }, + { + "epoch": 0.7117469879518072, + "grad_norm": 3.796946909369294, + "learning_rate": 2.1047580100800286e-06, + "loss": 0.8375, + "step": 23630 + }, + { + "epoch": 0.7120481927710843, + "grad_norm": 3.8388330766887484, + "learning_rate": 2.1006990173399556e-06, + "loss": 0.8744, + "step": 23640 + }, + { + "epoch": 0.7123493975903614, + "grad_norm": 1.9531315204735107, + "learning_rate": 2.0966429011346923e-06, + "loss": 0.855, + "step": 23650 + }, + { + "epoch": 0.7126506024096385, + "grad_norm": 5.284646124851012, + "learning_rate": 2.0925896654885057e-06, + "loss": 1.0146, + "step": 23660 + }, + { + "epoch": 0.7129518072289157, + "grad_norm": 1.9618156904959965, + "learning_rate": 2.0885393144228076e-06, + "loss": 0.8855, + "step": 23670 + }, + { + "epoch": 0.7132530120481928, + "grad_norm": 4.726929580568592, + "learning_rate": 2.0844918519561425e-06, + "loss": 0.9278, + "step": 23680 + }, + { + "epoch": 0.7135542168674699, + "grad_norm": 4.034876126551746, + "learning_rate": 2.0804472821041933e-06, + "loss": 1.0064, + "step": 23690 + }, + { + "epoch": 0.713855421686747, + "grad_norm": 4.2548237651244545, + "learning_rate": 2.0764056088797646e-06, + "loss": 0.9422, + "step": 23700 + }, + { + "epoch": 0.7141566265060241, + "grad_norm": 4.6760236051599735, + "learning_rate": 2.0723668362927996e-06, + "loss": 0.9429, + "step": 23710 + }, + { + "epoch": 0.7144578313253012, + "grad_norm": 5.734079540702262, + "learning_rate": 2.0683309683503543e-06, + "loss": 0.8584, + "step": 23720 + }, + { + "epoch": 0.7147590361445784, + "grad_norm": 4.416688870319067, + "learning_rate": 2.0642980090566065e-06, + "loss": 1.0068, + "step": 23730 + }, + { + "epoch": 0.7150602409638555, + "grad_norm": 4.93105304401678, + "learning_rate": 2.0602679624128473e-06, + "loss": 1.0497, + "step": 23740 + }, + { + "epoch": 0.7153614457831325, + "grad_norm": 6.316933040636142, + "learning_rate": 2.0562408324174786e-06, + "loss": 0.935, + "step": 23750 + }, + { + "epoch": 0.7156626506024096, + "grad_norm": 4.090206799769014, + "learning_rate": 2.0522166230660074e-06, + "loss": 0.9591, + "step": 23760 + }, + { + "epoch": 0.7159638554216867, + "grad_norm": 3.768301742741461, + "learning_rate": 2.0481953383510426e-06, + "loss": 0.8223, + "step": 23770 + }, + { + "epoch": 0.7162650602409638, + "grad_norm": 4.6501395199172, + "learning_rate": 2.0441769822622964e-06, + "loss": 0.8744, + "step": 23780 + }, + { + "epoch": 0.716566265060241, + "grad_norm": 4.157839214519675, + "learning_rate": 2.040161558786571e-06, + "loss": 0.9164, + "step": 23790 + }, + { + "epoch": 0.7168674698795181, + "grad_norm": 6.003499162384984, + "learning_rate": 2.036149071907754e-06, + "loss": 0.964, + "step": 23800 + }, + { + "epoch": 0.7171686746987952, + "grad_norm": 5.252391559160354, + "learning_rate": 2.032139525606831e-06, + "loss": 1.0058, + "step": 23810 + }, + { + "epoch": 0.7174698795180723, + "grad_norm": 5.43251284612108, + "learning_rate": 2.0281329238618617e-06, + "loss": 0.8611, + "step": 23820 + }, + { + "epoch": 0.7177710843373494, + "grad_norm": 5.133765137376551, + "learning_rate": 2.024129270647988e-06, + "loss": 1.0552, + "step": 23830 + }, + { + "epoch": 0.7180722891566265, + "grad_norm": 4.602515463311826, + "learning_rate": 2.020128569937424e-06, + "loss": 0.9639, + "step": 23840 + }, + { + "epoch": 0.7183734939759037, + "grad_norm": 5.153338287802595, + "learning_rate": 2.0161308256994565e-06, + "loss": 0.9653, + "step": 23850 + }, + { + "epoch": 0.7186746987951808, + "grad_norm": 4.945162207016891, + "learning_rate": 2.0121360419004383e-06, + "loss": 0.9646, + "step": 23860 + }, + { + "epoch": 0.7189759036144578, + "grad_norm": 5.3412116960756375, + "learning_rate": 2.0081442225037847e-06, + "loss": 0.9442, + "step": 23870 + }, + { + "epoch": 0.7192771084337349, + "grad_norm": 3.438474027398295, + "learning_rate": 2.0041553714699697e-06, + "loss": 0.8855, + "step": 23880 + }, + { + "epoch": 0.719578313253012, + "grad_norm": 4.528565600324454, + "learning_rate": 2.000169492756523e-06, + "loss": 0.8812, + "step": 23890 + }, + { + "epoch": 0.7198795180722891, + "grad_norm": 5.096291955102272, + "learning_rate": 1.9961865903180256e-06, + "loss": 0.8498, + "step": 23900 + }, + { + "epoch": 0.7201807228915663, + "grad_norm": 4.798050354142986, + "learning_rate": 1.9922066681061025e-06, + "loss": 1.003, + "step": 23910 + }, + { + "epoch": 0.7204819277108434, + "grad_norm": 2.0863242738648755, + "learning_rate": 1.9882297300694283e-06, + "loss": 0.9505, + "step": 23920 + }, + { + "epoch": 0.7207831325301205, + "grad_norm": 2.012477642800008, + "learning_rate": 1.984255780153711e-06, + "loss": 0.9697, + "step": 23930 + }, + { + "epoch": 0.7210843373493976, + "grad_norm": 4.348988317452686, + "learning_rate": 1.9802848223016976e-06, + "loss": 0.9739, + "step": 23940 + }, + { + "epoch": 0.7213855421686747, + "grad_norm": 4.641332928099159, + "learning_rate": 1.9763168604531594e-06, + "loss": 0.9557, + "step": 23950 + }, + { + "epoch": 0.7216867469879518, + "grad_norm": 1.886539282374377, + "learning_rate": 1.972351898544907e-06, + "loss": 0.9418, + "step": 23960 + }, + { + "epoch": 0.721987951807229, + "grad_norm": 2.065348782109405, + "learning_rate": 1.9683899405107647e-06, + "loss": 0.8994, + "step": 23970 + }, + { + "epoch": 0.7222891566265061, + "grad_norm": 8.660343030699208, + "learning_rate": 1.964430990281582e-06, + "loss": 0.9721, + "step": 23980 + }, + { + "epoch": 0.7225903614457831, + "grad_norm": 9.122400281515935, + "learning_rate": 1.9604750517852206e-06, + "loss": 0.8195, + "step": 23990 + }, + { + "epoch": 0.7228915662650602, + "grad_norm": 1.9745064009367765, + "learning_rate": 1.956522128946557e-06, + "loss": 0.811, + "step": 24000 + }, + { + "epoch": 0.7231927710843373, + "grad_norm": 4.9300901012920235, + "learning_rate": 1.9525722256874736e-06, + "loss": 0.9386, + "step": 24010 + }, + { + "epoch": 0.7234939759036144, + "grad_norm": 2.05432062396776, + "learning_rate": 1.9486253459268563e-06, + "loss": 0.8124, + "step": 24020 + }, + { + "epoch": 0.7237951807228916, + "grad_norm": 4.115157880050282, + "learning_rate": 1.9446814935805986e-06, + "loss": 1.0251, + "step": 24030 + }, + { + "epoch": 0.7240963855421687, + "grad_norm": 5.215746241768255, + "learning_rate": 1.940740672561579e-06, + "loss": 1.0002, + "step": 24040 + }, + { + "epoch": 0.7243975903614458, + "grad_norm": 1.8961637352206742, + "learning_rate": 1.936802886779674e-06, + "loss": 0.8946, + "step": 24050 + }, + { + "epoch": 0.7246987951807229, + "grad_norm": 4.0003651505033355, + "learning_rate": 1.9328681401417516e-06, + "loss": 0.947, + "step": 24060 + }, + { + "epoch": 0.725, + "grad_norm": 2.071378430624273, + "learning_rate": 1.928936436551661e-06, + "loss": 0.9388, + "step": 24070 + }, + { + "epoch": 0.7253012048192771, + "grad_norm": 1.9878794103311674, + "learning_rate": 1.9250077799102323e-06, + "loss": 0.8782, + "step": 24080 + }, + { + "epoch": 0.7256024096385543, + "grad_norm": 4.456132427939708, + "learning_rate": 1.921082174115273e-06, + "loss": 0.8974, + "step": 24090 + }, + { + "epoch": 0.7259036144578314, + "grad_norm": 5.232572618042077, + "learning_rate": 1.9171596230615647e-06, + "loss": 0.9636, + "step": 24100 + }, + { + "epoch": 0.7262048192771084, + "grad_norm": 3.935718031287158, + "learning_rate": 1.913240130640856e-06, + "loss": 1.0512, + "step": 24110 + }, + { + "epoch": 0.7265060240963855, + "grad_norm": 3.922542714074301, + "learning_rate": 1.909323700741862e-06, + "loss": 0.9966, + "step": 24120 + }, + { + "epoch": 0.7268072289156626, + "grad_norm": 5.855878513334449, + "learning_rate": 1.905410337250264e-06, + "loss": 0.8188, + "step": 24130 + }, + { + "epoch": 0.7271084337349397, + "grad_norm": 5.932471616130165, + "learning_rate": 1.901500044048692e-06, + "loss": 0.9092, + "step": 24140 + }, + { + "epoch": 0.7274096385542169, + "grad_norm": 1.848641242811829, + "learning_rate": 1.8975928250167352e-06, + "loss": 0.9093, + "step": 24150 + }, + { + "epoch": 0.727710843373494, + "grad_norm": 5.0570936729903115, + "learning_rate": 1.8936886840309309e-06, + "loss": 0.8805, + "step": 24160 + }, + { + "epoch": 0.7280120481927711, + "grad_norm": 1.8956793833929124, + "learning_rate": 1.8897876249647672e-06, + "loss": 0.9153, + "step": 24170 + }, + { + "epoch": 0.7283132530120482, + "grad_norm": 5.456779690160975, + "learning_rate": 1.885889651688671e-06, + "loss": 0.952, + "step": 24180 + }, + { + "epoch": 0.7286144578313253, + "grad_norm": 8.698279686646263, + "learning_rate": 1.8819947680700035e-06, + "loss": 1.0162, + "step": 24190 + }, + { + "epoch": 0.7289156626506024, + "grad_norm": 4.846038386260658, + "learning_rate": 1.8781029779730646e-06, + "loss": 0.915, + "step": 24200 + }, + { + "epoch": 0.7292168674698796, + "grad_norm": 4.973815362553223, + "learning_rate": 1.874214285259089e-06, + "loss": 0.9525, + "step": 24210 + }, + { + "epoch": 0.7295180722891567, + "grad_norm": 5.31951673220082, + "learning_rate": 1.870328693786232e-06, + "loss": 0.9442, + "step": 24220 + }, + { + "epoch": 0.7298192771084338, + "grad_norm": 4.182075503567281, + "learning_rate": 1.8664462074095746e-06, + "loss": 0.9526, + "step": 24230 + }, + { + "epoch": 0.7301204819277108, + "grad_norm": 1.9328226925727665, + "learning_rate": 1.8625668299811162e-06, + "loss": 0.8531, + "step": 24240 + }, + { + "epoch": 0.7304216867469879, + "grad_norm": 2.1175901768808028, + "learning_rate": 1.8586905653497722e-06, + "loss": 0.8759, + "step": 24250 + }, + { + "epoch": 0.730722891566265, + "grad_norm": 4.939943011416337, + "learning_rate": 1.8548174173613703e-06, + "loss": 1.0196, + "step": 24260 + }, + { + "epoch": 0.7310240963855422, + "grad_norm": 4.539405539248213, + "learning_rate": 1.8509473898586432e-06, + "loss": 0.9586, + "step": 24270 + }, + { + "epoch": 0.7313253012048193, + "grad_norm": 6.273212072370957, + "learning_rate": 1.8470804866812354e-06, + "loss": 0.8979, + "step": 24280 + }, + { + "epoch": 0.7316265060240964, + "grad_norm": 5.963814795774179, + "learning_rate": 1.8432167116656802e-06, + "loss": 0.9229, + "step": 24290 + }, + { + "epoch": 0.7319277108433735, + "grad_norm": 2.0443692619301235, + "learning_rate": 1.8393560686454137e-06, + "loss": 0.9121, + "step": 24300 + }, + { + "epoch": 0.7322289156626506, + "grad_norm": 2.008271172152894, + "learning_rate": 1.835498561450767e-06, + "loss": 0.9958, + "step": 24310 + }, + { + "epoch": 0.7325301204819277, + "grad_norm": 5.858191431963845, + "learning_rate": 1.8316441939089552e-06, + "loss": 0.9358, + "step": 24320 + }, + { + "epoch": 0.7328313253012049, + "grad_norm": 2.0139009342278364, + "learning_rate": 1.8277929698440812e-06, + "loss": 0.9076, + "step": 24330 + }, + { + "epoch": 0.733132530120482, + "grad_norm": 4.292322829070784, + "learning_rate": 1.823944893077127e-06, + "loss": 0.8848, + "step": 24340 + }, + { + "epoch": 0.733433734939759, + "grad_norm": 21.60801004264943, + "learning_rate": 1.8200999674259533e-06, + "loss": 0.984, + "step": 24350 + }, + { + "epoch": 0.7337349397590361, + "grad_norm": 4.7668084802540935, + "learning_rate": 1.8162581967052945e-06, + "loss": 0.9263, + "step": 24360 + }, + { + "epoch": 0.7340361445783132, + "grad_norm": 7.522471792672586, + "learning_rate": 1.8124195847267533e-06, + "loss": 0.9809, + "step": 24370 + }, + { + "epoch": 0.7343373493975903, + "grad_norm": 5.633887550884112, + "learning_rate": 1.8085841352988043e-06, + "loss": 0.8985, + "step": 24380 + }, + { + "epoch": 0.7346385542168675, + "grad_norm": 3.317601174174788, + "learning_rate": 1.8047518522267748e-06, + "loss": 0.8532, + "step": 24390 + }, + { + "epoch": 0.7349397590361446, + "grad_norm": 3.786751412267653, + "learning_rate": 1.800922739312857e-06, + "loss": 0.9554, + "step": 24400 + }, + { + "epoch": 0.7352409638554217, + "grad_norm": 4.1541404138077445, + "learning_rate": 1.7970968003560958e-06, + "loss": 0.9615, + "step": 24410 + }, + { + "epoch": 0.7355421686746988, + "grad_norm": 6.0467211461924775, + "learning_rate": 1.7932740391523901e-06, + "loss": 0.9148, + "step": 24420 + }, + { + "epoch": 0.7358433734939759, + "grad_norm": 2.051997275765705, + "learning_rate": 1.7894544594944845e-06, + "loss": 0.8672, + "step": 24430 + }, + { + "epoch": 0.736144578313253, + "grad_norm": 6.513789781450807, + "learning_rate": 1.7856380651719623e-06, + "loss": 0.9379, + "step": 24440 + }, + { + "epoch": 0.7364457831325302, + "grad_norm": 4.272217606125275, + "learning_rate": 1.78182485997125e-06, + "loss": 1.0536, + "step": 24450 + }, + { + "epoch": 0.7367469879518073, + "grad_norm": 4.4105061160893015, + "learning_rate": 1.7780148476756148e-06, + "loss": 0.8337, + "step": 24460 + }, + { + "epoch": 0.7370481927710844, + "grad_norm": 2.100337684473239, + "learning_rate": 1.7742080320651496e-06, + "loss": 0.9263, + "step": 24470 + }, + { + "epoch": 0.7373493975903614, + "grad_norm": 25.125198507261835, + "learning_rate": 1.7704044169167772e-06, + "loss": 1.0516, + "step": 24480 + }, + { + "epoch": 0.7376506024096385, + "grad_norm": 4.191105317121492, + "learning_rate": 1.7666040060042456e-06, + "loss": 0.8798, + "step": 24490 + }, + { + "epoch": 0.7379518072289156, + "grad_norm": 2.2092416042783145, + "learning_rate": 1.7628068030981244e-06, + "loss": 0.9417, + "step": 24500 + }, + { + "epoch": 0.7382530120481928, + "grad_norm": 2.0755191122301753, + "learning_rate": 1.7590128119657995e-06, + "loss": 0.8016, + "step": 24510 + }, + { + "epoch": 0.7385542168674699, + "grad_norm": 7.201579467178426, + "learning_rate": 1.7552220363714689e-06, + "loss": 1.0353, + "step": 24520 + }, + { + "epoch": 0.738855421686747, + "grad_norm": 1.8705647557631861, + "learning_rate": 1.7514344800761474e-06, + "loss": 0.9524, + "step": 24530 + }, + { + "epoch": 0.7391566265060241, + "grad_norm": 4.779697056317637, + "learning_rate": 1.7476501468376438e-06, + "loss": 0.781, + "step": 24540 + }, + { + "epoch": 0.7394578313253012, + "grad_norm": 4.847883618628411, + "learning_rate": 1.7438690404105769e-06, + "loss": 0.9694, + "step": 24550 + }, + { + "epoch": 0.7397590361445783, + "grad_norm": 5.072446795267843, + "learning_rate": 1.740091164546366e-06, + "loss": 0.9741, + "step": 24560 + }, + { + "epoch": 0.7400602409638555, + "grad_norm": 6.690733857850204, + "learning_rate": 1.7363165229932194e-06, + "loss": 0.8466, + "step": 24570 + }, + { + "epoch": 0.7403614457831326, + "grad_norm": 4.6258476710645065, + "learning_rate": 1.7325451194961413e-06, + "loss": 0.8819, + "step": 24580 + }, + { + "epoch": 0.7406626506024097, + "grad_norm": 1.9028741288540398, + "learning_rate": 1.7287769577969166e-06, + "loss": 0.8972, + "step": 24590 + }, + { + "epoch": 0.7409638554216867, + "grad_norm": 5.927244604032882, + "learning_rate": 1.7250120416341227e-06, + "loss": 1.0344, + "step": 24600 + }, + { + "epoch": 0.7412650602409638, + "grad_norm": 3.7833930409397754, + "learning_rate": 1.7212503747431104e-06, + "loss": 0.9957, + "step": 24610 + }, + { + "epoch": 0.7415662650602409, + "grad_norm": 5.744623205380776, + "learning_rate": 1.7174919608560097e-06, + "loss": 0.9835, + "step": 24620 + }, + { + "epoch": 0.7418674698795181, + "grad_norm": 4.4215341591308865, + "learning_rate": 1.7137368037017215e-06, + "loss": 1.0573, + "step": 24630 + }, + { + "epoch": 0.7421686746987952, + "grad_norm": 7.040927591742778, + "learning_rate": 1.7099849070059177e-06, + "loss": 0.9191, + "step": 24640 + }, + { + "epoch": 0.7424698795180723, + "grad_norm": 2.1104550722668503, + "learning_rate": 1.7062362744910321e-06, + "loss": 0.9583, + "step": 24650 + }, + { + "epoch": 0.7427710843373494, + "grad_norm": 4.692163455292239, + "learning_rate": 1.702490909876262e-06, + "loss": 0.9226, + "step": 24660 + }, + { + "epoch": 0.7430722891566265, + "grad_norm": 1.8573874755026485, + "learning_rate": 1.6987488168775644e-06, + "loss": 0.7367, + "step": 24670 + }, + { + "epoch": 0.7433734939759036, + "grad_norm": 2.0192474808514405, + "learning_rate": 1.6950099992076485e-06, + "loss": 0.8619, + "step": 24680 + }, + { + "epoch": 0.7436746987951808, + "grad_norm": 7.776734395100246, + "learning_rate": 1.6912744605759707e-06, + "loss": 0.9502, + "step": 24690 + }, + { + "epoch": 0.7439759036144579, + "grad_norm": 1.9551977449250517, + "learning_rate": 1.6875422046887368e-06, + "loss": 0.7732, + "step": 24700 + }, + { + "epoch": 0.744277108433735, + "grad_norm": 10.390101692580956, + "learning_rate": 1.6838132352488995e-06, + "loss": 0.8319, + "step": 24710 + }, + { + "epoch": 0.744578313253012, + "grad_norm": 4.101030765300637, + "learning_rate": 1.680087555956146e-06, + "loss": 0.9334, + "step": 24720 + }, + { + "epoch": 0.7448795180722891, + "grad_norm": 6.581037482993133, + "learning_rate": 1.676365170506899e-06, + "loss": 0.8971, + "step": 24730 + }, + { + "epoch": 0.7451807228915662, + "grad_norm": 4.655724688667062, + "learning_rate": 1.6726460825943158e-06, + "loss": 0.9744, + "step": 24740 + }, + { + "epoch": 0.7454819277108434, + "grad_norm": 4.121847130103446, + "learning_rate": 1.6689302959082803e-06, + "loss": 0.988, + "step": 24750 + }, + { + "epoch": 0.7457831325301205, + "grad_norm": 3.818761445007322, + "learning_rate": 1.665217814135403e-06, + "loss": 0.9183, + "step": 24760 + }, + { + "epoch": 0.7460843373493976, + "grad_norm": 1.9836031457443304, + "learning_rate": 1.6615086409590125e-06, + "loss": 0.9151, + "step": 24770 + }, + { + "epoch": 0.7463855421686747, + "grad_norm": 5.138352253816458, + "learning_rate": 1.6578027800591578e-06, + "loss": 0.9541, + "step": 24780 + }, + { + "epoch": 0.7466867469879518, + "grad_norm": 3.8045675529228618, + "learning_rate": 1.6541002351125996e-06, + "loss": 0.8808, + "step": 24790 + }, + { + "epoch": 0.7469879518072289, + "grad_norm": 1.9958088955738258, + "learning_rate": 1.6504010097928074e-06, + "loss": 0.9771, + "step": 24800 + }, + { + "epoch": 0.7472891566265061, + "grad_norm": 4.234970270474972, + "learning_rate": 1.6467051077699632e-06, + "loss": 0.9119, + "step": 24810 + }, + { + "epoch": 0.7475903614457832, + "grad_norm": 2.0204691596181665, + "learning_rate": 1.6430125327109458e-06, + "loss": 0.9406, + "step": 24820 + }, + { + "epoch": 0.7478915662650603, + "grad_norm": 4.04979919903914, + "learning_rate": 1.6393232882793363e-06, + "loss": 0.9047, + "step": 24830 + }, + { + "epoch": 0.7481927710843373, + "grad_norm": 5.428702237379441, + "learning_rate": 1.6356373781354058e-06, + "loss": 0.9933, + "step": 24840 + }, + { + "epoch": 0.7484939759036144, + "grad_norm": 1.981495260166855, + "learning_rate": 1.6319548059361256e-06, + "loss": 0.8876, + "step": 24850 + }, + { + "epoch": 0.7487951807228915, + "grad_norm": 6.288176359296529, + "learning_rate": 1.6282755753351504e-06, + "loss": 0.8966, + "step": 24860 + }, + { + "epoch": 0.7490963855421687, + "grad_norm": 4.264526207466045, + "learning_rate": 1.6245996899828203e-06, + "loss": 0.8744, + "step": 24870 + }, + { + "epoch": 0.7493975903614458, + "grad_norm": 4.085038290569103, + "learning_rate": 1.6209271535261567e-06, + "loss": 0.9458, + "step": 24880 + }, + { + "epoch": 0.7496987951807229, + "grad_norm": 3.91845735237114, + "learning_rate": 1.6172579696088575e-06, + "loss": 0.9562, + "step": 24890 + }, + { + "epoch": 0.75, + "grad_norm": 5.83514146567771, + "learning_rate": 1.6135921418712959e-06, + "loss": 0.9126, + "step": 24900 + }, + { + "epoch": 0.7503012048192771, + "grad_norm": 3.788367410339539, + "learning_rate": 1.6099296739505116e-06, + "loss": 0.9895, + "step": 24910 + }, + { + "epoch": 0.7506024096385542, + "grad_norm": 5.607228355530344, + "learning_rate": 1.6062705694802199e-06, + "loss": 0.9194, + "step": 24920 + }, + { + "epoch": 0.7509036144578313, + "grad_norm": 1.9758269214360387, + "learning_rate": 1.602614832090788e-06, + "loss": 0.9182, + "step": 24930 + }, + { + "epoch": 0.7512048192771085, + "grad_norm": 1.9166091401241117, + "learning_rate": 1.5989624654092477e-06, + "loss": 0.9763, + "step": 24940 + }, + { + "epoch": 0.7515060240963856, + "grad_norm": 4.8752466540305655, + "learning_rate": 1.5953134730592862e-06, + "loss": 0.9461, + "step": 24950 + }, + { + "epoch": 0.7518072289156627, + "grad_norm": 5.198409199290026, + "learning_rate": 1.591667858661245e-06, + "loss": 0.9829, + "step": 24960 + }, + { + "epoch": 0.7521084337349397, + "grad_norm": 4.5765780169884005, + "learning_rate": 1.5880256258321107e-06, + "loss": 0.936, + "step": 24970 + }, + { + "epoch": 0.7524096385542168, + "grad_norm": 3.933693088792404, + "learning_rate": 1.584386778185517e-06, + "loss": 0.9314, + "step": 24980 + }, + { + "epoch": 0.7527108433734939, + "grad_norm": 4.366482613012169, + "learning_rate": 1.580751319331737e-06, + "loss": 0.878, + "step": 24990 + }, + { + "epoch": 0.7530120481927711, + "grad_norm": 1.9704952849185235, + "learning_rate": 1.5771192528776835e-06, + "loss": 0.911, + "step": 25000 + }, + { + "epoch": 0.7533132530120482, + "grad_norm": 4.153164024254792, + "learning_rate": 1.5734905824269031e-06, + "loss": 0.8928, + "step": 25010 + }, + { + "epoch": 0.7536144578313253, + "grad_norm": 2.011400930365946, + "learning_rate": 1.569865311579572e-06, + "loss": 0.832, + "step": 25020 + }, + { + "epoch": 0.7539156626506024, + "grad_norm": 6.482812341396677, + "learning_rate": 1.566243443932496e-06, + "loss": 1.0043, + "step": 25030 + }, + { + "epoch": 0.7542168674698795, + "grad_norm": 4.972915906899383, + "learning_rate": 1.5626249830791008e-06, + "loss": 0.886, + "step": 25040 + }, + { + "epoch": 0.7545180722891566, + "grad_norm": 4.700232307689609, + "learning_rate": 1.5590099326094333e-06, + "loss": 1.011, + "step": 25050 + }, + { + "epoch": 0.7548192771084338, + "grad_norm": 5.37168699961588, + "learning_rate": 1.555398296110161e-06, + "loss": 1.0653, + "step": 25060 + }, + { + "epoch": 0.7551204819277109, + "grad_norm": 1.9595477990699068, + "learning_rate": 1.5517900771645605e-06, + "loss": 0.855, + "step": 25070 + }, + { + "epoch": 0.755421686746988, + "grad_norm": 3.7930480459430465, + "learning_rate": 1.5481852793525143e-06, + "loss": 0.8334, + "step": 25080 + }, + { + "epoch": 0.755722891566265, + "grad_norm": 4.34052251223654, + "learning_rate": 1.5445839062505151e-06, + "loss": 0.8419, + "step": 25090 + }, + { + "epoch": 0.7560240963855421, + "grad_norm": 13.245431403369835, + "learning_rate": 1.5409859614316596e-06, + "loss": 0.8426, + "step": 25100 + }, + { + "epoch": 0.7563253012048192, + "grad_norm": 5.004260104855831, + "learning_rate": 1.5373914484656394e-06, + "loss": 0.8687, + "step": 25110 + }, + { + "epoch": 0.7566265060240964, + "grad_norm": 2.1527268242741924, + "learning_rate": 1.5338003709187416e-06, + "loss": 0.8651, + "step": 25120 + }, + { + "epoch": 0.7569277108433735, + "grad_norm": 3.7745336458304855, + "learning_rate": 1.5302127323538463e-06, + "loss": 0.9284, + "step": 25130 + }, + { + "epoch": 0.7572289156626506, + "grad_norm": 6.693207735348509, + "learning_rate": 1.52662853633042e-06, + "loss": 0.8987, + "step": 25140 + }, + { + "epoch": 0.7575301204819277, + "grad_norm": 6.261899342116549, + "learning_rate": 1.5230477864045152e-06, + "loss": 0.9968, + "step": 25150 + }, + { + "epoch": 0.7578313253012048, + "grad_norm": 2.022033652286476, + "learning_rate": 1.5194704861287629e-06, + "loss": 0.8978, + "step": 25160 + }, + { + "epoch": 0.7581325301204819, + "grad_norm": 2.136605032190541, + "learning_rate": 1.5158966390523783e-06, + "loss": 0.9367, + "step": 25170 + }, + { + "epoch": 0.7584337349397591, + "grad_norm": 5.391587866635553, + "learning_rate": 1.5123262487211404e-06, + "loss": 0.954, + "step": 25180 + }, + { + "epoch": 0.7587349397590362, + "grad_norm": 1.9421043730029601, + "learning_rate": 1.5087593186774052e-06, + "loss": 0.88, + "step": 25190 + }, + { + "epoch": 0.7590361445783133, + "grad_norm": 4.471435294641672, + "learning_rate": 1.5051958524600935e-06, + "loss": 0.9018, + "step": 25200 + }, + { + "epoch": 0.7593373493975903, + "grad_norm": 5.533919834547196, + "learning_rate": 1.5016358536046927e-06, + "loss": 0.9512, + "step": 25210 + }, + { + "epoch": 0.7596385542168674, + "grad_norm": 3.849065864864674, + "learning_rate": 1.4980793256432474e-06, + "loss": 0.8885, + "step": 25220 + }, + { + "epoch": 0.7599397590361445, + "grad_norm": 6.609822498760474, + "learning_rate": 1.4945262721043547e-06, + "loss": 0.8584, + "step": 25230 + }, + { + "epoch": 0.7602409638554217, + "grad_norm": 3.490521918344758, + "learning_rate": 1.4909766965131717e-06, + "loss": 0.9345, + "step": 25240 + }, + { + "epoch": 0.7605421686746988, + "grad_norm": 5.146801124625636, + "learning_rate": 1.4874306023914014e-06, + "loss": 0.9167, + "step": 25250 + }, + { + "epoch": 0.7608433734939759, + "grad_norm": 10.898026703823326, + "learning_rate": 1.4838879932572925e-06, + "loss": 0.9373, + "step": 25260 + }, + { + "epoch": 0.761144578313253, + "grad_norm": 4.133079212933276, + "learning_rate": 1.4803488726256366e-06, + "loss": 0.9397, + "step": 25270 + }, + { + "epoch": 0.7614457831325301, + "grad_norm": 6.394389690149214, + "learning_rate": 1.4768132440077643e-06, + "loss": 0.9996, + "step": 25280 + }, + { + "epoch": 0.7617469879518072, + "grad_norm": 6.617472460746821, + "learning_rate": 1.4732811109115409e-06, + "loss": 1.0381, + "step": 25290 + }, + { + "epoch": 0.7620481927710844, + "grad_norm": 7.540817796003784, + "learning_rate": 1.469752476841363e-06, + "loss": 0.9503, + "step": 25300 + }, + { + "epoch": 0.7623493975903615, + "grad_norm": 4.566984221797826, + "learning_rate": 1.4662273452981596e-06, + "loss": 0.9915, + "step": 25310 + }, + { + "epoch": 0.7626506024096386, + "grad_norm": 4.309362832475164, + "learning_rate": 1.462705719779382e-06, + "loss": 0.8111, + "step": 25320 + }, + { + "epoch": 0.7629518072289156, + "grad_norm": 2.075270544694661, + "learning_rate": 1.4591876037789999e-06, + "loss": 0.9168, + "step": 25330 + }, + { + "epoch": 0.7632530120481927, + "grad_norm": 4.078201471715807, + "learning_rate": 1.4556730007875037e-06, + "loss": 0.9053, + "step": 25340 + }, + { + "epoch": 0.7635542168674698, + "grad_norm": 5.59988233509037, + "learning_rate": 1.4521619142919025e-06, + "loss": 0.8567, + "step": 25350 + }, + { + "epoch": 0.763855421686747, + "grad_norm": 5.27547099207277, + "learning_rate": 1.4486543477757097e-06, + "loss": 0.9761, + "step": 25360 + }, + { + "epoch": 0.7641566265060241, + "grad_norm": 4.081857627172848, + "learning_rate": 1.4451503047189513e-06, + "loss": 0.8441, + "step": 25370 + }, + { + "epoch": 0.7644578313253012, + "grad_norm": 1.821742254320572, + "learning_rate": 1.441649788598154e-06, + "loss": 0.9515, + "step": 25380 + }, + { + "epoch": 0.7647590361445783, + "grad_norm": 4.395702922868068, + "learning_rate": 1.438152802886348e-06, + "loss": 0.9061, + "step": 25390 + }, + { + "epoch": 0.7650602409638554, + "grad_norm": 4.236608318380971, + "learning_rate": 1.4346593510530582e-06, + "loss": 0.9274, + "step": 25400 + }, + { + "epoch": 0.7653614457831325, + "grad_norm": 5.825112972513538, + "learning_rate": 1.4311694365643048e-06, + "loss": 0.9536, + "step": 25410 + }, + { + "epoch": 0.7656626506024097, + "grad_norm": 1.9536025996663187, + "learning_rate": 1.4276830628826023e-06, + "loss": 0.9233, + "step": 25420 + }, + { + "epoch": 0.7659638554216868, + "grad_norm": 3.983628338422071, + "learning_rate": 1.4242002334669436e-06, + "loss": 0.9305, + "step": 25430 + }, + { + "epoch": 0.7662650602409639, + "grad_norm": 3.5290911915940693, + "learning_rate": 1.4207209517728099e-06, + "loss": 0.9015, + "step": 25440 + }, + { + "epoch": 0.766566265060241, + "grad_norm": 4.276417649329165, + "learning_rate": 1.4172452212521664e-06, + "loss": 0.9532, + "step": 25450 + }, + { + "epoch": 0.766867469879518, + "grad_norm": 1.9858692635874682, + "learning_rate": 1.4137730453534487e-06, + "loss": 0.9327, + "step": 25460 + }, + { + "epoch": 0.7671686746987951, + "grad_norm": 4.63601285056426, + "learning_rate": 1.41030442752157e-06, + "loss": 0.9592, + "step": 25470 + }, + { + "epoch": 0.7674698795180723, + "grad_norm": 6.333195246507176, + "learning_rate": 1.4068393711979074e-06, + "loss": 0.878, + "step": 25480 + }, + { + "epoch": 0.7677710843373494, + "grad_norm": 5.1602068957904486, + "learning_rate": 1.403377879820312e-06, + "loss": 0.9782, + "step": 25490 + }, + { + "epoch": 0.7680722891566265, + "grad_norm": 5.632738739078735, + "learning_rate": 1.3999199568230942e-06, + "loss": 0.8566, + "step": 25500 + }, + { + "epoch": 0.7683734939759036, + "grad_norm": 6.291188487112138, + "learning_rate": 1.396465605637024e-06, + "loss": 1.0043, + "step": 25510 + }, + { + "epoch": 0.7686746987951807, + "grad_norm": 3.9444677940487876, + "learning_rate": 1.3930148296893276e-06, + "loss": 1.0077, + "step": 25520 + }, + { + "epoch": 0.7689759036144578, + "grad_norm": 4.184600759754945, + "learning_rate": 1.3895676324036844e-06, + "loss": 0.9654, + "step": 25530 + }, + { + "epoch": 0.769277108433735, + "grad_norm": 2.1370891402903993, + "learning_rate": 1.3861240172002243e-06, + "loss": 0.8956, + "step": 25540 + }, + { + "epoch": 0.7695783132530121, + "grad_norm": 3.8578609209664854, + "learning_rate": 1.3826839874955195e-06, + "loss": 0.8371, + "step": 25550 + }, + { + "epoch": 0.7698795180722892, + "grad_norm": 4.3944620783098465, + "learning_rate": 1.3792475467025911e-06, + "loss": 0.8816, + "step": 25560 + }, + { + "epoch": 0.7701807228915662, + "grad_norm": 6.249051779071241, + "learning_rate": 1.3758146982308963e-06, + "loss": 0.9595, + "step": 25570 + }, + { + "epoch": 0.7704819277108433, + "grad_norm": 1.7535816912474849, + "learning_rate": 1.372385445486324e-06, + "loss": 0.9491, + "step": 25580 + }, + { + "epoch": 0.7707831325301204, + "grad_norm": 5.212520780688571, + "learning_rate": 1.3689597918712e-06, + "loss": 0.8841, + "step": 25590 + }, + { + "epoch": 0.7710843373493976, + "grad_norm": 1.9867911549451127, + "learning_rate": 1.3655377407842813e-06, + "loss": 0.8906, + "step": 25600 + }, + { + "epoch": 0.7713855421686747, + "grad_norm": 4.224056730000401, + "learning_rate": 1.3621192956207473e-06, + "loss": 0.9832, + "step": 25610 + }, + { + "epoch": 0.7716867469879518, + "grad_norm": 2.0984283815411566, + "learning_rate": 1.3587044597721993e-06, + "loss": 0.9103, + "step": 25620 + }, + { + "epoch": 0.7719879518072289, + "grad_norm": 4.466657918298324, + "learning_rate": 1.3552932366266602e-06, + "loss": 0.9708, + "step": 25630 + }, + { + "epoch": 0.772289156626506, + "grad_norm": 4.419096139977014, + "learning_rate": 1.3518856295685662e-06, + "loss": 0.901, + "step": 25640 + }, + { + "epoch": 0.7725903614457831, + "grad_norm": 6.365450128341391, + "learning_rate": 1.3484816419787672e-06, + "loss": 0.9741, + "step": 25650 + }, + { + "epoch": 0.7728915662650603, + "grad_norm": 4.394197331045439, + "learning_rate": 1.3450812772345218e-06, + "loss": 0.887, + "step": 25660 + }, + { + "epoch": 0.7731927710843374, + "grad_norm": 2.023354498009975, + "learning_rate": 1.3416845387094935e-06, + "loss": 0.9366, + "step": 25670 + }, + { + "epoch": 0.7734939759036145, + "grad_norm": 4.904535265001817, + "learning_rate": 1.3382914297737492e-06, + "loss": 0.8131, + "step": 25680 + }, + { + "epoch": 0.7737951807228916, + "grad_norm": 7.310231445392361, + "learning_rate": 1.3349019537937524e-06, + "loss": 0.957, + "step": 25690 + }, + { + "epoch": 0.7740963855421686, + "grad_norm": 6.724978090106085, + "learning_rate": 1.3315161141323668e-06, + "loss": 0.909, + "step": 25700 + }, + { + "epoch": 0.7743975903614457, + "grad_norm": 2.0153041638234757, + "learning_rate": 1.3281339141488447e-06, + "loss": 0.9193, + "step": 25710 + }, + { + "epoch": 0.7746987951807229, + "grad_norm": 1.9314462357714723, + "learning_rate": 1.3247553571988287e-06, + "loss": 0.9334, + "step": 25720 + }, + { + "epoch": 0.775, + "grad_norm": 4.223220828327241, + "learning_rate": 1.321380446634342e-06, + "loss": 0.9743, + "step": 25730 + }, + { + "epoch": 0.7753012048192771, + "grad_norm": 3.9906748037488002, + "learning_rate": 1.3180091858037992e-06, + "loss": 0.8912, + "step": 25740 + }, + { + "epoch": 0.7756024096385542, + "grad_norm": 2.0000721620101864, + "learning_rate": 1.3146415780519866e-06, + "loss": 0.8386, + "step": 25750 + }, + { + "epoch": 0.7759036144578313, + "grad_norm": 4.821778037362633, + "learning_rate": 1.3112776267200695e-06, + "loss": 0.9574, + "step": 25760 + }, + { + "epoch": 0.7762048192771084, + "grad_norm": 4.030402565410141, + "learning_rate": 1.3079173351455843e-06, + "loss": 0.947, + "step": 25770 + }, + { + "epoch": 0.7765060240963856, + "grad_norm": 5.159710010514664, + "learning_rate": 1.304560706662436e-06, + "loss": 0.8636, + "step": 25780 + }, + { + "epoch": 0.7768072289156627, + "grad_norm": 5.878278774009832, + "learning_rate": 1.3012077446008969e-06, + "loss": 0.9339, + "step": 25790 + }, + { + "epoch": 0.7771084337349398, + "grad_norm": 4.866088555096551, + "learning_rate": 1.2978584522875981e-06, + "loss": 0.925, + "step": 25800 + }, + { + "epoch": 0.7774096385542169, + "grad_norm": 6.796986463528478, + "learning_rate": 1.294512833045537e-06, + "loss": 0.9362, + "step": 25810 + }, + { + "epoch": 0.7777108433734939, + "grad_norm": 6.357639347649964, + "learning_rate": 1.2911708901940572e-06, + "loss": 0.9411, + "step": 25820 + }, + { + "epoch": 0.778012048192771, + "grad_norm": 5.6679978434498155, + "learning_rate": 1.2878326270488622e-06, + "loss": 0.8914, + "step": 25830 + }, + { + "epoch": 0.7783132530120482, + "grad_norm": 2.0276944468996456, + "learning_rate": 1.2844980469220003e-06, + "loss": 0.9039, + "step": 25840 + }, + { + "epoch": 0.7786144578313253, + "grad_norm": 4.731582787629073, + "learning_rate": 1.28116715312187e-06, + "loss": 0.8664, + "step": 25850 + }, + { + "epoch": 0.7789156626506024, + "grad_norm": 4.425328161933482, + "learning_rate": 1.2778399489532085e-06, + "loss": 0.9914, + "step": 25860 + }, + { + "epoch": 0.7792168674698795, + "grad_norm": 2.01502210466282, + "learning_rate": 1.2745164377170937e-06, + "loss": 0.9123, + "step": 25870 + }, + { + "epoch": 0.7795180722891566, + "grad_norm": 3.658112612814461, + "learning_rate": 1.2711966227109402e-06, + "loss": 0.9839, + "step": 25880 + }, + { + "epoch": 0.7798192771084337, + "grad_norm": 1.9038679834427659, + "learning_rate": 1.2678805072284944e-06, + "loss": 0.8812, + "step": 25890 + }, + { + "epoch": 0.7801204819277109, + "grad_norm": 6.253973569990244, + "learning_rate": 1.2645680945598327e-06, + "loss": 0.9659, + "step": 25900 + }, + { + "epoch": 0.780421686746988, + "grad_norm": 3.9352510119647124, + "learning_rate": 1.261259387991358e-06, + "loss": 1.0112, + "step": 25910 + }, + { + "epoch": 0.7807228915662651, + "grad_norm": 3.68180517325047, + "learning_rate": 1.2579543908057956e-06, + "loss": 0.7992, + "step": 25920 + }, + { + "epoch": 0.7810240963855422, + "grad_norm": 4.6948677189212455, + "learning_rate": 1.254653106282191e-06, + "loss": 0.8736, + "step": 25930 + }, + { + "epoch": 0.7813253012048192, + "grad_norm": 4.42954429704751, + "learning_rate": 1.2513555376959047e-06, + "loss": 0.9852, + "step": 25940 + }, + { + "epoch": 0.7816265060240963, + "grad_norm": 2.137342924937682, + "learning_rate": 1.248061688318614e-06, + "loss": 0.8774, + "step": 25950 + }, + { + "epoch": 0.7819277108433735, + "grad_norm": 1.975982299553347, + "learning_rate": 1.2447715614183053e-06, + "loss": 0.9876, + "step": 25960 + }, + { + "epoch": 0.7822289156626506, + "grad_norm": 6.550414400072866, + "learning_rate": 1.2414851602592665e-06, + "loss": 0.9902, + "step": 25970 + }, + { + "epoch": 0.7825301204819277, + "grad_norm": 6.126236084567978, + "learning_rate": 1.2382024881020937e-06, + "loss": 0.988, + "step": 25980 + }, + { + "epoch": 0.7828313253012048, + "grad_norm": 2.105531679291249, + "learning_rate": 1.2349235482036849e-06, + "loss": 0.9077, + "step": 25990 + }, + { + "epoch": 0.7831325301204819, + "grad_norm": 2.021805743567685, + "learning_rate": 1.231648343817231e-06, + "loss": 0.898, + "step": 26000 + }, + { + "epoch": 0.783433734939759, + "grad_norm": 4.846061086067915, + "learning_rate": 1.2283768781922195e-06, + "loss": 0.9338, + "step": 26010 + }, + { + "epoch": 0.7837349397590362, + "grad_norm": 4.12075668655565, + "learning_rate": 1.2251091545744265e-06, + "loss": 0.9795, + "step": 26020 + }, + { + "epoch": 0.7840361445783133, + "grad_norm": 4.2233381745493785, + "learning_rate": 1.2218451762059174e-06, + "loss": 0.8972, + "step": 26030 + }, + { + "epoch": 0.7843373493975904, + "grad_norm": 4.236742207240406, + "learning_rate": 1.2185849463250405e-06, + "loss": 0.9595, + "step": 26040 + }, + { + "epoch": 0.7846385542168675, + "grad_norm": 4.45407343943746, + "learning_rate": 1.2153284681664235e-06, + "loss": 1.0062, + "step": 26050 + }, + { + "epoch": 0.7849397590361445, + "grad_norm": 6.0861661086256085, + "learning_rate": 1.212075744960979e-06, + "loss": 1.0037, + "step": 26060 + }, + { + "epoch": 0.7852409638554216, + "grad_norm": 3.70085945934955, + "learning_rate": 1.2088267799358833e-06, + "loss": 0.9244, + "step": 26070 + }, + { + "epoch": 0.7855421686746988, + "grad_norm": 3.9099942360199713, + "learning_rate": 1.2055815763145923e-06, + "loss": 0.9985, + "step": 26080 + }, + { + "epoch": 0.7858433734939759, + "grad_norm": 1.7590069586623232, + "learning_rate": 1.2023401373168248e-06, + "loss": 0.8263, + "step": 26090 + }, + { + "epoch": 0.786144578313253, + "grad_norm": 4.4828406707090735, + "learning_rate": 1.1991024661585704e-06, + "loss": 0.9566, + "step": 26100 + }, + { + "epoch": 0.7864457831325301, + "grad_norm": 1.8357938637751958, + "learning_rate": 1.1958685660520775e-06, + "loss": 0.8148, + "step": 26110 + }, + { + "epoch": 0.7867469879518072, + "grad_norm": 3.6533553251729853, + "learning_rate": 1.1926384402058478e-06, + "loss": 0.9665, + "step": 26120 + }, + { + "epoch": 0.7870481927710843, + "grad_norm": 4.057183995762749, + "learning_rate": 1.189412091824647e-06, + "loss": 0.972, + "step": 26130 + }, + { + "epoch": 0.7873493975903615, + "grad_norm": 4.130984563490733, + "learning_rate": 1.1861895241094895e-06, + "loss": 0.9189, + "step": 26140 + }, + { + "epoch": 0.7876506024096386, + "grad_norm": 4.488646157668425, + "learning_rate": 1.1829707402576375e-06, + "loss": 0.9066, + "step": 26150 + }, + { + "epoch": 0.7879518072289157, + "grad_norm": 8.370888803521982, + "learning_rate": 1.1797557434625994e-06, + "loss": 0.9105, + "step": 26160 + }, + { + "epoch": 0.7882530120481928, + "grad_norm": 2.1094400794436625, + "learning_rate": 1.1765445369141276e-06, + "loss": 0.7758, + "step": 26170 + }, + { + "epoch": 0.7885542168674698, + "grad_norm": 2.0947795704711227, + "learning_rate": 1.1733371237982133e-06, + "loss": 0.915, + "step": 26180 + }, + { + "epoch": 0.7888554216867469, + "grad_norm": 2.0468458622487566, + "learning_rate": 1.1701335072970825e-06, + "loss": 0.9879, + "step": 26190 + }, + { + "epoch": 0.7891566265060241, + "grad_norm": 2.0626807496444, + "learning_rate": 1.1669336905891987e-06, + "loss": 0.9199, + "step": 26200 + }, + { + "epoch": 0.7894578313253012, + "grad_norm": 4.506999692296973, + "learning_rate": 1.1637376768492524e-06, + "loss": 0.9, + "step": 26210 + }, + { + "epoch": 0.7897590361445783, + "grad_norm": 5.957970252927702, + "learning_rate": 1.16054546924816e-06, + "loss": 0.9151, + "step": 26220 + }, + { + "epoch": 0.7900602409638554, + "grad_norm": 5.405482708816962, + "learning_rate": 1.1573570709530608e-06, + "loss": 0.946, + "step": 26230 + }, + { + "epoch": 0.7903614457831325, + "grad_norm": 5.447065296117892, + "learning_rate": 1.1541724851273213e-06, + "loss": 0.9401, + "step": 26240 + }, + { + "epoch": 0.7906626506024096, + "grad_norm": 5.157807215195378, + "learning_rate": 1.1509917149305183e-06, + "loss": 0.9993, + "step": 26250 + }, + { + "epoch": 0.7909638554216868, + "grad_norm": 1.8861021646220957, + "learning_rate": 1.1478147635184482e-06, + "loss": 0.9656, + "step": 26260 + }, + { + "epoch": 0.7912650602409639, + "grad_norm": 5.129408790893935, + "learning_rate": 1.144641634043111e-06, + "loss": 1.0337, + "step": 26270 + }, + { + "epoch": 0.791566265060241, + "grad_norm": 4.1282872652533, + "learning_rate": 1.1414723296527247e-06, + "loss": 0.8983, + "step": 26280 + }, + { + "epoch": 0.7918674698795181, + "grad_norm": 1.8705992631642177, + "learning_rate": 1.1383068534917057e-06, + "loss": 0.913, + "step": 26290 + }, + { + "epoch": 0.7921686746987951, + "grad_norm": 4.052602527318772, + "learning_rate": 1.1351452087006737e-06, + "loss": 0.9466, + "step": 26300 + }, + { + "epoch": 0.7924698795180722, + "grad_norm": 1.822214358987177, + "learning_rate": 1.1319873984164475e-06, + "loss": 0.8877, + "step": 26310 + }, + { + "epoch": 0.7927710843373494, + "grad_norm": 4.221300445407804, + "learning_rate": 1.1288334257720412e-06, + "loss": 0.9885, + "step": 26320 + }, + { + "epoch": 0.7930722891566265, + "grad_norm": 2.197146428680317, + "learning_rate": 1.1256832938966622e-06, + "loss": 1.0005, + "step": 26330 + }, + { + "epoch": 0.7933734939759036, + "grad_norm": 4.5410982316680695, + "learning_rate": 1.1225370059157042e-06, + "loss": 0.8014, + "step": 26340 + }, + { + "epoch": 0.7936746987951807, + "grad_norm": 5.395947773614742, + "learning_rate": 1.1193945649507526e-06, + "loss": 0.9843, + "step": 26350 + }, + { + "epoch": 0.7939759036144578, + "grad_norm": 4.461932971950932, + "learning_rate": 1.1162559741195733e-06, + "loss": 0.8463, + "step": 26360 + }, + { + "epoch": 0.7942771084337349, + "grad_norm": 6.6869036841805025, + "learning_rate": 1.1131212365361083e-06, + "loss": 0.8218, + "step": 26370 + }, + { + "epoch": 0.7945783132530121, + "grad_norm": 4.854899512075135, + "learning_rate": 1.1099903553104835e-06, + "loss": 0.9564, + "step": 26380 + }, + { + "epoch": 0.7948795180722892, + "grad_norm": 3.688571325764524, + "learning_rate": 1.1068633335489943e-06, + "loss": 0.886, + "step": 26390 + }, + { + "epoch": 0.7951807228915663, + "grad_norm": 5.898503386801963, + "learning_rate": 1.1037401743541082e-06, + "loss": 0.9585, + "step": 26400 + }, + { + "epoch": 0.7954819277108434, + "grad_norm": 5.451504270955381, + "learning_rate": 1.1006208808244612e-06, + "loss": 0.9333, + "step": 26410 + }, + { + "epoch": 0.7957831325301205, + "grad_norm": 4.581186674401167, + "learning_rate": 1.0975054560548521e-06, + "loss": 0.9347, + "step": 26420 + }, + { + "epoch": 0.7960843373493975, + "grad_norm": 4.238600543352366, + "learning_rate": 1.0943939031362432e-06, + "loss": 0.9473, + "step": 26430 + }, + { + "epoch": 0.7963855421686747, + "grad_norm": 4.2327094364943365, + "learning_rate": 1.0912862251557532e-06, + "loss": 0.8378, + "step": 26440 + }, + { + "epoch": 0.7966867469879518, + "grad_norm": 2.094152982256059, + "learning_rate": 1.0881824251966593e-06, + "loss": 0.8707, + "step": 26450 + }, + { + "epoch": 0.7969879518072289, + "grad_norm": 12.966820603992703, + "learning_rate": 1.0850825063383912e-06, + "loss": 0.8267, + "step": 26460 + }, + { + "epoch": 0.797289156626506, + "grad_norm": 5.9430399670813285, + "learning_rate": 1.0819864716565227e-06, + "loss": 0.8869, + "step": 26470 + }, + { + "epoch": 0.7975903614457831, + "grad_norm": 6.82874328676258, + "learning_rate": 1.0788943242227768e-06, + "loss": 0.8684, + "step": 26480 + }, + { + "epoch": 0.7978915662650602, + "grad_norm": 1.9055393103703155, + "learning_rate": 1.0758060671050247e-06, + "loss": 0.8856, + "step": 26490 + }, + { + "epoch": 0.7981927710843374, + "grad_norm": 4.630105577091288, + "learning_rate": 1.072721703367271e-06, + "loss": 1.0358, + "step": 26500 + }, + { + "epoch": 0.7984939759036145, + "grad_norm": 4.965017021441382, + "learning_rate": 1.0696412360696617e-06, + "loss": 1.0116, + "step": 26510 + }, + { + "epoch": 0.7987951807228916, + "grad_norm": 1.9918649889001563, + "learning_rate": 1.0665646682684722e-06, + "loss": 0.8275, + "step": 26520 + }, + { + "epoch": 0.7990963855421687, + "grad_norm": 2.012178059055671, + "learning_rate": 1.063492003016115e-06, + "loss": 0.8861, + "step": 26530 + }, + { + "epoch": 0.7993975903614458, + "grad_norm": 6.9462012180711215, + "learning_rate": 1.0604232433611272e-06, + "loss": 0.9874, + "step": 26540 + }, + { + "epoch": 0.7996987951807228, + "grad_norm": 4.515389197051747, + "learning_rate": 1.057358392348171e-06, + "loss": 1.0043, + "step": 26550 + }, + { + "epoch": 0.8, + "grad_norm": 5.028675444764515, + "learning_rate": 1.0542974530180327e-06, + "loss": 0.8569, + "step": 26560 + }, + { + "epoch": 0.8003012048192771, + "grad_norm": 2.098557752811368, + "learning_rate": 1.051240428407615e-06, + "loss": 0.8994, + "step": 26570 + }, + { + "epoch": 0.8006024096385542, + "grad_norm": 4.527629327335188, + "learning_rate": 1.0481873215499377e-06, + "loss": 0.9069, + "step": 26580 + }, + { + "epoch": 0.8009036144578313, + "grad_norm": 4.538578866914151, + "learning_rate": 1.0451381354741336e-06, + "loss": 0.9278, + "step": 26590 + }, + { + "epoch": 0.8012048192771084, + "grad_norm": 3.337709672653328, + "learning_rate": 1.042092873205447e-06, + "loss": 0.8074, + "step": 26600 + }, + { + "epoch": 0.8015060240963855, + "grad_norm": 10.619111251584942, + "learning_rate": 1.0390515377652288e-06, + "loss": 0.9584, + "step": 26610 + }, + { + "epoch": 0.8018072289156627, + "grad_norm": 4.81985411250438, + "learning_rate": 1.0360141321709288e-06, + "loss": 0.9831, + "step": 26620 + }, + { + "epoch": 0.8021084337349398, + "grad_norm": 3.3686071971015323, + "learning_rate": 1.0329806594361059e-06, + "loss": 0.8553, + "step": 26630 + }, + { + "epoch": 0.8024096385542169, + "grad_norm": 4.826253370488143, + "learning_rate": 1.0299511225704113e-06, + "loss": 0.9601, + "step": 26640 + }, + { + "epoch": 0.802710843373494, + "grad_norm": 4.7909237230489214, + "learning_rate": 1.0269255245795938e-06, + "loss": 0.9377, + "step": 26650 + }, + { + "epoch": 0.803012048192771, + "grad_norm": 3.985511976147391, + "learning_rate": 1.0239038684654928e-06, + "loss": 1.0171, + "step": 26660 + }, + { + "epoch": 0.8033132530120481, + "grad_norm": 4.668711471278927, + "learning_rate": 1.020886157226038e-06, + "loss": 0.9964, + "step": 26670 + }, + { + "epoch": 0.8036144578313253, + "grad_norm": 1.923426713562242, + "learning_rate": 1.0178723938552449e-06, + "loss": 0.9291, + "step": 26680 + }, + { + "epoch": 0.8039156626506024, + "grad_norm": 4.4701170492883815, + "learning_rate": 1.01486258134321e-06, + "loss": 0.8607, + "step": 26690 + }, + { + "epoch": 0.8042168674698795, + "grad_norm": 4.72559958319008, + "learning_rate": 1.011856722676116e-06, + "loss": 0.9948, + "step": 26700 + }, + { + "epoch": 0.8045180722891566, + "grad_norm": 6.178153048399851, + "learning_rate": 1.0088548208362149e-06, + "loss": 0.978, + "step": 26710 + }, + { + "epoch": 0.8048192771084337, + "grad_norm": 3.9557340854033525, + "learning_rate": 1.0058568788018374e-06, + "loss": 0.9143, + "step": 26720 + }, + { + "epoch": 0.8051204819277108, + "grad_norm": 6.397975028572285, + "learning_rate": 1.0028628995473844e-06, + "loss": 0.955, + "step": 26730 + }, + { + "epoch": 0.805421686746988, + "grad_norm": 3.949178383445514, + "learning_rate": 9.998728860433277e-07, + "loss": 0.9726, + "step": 26740 + }, + { + "epoch": 0.8057228915662651, + "grad_norm": 5.8382587758648, + "learning_rate": 9.968868412562012e-07, + "loss": 0.9155, + "step": 26750 + }, + { + "epoch": 0.8060240963855422, + "grad_norm": 1.7851946741763713, + "learning_rate": 9.939047681486018e-07, + "loss": 0.905, + "step": 26760 + }, + { + "epoch": 0.8063253012048193, + "grad_norm": 3.9588179541706343, + "learning_rate": 9.90926669679187e-07, + "loss": 0.8882, + "step": 26770 + }, + { + "epoch": 0.8066265060240964, + "grad_norm": 5.512485201013972, + "learning_rate": 9.879525488026704e-07, + "loss": 0.9073, + "step": 26780 + }, + { + "epoch": 0.8069277108433734, + "grad_norm": 4.473173363507019, + "learning_rate": 9.84982408469819e-07, + "loss": 0.8525, + "step": 26790 + }, + { + "epoch": 0.8072289156626506, + "grad_norm": 4.623229976299066, + "learning_rate": 9.820162516274512e-07, + "loss": 0.9826, + "step": 26800 + }, + { + "epoch": 0.8075301204819277, + "grad_norm": 2.0461130174625115, + "learning_rate": 9.790540812184319e-07, + "loss": 0.908, + "step": 26810 + }, + { + "epoch": 0.8078313253012048, + "grad_norm": 1.821179105196857, + "learning_rate": 9.760959001816723e-07, + "loss": 0.8484, + "step": 26820 + }, + { + "epoch": 0.8081325301204819, + "grad_norm": 2.122677229149528, + "learning_rate": 9.731417114521246e-07, + "loss": 0.8311, + "step": 26830 + }, + { + "epoch": 0.808433734939759, + "grad_norm": 1.9949072899409344, + "learning_rate": 9.70191517960779e-07, + "loss": 0.9034, + "step": 26840 + }, + { + "epoch": 0.8087349397590361, + "grad_norm": 4.051532263239917, + "learning_rate": 9.672453226346673e-07, + "loss": 0.8529, + "step": 26850 + }, + { + "epoch": 0.8090361445783133, + "grad_norm": 5.001389372697416, + "learning_rate": 9.643031283968467e-07, + "loss": 0.9398, + "step": 26860 + }, + { + "epoch": 0.8093373493975904, + "grad_norm": 1.995055674959236, + "learning_rate": 9.61364938166408e-07, + "loss": 0.9159, + "step": 26870 + }, + { + "epoch": 0.8096385542168675, + "grad_norm": 7.349946295979108, + "learning_rate": 9.584307548584725e-07, + "loss": 0.876, + "step": 26880 + }, + { + "epoch": 0.8099397590361446, + "grad_norm": 1.9244608147555216, + "learning_rate": 9.555005813841827e-07, + "loss": 0.8988, + "step": 26890 + }, + { + "epoch": 0.8102409638554217, + "grad_norm": 7.079759318224367, + "learning_rate": 9.525744206507032e-07, + "loss": 0.8887, + "step": 26900 + }, + { + "epoch": 0.8105421686746987, + "grad_norm": 4.764109926375551, + "learning_rate": 9.496522755612175e-07, + "loss": 0.9922, + "step": 26910 + }, + { + "epoch": 0.810843373493976, + "grad_norm": 3.475587394695104, + "learning_rate": 9.467341490149251e-07, + "loss": 0.9398, + "step": 26920 + }, + { + "epoch": 0.811144578313253, + "grad_norm": 3.433999525350799, + "learning_rate": 9.438200439070388e-07, + "loss": 0.8413, + "step": 26930 + }, + { + "epoch": 0.8114457831325301, + "grad_norm": 6.004915672093086, + "learning_rate": 9.409099631287799e-07, + "loss": 0.9537, + "step": 26940 + }, + { + "epoch": 0.8117469879518072, + "grad_norm": 5.08164683715821, + "learning_rate": 9.380039095673821e-07, + "loss": 0.8516, + "step": 26950 + }, + { + "epoch": 0.8120481927710843, + "grad_norm": 1.9924714282201934, + "learning_rate": 9.351018861060762e-07, + "loss": 0.8413, + "step": 26960 + }, + { + "epoch": 0.8123493975903614, + "grad_norm": 3.730570459866916, + "learning_rate": 9.322038956240992e-07, + "loss": 0.8713, + "step": 26970 + }, + { + "epoch": 0.8126506024096386, + "grad_norm": 4.125942399348648, + "learning_rate": 9.293099409966844e-07, + "loss": 0.9627, + "step": 26980 + }, + { + "epoch": 0.8129518072289157, + "grad_norm": 4.636791732008932, + "learning_rate": 9.264200250950645e-07, + "loss": 0.841, + "step": 26990 + }, + { + "epoch": 0.8132530120481928, + "grad_norm": 5.350309927339916, + "learning_rate": 9.235341507864637e-07, + "loss": 1.0298, + "step": 27000 + }, + { + "epoch": 0.8135542168674699, + "grad_norm": 4.087683638110185, + "learning_rate": 9.206523209340912e-07, + "loss": 0.9649, + "step": 27010 + }, + { + "epoch": 0.813855421686747, + "grad_norm": 4.769646376143996, + "learning_rate": 9.177745383971515e-07, + "loss": 0.9672, + "step": 27020 + }, + { + "epoch": 0.814156626506024, + "grad_norm": 5.657734836139984, + "learning_rate": 9.14900806030829e-07, + "loss": 0.898, + "step": 27030 + }, + { + "epoch": 0.8144578313253013, + "grad_norm": 4.37602630907073, + "learning_rate": 9.120311266862908e-07, + "loss": 0.9639, + "step": 27040 + }, + { + "epoch": 0.8147590361445783, + "grad_norm": 3.7446863659705256, + "learning_rate": 9.091655032106822e-07, + "loss": 1.0256, + "step": 27050 + }, + { + "epoch": 0.8150602409638554, + "grad_norm": 5.369127240316652, + "learning_rate": 9.063039384471256e-07, + "loss": 1.0176, + "step": 27060 + }, + { + "epoch": 0.8153614457831325, + "grad_norm": 4.056586251695432, + "learning_rate": 9.034464352347156e-07, + "loss": 0.8408, + "step": 27070 + }, + { + "epoch": 0.8156626506024096, + "grad_norm": 6.484866556665476, + "learning_rate": 9.005929964085169e-07, + "loss": 0.9512, + "step": 27080 + }, + { + "epoch": 0.8159638554216867, + "grad_norm": 2.0939348844131223, + "learning_rate": 8.977436247995619e-07, + "loss": 0.935, + "step": 27090 + }, + { + "epoch": 0.8162650602409639, + "grad_norm": 4.1015451369900395, + "learning_rate": 8.948983232348518e-07, + "loss": 0.9712, + "step": 27100 + }, + { + "epoch": 0.816566265060241, + "grad_norm": 12.325101995524495, + "learning_rate": 8.920570945373414e-07, + "loss": 0.8931, + "step": 27110 + }, + { + "epoch": 0.8168674698795181, + "grad_norm": 7.337973386375015, + "learning_rate": 8.892199415259501e-07, + "loss": 0.8685, + "step": 27120 + }, + { + "epoch": 0.8171686746987952, + "grad_norm": 1.9946831973745822, + "learning_rate": 8.863868670155545e-07, + "loss": 0.8989, + "step": 27130 + }, + { + "epoch": 0.8174698795180723, + "grad_norm": 4.4044825487754045, + "learning_rate": 8.835578738169825e-07, + "loss": 0.9525, + "step": 27140 + }, + { + "epoch": 0.8177710843373494, + "grad_norm": 5.400466321171892, + "learning_rate": 8.807329647370139e-07, + "loss": 0.8243, + "step": 27150 + }, + { + "epoch": 0.8180722891566266, + "grad_norm": 5.935745729566731, + "learning_rate": 8.779121425783721e-07, + "loss": 0.9381, + "step": 27160 + }, + { + "epoch": 0.8183734939759036, + "grad_norm": 4.614854388393017, + "learning_rate": 8.750954101397335e-07, + "loss": 1.0315, + "step": 27170 + }, + { + "epoch": 0.8186746987951807, + "grad_norm": 1.9215296013732723, + "learning_rate": 8.722827702157105e-07, + "loss": 1.0177, + "step": 27180 + }, + { + "epoch": 0.8189759036144578, + "grad_norm": 1.9749572471131391, + "learning_rate": 8.694742255968586e-07, + "loss": 0.9112, + "step": 27190 + }, + { + "epoch": 0.8192771084337349, + "grad_norm": 2.0190811152014208, + "learning_rate": 8.666697790696682e-07, + "loss": 0.8902, + "step": 27200 + }, + { + "epoch": 0.819578313253012, + "grad_norm": 4.08313914158281, + "learning_rate": 8.638694334165642e-07, + "loss": 0.9238, + "step": 27210 + }, + { + "epoch": 0.8198795180722892, + "grad_norm": 4.244208090872529, + "learning_rate": 8.610731914159037e-07, + "loss": 0.9479, + "step": 27220 + }, + { + "epoch": 0.8201807228915663, + "grad_norm": 4.531661928586746, + "learning_rate": 8.582810558419708e-07, + "loss": 0.9322, + "step": 27230 + }, + { + "epoch": 0.8204819277108434, + "grad_norm": 4.523393454805247, + "learning_rate": 8.554930294649777e-07, + "loss": 0.9203, + "step": 27240 + }, + { + "epoch": 0.8207831325301205, + "grad_norm": 1.8254250854960814, + "learning_rate": 8.527091150510592e-07, + "loss": 0.9018, + "step": 27250 + }, + { + "epoch": 0.8210843373493976, + "grad_norm": 2.010002656092585, + "learning_rate": 8.499293153622657e-07, + "loss": 0.8595, + "step": 27260 + }, + { + "epoch": 0.8213855421686747, + "grad_norm": 2.0340982050325973, + "learning_rate": 8.471536331565733e-07, + "loss": 0.9179, + "step": 27270 + }, + { + "epoch": 0.8216867469879519, + "grad_norm": 1.9881953867347788, + "learning_rate": 8.443820711878664e-07, + "loss": 0.8816, + "step": 27280 + }, + { + "epoch": 0.821987951807229, + "grad_norm": 4.854072714959214, + "learning_rate": 8.416146322059449e-07, + "loss": 0.8966, + "step": 27290 + }, + { + "epoch": 0.822289156626506, + "grad_norm": 22.399579502033067, + "learning_rate": 8.388513189565161e-07, + "loss": 0.9374, + "step": 27300 + }, + { + "epoch": 0.8225903614457831, + "grad_norm": 14.495352721079653, + "learning_rate": 8.360921341811956e-07, + "loss": 0.9245, + "step": 27310 + }, + { + "epoch": 0.8228915662650602, + "grad_norm": 7.079842567759796, + "learning_rate": 8.333370806175017e-07, + "loss": 0.8785, + "step": 27320 + }, + { + "epoch": 0.8231927710843373, + "grad_norm": 6.054767404580933, + "learning_rate": 8.305861609988553e-07, + "loss": 0.8306, + "step": 27330 + }, + { + "epoch": 0.8234939759036145, + "grad_norm": 5.258622117514524, + "learning_rate": 8.278393780545746e-07, + "loss": 0.9699, + "step": 27340 + }, + { + "epoch": 0.8237951807228916, + "grad_norm": 2.036278786840511, + "learning_rate": 8.250967345098731e-07, + "loss": 0.9187, + "step": 27350 + }, + { + "epoch": 0.8240963855421687, + "grad_norm": 2.0687201110763773, + "learning_rate": 8.223582330858599e-07, + "loss": 0.9041, + "step": 27360 + }, + { + "epoch": 0.8243975903614458, + "grad_norm": 6.540868711120688, + "learning_rate": 8.196238764995301e-07, + "loss": 0.9113, + "step": 27370 + }, + { + "epoch": 0.8246987951807229, + "grad_norm": 4.674074221348497, + "learning_rate": 8.168936674637728e-07, + "loss": 0.9354, + "step": 27380 + }, + { + "epoch": 0.825, + "grad_norm": 5.2923946551416226, + "learning_rate": 8.141676086873574e-07, + "loss": 0.8405, + "step": 27390 + }, + { + "epoch": 0.8253012048192772, + "grad_norm": 17.16648235120451, + "learning_rate": 8.114457028749373e-07, + "loss": 0.8569, + "step": 27400 + }, + { + "epoch": 0.8256024096385542, + "grad_norm": 5.5318367812979465, + "learning_rate": 8.08727952727042e-07, + "loss": 0.9101, + "step": 27410 + }, + { + "epoch": 0.8259036144578313, + "grad_norm": 11.042453676325039, + "learning_rate": 8.060143609400845e-07, + "loss": 0.9282, + "step": 27420 + }, + { + "epoch": 0.8262048192771084, + "grad_norm": 6.320479980702559, + "learning_rate": 8.033049302063478e-07, + "loss": 0.9145, + "step": 27430 + }, + { + "epoch": 0.8265060240963855, + "grad_norm": 5.461256521021767, + "learning_rate": 8.005996632139867e-07, + "loss": 0.9507, + "step": 27440 + }, + { + "epoch": 0.8268072289156626, + "grad_norm": 5.4490153905479, + "learning_rate": 7.978985626470264e-07, + "loss": 0.932, + "step": 27450 + }, + { + "epoch": 0.8271084337349398, + "grad_norm": 33.65867301668931, + "learning_rate": 7.952016311853572e-07, + "loss": 0.922, + "step": 27460 + }, + { + "epoch": 0.8274096385542169, + "grad_norm": 9.655256469529428, + "learning_rate": 7.925088715047341e-07, + "loss": 0.8934, + "step": 27470 + }, + { + "epoch": 0.827710843373494, + "grad_norm": 5.215517515809109, + "learning_rate": 7.898202862767701e-07, + "loss": 0.9111, + "step": 27480 + }, + { + "epoch": 0.8280120481927711, + "grad_norm": 3.9926902043041985, + "learning_rate": 7.871358781689431e-07, + "loss": 0.869, + "step": 27490 + }, + { + "epoch": 0.8283132530120482, + "grad_norm": 6.770181487323086, + "learning_rate": 7.844556498445788e-07, + "loss": 1.0588, + "step": 27500 + }, + { + "epoch": 0.8286144578313253, + "grad_norm": 4.003342302838501, + "learning_rate": 7.8177960396286e-07, + "loss": 0.9382, + "step": 27510 + }, + { + "epoch": 0.8289156626506025, + "grad_norm": 6.312607703702879, + "learning_rate": 7.791077431788207e-07, + "loss": 1.025, + "step": 27520 + }, + { + "epoch": 0.8292168674698795, + "grad_norm": 3.9398134718276556, + "learning_rate": 7.764400701433417e-07, + "loss": 0.9401, + "step": 27530 + }, + { + "epoch": 0.8295180722891566, + "grad_norm": 5.226230967539029, + "learning_rate": 7.737765875031477e-07, + "loss": 0.9035, + "step": 27540 + }, + { + "epoch": 0.8298192771084337, + "grad_norm": 5.003291406080699, + "learning_rate": 7.711172979008069e-07, + "loss": 0.8738, + "step": 27550 + }, + { + "epoch": 0.8301204819277108, + "grad_norm": 6.972945541490178, + "learning_rate": 7.684622039747281e-07, + "loss": 0.9869, + "step": 27560 + }, + { + "epoch": 0.8304216867469879, + "grad_norm": 4.674914719889957, + "learning_rate": 7.658113083591556e-07, + "loss": 0.946, + "step": 27570 + }, + { + "epoch": 0.8307228915662651, + "grad_norm": 4.091640041552626, + "learning_rate": 7.631646136841702e-07, + "loss": 0.8646, + "step": 27580 + }, + { + "epoch": 0.8310240963855422, + "grad_norm": 6.11366045652225, + "learning_rate": 7.605221225756837e-07, + "loss": 0.8583, + "step": 27590 + }, + { + "epoch": 0.8313253012048193, + "grad_norm": 4.702228869067837, + "learning_rate": 7.578838376554376e-07, + "loss": 0.9626, + "step": 27600 + }, + { + "epoch": 0.8316265060240964, + "grad_norm": 2.0034731218658015, + "learning_rate": 7.552497615410003e-07, + "loss": 0.8524, + "step": 27610 + }, + { + "epoch": 0.8319277108433735, + "grad_norm": 4.105676082020187, + "learning_rate": 7.526198968457632e-07, + "loss": 0.8579, + "step": 27620 + }, + { + "epoch": 0.8322289156626506, + "grad_norm": 8.181732148696257, + "learning_rate": 7.499942461789423e-07, + "loss": 0.9798, + "step": 27630 + }, + { + "epoch": 0.8325301204819278, + "grad_norm": 5.061684993884733, + "learning_rate": 7.473728121455703e-07, + "loss": 0.9056, + "step": 27640 + }, + { + "epoch": 0.8328313253012049, + "grad_norm": 1.90854313938708, + "learning_rate": 7.447555973464988e-07, + "loss": 0.8684, + "step": 27650 + }, + { + "epoch": 0.8331325301204819, + "grad_norm": 4.554426097884475, + "learning_rate": 7.421426043783864e-07, + "loss": 0.9668, + "step": 27660 + }, + { + "epoch": 0.833433734939759, + "grad_norm": 5.653339492385188, + "learning_rate": 7.395338358337123e-07, + "loss": 0.92, + "step": 27670 + }, + { + "epoch": 0.8337349397590361, + "grad_norm": 1.9014041837363123, + "learning_rate": 7.369292943007589e-07, + "loss": 0.9226, + "step": 27680 + }, + { + "epoch": 0.8340361445783132, + "grad_norm": 4.548615361967034, + "learning_rate": 7.343289823636168e-07, + "loss": 0.8829, + "step": 27690 + }, + { + "epoch": 0.8343373493975904, + "grad_norm": 3.809847836225357, + "learning_rate": 7.317329026021791e-07, + "loss": 0.8267, + "step": 27700 + }, + { + "epoch": 0.8346385542168675, + "grad_norm": 2.142259832202635, + "learning_rate": 7.291410575921399e-07, + "loss": 0.8336, + "step": 27710 + }, + { + "epoch": 0.8349397590361446, + "grad_norm": 6.038088531105821, + "learning_rate": 7.265534499049925e-07, + "loss": 1.0077, + "step": 27720 + }, + { + "epoch": 0.8352409638554217, + "grad_norm": 10.756075274716798, + "learning_rate": 7.239700821080254e-07, + "loss": 0.913, + "step": 27730 + }, + { + "epoch": 0.8355421686746988, + "grad_norm": 11.948788612904204, + "learning_rate": 7.213909567643235e-07, + "loss": 0.9393, + "step": 27740 + }, + { + "epoch": 0.8358433734939759, + "grad_norm": 5.679665390212351, + "learning_rate": 7.188160764327573e-07, + "loss": 0.926, + "step": 27750 + }, + { + "epoch": 0.8361445783132531, + "grad_norm": 3.898848576546342, + "learning_rate": 7.162454436679883e-07, + "loss": 0.8605, + "step": 27760 + }, + { + "epoch": 0.8364457831325302, + "grad_norm": 1.938386863051616, + "learning_rate": 7.136790610204663e-07, + "loss": 0.941, + "step": 27770 + }, + { + "epoch": 0.8367469879518072, + "grad_norm": 3.751209077060178, + "learning_rate": 7.111169310364203e-07, + "loss": 1.0116, + "step": 27780 + }, + { + "epoch": 0.8370481927710843, + "grad_norm": 9.978767089668626, + "learning_rate": 7.08559056257862e-07, + "loss": 0.9203, + "step": 27790 + }, + { + "epoch": 0.8373493975903614, + "grad_norm": 2.106694785336448, + "learning_rate": 7.060054392225812e-07, + "loss": 0.7004, + "step": 27800 + }, + { + "epoch": 0.8376506024096385, + "grad_norm": 2.0191945671021276, + "learning_rate": 7.034560824641434e-07, + "loss": 0.9192, + "step": 27810 + }, + { + "epoch": 0.8379518072289157, + "grad_norm": 2.0488468271818965, + "learning_rate": 7.009109885118859e-07, + "loss": 0.7888, + "step": 27820 + }, + { + "epoch": 0.8382530120481928, + "grad_norm": 4.571375666175543, + "learning_rate": 6.983701598909192e-07, + "loss": 0.9304, + "step": 27830 + }, + { + "epoch": 0.8385542168674699, + "grad_norm": 1.8653745309472625, + "learning_rate": 6.958335991221194e-07, + "loss": 0.857, + "step": 27840 + }, + { + "epoch": 0.838855421686747, + "grad_norm": 4.377028856709774, + "learning_rate": 6.933013087221302e-07, + "loss": 0.9011, + "step": 27850 + }, + { + "epoch": 0.8391566265060241, + "grad_norm": 4.817173377502368, + "learning_rate": 6.907732912033577e-07, + "loss": 0.8631, + "step": 27860 + }, + { + "epoch": 0.8394578313253012, + "grad_norm": 1.9625969406495993, + "learning_rate": 6.882495490739671e-07, + "loss": 0.9104, + "step": 27870 + }, + { + "epoch": 0.8397590361445784, + "grad_norm": 9.30273923683444, + "learning_rate": 6.857300848378857e-07, + "loss": 0.9997, + "step": 27880 + }, + { + "epoch": 0.8400602409638555, + "grad_norm": 6.362259796832259, + "learning_rate": 6.832149009947942e-07, + "loss": 0.8664, + "step": 27890 + }, + { + "epoch": 0.8403614457831325, + "grad_norm": 2.0409815446761757, + "learning_rate": 6.807040000401244e-07, + "loss": 0.8709, + "step": 27900 + }, + { + "epoch": 0.8406626506024096, + "grad_norm": 2.18058333288868, + "learning_rate": 6.781973844650614e-07, + "loss": 0.8885, + "step": 27910 + }, + { + "epoch": 0.8409638554216867, + "grad_norm": 4.384988353873599, + "learning_rate": 6.75695056756539e-07, + "loss": 0.993, + "step": 27920 + }, + { + "epoch": 0.8412650602409638, + "grad_norm": 4.6712190495818815, + "learning_rate": 6.73197019397236e-07, + "loss": 0.9277, + "step": 27930 + }, + { + "epoch": 0.841566265060241, + "grad_norm": 5.043727698460975, + "learning_rate": 6.70703274865574e-07, + "loss": 0.9338, + "step": 27940 + }, + { + "epoch": 0.8418674698795181, + "grad_norm": 6.531805156073384, + "learning_rate": 6.682138256357157e-07, + "loss": 0.8899, + "step": 27950 + }, + { + "epoch": 0.8421686746987952, + "grad_norm": 2.1279970719393453, + "learning_rate": 6.657286741775626e-07, + "loss": 0.8106, + "step": 27960 + }, + { + "epoch": 0.8424698795180723, + "grad_norm": 3.9752962599390713, + "learning_rate": 6.632478229567524e-07, + "loss": 0.8501, + "step": 27970 + }, + { + "epoch": 0.8427710843373494, + "grad_norm": 1.9133696843401249, + "learning_rate": 6.60771274434654e-07, + "loss": 0.8012, + "step": 27980 + }, + { + "epoch": 0.8430722891566265, + "grad_norm": 2.0254618726877114, + "learning_rate": 6.582990310683729e-07, + "loss": 0.8161, + "step": 27990 + }, + { + "epoch": 0.8433734939759037, + "grad_norm": 4.752619948600181, + "learning_rate": 6.558310953107372e-07, + "loss": 0.9181, + "step": 28000 + }, + { + "epoch": 0.8436746987951808, + "grad_norm": 9.54033400244362, + "learning_rate": 6.533674696103026e-07, + "loss": 0.9582, + "step": 28010 + }, + { + "epoch": 0.8439759036144578, + "grad_norm": 8.0533671144653, + "learning_rate": 6.509081564113529e-07, + "loss": 0.9613, + "step": 28020 + }, + { + "epoch": 0.8442771084337349, + "grad_norm": 11.676701993064908, + "learning_rate": 6.484531581538878e-07, + "loss": 0.9921, + "step": 28030 + }, + { + "epoch": 0.844578313253012, + "grad_norm": 6.1489331001793275, + "learning_rate": 6.460024772736301e-07, + "loss": 0.9244, + "step": 28040 + }, + { + "epoch": 0.8448795180722891, + "grad_norm": 6.338948370998788, + "learning_rate": 6.435561162020138e-07, + "loss": 0.9592, + "step": 28050 + }, + { + "epoch": 0.8451807228915663, + "grad_norm": 5.08435345343287, + "learning_rate": 6.411140773661933e-07, + "loss": 0.8618, + "step": 28060 + }, + { + "epoch": 0.8454819277108434, + "grad_norm": 4.90263337957172, + "learning_rate": 6.386763631890313e-07, + "loss": 0.856, + "step": 28070 + }, + { + "epoch": 0.8457831325301205, + "grad_norm": 7.501212199113993, + "learning_rate": 6.362429760890998e-07, + "loss": 0.9388, + "step": 28080 + }, + { + "epoch": 0.8460843373493976, + "grad_norm": 4.989775120116956, + "learning_rate": 6.338139184806791e-07, + "loss": 0.972, + "step": 28090 + }, + { + "epoch": 0.8463855421686747, + "grad_norm": 4.313002046257058, + "learning_rate": 6.31389192773752e-07, + "loss": 0.957, + "step": 28100 + }, + { + "epoch": 0.8466867469879518, + "grad_norm": 13.806894022089965, + "learning_rate": 6.289688013740047e-07, + "loss": 0.9736, + "step": 28110 + }, + { + "epoch": 0.846987951807229, + "grad_norm": 5.688792724738393, + "learning_rate": 6.265527466828231e-07, + "loss": 0.9773, + "step": 28120 + }, + { + "epoch": 0.8472891566265061, + "grad_norm": 5.432975160320383, + "learning_rate": 6.241410310972906e-07, + "loss": 0.8773, + "step": 28130 + }, + { + "epoch": 0.8475903614457831, + "grad_norm": 4.854691723593697, + "learning_rate": 6.217336570101873e-07, + "loss": 0.9537, + "step": 28140 + }, + { + "epoch": 0.8478915662650602, + "grad_norm": 8.78639374796019, + "learning_rate": 6.193306268099802e-07, + "loss": 0.8473, + "step": 28150 + }, + { + "epoch": 0.8481927710843373, + "grad_norm": 9.266631210306816, + "learning_rate": 6.169319428808307e-07, + "loss": 0.9589, + "step": 28160 + }, + { + "epoch": 0.8484939759036144, + "grad_norm": 4.322136497921134, + "learning_rate": 6.145376076025895e-07, + "loss": 0.9879, + "step": 28170 + }, + { + "epoch": 0.8487951807228916, + "grad_norm": 1.8729514199890531, + "learning_rate": 6.1214762335079e-07, + "loss": 0.7766, + "step": 28180 + }, + { + "epoch": 0.8490963855421687, + "grad_norm": 1.9564038691269208, + "learning_rate": 6.097619924966497e-07, + "loss": 0.9058, + "step": 28190 + }, + { + "epoch": 0.8493975903614458, + "grad_norm": 5.975231215790924, + "learning_rate": 6.073807174070667e-07, + "loss": 0.9175, + "step": 28200 + }, + { + "epoch": 0.8496987951807229, + "grad_norm": 4.1131203058007975, + "learning_rate": 6.050038004446179e-07, + "loss": 0.8922, + "step": 28210 + }, + { + "epoch": 0.85, + "grad_norm": 6.169024954256811, + "learning_rate": 6.026312439675553e-07, + "loss": 1.0015, + "step": 28220 + }, + { + "epoch": 0.8503012048192771, + "grad_norm": 5.934489393567222, + "learning_rate": 6.002630503298057e-07, + "loss": 0.9796, + "step": 28230 + }, + { + "epoch": 0.8506024096385543, + "grad_norm": 2.0530468074302832, + "learning_rate": 5.978992218809665e-07, + "loss": 0.7166, + "step": 28240 + }, + { + "epoch": 0.8509036144578314, + "grad_norm": 5.537437951695425, + "learning_rate": 5.955397609663061e-07, + "loss": 0.755, + "step": 28250 + }, + { + "epoch": 0.8512048192771084, + "grad_norm": 5.2229317713025845, + "learning_rate": 5.931846699267558e-07, + "loss": 0.9379, + "step": 28260 + }, + { + "epoch": 0.8515060240963855, + "grad_norm": 4.330571942184469, + "learning_rate": 5.908339510989158e-07, + "loss": 0.9476, + "step": 28270 + }, + { + "epoch": 0.8518072289156626, + "grad_norm": 4.870343631628832, + "learning_rate": 5.884876068150463e-07, + "loss": 0.958, + "step": 28280 + }, + { + "epoch": 0.8521084337349397, + "grad_norm": 6.372409493185241, + "learning_rate": 5.861456394030679e-07, + "loss": 0.9806, + "step": 28290 + }, + { + "epoch": 0.8524096385542169, + "grad_norm": 5.715537889441745, + "learning_rate": 5.838080511865557e-07, + "loss": 0.9923, + "step": 28300 + }, + { + "epoch": 0.852710843373494, + "grad_norm": 1.9619118766202412, + "learning_rate": 5.81474844484744e-07, + "loss": 0.83, + "step": 28310 + }, + { + "epoch": 0.8530120481927711, + "grad_norm": 4.227665926617872, + "learning_rate": 5.791460216125194e-07, + "loss": 0.9914, + "step": 28320 + }, + { + "epoch": 0.8533132530120482, + "grad_norm": 4.853282412993819, + "learning_rate": 5.768215848804165e-07, + "loss": 0.9985, + "step": 28330 + }, + { + "epoch": 0.8536144578313253, + "grad_norm": 5.066157065378188, + "learning_rate": 5.745015365946211e-07, + "loss": 0.7837, + "step": 28340 + }, + { + "epoch": 0.8539156626506024, + "grad_norm": 5.156033438228298, + "learning_rate": 5.721858790569634e-07, + "loss": 0.9316, + "step": 28350 + }, + { + "epoch": 0.8542168674698796, + "grad_norm": 4.299751966843597, + "learning_rate": 5.69874614564917e-07, + "loss": 0.9433, + "step": 28360 + }, + { + "epoch": 0.8545180722891567, + "grad_norm": 1.981874435981955, + "learning_rate": 5.675677454115979e-07, + "loss": 0.9611, + "step": 28370 + }, + { + "epoch": 0.8548192771084338, + "grad_norm": 4.902718733660948, + "learning_rate": 5.652652738857628e-07, + "loss": 0.9413, + "step": 28380 + }, + { + "epoch": 0.8551204819277108, + "grad_norm": 2.1582210019316275, + "learning_rate": 5.629672022718013e-07, + "loss": 0.9425, + "step": 28390 + }, + { + "epoch": 0.8554216867469879, + "grad_norm": 5.880947894711182, + "learning_rate": 5.606735328497409e-07, + "loss": 0.9013, + "step": 28400 + }, + { + "epoch": 0.855722891566265, + "grad_norm": 5.2299504726403026, + "learning_rate": 5.583842678952389e-07, + "loss": 0.9574, + "step": 28410 + }, + { + "epoch": 0.8560240963855422, + "grad_norm": 10.1635832698442, + "learning_rate": 5.560994096795869e-07, + "loss": 0.9176, + "step": 28420 + }, + { + "epoch": 0.8563253012048193, + "grad_norm": 4.693638440759502, + "learning_rate": 5.538189604697003e-07, + "loss": 0.9092, + "step": 28430 + }, + { + "epoch": 0.8566265060240964, + "grad_norm": 2.0933814621392255, + "learning_rate": 5.515429225281222e-07, + "loss": 0.9523, + "step": 28440 + }, + { + "epoch": 0.8569277108433735, + "grad_norm": 7.34226923750672, + "learning_rate": 5.492712981130171e-07, + "loss": 0.9428, + "step": 28450 + }, + { + "epoch": 0.8572289156626506, + "grad_norm": 5.167497352716724, + "learning_rate": 5.470040894781731e-07, + "loss": 0.9119, + "step": 28460 + }, + { + "epoch": 0.8575301204819277, + "grad_norm": 4.948847085140676, + "learning_rate": 5.447412988729961e-07, + "loss": 0.8916, + "step": 28470 + }, + { + "epoch": 0.8578313253012049, + "grad_norm": 8.995865512218028, + "learning_rate": 5.424829285425077e-07, + "loss": 0.9015, + "step": 28480 + }, + { + "epoch": 0.858132530120482, + "grad_norm": 2.1015400470155883, + "learning_rate": 5.402289807273453e-07, + "loss": 0.8613, + "step": 28490 + }, + { + "epoch": 0.858433734939759, + "grad_norm": 1.9559923309080423, + "learning_rate": 5.379794576637581e-07, + "loss": 0.8829, + "step": 28500 + }, + { + "epoch": 0.8587349397590361, + "grad_norm": 4.912852581162225, + "learning_rate": 5.357343615836041e-07, + "loss": 0.9039, + "step": 28510 + }, + { + "epoch": 0.8590361445783132, + "grad_norm": 4.7333957609298, + "learning_rate": 5.334936947143526e-07, + "loss": 0.9241, + "step": 28520 + }, + { + "epoch": 0.8593373493975903, + "grad_norm": 5.678929972574859, + "learning_rate": 5.312574592790759e-07, + "loss": 0.9969, + "step": 28530 + }, + { + "epoch": 0.8596385542168675, + "grad_norm": 2.150067626238408, + "learning_rate": 5.290256574964481e-07, + "loss": 0.7996, + "step": 28540 + }, + { + "epoch": 0.8599397590361446, + "grad_norm": 1.9698321735428013, + "learning_rate": 5.267982915807451e-07, + "loss": 0.9553, + "step": 28550 + }, + { + "epoch": 0.8602409638554217, + "grad_norm": 2.024910226438677, + "learning_rate": 5.245753637418461e-07, + "loss": 0.8779, + "step": 28560 + }, + { + "epoch": 0.8605421686746988, + "grad_norm": 9.891051431340566, + "learning_rate": 5.223568761852227e-07, + "loss": 0.9883, + "step": 28570 + }, + { + "epoch": 0.8608433734939759, + "grad_norm": 6.249959715005973, + "learning_rate": 5.201428311119416e-07, + "loss": 0.9391, + "step": 28580 + }, + { + "epoch": 0.861144578313253, + "grad_norm": 1.9420678070551143, + "learning_rate": 5.17933230718663e-07, + "loss": 0.8541, + "step": 28590 + }, + { + "epoch": 0.8614457831325302, + "grad_norm": 5.106089369429489, + "learning_rate": 5.157280771976364e-07, + "loss": 0.8666, + "step": 28600 + }, + { + "epoch": 0.8617469879518073, + "grad_norm": 4.352358368164967, + "learning_rate": 5.135273727367007e-07, + "loss": 0.922, + "step": 28610 + }, + { + "epoch": 0.8620481927710844, + "grad_norm": 14.17160551898297, + "learning_rate": 5.113311195192777e-07, + "loss": 0.9217, + "step": 28620 + }, + { + "epoch": 0.8623493975903614, + "grad_norm": 4.038349320253583, + "learning_rate": 5.091393197243788e-07, + "loss": 0.9002, + "step": 28630 + }, + { + "epoch": 0.8626506024096385, + "grad_norm": 5.02470867896239, + "learning_rate": 5.0695197552659e-07, + "loss": 0.9273, + "step": 28640 + }, + { + "epoch": 0.8629518072289156, + "grad_norm": 5.043995569523077, + "learning_rate": 5.047690890960804e-07, + "loss": 0.8193, + "step": 28650 + }, + { + "epoch": 0.8632530120481928, + "grad_norm": 8.273084337120943, + "learning_rate": 5.025906625985949e-07, + "loss": 0.8695, + "step": 28660 + }, + { + "epoch": 0.8635542168674699, + "grad_norm": 1.8102228845810138, + "learning_rate": 5.004166981954567e-07, + "loss": 0.849, + "step": 28670 + }, + { + "epoch": 0.863855421686747, + "grad_norm": 4.412327049290363, + "learning_rate": 4.982471980435594e-07, + "loss": 0.9532, + "step": 28680 + }, + { + "epoch": 0.8641566265060241, + "grad_norm": 5.895678021778057, + "learning_rate": 4.96082164295365e-07, + "loss": 0.8382, + "step": 28690 + }, + { + "epoch": 0.8644578313253012, + "grad_norm": 1.701427289067495, + "learning_rate": 4.939215990989088e-07, + "loss": 0.8398, + "step": 28700 + }, + { + "epoch": 0.8647590361445783, + "grad_norm": 6.346153380798089, + "learning_rate": 4.917655045977898e-07, + "loss": 0.9549, + "step": 28710 + }, + { + "epoch": 0.8650602409638555, + "grad_norm": 4.536059544543669, + "learning_rate": 4.896138829311731e-07, + "loss": 0.9226, + "step": 28720 + }, + { + "epoch": 0.8653614457831326, + "grad_norm": 5.853451445807957, + "learning_rate": 4.874667362337843e-07, + "loss": 0.9585, + "step": 28730 + }, + { + "epoch": 0.8656626506024097, + "grad_norm": 7.489524281315221, + "learning_rate": 4.853240666359105e-07, + "loss": 0.906, + "step": 28740 + }, + { + "epoch": 0.8659638554216867, + "grad_norm": 6.235676165574543, + "learning_rate": 4.831858762633967e-07, + "loss": 0.7827, + "step": 28750 + }, + { + "epoch": 0.8662650602409638, + "grad_norm": 4.931312234611484, + "learning_rate": 4.810521672376417e-07, + "loss": 0.895, + "step": 28760 + }, + { + "epoch": 0.8665662650602409, + "grad_norm": 5.895538762613074, + "learning_rate": 4.789229416756025e-07, + "loss": 0.9286, + "step": 28770 + }, + { + "epoch": 0.8668674698795181, + "grad_norm": 2.128250819595505, + "learning_rate": 4.7679820168978543e-07, + "loss": 0.8672, + "step": 28780 + }, + { + "epoch": 0.8671686746987952, + "grad_norm": 2.1421179135103907, + "learning_rate": 4.746779493882442e-07, + "loss": 0.9731, + "step": 28790 + }, + { + "epoch": 0.8674698795180723, + "grad_norm": 2.056383986798822, + "learning_rate": 4.725621868745828e-07, + "loss": 0.8649, + "step": 28800 + }, + { + "epoch": 0.8677710843373494, + "grad_norm": 4.537467977290869, + "learning_rate": 4.7045091624795145e-07, + "loss": 0.9488, + "step": 28810 + }, + { + "epoch": 0.8680722891566265, + "grad_norm": 1.8406715038440107, + "learning_rate": 4.6834413960304135e-07, + "loss": 0.8982, + "step": 28820 + }, + { + "epoch": 0.8683734939759036, + "grad_norm": 6.047113860764686, + "learning_rate": 4.6624185903008713e-07, + "loss": 0.8888, + "step": 28830 + }, + { + "epoch": 0.8686746987951808, + "grad_norm": 2.0991278994213616, + "learning_rate": 4.6414407661486025e-07, + "loss": 0.8784, + "step": 28840 + }, + { + "epoch": 0.8689759036144579, + "grad_norm": 11.595911442501958, + "learning_rate": 4.6205079443867207e-07, + "loss": 0.956, + "step": 28850 + }, + { + "epoch": 0.869277108433735, + "grad_norm": 4.874018227772749, + "learning_rate": 4.5996201457836675e-07, + "loss": 0.9858, + "step": 28860 + }, + { + "epoch": 0.869578313253012, + "grad_norm": 4.402909274266867, + "learning_rate": 4.5787773910632214e-07, + "loss": 0.9021, + "step": 28870 + }, + { + "epoch": 0.8698795180722891, + "grad_norm": 2.0683943509907445, + "learning_rate": 4.5579797009044913e-07, + "loss": 0.8625, + "step": 28880 + }, + { + "epoch": 0.8701807228915662, + "grad_norm": 4.791022091052091, + "learning_rate": 4.537227095941843e-07, + "loss": 0.9933, + "step": 28890 + }, + { + "epoch": 0.8704819277108434, + "grad_norm": 4.39227937163775, + "learning_rate": 4.516519596764929e-07, + "loss": 0.9145, + "step": 28900 + }, + { + "epoch": 0.8707831325301205, + "grad_norm": 5.743297542284759, + "learning_rate": 4.4958572239186374e-07, + "loss": 0.9188, + "step": 28910 + }, + { + "epoch": 0.8710843373493976, + "grad_norm": 5.555509186765791, + "learning_rate": 4.475239997903108e-07, + "loss": 0.903, + "step": 28920 + }, + { + "epoch": 0.8713855421686747, + "grad_norm": 4.98994266768898, + "learning_rate": 4.454667939173685e-07, + "loss": 0.8964, + "step": 28930 + }, + { + "epoch": 0.8716867469879518, + "grad_norm": 6.261471622766507, + "learning_rate": 4.4341410681408516e-07, + "loss": 0.9495, + "step": 28940 + }, + { + "epoch": 0.8719879518072289, + "grad_norm": 3.62450816266926, + "learning_rate": 4.4136594051703286e-07, + "loss": 0.979, + "step": 28950 + }, + { + "epoch": 0.8722891566265061, + "grad_norm": 2.067644154191991, + "learning_rate": 4.393222970582939e-07, + "loss": 0.9022, + "step": 28960 + }, + { + "epoch": 0.8725903614457832, + "grad_norm": 2.1342799963472414, + "learning_rate": 4.372831784654652e-07, + "loss": 0.9252, + "step": 28970 + }, + { + "epoch": 0.8728915662650603, + "grad_norm": 14.530165084636979, + "learning_rate": 4.352485867616524e-07, + "loss": 0.8677, + "step": 28980 + }, + { + "epoch": 0.8731927710843373, + "grad_norm": 2.0120063513613693, + "learning_rate": 4.332185239654718e-07, + "loss": 0.8509, + "step": 28990 + }, + { + "epoch": 0.8734939759036144, + "grad_norm": 4.865687883027641, + "learning_rate": 4.311929920910451e-07, + "loss": 0.935, + "step": 29000 + }, + { + "epoch": 0.8737951807228915, + "grad_norm": 2.0245036414742166, + "learning_rate": 4.2917199314799873e-07, + "loss": 0.9141, + "step": 29010 + }, + { + "epoch": 0.8740963855421687, + "grad_norm": 5.76091144579355, + "learning_rate": 4.271555291414636e-07, + "loss": 0.9055, + "step": 29020 + }, + { + "epoch": 0.8743975903614458, + "grad_norm": 4.315560309100055, + "learning_rate": 4.2514360207206953e-07, + "loss": 0.8906, + "step": 29030 + }, + { + "epoch": 0.8746987951807229, + "grad_norm": 8.076654083839747, + "learning_rate": 4.231362139359441e-07, + "loss": 0.832, + "step": 29040 + }, + { + "epoch": 0.875, + "grad_norm": 4.456687958244754, + "learning_rate": 4.211333667247125e-07, + "loss": 0.9492, + "step": 29050 + }, + { + "epoch": 0.8753012048192771, + "grad_norm": 3.612573791932306, + "learning_rate": 4.191350624254975e-07, + "loss": 0.877, + "step": 29060 + }, + { + "epoch": 0.8756024096385542, + "grad_norm": 8.883093819274588, + "learning_rate": 4.171413030209098e-07, + "loss": 0.9089, + "step": 29070 + }, + { + "epoch": 0.8759036144578313, + "grad_norm": 6.558894267229141, + "learning_rate": 4.1515209048905434e-07, + "loss": 0.9195, + "step": 29080 + }, + { + "epoch": 0.8762048192771085, + "grad_norm": 9.114461342653444, + "learning_rate": 4.1316742680352406e-07, + "loss": 0.9902, + "step": 29090 + }, + { + "epoch": 0.8765060240963856, + "grad_norm": 4.721154027582496, + "learning_rate": 4.111873139333972e-07, + "loss": 0.9006, + "step": 29100 + }, + { + "epoch": 0.8768072289156627, + "grad_norm": 4.562759120003542, + "learning_rate": 4.092117538432394e-07, + "loss": 0.9693, + "step": 29110 + }, + { + "epoch": 0.8771084337349397, + "grad_norm": 1.9824518715510289, + "learning_rate": 4.072407484930968e-07, + "loss": 0.7591, + "step": 29120 + }, + { + "epoch": 0.8774096385542168, + "grad_norm": 3.6681972154747706, + "learning_rate": 4.052742998384995e-07, + "loss": 0.9263, + "step": 29130 + }, + { + "epoch": 0.8777108433734939, + "grad_norm": 5.787985110484834, + "learning_rate": 4.0331240983045363e-07, + "loss": 0.8554, + "step": 29140 + }, + { + "epoch": 0.8780120481927711, + "grad_norm": 4.563737633826767, + "learning_rate": 4.0135508041544423e-07, + "loss": 0.9257, + "step": 29150 + }, + { + "epoch": 0.8783132530120482, + "grad_norm": 4.423986763627893, + "learning_rate": 3.9940231353543077e-07, + "loss": 0.8615, + "step": 29160 + }, + { + "epoch": 0.8786144578313253, + "grad_norm": 4.18139800162496, + "learning_rate": 3.974541111278474e-07, + "loss": 0.9298, + "step": 29170 + }, + { + "epoch": 0.8789156626506024, + "grad_norm": 5.813227606398191, + "learning_rate": 3.955104751255995e-07, + "loss": 0.9526, + "step": 29180 + }, + { + "epoch": 0.8792168674698795, + "grad_norm": 5.430429125831835, + "learning_rate": 3.935714074570579e-07, + "loss": 0.85, + "step": 29190 + }, + { + "epoch": 0.8795180722891566, + "grad_norm": 4.656406443588307, + "learning_rate": 3.916369100460665e-07, + "loss": 0.8522, + "step": 29200 + }, + { + "epoch": 0.8798192771084338, + "grad_norm": 5.18714013040448, + "learning_rate": 3.8970698481193225e-07, + "loss": 0.8949, + "step": 29210 + }, + { + "epoch": 0.8801204819277109, + "grad_norm": 1.9378784598863399, + "learning_rate": 3.877816336694262e-07, + "loss": 0.8809, + "step": 29220 + }, + { + "epoch": 0.880421686746988, + "grad_norm": 4.198225000624405, + "learning_rate": 3.858608585287799e-07, + "loss": 1.0119, + "step": 29230 + }, + { + "epoch": 0.880722891566265, + "grad_norm": 4.546057128649085, + "learning_rate": 3.839446612956871e-07, + "loss": 0.9377, + "step": 29240 + }, + { + "epoch": 0.8810240963855421, + "grad_norm": 1.9557703770719894, + "learning_rate": 3.8203304387129823e-07, + "loss": 0.8512, + "step": 29250 + }, + { + "epoch": 0.8813253012048192, + "grad_norm": 2.1599891644315354, + "learning_rate": 3.80126008152219e-07, + "loss": 0.9077, + "step": 29260 + }, + { + "epoch": 0.8816265060240964, + "grad_norm": 2.1197042372122463, + "learning_rate": 3.782235560305131e-07, + "loss": 0.934, + "step": 29270 + }, + { + "epoch": 0.8819277108433735, + "grad_norm": 4.0269587095371735, + "learning_rate": 3.763256893936917e-07, + "loss": 0.9689, + "step": 29280 + }, + { + "epoch": 0.8822289156626506, + "grad_norm": 3.7807803120359527, + "learning_rate": 3.744324101247199e-07, + "loss": 1.0031, + "step": 29290 + }, + { + "epoch": 0.8825301204819277, + "grad_norm": 1.9451313314176517, + "learning_rate": 3.725437201020088e-07, + "loss": 0.9282, + "step": 29300 + }, + { + "epoch": 0.8828313253012048, + "grad_norm": 11.240100733399817, + "learning_rate": 3.706596211994201e-07, + "loss": 0.9891, + "step": 29310 + }, + { + "epoch": 0.8831325301204819, + "grad_norm": 7.8575070793060675, + "learning_rate": 3.687801152862569e-07, + "loss": 0.8873, + "step": 29320 + }, + { + "epoch": 0.8834337349397591, + "grad_norm": 4.446012880321386, + "learning_rate": 3.669052042272675e-07, + "loss": 0.8067, + "step": 29330 + }, + { + "epoch": 0.8837349397590362, + "grad_norm": 4.7464167208914825, + "learning_rate": 3.6503488988264e-07, + "loss": 0.8927, + "step": 29340 + }, + { + "epoch": 0.8840361445783133, + "grad_norm": 2.1024173526693457, + "learning_rate": 3.631691741080023e-07, + "loss": 0.8519, + "step": 29350 + }, + { + "epoch": 0.8843373493975903, + "grad_norm": 7.569368125402966, + "learning_rate": 3.61308058754421e-07, + "loss": 1.0047, + "step": 29360 + }, + { + "epoch": 0.8846385542168674, + "grad_norm": 5.215962429635607, + "learning_rate": 3.5945154566839613e-07, + "loss": 0.8994, + "step": 29370 + }, + { + "epoch": 0.8849397590361445, + "grad_norm": 5.006863733803054, + "learning_rate": 3.575996366918638e-07, + "loss": 0.9277, + "step": 29380 + }, + { + "epoch": 0.8852409638554217, + "grad_norm": 5.158016403608478, + "learning_rate": 3.5575233366219044e-07, + "loss": 0.893, + "step": 29390 + }, + { + "epoch": 0.8855421686746988, + "grad_norm": 5.014009912667711, + "learning_rate": 3.539096384121743e-07, + "loss": 0.9196, + "step": 29400 + }, + { + "epoch": 0.8858433734939759, + "grad_norm": 4.33250483163687, + "learning_rate": 3.520715527700402e-07, + "loss": 0.97, + "step": 29410 + }, + { + "epoch": 0.886144578313253, + "grad_norm": 6.012638414468784, + "learning_rate": 3.502380785594428e-07, + "loss": 0.9374, + "step": 29420 + }, + { + "epoch": 0.8864457831325301, + "grad_norm": 4.251775072235473, + "learning_rate": 3.4840921759945745e-07, + "loss": 0.9737, + "step": 29430 + }, + { + "epoch": 0.8867469879518072, + "grad_norm": 4.518795440404182, + "learning_rate": 3.465849717045838e-07, + "loss": 0.9384, + "step": 29440 + }, + { + "epoch": 0.8870481927710844, + "grad_norm": 2.081210093857427, + "learning_rate": 3.447653426847447e-07, + "loss": 0.9253, + "step": 29450 + }, + { + "epoch": 0.8873493975903615, + "grad_norm": 5.565653379563694, + "learning_rate": 3.42950332345281e-07, + "loss": 0.9628, + "step": 29460 + }, + { + "epoch": 0.8876506024096386, + "grad_norm": 4.017165584844056, + "learning_rate": 3.411399424869499e-07, + "loss": 0.9577, + "step": 29470 + }, + { + "epoch": 0.8879518072289156, + "grad_norm": 3.984888910738686, + "learning_rate": 3.393341749059259e-07, + "loss": 0.9496, + "step": 29480 + }, + { + "epoch": 0.8882530120481927, + "grad_norm": 2.073737194733733, + "learning_rate": 3.37533031393798e-07, + "loss": 0.8238, + "step": 29490 + }, + { + "epoch": 0.8885542168674698, + "grad_norm": 10.737941249040539, + "learning_rate": 3.3573651373756513e-07, + "loss": 0.8978, + "step": 29500 + }, + { + "epoch": 0.888855421686747, + "grad_norm": 2.022991667976808, + "learning_rate": 3.3394462371963864e-07, + "loss": 0.928, + "step": 29510 + }, + { + "epoch": 0.8891566265060241, + "grad_norm": 7.719472513286403, + "learning_rate": 3.3215736311784044e-07, + "loss": 1.0267, + "step": 29520 + }, + { + "epoch": 0.8894578313253012, + "grad_norm": 5.256950506393739, + "learning_rate": 3.303747337053936e-07, + "loss": 0.7333, + "step": 29530 + }, + { + "epoch": 0.8897590361445783, + "grad_norm": 2.0995705984504593, + "learning_rate": 3.2859673725093133e-07, + "loss": 0.8656, + "step": 29540 + }, + { + "epoch": 0.8900602409638554, + "grad_norm": 3.7102400130303046, + "learning_rate": 3.2682337551848795e-07, + "loss": 0.9209, + "step": 29550 + }, + { + "epoch": 0.8903614457831325, + "grad_norm": 4.1883773578173855, + "learning_rate": 3.2505465026750115e-07, + "loss": 0.8555, + "step": 29560 + }, + { + "epoch": 0.8906626506024097, + "grad_norm": 1.9198249481048593, + "learning_rate": 3.2329056325280885e-07, + "loss": 0.8714, + "step": 29570 + }, + { + "epoch": 0.8909638554216868, + "grad_norm": 5.923383194381662, + "learning_rate": 3.2153111622464263e-07, + "loss": 0.9234, + "step": 29580 + }, + { + "epoch": 0.8912650602409639, + "grad_norm": 5.567215823246787, + "learning_rate": 3.1977631092863613e-07, + "loss": 0.9195, + "step": 29590 + }, + { + "epoch": 0.891566265060241, + "grad_norm": 7.9148940355828605, + "learning_rate": 3.1802614910581506e-07, + "loss": 0.8443, + "step": 29600 + }, + { + "epoch": 0.891867469879518, + "grad_norm": 2.050219279673027, + "learning_rate": 3.162806324925988e-07, + "loss": 0.728, + "step": 29610 + }, + { + "epoch": 0.8921686746987951, + "grad_norm": 6.271753398148368, + "learning_rate": 3.1453976282079636e-07, + "loss": 0.9619, + "step": 29620 + }, + { + "epoch": 0.8924698795180723, + "grad_norm": 4.354315334600484, + "learning_rate": 3.1280354181760896e-07, + "loss": 0.9477, + "step": 29630 + }, + { + "epoch": 0.8927710843373494, + "grad_norm": 5.223722917723963, + "learning_rate": 3.1107197120562317e-07, + "loss": 0.9797, + "step": 29640 + }, + { + "epoch": 0.8930722891566265, + "grad_norm": 4.558827839557073, + "learning_rate": 3.09345052702813e-07, + "loss": 0.8947, + "step": 29650 + }, + { + "epoch": 0.8933734939759036, + "grad_norm": 5.37366913024313, + "learning_rate": 3.0762278802253743e-07, + "loss": 0.8967, + "step": 29660 + }, + { + "epoch": 0.8936746987951807, + "grad_norm": 3.9853627599869226, + "learning_rate": 3.059051788735379e-07, + "loss": 0.9019, + "step": 29670 + }, + { + "epoch": 0.8939759036144578, + "grad_norm": 5.107432456134144, + "learning_rate": 3.0419222695993455e-07, + "loss": 0.9377, + "step": 29680 + }, + { + "epoch": 0.894277108433735, + "grad_norm": 2.1163388036344823, + "learning_rate": 3.024839339812291e-07, + "loss": 0.8239, + "step": 29690 + }, + { + "epoch": 0.8945783132530121, + "grad_norm": 1.822929408467778, + "learning_rate": 3.007803016323013e-07, + "loss": 0.9157, + "step": 29700 + }, + { + "epoch": 0.8948795180722892, + "grad_norm": 4.682403873330276, + "learning_rate": 2.990813316034064e-07, + "loss": 0.918, + "step": 29710 + }, + { + "epoch": 0.8951807228915662, + "grad_norm": 5.200497158377645, + "learning_rate": 2.9738702558017305e-07, + "loss": 0.8555, + "step": 29720 + }, + { + "epoch": 0.8954819277108433, + "grad_norm": 6.504588539674624, + "learning_rate": 2.9569738524360236e-07, + "loss": 1.0323, + "step": 29730 + }, + { + "epoch": 0.8957831325301204, + "grad_norm": 4.495593669503393, + "learning_rate": 2.940124122700677e-07, + "loss": 1.0411, + "step": 29740 + }, + { + "epoch": 0.8960843373493976, + "grad_norm": 4.287366824328198, + "learning_rate": 2.923321083313113e-07, + "loss": 0.8035, + "step": 29750 + }, + { + "epoch": 0.8963855421686747, + "grad_norm": 5.086546250823018, + "learning_rate": 2.9065647509444296e-07, + "loss": 0.9774, + "step": 29760 + }, + { + "epoch": 0.8966867469879518, + "grad_norm": 2.155168225757024, + "learning_rate": 2.8898551422193833e-07, + "loss": 0.9169, + "step": 29770 + }, + { + "epoch": 0.8969879518072289, + "grad_norm": 16.435698828928746, + "learning_rate": 2.873192273716369e-07, + "loss": 0.8862, + "step": 29780 + }, + { + "epoch": 0.897289156626506, + "grad_norm": 4.641324427557074, + "learning_rate": 2.856576161967417e-07, + "loss": 0.9304, + "step": 29790 + }, + { + "epoch": 0.8975903614457831, + "grad_norm": 1.940484722335563, + "learning_rate": 2.840006823458158e-07, + "loss": 0.8366, + "step": 29800 + }, + { + "epoch": 0.8978915662650603, + "grad_norm": 7.506862616845636, + "learning_rate": 2.823484274627836e-07, + "loss": 0.8632, + "step": 29810 + }, + { + "epoch": 0.8981927710843374, + "grad_norm": 4.803067142316807, + "learning_rate": 2.8070085318692597e-07, + "loss": 0.9566, + "step": 29820 + }, + { + "epoch": 0.8984939759036145, + "grad_norm": 4.802931629665585, + "learning_rate": 2.7905796115287767e-07, + "loss": 0.9485, + "step": 29830 + }, + { + "epoch": 0.8987951807228916, + "grad_norm": 6.143256075314575, + "learning_rate": 2.774197529906325e-07, + "loss": 0.7594, + "step": 29840 + }, + { + "epoch": 0.8990963855421686, + "grad_norm": 5.319450338673451, + "learning_rate": 2.7578623032553487e-07, + "loss": 0.8765, + "step": 29850 + }, + { + "epoch": 0.8993975903614457, + "grad_norm": 5.465696219193865, + "learning_rate": 2.741573947782794e-07, + "loss": 0.9911, + "step": 29860 + }, + { + "epoch": 0.8996987951807229, + "grad_norm": 2.0878824258379707, + "learning_rate": 2.725332479649123e-07, + "loss": 0.8584, + "step": 29870 + }, + { + "epoch": 0.9, + "grad_norm": 5.149453599378429, + "learning_rate": 2.7091379149682683e-07, + "loss": 0.9967, + "step": 29880 + }, + { + "epoch": 0.9003012048192771, + "grad_norm": 1.9416211530643281, + "learning_rate": 2.6929902698076327e-07, + "loss": 0.9359, + "step": 29890 + }, + { + "epoch": 0.9006024096385542, + "grad_norm": 4.35994852925526, + "learning_rate": 2.676889560188051e-07, + "loss": 0.9473, + "step": 29900 + }, + { + "epoch": 0.9009036144578313, + "grad_norm": 4.730471412138855, + "learning_rate": 2.660835802083833e-07, + "loss": 0.855, + "step": 29910 + }, + { + "epoch": 0.9012048192771084, + "grad_norm": 4.648347910220079, + "learning_rate": 2.6448290114226616e-07, + "loss": 0.9593, + "step": 29920 + }, + { + "epoch": 0.9015060240963856, + "grad_norm": 1.9949640203058063, + "learning_rate": 2.628869204085638e-07, + "loss": 0.7891, + "step": 29930 + }, + { + "epoch": 0.9018072289156627, + "grad_norm": 7.036764321044006, + "learning_rate": 2.612956395907235e-07, + "loss": 0.9608, + "step": 29940 + }, + { + "epoch": 0.9021084337349398, + "grad_norm": 10.185960314989945, + "learning_rate": 2.597090602675334e-07, + "loss": 0.9936, + "step": 29950 + }, + { + "epoch": 0.9024096385542169, + "grad_norm": 1.957574016901448, + "learning_rate": 2.5812718401311334e-07, + "loss": 0.9282, + "step": 29960 + }, + { + "epoch": 0.9027108433734939, + "grad_norm": 5.031416696909351, + "learning_rate": 2.5655001239691836e-07, + "loss": 0.9147, + "step": 29970 + }, + { + "epoch": 0.903012048192771, + "grad_norm": 4.117105535052751, + "learning_rate": 2.5497754698373455e-07, + "loss": 0.8457, + "step": 29980 + }, + { + "epoch": 0.9033132530120482, + "grad_norm": 1.9777963266760097, + "learning_rate": 2.5340978933368177e-07, + "loss": 0.8008, + "step": 29990 + }, + { + "epoch": 0.9036144578313253, + "grad_norm": 4.484332589137819, + "learning_rate": 2.518467410022063e-07, + "loss": 0.9254, + "step": 30000 + }, + { + "epoch": 0.9039156626506024, + "grad_norm": 4.427670215416986, + "learning_rate": 2.5028840354008264e-07, + "loss": 0.8791, + "step": 30010 + }, + { + "epoch": 0.9042168674698795, + "grad_norm": 8.377019267464451, + "learning_rate": 2.487347784934119e-07, + "loss": 0.8925, + "step": 30020 + }, + { + "epoch": 0.9045180722891566, + "grad_norm": 7.2365708481771485, + "learning_rate": 2.4718586740361985e-07, + "loss": 0.8452, + "step": 30030 + }, + { + "epoch": 0.9048192771084337, + "grad_norm": 2.0534488121332792, + "learning_rate": 2.45641671807455e-07, + "loss": 0.8459, + "step": 30040 + }, + { + "epoch": 0.9051204819277109, + "grad_norm": 4.301532552562159, + "learning_rate": 2.4410219323698636e-07, + "loss": 0.9414, + "step": 30050 + }, + { + "epoch": 0.905421686746988, + "grad_norm": 2.038146180435159, + "learning_rate": 2.42567433219606e-07, + "loss": 0.8977, + "step": 30060 + }, + { + "epoch": 0.9057228915662651, + "grad_norm": 6.0234597980613485, + "learning_rate": 2.410373932780219e-07, + "loss": 1.0074, + "step": 30070 + }, + { + "epoch": 0.9060240963855422, + "grad_norm": 4.470062653453608, + "learning_rate": 2.3951207493025876e-07, + "loss": 0.9697, + "step": 30080 + }, + { + "epoch": 0.9063253012048192, + "grad_norm": 4.061143895409584, + "learning_rate": 2.379914796896582e-07, + "loss": 0.9099, + "step": 30090 + }, + { + "epoch": 0.9066265060240963, + "grad_norm": 4.789426982661688, + "learning_rate": 2.364756090648762e-07, + "loss": 0.9098, + "step": 30100 + }, + { + "epoch": 0.9069277108433735, + "grad_norm": 2.078045479960634, + "learning_rate": 2.349644645598792e-07, + "loss": 0.9128, + "step": 30110 + }, + { + "epoch": 0.9072289156626506, + "grad_norm": 4.79462034368817, + "learning_rate": 2.3345804767394564e-07, + "loss": 0.8735, + "step": 30120 + }, + { + "epoch": 0.9075301204819277, + "grad_norm": 10.25970322701426, + "learning_rate": 2.3195635990166442e-07, + "loss": 0.9362, + "step": 30130 + }, + { + "epoch": 0.9078313253012048, + "grad_norm": 51.63692180692184, + "learning_rate": 2.3045940273293154e-07, + "loss": 0.9505, + "step": 30140 + }, + { + "epoch": 0.9081325301204819, + "grad_norm": 6.726779326874728, + "learning_rate": 2.2896717765294784e-07, + "loss": 0.8549, + "step": 30150 + }, + { + "epoch": 0.908433734939759, + "grad_norm": 11.326967021423837, + "learning_rate": 2.274796861422246e-07, + "loss": 0.8707, + "step": 30160 + }, + { + "epoch": 0.9087349397590362, + "grad_norm": 5.071628059826979, + "learning_rate": 2.2599692967656962e-07, + "loss": 0.9701, + "step": 30170 + }, + { + "epoch": 0.9090361445783133, + "grad_norm": 6.955451787569282, + "learning_rate": 2.2451890972709778e-07, + "loss": 0.9932, + "step": 30180 + }, + { + "epoch": 0.9093373493975904, + "grad_norm": 2.2274587409892757, + "learning_rate": 2.2304562776022276e-07, + "loss": 0.8634, + "step": 30190 + }, + { + "epoch": 0.9096385542168675, + "grad_norm": 1.998660494421673, + "learning_rate": 2.2157708523765864e-07, + "loss": 0.8311, + "step": 30200 + }, + { + "epoch": 0.9099397590361445, + "grad_norm": 60.562075316733505, + "learning_rate": 2.2011328361641605e-07, + "loss": 0.8249, + "step": 30210 + }, + { + "epoch": 0.9102409638554216, + "grad_norm": 1.8539763405018266, + "learning_rate": 2.1865422434880267e-07, + "loss": 0.9395, + "step": 30220 + }, + { + "epoch": 0.9105421686746988, + "grad_norm": 5.549849941885362, + "learning_rate": 2.171999088824195e-07, + "loss": 1.0061, + "step": 30230 + }, + { + "epoch": 0.9108433734939759, + "grad_norm": 5.079218708837629, + "learning_rate": 2.1575033866016447e-07, + "loss": 0.9713, + "step": 30240 + }, + { + "epoch": 0.911144578313253, + "grad_norm": 1.9301915959028308, + "learning_rate": 2.1430551512022336e-07, + "loss": 0.9012, + "step": 30250 + }, + { + "epoch": 0.9114457831325301, + "grad_norm": 4.992085712734793, + "learning_rate": 2.1286543969607608e-07, + "loss": 0.9912, + "step": 30260 + }, + { + "epoch": 0.9117469879518072, + "grad_norm": 4.811049837802725, + "learning_rate": 2.1143011381648925e-07, + "loss": 0.819, + "step": 30270 + }, + { + "epoch": 0.9120481927710843, + "grad_norm": 5.748533215147977, + "learning_rate": 2.0999953890551816e-07, + "loss": 0.9567, + "step": 30280 + }, + { + "epoch": 0.9123493975903615, + "grad_norm": 4.694246260071003, + "learning_rate": 2.0857371638250467e-07, + "loss": 1.0229, + "step": 30290 + }, + { + "epoch": 0.9126506024096386, + "grad_norm": 5.091136449551038, + "learning_rate": 2.0715264766207443e-07, + "loss": 0.9766, + "step": 30300 + }, + { + "epoch": 0.9129518072289157, + "grad_norm": 5.033870702189704, + "learning_rate": 2.0573633415413962e-07, + "loss": 0.9008, + "step": 30310 + }, + { + "epoch": 0.9132530120481928, + "grad_norm": 7.73187832623893, + "learning_rate": 2.0432477726389122e-07, + "loss": 0.8716, + "step": 30320 + }, + { + "epoch": 0.9135542168674698, + "grad_norm": 4.657926851190474, + "learning_rate": 2.029179783918006e-07, + "loss": 0.9224, + "step": 30330 + }, + { + "epoch": 0.9138554216867469, + "grad_norm": 3.823697448591747, + "learning_rate": 2.0151593893362243e-07, + "loss": 0.9542, + "step": 30340 + }, + { + "epoch": 0.9141566265060241, + "grad_norm": 2.043246204959972, + "learning_rate": 2.0011866028038617e-07, + "loss": 0.8999, + "step": 30350 + }, + { + "epoch": 0.9144578313253012, + "grad_norm": 4.866599488871076, + "learning_rate": 1.987261438183985e-07, + "loss": 0.9034, + "step": 30360 + }, + { + "epoch": 0.9147590361445783, + "grad_norm": 6.132504023016106, + "learning_rate": 1.9733839092924088e-07, + "loss": 0.9367, + "step": 30370 + }, + { + "epoch": 0.9150602409638554, + "grad_norm": 6.151309864962023, + "learning_rate": 1.9595540298977035e-07, + "loss": 0.9902, + "step": 30380 + }, + { + "epoch": 0.9153614457831325, + "grad_norm": 12.372261174562896, + "learning_rate": 1.9457718137211423e-07, + "loss": 0.9355, + "step": 30390 + }, + { + "epoch": 0.9156626506024096, + "grad_norm": 4.628814514104804, + "learning_rate": 1.9320372744367155e-07, + "loss": 0.986, + "step": 30400 + }, + { + "epoch": 0.9159638554216868, + "grad_norm": 4.252012260236093, + "learning_rate": 1.9183504256711393e-07, + "loss": 0.8832, + "step": 30410 + }, + { + "epoch": 0.9162650602409639, + "grad_norm": 1.9118522641793776, + "learning_rate": 1.9047112810037626e-07, + "loss": 0.9056, + "step": 30420 + }, + { + "epoch": 0.916566265060241, + "grad_norm": 8.01935915873801, + "learning_rate": 1.8911198539666387e-07, + "loss": 0.8082, + "step": 30430 + }, + { + "epoch": 0.9168674698795181, + "grad_norm": 4.406163984505128, + "learning_rate": 1.877576158044464e-07, + "loss": 0.9766, + "step": 30440 + }, + { + "epoch": 0.9171686746987951, + "grad_norm": 1.882210522818602, + "learning_rate": 1.8640802066746067e-07, + "loss": 0.9741, + "step": 30450 + }, + { + "epoch": 0.9174698795180722, + "grad_norm": 6.823712374912623, + "learning_rate": 1.8506320132470334e-07, + "loss": 0.9299, + "step": 30460 + }, + { + "epoch": 0.9177710843373494, + "grad_norm": 6.084795298150425, + "learning_rate": 1.8372315911043325e-07, + "loss": 0.8878, + "step": 30470 + }, + { + "epoch": 0.9180722891566265, + "grad_norm": 4.503662702585053, + "learning_rate": 1.8238789535416967e-07, + "loss": 0.9787, + "step": 30480 + }, + { + "epoch": 0.9183734939759036, + "grad_norm": 1.9354436821973109, + "learning_rate": 1.8105741138069288e-07, + "loss": 0.901, + "step": 30490 + }, + { + "epoch": 0.9186746987951807, + "grad_norm": 3.836766585626372, + "learning_rate": 1.7973170851003918e-07, + "loss": 0.8324, + "step": 30500 + }, + { + "epoch": 0.9189759036144578, + "grad_norm": 4.612246788895821, + "learning_rate": 1.7841078805750146e-07, + "loss": 0.9591, + "step": 30510 + }, + { + "epoch": 0.9192771084337349, + "grad_norm": 5.449970799952609, + "learning_rate": 1.7709465133362803e-07, + "loss": 0.8628, + "step": 30520 + }, + { + "epoch": 0.9195783132530121, + "grad_norm": 6.4991130263966985, + "learning_rate": 1.7578329964422047e-07, + "loss": 0.9341, + "step": 30530 + }, + { + "epoch": 0.9198795180722892, + "grad_norm": 5.400668146662582, + "learning_rate": 1.7447673429033361e-07, + "loss": 0.7959, + "step": 30540 + }, + { + "epoch": 0.9201807228915663, + "grad_norm": 1.9159164496951613, + "learning_rate": 1.7317495656827278e-07, + "loss": 0.845, + "step": 30550 + }, + { + "epoch": 0.9204819277108434, + "grad_norm": 4.772949695590476, + "learning_rate": 1.7187796776959586e-07, + "loss": 0.9239, + "step": 30560 + }, + { + "epoch": 0.9207831325301205, + "grad_norm": 5.5642641650127995, + "learning_rate": 1.705857691811047e-07, + "loss": 0.8805, + "step": 30570 + }, + { + "epoch": 0.9210843373493975, + "grad_norm": 5.006196830374669, + "learning_rate": 1.6929836208485206e-07, + "loss": 0.881, + "step": 30580 + }, + { + "epoch": 0.9213855421686747, + "grad_norm": 4.193056881037043, + "learning_rate": 1.6801574775813623e-07, + "loss": 0.9169, + "step": 30590 + }, + { + "epoch": 0.9216867469879518, + "grad_norm": 4.17810379679463, + "learning_rate": 1.6673792747349983e-07, + "loss": 0.9344, + "step": 30600 + }, + { + "epoch": 0.9219879518072289, + "grad_norm": 1.9398758124726878, + "learning_rate": 1.6546490249873037e-07, + "loss": 0.8758, + "step": 30610 + }, + { + "epoch": 0.922289156626506, + "grad_norm": 5.275194518414215, + "learning_rate": 1.6419667409685425e-07, + "loss": 0.8999, + "step": 30620 + }, + { + "epoch": 0.9225903614457831, + "grad_norm": 5.263900088497955, + "learning_rate": 1.6293324352614325e-07, + "loss": 0.873, + "step": 30630 + }, + { + "epoch": 0.9228915662650602, + "grad_norm": 2.0493681426677814, + "learning_rate": 1.6167461204010692e-07, + "loss": 0.9109, + "step": 30640 + }, + { + "epoch": 0.9231927710843374, + "grad_norm": 4.9495360231559955, + "learning_rate": 1.6042078088749357e-07, + "loss": 0.8935, + "step": 30650 + }, + { + "epoch": 0.9234939759036145, + "grad_norm": 4.5307800067679365, + "learning_rate": 1.5917175131228812e-07, + "loss": 0.8849, + "step": 30660 + }, + { + "epoch": 0.9237951807228916, + "grad_norm": 5.417541286798659, + "learning_rate": 1.579275245537132e-07, + "loss": 0.9285, + "step": 30670 + }, + { + "epoch": 0.9240963855421687, + "grad_norm": 4.290994336416196, + "learning_rate": 1.5668810184622517e-07, + "loss": 0.9537, + "step": 30680 + }, + { + "epoch": 0.9243975903614458, + "grad_norm": 6.093207043891591, + "learning_rate": 1.5545348441951437e-07, + "loss": 0.9438, + "step": 30690 + }, + { + "epoch": 0.9246987951807228, + "grad_norm": 2.210913932320992, + "learning_rate": 1.5422367349850364e-07, + "loss": 0.9114, + "step": 30700 + }, + { + "epoch": 0.925, + "grad_norm": 5.106178602573226, + "learning_rate": 1.5299867030334815e-07, + "loss": 0.8904, + "step": 30710 + }, + { + "epoch": 0.9253012048192771, + "grad_norm": 2.162176062789831, + "learning_rate": 1.5177847604943062e-07, + "loss": 0.9336, + "step": 30720 + }, + { + "epoch": 0.9256024096385542, + "grad_norm": 3.841243784591798, + "learning_rate": 1.5056309194736385e-07, + "loss": 0.8639, + "step": 30730 + }, + { + "epoch": 0.9259036144578313, + "grad_norm": 7.206173562157358, + "learning_rate": 1.493525192029899e-07, + "loss": 0.9642, + "step": 30740 + }, + { + "epoch": 0.9262048192771084, + "grad_norm": 4.856945621486708, + "learning_rate": 1.4814675901737408e-07, + "loss": 1.0129, + "step": 30750 + }, + { + "epoch": 0.9265060240963855, + "grad_norm": 2.023495186035137, + "learning_rate": 1.469458125868095e-07, + "loss": 0.949, + "step": 30760 + }, + { + "epoch": 0.9268072289156627, + "grad_norm": 7.040211268226052, + "learning_rate": 1.4574968110281251e-07, + "loss": 0.812, + "step": 30770 + }, + { + "epoch": 0.9271084337349398, + "grad_norm": 1.973717105095577, + "learning_rate": 1.4455836575212157e-07, + "loss": 0.8822, + "step": 30780 + }, + { + "epoch": 0.9274096385542169, + "grad_norm": 6.261733297612659, + "learning_rate": 1.4337186771669841e-07, + "loss": 0.8421, + "step": 30790 + }, + { + "epoch": 0.927710843373494, + "grad_norm": 5.860160504921429, + "learning_rate": 1.421901881737231e-07, + "loss": 0.8834, + "step": 30800 + }, + { + "epoch": 0.928012048192771, + "grad_norm": 5.592990358537241, + "learning_rate": 1.410133282955972e-07, + "loss": 0.9817, + "step": 30810 + }, + { + "epoch": 0.9283132530120481, + "grad_norm": 6.692170712749857, + "learning_rate": 1.3984128924993955e-07, + "loss": 0.9008, + "step": 30820 + }, + { + "epoch": 0.9286144578313253, + "grad_norm": 4.387551000030284, + "learning_rate": 1.3867407219958496e-07, + "loss": 0.8981, + "step": 30830 + }, + { + "epoch": 0.9289156626506024, + "grad_norm": 5.003253856961021, + "learning_rate": 1.3751167830258594e-07, + "loss": 0.9477, + "step": 30840 + }, + { + "epoch": 0.9292168674698795, + "grad_norm": 4.538494758761143, + "learning_rate": 1.3635410871220944e-07, + "loss": 0.8519, + "step": 30850 + }, + { + "epoch": 0.9295180722891566, + "grad_norm": 7.638884039798874, + "learning_rate": 1.3520136457693512e-07, + "loss": 0.9269, + "step": 30860 + }, + { + "epoch": 0.9298192771084337, + "grad_norm": 5.134368648792981, + "learning_rate": 1.3405344704045477e-07, + "loss": 0.9676, + "step": 30870 + }, + { + "epoch": 0.9301204819277108, + "grad_norm": 5.596770449556262, + "learning_rate": 1.3291035724167233e-07, + "loss": 0.9257, + "step": 30880 + }, + { + "epoch": 0.930421686746988, + "grad_norm": 4.989679447843252, + "learning_rate": 1.3177209631470233e-07, + "loss": 0.9817, + "step": 30890 + }, + { + "epoch": 0.9307228915662651, + "grad_norm": 3.975918294128282, + "learning_rate": 1.306386653888675e-07, + "loss": 0.8391, + "step": 30900 + }, + { + "epoch": 0.9310240963855422, + "grad_norm": 5.902556027724682, + "learning_rate": 1.2951006558869883e-07, + "loss": 0.9168, + "step": 30910 + }, + { + "epoch": 0.9313253012048193, + "grad_norm": 2.0539439601061664, + "learning_rate": 1.2838629803393343e-07, + "loss": 0.9179, + "step": 30920 + }, + { + "epoch": 0.9316265060240964, + "grad_norm": 4.9912392983740554, + "learning_rate": 1.27267363839515e-07, + "loss": 0.8725, + "step": 30930 + }, + { + "epoch": 0.9319277108433734, + "grad_norm": 4.393343174533207, + "learning_rate": 1.2615326411559102e-07, + "loss": 0.9116, + "step": 30940 + }, + { + "epoch": 0.9322289156626506, + "grad_norm": 5.64237266061262, + "learning_rate": 1.2504399996751458e-07, + "loss": 0.9591, + "step": 30950 + }, + { + "epoch": 0.9325301204819277, + "grad_norm": 4.303841741723094, + "learning_rate": 1.2393957249583865e-07, + "loss": 0.9469, + "step": 30960 + }, + { + "epoch": 0.9328313253012048, + "grad_norm": 4.9202804625223715, + "learning_rate": 1.2283998279631782e-07, + "loss": 0.8744, + "step": 30970 + }, + { + "epoch": 0.9331325301204819, + "grad_norm": 4.117656483874047, + "learning_rate": 1.2174523195990774e-07, + "loss": 0.9997, + "step": 30980 + }, + { + "epoch": 0.933433734939759, + "grad_norm": 5.926551823705494, + "learning_rate": 1.2065532107276346e-07, + "loss": 0.8809, + "step": 30990 + }, + { + "epoch": 0.9337349397590361, + "grad_norm": 2.1586242117054866, + "learning_rate": 1.1957025121623723e-07, + "loss": 0.8989, + "step": 31000 + }, + { + "epoch": 0.9340361445783133, + "grad_norm": 2.0099367131032566, + "learning_rate": 1.1849002346687843e-07, + "loss": 0.8628, + "step": 31010 + }, + { + "epoch": 0.9343373493975904, + "grad_norm": 6.407564840941291, + "learning_rate": 1.1741463889643312e-07, + "loss": 0.8549, + "step": 31020 + }, + { + "epoch": 0.9346385542168675, + "grad_norm": 2.203204610823805, + "learning_rate": 1.1634409857184115e-07, + "loss": 0.9774, + "step": 31030 + }, + { + "epoch": 0.9349397590361446, + "grad_norm": 2.0892029281619084, + "learning_rate": 1.1527840355523678e-07, + "loss": 0.8511, + "step": 31040 + }, + { + "epoch": 0.9352409638554217, + "grad_norm": 6.670492149767532, + "learning_rate": 1.1421755490394703e-07, + "loss": 0.9171, + "step": 31050 + }, + { + "epoch": 0.9355421686746987, + "grad_norm": 3.9126140592894263, + "learning_rate": 1.1316155367048997e-07, + "loss": 0.9736, + "step": 31060 + }, + { + "epoch": 0.935843373493976, + "grad_norm": 5.508606439584242, + "learning_rate": 1.121104009025753e-07, + "loss": 0.9986, + "step": 31070 + }, + { + "epoch": 0.936144578313253, + "grad_norm": 4.513596381840643, + "learning_rate": 1.1106409764310099e-07, + "loss": 0.9629, + "step": 31080 + }, + { + "epoch": 0.9364457831325301, + "grad_norm": 4.458743380851925, + "learning_rate": 1.1002264493015557e-07, + "loss": 0.9075, + "step": 31090 + }, + { + "epoch": 0.9367469879518072, + "grad_norm": 2.0394683884665663, + "learning_rate": 1.089860437970136e-07, + "loss": 0.9574, + "step": 31100 + }, + { + "epoch": 0.9370481927710843, + "grad_norm": 4.490584890742716, + "learning_rate": 1.0795429527213685e-07, + "loss": 0.8513, + "step": 31110 + }, + { + "epoch": 0.9373493975903614, + "grad_norm": 4.560890693225779, + "learning_rate": 1.0692740037917038e-07, + "loss": 0.8868, + "step": 31120 + }, + { + "epoch": 0.9376506024096386, + "grad_norm": 5.304866129428448, + "learning_rate": 1.0590536013694808e-07, + "loss": 0.951, + "step": 31130 + }, + { + "epoch": 0.9379518072289157, + "grad_norm": 4.785267266198701, + "learning_rate": 1.0488817555948328e-07, + "loss": 0.9663, + "step": 31140 + }, + { + "epoch": 0.9382530120481928, + "grad_norm": 4.757059489339679, + "learning_rate": 1.0387584765597425e-07, + "loss": 0.9207, + "step": 31150 + }, + { + "epoch": 0.9385542168674699, + "grad_norm": 6.21566281231201, + "learning_rate": 1.0286837743079869e-07, + "loss": 0.878, + "step": 31160 + }, + { + "epoch": 0.938855421686747, + "grad_norm": 7.000515722109719, + "learning_rate": 1.0186576588351704e-07, + "loss": 0.9788, + "step": 31170 + }, + { + "epoch": 0.939156626506024, + "grad_norm": 4.50859646721263, + "learning_rate": 1.0086801400886748e-07, + "loss": 0.9873, + "step": 31180 + }, + { + "epoch": 0.9394578313253013, + "grad_norm": 4.815024079717224, + "learning_rate": 9.987512279676648e-08, + "loss": 0.9023, + "step": 31190 + }, + { + "epoch": 0.9397590361445783, + "grad_norm": 5.786073200387487, + "learning_rate": 9.88870932323105e-08, + "loss": 0.9268, + "step": 31200 + }, + { + "epoch": 0.9400602409638554, + "grad_norm": 5.513886971916418, + "learning_rate": 9.790392629576928e-08, + "loss": 0.9748, + "step": 31210 + }, + { + "epoch": 0.9403614457831325, + "grad_norm": 6.165981363990004, + "learning_rate": 9.692562296259034e-08, + "loss": 0.9337, + "step": 31220 + }, + { + "epoch": 0.9406626506024096, + "grad_norm": 5.622474764916678, + "learning_rate": 9.595218420339502e-08, + "loss": 0.9343, + "step": 31230 + }, + { + "epoch": 0.9409638554216867, + "grad_norm": 4.29689025272055, + "learning_rate": 9.498361098397857e-08, + "loss": 0.8573, + "step": 31240 + }, + { + "epoch": 0.9412650602409639, + "grad_norm": 2.1228911427173878, + "learning_rate": 9.401990426530783e-08, + "loss": 0.8563, + "step": 31250 + }, + { + "epoch": 0.941566265060241, + "grad_norm": 5.556232352565769, + "learning_rate": 9.306106500352352e-08, + "loss": 0.8452, + "step": 31260 + }, + { + "epoch": 0.9418674698795181, + "grad_norm": 9.77660883553847, + "learning_rate": 9.210709414993468e-08, + "loss": 0.9925, + "step": 31270 + }, + { + "epoch": 0.9421686746987952, + "grad_norm": 7.768326396804313, + "learning_rate": 9.115799265102144e-08, + "loss": 0.9125, + "step": 31280 + }, + { + "epoch": 0.9424698795180723, + "grad_norm": 6.1472972464912425, + "learning_rate": 9.021376144843331e-08, + "loss": 0.944, + "step": 31290 + }, + { + "epoch": 0.9427710843373494, + "grad_norm": 4.65595788334406, + "learning_rate": 8.927440147898703e-08, + "loss": 0.9154, + "step": 31300 + }, + { + "epoch": 0.9430722891566266, + "grad_norm": 7.772291502764226, + "learning_rate": 8.833991367466543e-08, + "loss": 0.9027, + "step": 31310 + }, + { + "epoch": 0.9433734939759036, + "grad_norm": 6.089237439721673, + "learning_rate": 8.741029896261965e-08, + "loss": 0.9101, + "step": 31320 + }, + { + "epoch": 0.9436746987951807, + "grad_norm": 1.9917472325210748, + "learning_rate": 8.648555826516358e-08, + "loss": 0.8349, + "step": 31330 + }, + { + "epoch": 0.9439759036144578, + "grad_norm": 1.8828427949970825, + "learning_rate": 8.556569249977776e-08, + "loss": 0.8208, + "step": 31340 + }, + { + "epoch": 0.9442771084337349, + "grad_norm": 1.9236935981975551, + "learning_rate": 8.465070257910545e-08, + "loss": 0.9346, + "step": 31350 + }, + { + "epoch": 0.944578313253012, + "grad_norm": 6.512249637191026, + "learning_rate": 8.374058941094943e-08, + "loss": 0.9793, + "step": 31360 + }, + { + "epoch": 0.9448795180722892, + "grad_norm": 5.268319202207796, + "learning_rate": 8.283535389827791e-08, + "loss": 0.8733, + "step": 31370 + }, + { + "epoch": 0.9451807228915663, + "grad_norm": 4.2062384269533, + "learning_rate": 8.193499693921858e-08, + "loss": 0.9278, + "step": 31380 + }, + { + "epoch": 0.9454819277108434, + "grad_norm": 2.2600880356395043, + "learning_rate": 8.103951942705746e-08, + "loss": 0.7669, + "step": 31390 + }, + { + "epoch": 0.9457831325301205, + "grad_norm": 2.011035092391403, + "learning_rate": 8.014892225024162e-08, + "loss": 0.9416, + "step": 31400 + }, + { + "epoch": 0.9460843373493976, + "grad_norm": 4.914858948626751, + "learning_rate": 7.926320629237372e-08, + "loss": 0.8178, + "step": 31410 + }, + { + "epoch": 0.9463855421686747, + "grad_norm": 4.724209174896651, + "learning_rate": 7.838237243221636e-08, + "loss": 0.8016, + "step": 31420 + }, + { + "epoch": 0.9466867469879519, + "grad_norm": 5.234814754925279, + "learning_rate": 7.750642154368549e-08, + "loss": 0.8757, + "step": 31430 + }, + { + "epoch": 0.946987951807229, + "grad_norm": 2.009043731039072, + "learning_rate": 7.663535449585368e-08, + "loss": 0.8948, + "step": 31440 + }, + { + "epoch": 0.947289156626506, + "grad_norm": 5.637252015383081, + "learning_rate": 7.57691721529502e-08, + "loss": 0.9159, + "step": 31450 + }, + { + "epoch": 0.9475903614457831, + "grad_norm": 7.47274754525009, + "learning_rate": 7.490787537435318e-08, + "loss": 1.01, + "step": 31460 + }, + { + "epoch": 0.9478915662650602, + "grad_norm": 5.614873453119678, + "learning_rate": 7.405146501459737e-08, + "loss": 0.8489, + "step": 31470 + }, + { + "epoch": 0.9481927710843373, + "grad_norm": 5.174851348793695, + "learning_rate": 7.319994192336922e-08, + "loss": 0.8795, + "step": 31480 + }, + { + "epoch": 0.9484939759036145, + "grad_norm": 1.9722186899695042, + "learning_rate": 7.235330694550402e-08, + "loss": 0.9215, + "step": 31490 + }, + { + "epoch": 0.9487951807228916, + "grad_norm": 4.67144163926662, + "learning_rate": 7.151156092098987e-08, + "loss": 0.9562, + "step": 31500 + }, + { + "epoch": 0.9490963855421687, + "grad_norm": 6.409503643827968, + "learning_rate": 7.067470468496207e-08, + "loss": 0.8968, + "step": 31510 + }, + { + "epoch": 0.9493975903614458, + "grad_norm": 5.000686619576055, + "learning_rate": 6.984273906770645e-08, + "loss": 0.7606, + "step": 31520 + }, + { + "epoch": 0.9496987951807229, + "grad_norm": 5.353730688646645, + "learning_rate": 6.901566489465494e-08, + "loss": 0.9252, + "step": 31530 + }, + { + "epoch": 0.95, + "grad_norm": 10.48665505738073, + "learning_rate": 6.819348298638839e-08, + "loss": 0.9965, + "step": 31540 + }, + { + "epoch": 0.9503012048192772, + "grad_norm": 5.496642102580685, + "learning_rate": 6.73761941586315e-08, + "loss": 0.9488, + "step": 31550 + }, + { + "epoch": 0.9506024096385542, + "grad_norm": 1.8154120551670836, + "learning_rate": 6.656379922225676e-08, + "loss": 0.8733, + "step": 31560 + }, + { + "epoch": 0.9509036144578313, + "grad_norm": 21.46219883385068, + "learning_rate": 6.575629898327884e-08, + "loss": 0.979, + "step": 31570 + }, + { + "epoch": 0.9512048192771084, + "grad_norm": 2.0278059526542354, + "learning_rate": 6.495369424285746e-08, + "loss": 0.9338, + "step": 31580 + }, + { + "epoch": 0.9515060240963855, + "grad_norm": 20.890020765453254, + "learning_rate": 6.415598579729566e-08, + "loss": 0.8763, + "step": 31590 + }, + { + "epoch": 0.9518072289156626, + "grad_norm": 5.253818125619825, + "learning_rate": 6.336317443803808e-08, + "loss": 0.9124, + "step": 31600 + }, + { + "epoch": 0.9521084337349398, + "grad_norm": 2.0196564665635317, + "learning_rate": 6.257526095166999e-08, + "loss": 0.7798, + "step": 31610 + }, + { + "epoch": 0.9524096385542169, + "grad_norm": 4.460652751428492, + "learning_rate": 6.179224611991774e-08, + "loss": 0.873, + "step": 31620 + }, + { + "epoch": 0.952710843373494, + "grad_norm": 4.5934344920094015, + "learning_rate": 6.101413071964879e-08, + "loss": 0.8855, + "step": 31630 + }, + { + "epoch": 0.9530120481927711, + "grad_norm": 5.692243288757897, + "learning_rate": 6.024091552286781e-08, + "loss": 0.9729, + "step": 31640 + }, + { + "epoch": 0.9533132530120482, + "grad_norm": 4.2118553333912745, + "learning_rate": 5.9472601296718945e-08, + "loss": 0.9201, + "step": 31650 + }, + { + "epoch": 0.9536144578313253, + "grad_norm": 5.00798606357242, + "learning_rate": 5.8709188803482973e-08, + "loss": 0.9454, + "step": 31660 + }, + { + "epoch": 0.9539156626506025, + "grad_norm": 5.478380218887263, + "learning_rate": 5.795067880057792e-08, + "loss": 0.8821, + "step": 31670 + }, + { + "epoch": 0.9542168674698795, + "grad_norm": 5.5916796675069484, + "learning_rate": 5.7197072040557356e-08, + "loss": 1.0141, + "step": 31680 + }, + { + "epoch": 0.9545180722891566, + "grad_norm": 2.082708147343465, + "learning_rate": 5.644836927111152e-08, + "loss": 0.8511, + "step": 31690 + }, + { + "epoch": 0.9548192771084337, + "grad_norm": 1.9356641257286196, + "learning_rate": 5.57045712350629e-08, + "loss": 0.9663, + "step": 31700 + }, + { + "epoch": 0.9551204819277108, + "grad_norm": 10.966520500300694, + "learning_rate": 5.496567867037006e-08, + "loss": 0.9417, + "step": 31710 + }, + { + "epoch": 0.9554216867469879, + "grad_norm": 4.067956182307647, + "learning_rate": 5.423169231012271e-08, + "loss": 0.9089, + "step": 31720 + }, + { + "epoch": 0.9557228915662651, + "grad_norm": 4.452756075880683, + "learning_rate": 5.350261288254499e-08, + "loss": 0.9967, + "step": 31730 + }, + { + "epoch": 0.9560240963855422, + "grad_norm": 5.582902630953903, + "learning_rate": 5.277844111098995e-08, + "loss": 0.9021, + "step": 31740 + }, + { + "epoch": 0.9563253012048193, + "grad_norm": 6.445885955031362, + "learning_rate": 5.205917771394398e-08, + "loss": 0.9789, + "step": 31750 + }, + { + "epoch": 0.9566265060240964, + "grad_norm": 4.251170884605689, + "learning_rate": 5.134482340502178e-08, + "loss": 1.0117, + "step": 31760 + }, + { + "epoch": 0.9569277108433735, + "grad_norm": 1.9909931612164995, + "learning_rate": 5.063537889296921e-08, + "loss": 0.9339, + "step": 31770 + }, + { + "epoch": 0.9572289156626506, + "grad_norm": 4.586901197703373, + "learning_rate": 4.993084488165933e-08, + "loss": 0.9644, + "step": 31780 + }, + { + "epoch": 0.9575301204819278, + "grad_norm": 4.2002467135643835, + "learning_rate": 4.923122207009412e-08, + "loss": 0.983, + "step": 31790 + }, + { + "epoch": 0.9578313253012049, + "grad_norm": 1.9544588383018127, + "learning_rate": 4.8536511152402766e-08, + "loss": 0.9102, + "step": 31800 + }, + { + "epoch": 0.9581325301204819, + "grad_norm": 4.585237811146019, + "learning_rate": 4.784671281784115e-08, + "loss": 0.7348, + "step": 31810 + }, + { + "epoch": 0.958433734939759, + "grad_norm": 2.065119496477985, + "learning_rate": 4.7161827750791254e-08, + "loss": 0.9061, + "step": 31820 + }, + { + "epoch": 0.9587349397590361, + "grad_norm": 4.734611038207768, + "learning_rate": 4.648185663075955e-08, + "loss": 0.9375, + "step": 31830 + }, + { + "epoch": 0.9590361445783132, + "grad_norm": 4.172196493271511, + "learning_rate": 4.580680013237915e-08, + "loss": 0.8531, + "step": 31840 + }, + { + "epoch": 0.9593373493975904, + "grad_norm": 2.1088456606552666, + "learning_rate": 4.513665892540486e-08, + "loss": 0.8898, + "step": 31850 + }, + { + "epoch": 0.9596385542168675, + "grad_norm": 2.1762155217981154, + "learning_rate": 4.44714336747154e-08, + "loss": 0.9557, + "step": 31860 + }, + { + "epoch": 0.9599397590361446, + "grad_norm": 4.08967158877884, + "learning_rate": 4.381112504031337e-08, + "loss": 0.8883, + "step": 31870 + }, + { + "epoch": 0.9602409638554217, + "grad_norm": 4.736864621834937, + "learning_rate": 4.3155733677321397e-08, + "loss": 0.9858, + "step": 31880 + }, + { + "epoch": 0.9605421686746988, + "grad_norm": 5.065952788450292, + "learning_rate": 4.2505260235986e-08, + "loss": 0.8857, + "step": 31890 + }, + { + "epoch": 0.9608433734939759, + "grad_norm": 4.572014718952944, + "learning_rate": 4.18597053616715e-08, + "loss": 0.9817, + "step": 31900 + }, + { + "epoch": 0.9611445783132531, + "grad_norm": 6.998044500990425, + "learning_rate": 4.121906969486444e-08, + "loss": 0.9341, + "step": 31910 + }, + { + "epoch": 0.9614457831325302, + "grad_norm": 5.199759990994423, + "learning_rate": 4.058335387117029e-08, + "loss": 0.8573, + "step": 31920 + }, + { + "epoch": 0.9617469879518072, + "grad_norm": 1.9173809977988288, + "learning_rate": 3.9952558521311746e-08, + "loss": 0.8957, + "step": 31930 + }, + { + "epoch": 0.9620481927710843, + "grad_norm": 1.9320919093829663, + "learning_rate": 3.932668427113262e-08, + "loss": 0.9187, + "step": 31940 + }, + { + "epoch": 0.9623493975903614, + "grad_norm": 8.590303173871973, + "learning_rate": 3.87057317415912e-08, + "loss": 0.8842, + "step": 31950 + }, + { + "epoch": 0.9626506024096385, + "grad_norm": 6.829054113107583, + "learning_rate": 3.8089701548765237e-08, + "loss": 0.9649, + "step": 31960 + }, + { + "epoch": 0.9629518072289157, + "grad_norm": 5.002661137745544, + "learning_rate": 3.747859430384637e-08, + "loss": 0.9588, + "step": 31970 + }, + { + "epoch": 0.9632530120481928, + "grad_norm": 1.833743228665942, + "learning_rate": 3.687241061314462e-08, + "loss": 0.8163, + "step": 31980 + }, + { + "epoch": 0.9635542168674699, + "grad_norm": 3.858606939722687, + "learning_rate": 3.627115107808332e-08, + "loss": 0.9026, + "step": 31990 + }, + { + "epoch": 0.963855421686747, + "grad_norm": 5.733580986982268, + "learning_rate": 3.567481629520086e-08, + "loss": 0.9104, + "step": 32000 + }, + { + "epoch": 0.9641566265060241, + "grad_norm": 1.916862277391112, + "learning_rate": 3.508340685614842e-08, + "loss": 0.8529, + "step": 32010 + }, + { + "epoch": 0.9644578313253012, + "grad_norm": 2.081678156309162, + "learning_rate": 3.449692334769217e-08, + "loss": 0.9556, + "step": 32020 + }, + { + "epoch": 0.9647590361445784, + "grad_norm": 6.252452685789378, + "learning_rate": 3.391536635171055e-08, + "loss": 0.8811, + "step": 32030 + }, + { + "epoch": 0.9650602409638555, + "grad_norm": 14.46103305347846, + "learning_rate": 3.333873644519425e-08, + "loss": 0.9338, + "step": 32040 + }, + { + "epoch": 0.9653614457831325, + "grad_norm": 5.190856685469573, + "learning_rate": 3.2767034200245076e-08, + "loss": 0.9446, + "step": 32050 + }, + { + "epoch": 0.9656626506024096, + "grad_norm": 4.403507020006852, + "learning_rate": 3.220026018407541e-08, + "loss": 0.9359, + "step": 32060 + }, + { + "epoch": 0.9659638554216867, + "grad_norm": 4.787761078065262, + "learning_rate": 3.163841495900988e-08, + "loss": 0.9338, + "step": 32070 + }, + { + "epoch": 0.9662650602409638, + "grad_norm": 1.9184515601386845, + "learning_rate": 3.108149908248093e-08, + "loss": 0.9012, + "step": 32080 + }, + { + "epoch": 0.966566265060241, + "grad_norm": 2.0031446249235, + "learning_rate": 3.052951310703212e-08, + "loss": 0.8868, + "step": 32090 + }, + { + "epoch": 0.9668674698795181, + "grad_norm": 4.95976830218123, + "learning_rate": 2.998245758031482e-08, + "loss": 0.9571, + "step": 32100 + }, + { + "epoch": 0.9671686746987952, + "grad_norm": 5.087747737811243, + "learning_rate": 2.9440333045088753e-08, + "loss": 0.9274, + "step": 32110 + }, + { + "epoch": 0.9674698795180723, + "grad_norm": 1.9009154478365193, + "learning_rate": 2.8903140039222012e-08, + "loss": 0.8484, + "step": 32120 + }, + { + "epoch": 0.9677710843373494, + "grad_norm": 8.640822327018284, + "learning_rate": 2.8370879095688807e-08, + "loss": 0.9061, + "step": 32130 + }, + { + "epoch": 0.9680722891566265, + "grad_norm": 4.714580331834582, + "learning_rate": 2.7843550742570614e-08, + "loss": 0.9258, + "step": 32140 + }, + { + "epoch": 0.9683734939759037, + "grad_norm": 5.033035821592232, + "learning_rate": 2.7321155503055584e-08, + "loss": 0.9246, + "step": 32150 + }, + { + "epoch": 0.9686746987951808, + "grad_norm": 5.628529182506936, + "learning_rate": 2.680369389543691e-08, + "loss": 0.9865, + "step": 32160 + }, + { + "epoch": 0.9689759036144578, + "grad_norm": 10.175220114494271, + "learning_rate": 2.629116643311225e-08, + "loss": 0.9374, + "step": 32170 + }, + { + "epoch": 0.9692771084337349, + "grad_norm": 6.520144800887773, + "learning_rate": 2.578357362458539e-08, + "loss": 0.8914, + "step": 32180 + }, + { + "epoch": 0.969578313253012, + "grad_norm": 1.9414490245596985, + "learning_rate": 2.5280915973462385e-08, + "loss": 0.8121, + "step": 32190 + }, + { + "epoch": 0.9698795180722891, + "grad_norm": 5.199655962184366, + "learning_rate": 2.4783193978454857e-08, + "loss": 0.9901, + "step": 32200 + }, + { + "epoch": 0.9701807228915663, + "grad_norm": 7.304053814674291, + "learning_rate": 2.429040813337613e-08, + "loss": 0.947, + "step": 32210 + }, + { + "epoch": 0.9704819277108434, + "grad_norm": 4.89995129035935, + "learning_rate": 2.380255892714234e-08, + "loss": 0.8451, + "step": 32220 + }, + { + "epoch": 0.9707831325301205, + "grad_norm": 6.366477750083791, + "learning_rate": 2.3319646843772416e-08, + "loss": 0.9329, + "step": 32230 + }, + { + "epoch": 0.9710843373493976, + "grad_norm": 7.3058390489962886, + "learning_rate": 2.2841672362386435e-08, + "loss": 0.9899, + "step": 32240 + }, + { + "epoch": 0.9713855421686747, + "grad_norm": 7.049171259139614, + "learning_rate": 2.236863595720562e-08, + "loss": 0.9625, + "step": 32250 + }, + { + "epoch": 0.9716867469879518, + "grad_norm": 7.964210915652266, + "learning_rate": 2.1900538097551217e-08, + "loss": 0.9222, + "step": 32260 + }, + { + "epoch": 0.971987951807229, + "grad_norm": 6.530625776104598, + "learning_rate": 2.143737924784617e-08, + "loss": 0.913, + "step": 32270 + }, + { + "epoch": 0.9722891566265061, + "grad_norm": 5.324411150574649, + "learning_rate": 2.097915986761234e-08, + "loss": 0.91, + "step": 32280 + }, + { + "epoch": 0.9725903614457831, + "grad_norm": 9.723645218471404, + "learning_rate": 2.052588041147108e-08, + "loss": 0.9928, + "step": 32290 + }, + { + "epoch": 0.9728915662650602, + "grad_norm": 5.943995608444509, + "learning_rate": 2.0077541329142636e-08, + "loss": 0.9234, + "step": 32300 + }, + { + "epoch": 0.9731927710843373, + "grad_norm": 1.851216045599393, + "learning_rate": 1.9634143065445645e-08, + "loss": 0.9536, + "step": 32310 + }, + { + "epoch": 0.9734939759036144, + "grad_norm": 5.6125492081688, + "learning_rate": 1.9195686060295982e-08, + "loss": 0.9597, + "step": 32320 + }, + { + "epoch": 0.9737951807228916, + "grad_norm": 4.996823292904083, + "learning_rate": 1.8762170748708454e-08, + "loss": 0.8892, + "step": 32330 + }, + { + "epoch": 0.9740963855421687, + "grad_norm": 5.104708268626902, + "learning_rate": 1.833359756079456e-08, + "loss": 1.0204, + "step": 32340 + }, + { + "epoch": 0.9743975903614458, + "grad_norm": 7.326232584172267, + "learning_rate": 1.790996692176139e-08, + "loss": 0.8854, + "step": 32350 + }, + { + "epoch": 0.9746987951807229, + "grad_norm": 1.8982627522393387, + "learning_rate": 1.7491279251913297e-08, + "loss": 0.926, + "step": 32360 + }, + { + "epoch": 0.975, + "grad_norm": 1.961582971142859, + "learning_rate": 1.7077534966650767e-08, + "loss": 0.9338, + "step": 32370 + }, + { + "epoch": 0.9753012048192771, + "grad_norm": 7.204296466118696, + "learning_rate": 1.6668734476468775e-08, + "loss": 0.9141, + "step": 32380 + }, + { + "epoch": 0.9756024096385543, + "grad_norm": 4.445754732635552, + "learning_rate": 1.6264878186958434e-08, + "loss": 0.9337, + "step": 32390 + }, + { + "epoch": 0.9759036144578314, + "grad_norm": 4.664311706562546, + "learning_rate": 1.5865966498803674e-08, + "loss": 0.9858, + "step": 32400 + }, + { + "epoch": 0.9762048192771084, + "grad_norm": 5.723175316978166, + "learning_rate": 1.5471999807784578e-08, + "loss": 0.8587, + "step": 32410 + }, + { + "epoch": 0.9765060240963855, + "grad_norm": 6.969543067650618, + "learning_rate": 1.508297850477458e-08, + "loss": 0.9664, + "step": 32420 + }, + { + "epoch": 0.9768072289156626, + "grad_norm": 1.7843877358981983, + "learning_rate": 1.4698902975739948e-08, + "loss": 0.9528, + "step": 32430 + }, + { + "epoch": 0.9771084337349397, + "grad_norm": 1.9878703513714073, + "learning_rate": 1.431977360173975e-08, + "loss": 0.8372, + "step": 32440 + }, + { + "epoch": 0.9774096385542169, + "grad_norm": 5.725179353496761, + "learning_rate": 1.3945590758927541e-08, + "loss": 0.888, + "step": 32450 + }, + { + "epoch": 0.977710843373494, + "grad_norm": 5.246160467460025, + "learning_rate": 1.357635481854691e-08, + "loss": 0.9842, + "step": 32460 + }, + { + "epoch": 0.9780120481927711, + "grad_norm": 6.26476584244363, + "learning_rate": 1.3212066146934267e-08, + "loss": 0.9102, + "step": 32470 + }, + { + "epoch": 0.9783132530120482, + "grad_norm": 3.957553293059344, + "learning_rate": 1.2852725105519382e-08, + "loss": 0.9382, + "step": 32480 + }, + { + "epoch": 0.9786144578313253, + "grad_norm": 2.018288924985573, + "learning_rate": 1.2498332050820406e-08, + "loss": 0.9859, + "step": 32490 + }, + { + "epoch": 0.9789156626506024, + "grad_norm": 5.731505873250158, + "learning_rate": 1.214888733444719e-08, + "loss": 0.9666, + "step": 32500 + }, + { + "epoch": 0.9792168674698796, + "grad_norm": 4.205672805287298, + "learning_rate": 1.1804391303101292e-08, + "loss": 0.9027, + "step": 32510 + }, + { + "epoch": 0.9795180722891567, + "grad_norm": 1.9724752924746713, + "learning_rate": 1.1464844298573752e-08, + "loss": 0.8686, + "step": 32520 + }, + { + "epoch": 0.9798192771084338, + "grad_norm": 2.04575420580311, + "learning_rate": 1.1130246657745092e-08, + "loss": 0.7719, + "step": 32530 + }, + { + "epoch": 0.9801204819277108, + "grad_norm": 6.5790252772524465, + "learning_rate": 1.080059871258532e-08, + "loss": 0.8899, + "step": 32540 + }, + { + "epoch": 0.9804216867469879, + "grad_norm": 1.9387316441227798, + "learning_rate": 1.047590079015448e-08, + "loss": 0.8163, + "step": 32550 + }, + { + "epoch": 0.980722891566265, + "grad_norm": 1.9019712434002312, + "learning_rate": 1.015615321260044e-08, + "loss": 0.9109, + "step": 32560 + }, + { + "epoch": 0.9810240963855422, + "grad_norm": 5.366533735264594, + "learning_rate": 9.841356297159988e-09, + "loss": 0.8855, + "step": 32570 + }, + { + "epoch": 0.9813253012048193, + "grad_norm": 6.614995026563534, + "learning_rate": 9.531510356158291e-09, + "loss": 0.8818, + "step": 32580 + }, + { + "epoch": 0.9816265060240964, + "grad_norm": 5.51945953883837, + "learning_rate": 9.226615697008335e-09, + "loss": 0.9751, + "step": 32590 + }, + { + "epoch": 0.9819277108433735, + "grad_norm": 6.390542965586899, + "learning_rate": 8.926672622210365e-09, + "loss": 0.9374, + "step": 32600 + }, + { + "epoch": 0.9822289156626506, + "grad_norm": 7.353054114327854, + "learning_rate": 8.631681429353e-09, + "loss": 0.8522, + "step": 32610 + }, + { + "epoch": 0.9825301204819277, + "grad_norm": 16.68028728382817, + "learning_rate": 8.341642411109907e-09, + "loss": 0.878, + "step": 32620 + }, + { + "epoch": 0.9828313253012049, + "grad_norm": 7.066433488752781, + "learning_rate": 8.056555855243675e-09, + "loss": 0.9412, + "step": 32630 + }, + { + "epoch": 0.983132530120482, + "grad_norm": 4.135364704257834, + "learning_rate": 7.77642204460194e-09, + "loss": 0.9071, + "step": 32640 + }, + { + "epoch": 0.983433734939759, + "grad_norm": 2.1166164025756977, + "learning_rate": 7.501241257118485e-09, + "loss": 0.9534, + "step": 32650 + }, + { + "epoch": 0.9837349397590361, + "grad_norm": 1.8835170239698753, + "learning_rate": 7.2310137658138104e-09, + "loss": 0.8746, + "step": 32660 + }, + { + "epoch": 0.9840361445783132, + "grad_norm": 5.390526178714944, + "learning_rate": 6.9657398387928954e-09, + "loss": 0.7723, + "step": 32670 + }, + { + "epoch": 0.9843373493975903, + "grad_norm": 4.587666920735826, + "learning_rate": 6.705419739246876e-09, + "loss": 0.9205, + "step": 32680 + }, + { + "epoch": 0.9846385542168675, + "grad_norm": 2.038842973204118, + "learning_rate": 6.450053725452488e-09, + "loss": 0.8732, + "step": 32690 + }, + { + "epoch": 0.9849397590361446, + "grad_norm": 1.9878965511900648, + "learning_rate": 6.199642050769283e-09, + "loss": 0.9325, + "step": 32700 + }, + { + "epoch": 0.9852409638554217, + "grad_norm": 17.11659900685862, + "learning_rate": 5.954184963644083e-09, + "loss": 0.7917, + "step": 32710 + }, + { + "epoch": 0.9855421686746988, + "grad_norm": 8.021908437912327, + "learning_rate": 5.713682707604862e-09, + "loss": 0.9266, + "step": 32720 + }, + { + "epoch": 0.9858433734939759, + "grad_norm": 6.410494503733885, + "learning_rate": 5.478135521266859e-09, + "loss": 1.022, + "step": 32730 + }, + { + "epoch": 0.986144578313253, + "grad_norm": 2.0533458036334253, + "learning_rate": 5.247543638327024e-09, + "loss": 0.8769, + "step": 32740 + }, + { + "epoch": 0.9864457831325302, + "grad_norm": 6.70943462401911, + "learning_rate": 5.021907287566241e-09, + "loss": 0.9252, + "step": 32750 + }, + { + "epoch": 0.9867469879518073, + "grad_norm": 4.3351846787495845, + "learning_rate": 4.801226692849881e-09, + "loss": 0.9755, + "step": 32760 + }, + { + "epoch": 0.9870481927710844, + "grad_norm": 5.899850504536801, + "learning_rate": 4.585502073125026e-09, + "loss": 0.8919, + "step": 32770 + }, + { + "epoch": 0.9873493975903614, + "grad_norm": 4.026783699486491, + "learning_rate": 4.37473364242269e-09, + "loss": 0.8622, + "step": 32780 + }, + { + "epoch": 0.9876506024096385, + "grad_norm": 2.059837098189508, + "learning_rate": 4.168921609856158e-09, + "loss": 0.922, + "step": 32790 + }, + { + "epoch": 0.9879518072289156, + "grad_norm": 5.903112831821014, + "learning_rate": 3.968066179621533e-09, + "loss": 0.9151, + "step": 32800 + }, + { + "epoch": 0.9882530120481928, + "grad_norm": 6.380133537258821, + "learning_rate": 3.7721675509971855e-09, + "loss": 0.9358, + "step": 32810 + }, + { + "epoch": 0.9885542168674699, + "grad_norm": 4.691649478423261, + "learning_rate": 3.5812259183426457e-09, + "loss": 0.8861, + "step": 32820 + }, + { + "epoch": 0.988855421686747, + "grad_norm": 2.020453275973914, + "learning_rate": 3.3952414711013737e-09, + "loss": 0.9482, + "step": 32830 + }, + { + "epoch": 0.9891566265060241, + "grad_norm": 4.887013192306234, + "learning_rate": 3.2142143937968774e-09, + "loss": 1.0159, + "step": 32840 + }, + { + "epoch": 0.9894578313253012, + "grad_norm": 4.478210861833627, + "learning_rate": 3.0381448660343758e-09, + "loss": 0.9274, + "step": 32850 + }, + { + "epoch": 0.9897590361445783, + "grad_norm": 5.917669075335433, + "learning_rate": 2.8670330625019117e-09, + "loss": 0.7962, + "step": 32860 + }, + { + "epoch": 0.9900602409638555, + "grad_norm": 6.769969376664644, + "learning_rate": 2.7008791529664624e-09, + "loss": 0.9538, + "step": 32870 + }, + { + "epoch": 0.9903614457831326, + "grad_norm": 5.406044559926571, + "learning_rate": 2.539683302277829e-09, + "loss": 0.9612, + "step": 32880 + }, + { + "epoch": 0.9906626506024097, + "grad_norm": 1.8584573720910964, + "learning_rate": 2.3834456703658583e-09, + "loss": 0.8362, + "step": 32890 + }, + { + "epoch": 0.9909638554216867, + "grad_norm": 8.331550664394987, + "learning_rate": 2.232166412241554e-09, + "loss": 0.8863, + "step": 32900 + }, + { + "epoch": 0.9912650602409638, + "grad_norm": 4.381243677666166, + "learning_rate": 2.085845677996523e-09, + "loss": 0.9229, + "step": 32910 + }, + { + "epoch": 0.9915662650602409, + "grad_norm": 7.120700168343828, + "learning_rate": 1.944483612802417e-09, + "loss": 0.9992, + "step": 32920 + }, + { + "epoch": 0.9918674698795181, + "grad_norm": 2.1293013436985726, + "learning_rate": 1.8080803569109351e-09, + "loss": 0.8772, + "step": 32930 + }, + { + "epoch": 0.9921686746987952, + "grad_norm": 2.0645965470437555, + "learning_rate": 1.6766360456549335e-09, + "loss": 0.9249, + "step": 32940 + }, + { + "epoch": 0.9924698795180723, + "grad_norm": 2.0943255183398923, + "learning_rate": 1.5501508094456496e-09, + "loss": 0.8664, + "step": 32950 + }, + { + "epoch": 0.9927710843373494, + "grad_norm": 5.175432571590235, + "learning_rate": 1.4286247737754776e-09, + "loss": 0.8166, + "step": 32960 + }, + { + "epoch": 0.9930722891566265, + "grad_norm": 6.522363733828929, + "learning_rate": 1.3120580592168586e-09, + "loss": 0.897, + "step": 32970 + }, + { + "epoch": 0.9933734939759036, + "grad_norm": 1.9844911119178683, + "learning_rate": 1.2004507814200595e-09, + "loss": 0.945, + "step": 32980 + }, + { + "epoch": 0.9936746987951808, + "grad_norm": 5.505876329498268, + "learning_rate": 1.0938030511170595e-09, + "loss": 0.8917, + "step": 32990 + }, + { + "epoch": 0.9939759036144579, + "grad_norm": 24.282183839792587, + "learning_rate": 9.92114974117664e-10, + "loss": 0.9364, + "step": 33000 + }, + { + "epoch": 0.994277108433735, + "grad_norm": 9.23266439400529, + "learning_rate": 8.953866513111698e-10, + "loss": 0.9321, + "step": 33010 + }, + { + "epoch": 0.994578313253012, + "grad_norm": 4.571343950740616, + "learning_rate": 8.036181786669206e-10, + "loss": 0.916, + "step": 33020 + }, + { + "epoch": 0.9948795180722891, + "grad_norm": 9.543958338609684, + "learning_rate": 7.168096472326413e-10, + "loss": 0.9559, + "step": 33030 + }, + { + "epoch": 0.9951807228915662, + "grad_norm": 3.926562177985658, + "learning_rate": 6.349611431349934e-10, + "loss": 0.9291, + "step": 33040 + }, + { + "epoch": 0.9954819277108434, + "grad_norm": 5.010326811827543, + "learning_rate": 5.580727475795744e-10, + "loss": 1.0484, + "step": 33050 + }, + { + "epoch": 0.9957831325301205, + "grad_norm": 7.221217099561627, + "learning_rate": 4.861445368514739e-10, + "loss": 0.9055, + "step": 33060 + }, + { + "epoch": 0.9960843373493976, + "grad_norm": 5.002598418152769, + "learning_rate": 4.1917658231416247e-10, + "loss": 0.9872, + "step": 33070 + }, + { + "epoch": 0.9963855421686747, + "grad_norm": 6.469774961883145, + "learning_rate": 3.571689504100473e-10, + "loss": 0.8994, + "step": 33080 + }, + { + "epoch": 0.9966867469879518, + "grad_norm": 2.218287374798166, + "learning_rate": 3.0012170265880656e-10, + "loss": 0.8879, + "step": 33090 + }, + { + "epoch": 0.9969879518072289, + "grad_norm": 3.8477052272546644, + "learning_rate": 2.4803489566016526e-10, + "loss": 0.8326, + "step": 33100 + }, + { + "epoch": 0.9972891566265061, + "grad_norm": 27.923816696036397, + "learning_rate": 2.0090858109278465e-10, + "loss": 0.9737, + "step": 33110 + }, + { + "epoch": 0.9975903614457832, + "grad_norm": 4.693803475151824, + "learning_rate": 1.5874280571148704e-10, + "loss": 0.9478, + "step": 33120 + }, + { + "epoch": 0.9978915662650603, + "grad_norm": 4.523075656820095, + "learning_rate": 1.215376113522515e-10, + "loss": 1.0148, + "step": 33130 + }, + { + "epoch": 0.9981927710843373, + "grad_norm": 17.090115553209188, + "learning_rate": 8.929303492721808e-11, + "loss": 0.8852, + "step": 33140 + }, + { + "epoch": 0.9984939759036144, + "grad_norm": 8.636003886826249, + "learning_rate": 6.200910842801833e-11, + "loss": 0.8452, + "step": 33150 + }, + { + "epoch": 0.9987951807228915, + "grad_norm": 4.263752612525351, + "learning_rate": 3.968585892466514e-11, + "loss": 0.7981, + "step": 33160 + }, + { + "epoch": 0.9990963855421687, + "grad_norm": 5.731074372657009, + "learning_rate": 2.232330856499765e-11, + "loss": 0.9671, + "step": 33170 + }, + { + "epoch": 0.9993975903614458, + "grad_norm": 2.1559076434496554, + "learning_rate": 9.921474574681179e-12, + "loss": 0.9056, + "step": 33180 + }, + { + "epoch": 0.9996987951807229, + "grad_norm": 6.4165937373810955, + "learning_rate": 2.480369258872628e-12, + "loss": 0.9025, + "step": 33190 + }, + { + "epoch": 1.0, + "grad_norm": 5.918614350791673, + "learning_rate": 0.0, + "loss": 1.0485, + "step": 33200 + }, + { + "epoch": 1.0, + "step": 33200, + "total_flos": 1.6067564005752832e+16, + "train_loss": 0.0, + "train_runtime": 5.1234, + "train_samples_per_second": 829462.23, + "train_steps_per_second": 6480.076 + } + ], + "logging_steps": 10, + "max_steps": 33200, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 3320, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.6067564005752832e+16, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}