keyword_gacha_multilingual_base / trainer_state.json
neavo's picture
20250128
0e5a816 verified
{
"epoch": 0.9991671471586905,
"global_step": 1114,
"max_steps": 1114,
"logging_steps": 1,
"eval_steps": 50,
"save_steps": 50,
"train_batch_size": 8,
"num_train_epochs": 1,
"num_input_tokens_seen": 0,
"total_flos": 6.811715592467251e+17,
"log_history": [
{
"loss": 440.6308,
"grad_norm": 98.61355590820312,
"learning_rate": 0.0004999990058793643,
"epoch": 0.0008969184444871549,
"step": 1
},
{
"loss": 515.3978,
"grad_norm": 1112.0234375,
"learning_rate": 0.0004999960235253631,
"epoch": 0.0017938368889743098,
"step": 2
},
{
"loss": 477.4767,
"grad_norm": 392.8102722167969,
"learning_rate": 0.0004999910529617153,
"epoch": 0.0026907553334614646,
"step": 3
},
{
"loss": 457.2771,
"grad_norm": 292.9400939941406,
"learning_rate": 0.0004999840942279514,
"epoch": 0.0035876737779486196,
"step": 4
},
{
"loss": 444.411,
"grad_norm": 166.66598510742188,
"learning_rate": 0.000499975147379414,
"epoch": 0.004484592222435775,
"step": 5
},
{
"loss": 438.7729,
"grad_norm": 132.8984375,
"learning_rate": 0.000499964212487257,
"epoch": 0.005381510666922929,
"step": 6
},
{
"loss": 434.4058,
"grad_norm": 102.88407135009766,
"learning_rate": 0.0004999512896384454,
"epoch": 0.006278429111410084,
"step": 7
},
{
"loss": 431.3428,
"grad_norm": 109.61495971679688,
"learning_rate": 0.0004999363789357541,
"epoch": 0.007175347555897239,
"step": 8
},
{
"loss": 430.6904,
"grad_norm": 94.442626953125,
"learning_rate": 0.0004999194804977674,
"epoch": 0.008072266000384394,
"step": 9
},
{
"loss": 427.7128,
"grad_norm": 79.10123443603516,
"learning_rate": 0.0004999005944588778,
"epoch": 0.00896918444487155,
"step": 10
},
{
"loss": 430.6295,
"grad_norm": 81.77398681640625,
"learning_rate": 0.0004998797209692856,
"epoch": 0.009866102889358703,
"step": 11
},
{
"loss": 422.6068,
"grad_norm": 67.85909271240234,
"learning_rate": 0.0004998568601949967,
"epoch": 0.010763021333845858,
"step": 12
},
{
"loss": 422.3798,
"grad_norm": 81.51007843017578,
"learning_rate": 0.0004998320123178223,
"epoch": 0.011659939778333014,
"step": 13
},
{
"loss": 423.3609,
"grad_norm": 70.7045669555664,
"learning_rate": 0.0004998051775353763,
"epoch": 0.012556858222820167,
"step": 14
},
{
"loss": 423.232,
"grad_norm": 75.2995834350586,
"learning_rate": 0.0004997763560610752,
"epoch": 0.013453776667307323,
"step": 15
},
{
"loss": 414.7621,
"grad_norm": 63.627197265625,
"learning_rate": 0.000499745548124135,
"epoch": 0.014350695111794478,
"step": 16
},
{
"loss": 419.0351,
"grad_norm": 73.96087646484375,
"learning_rate": 0.0004997127539695701,
"epoch": 0.015247613556281632,
"step": 17
},
{
"loss": 418.1977,
"grad_norm": 70.3633804321289,
"learning_rate": 0.0004996779738581913,
"epoch": 0.016144532000768787,
"step": 18
},
{
"loss": 416.1606,
"grad_norm": 74.2279052734375,
"learning_rate": 0.0004996412080666037,
"epoch": 0.017041450445255943,
"step": 19
},
{
"loss": 417.2284,
"grad_norm": 63.311676025390625,
"learning_rate": 0.0004996024568872042,
"epoch": 0.0179383688897431,
"step": 20
},
{
"loss": 409.5278,
"grad_norm": 63.21588897705078,
"learning_rate": 0.0004995617206281797,
"epoch": 0.01883528733423025,
"step": 21
},
{
"loss": 414.1958,
"grad_norm": 61.4863395690918,
"learning_rate": 0.0004995189996135042,
"epoch": 0.019732205778717406,
"step": 22
},
{
"loss": 419.7891,
"grad_norm": 61.297481536865234,
"learning_rate": 0.0004994742941829364,
"epoch": 0.02062912422320456,
"step": 23
},
{
"loss": 414.3831,
"grad_norm": 68.20845031738281,
"learning_rate": 0.0004994276046920171,
"epoch": 0.021526042667691717,
"step": 24
},
{
"loss": 415.8848,
"grad_norm": 59.016239166259766,
"learning_rate": 0.0004993789315120662,
"epoch": 0.022422961112178872,
"step": 25
},
{
"loss": 417.4357,
"grad_norm": 55.90328598022461,
"learning_rate": 0.0004993282750301799,
"epoch": 0.023319879556666027,
"step": 26
},
{
"loss": 411.6564,
"grad_norm": 59.52859115600586,
"learning_rate": 0.000499275635649227,
"epoch": 0.02421679800115318,
"step": 27
},
{
"loss": 412.2451,
"grad_norm": 59.61384963989258,
"learning_rate": 0.0004992210137878472,
"epoch": 0.025113716445640335,
"step": 28
},
{
"loss": 416.412,
"grad_norm": 60.00177001953125,
"learning_rate": 0.000499164409880446,
"epoch": 0.02601063489012749,
"step": 29
},
{
"loss": 405.7923,
"grad_norm": 59.08831024169922,
"learning_rate": 0.0004991058243771922,
"epoch": 0.026907553334614646,
"step": 30
},
{
"loss": 411.6278,
"grad_norm": 58.00886154174805,
"learning_rate": 0.0004990452577440143,
"epoch": 0.0278044717791018,
"step": 31
},
{
"loss": 406.3222,
"grad_norm": 57.3386116027832,
"learning_rate": 0.0004989827104625969,
"epoch": 0.028701390223588957,
"step": 32
},
{
"loss": 404.9872,
"grad_norm": 56.013816833496094,
"learning_rate": 0.000498918183030376,
"epoch": 0.02959830866807611,
"step": 33
},
{
"loss": 406.4626,
"grad_norm": 57.787132263183594,
"learning_rate": 0.0004988516759605363,
"epoch": 0.030495227112563264,
"step": 34
},
{
"loss": 405.2309,
"grad_norm": 54.9903678894043,
"learning_rate": 0.0004987831897820059,
"epoch": 0.03139214555705042,
"step": 35
},
{
"loss": 415.0021,
"grad_norm": 55.86436462402344,
"learning_rate": 0.0004987127250394532,
"epoch": 0.032289064001537575,
"step": 36
},
{
"loss": 402.1766,
"grad_norm": 53.72284698486328,
"learning_rate": 0.0004986402822932818,
"epoch": 0.03318598244602473,
"step": 37
},
{
"loss": 409.7162,
"grad_norm": 56.52421569824219,
"learning_rate": 0.0004985658621196263,
"epoch": 0.034082900890511886,
"step": 38
},
{
"loss": 406.8592,
"grad_norm": 63.26171875,
"learning_rate": 0.0004984894651103478,
"epoch": 0.03497981933499904,
"step": 39
},
{
"loss": 401.9672,
"grad_norm": 52.98197937011719,
"learning_rate": 0.0004984110918730289,
"epoch": 0.0358767377794862,
"step": 40
},
{
"loss": 402.0731,
"grad_norm": 61.255733489990234,
"learning_rate": 0.0004983307430309695,
"epoch": 0.03677365622397335,
"step": 41
},
{
"loss": 405.9777,
"grad_norm": 62.212188720703125,
"learning_rate": 0.0004982484192231808,
"epoch": 0.0376705746684605,
"step": 42
},
{
"loss": 409.4884,
"grad_norm": 60.04124450683594,
"learning_rate": 0.0004981641211043813,
"epoch": 0.03856749311294766,
"step": 43
},
{
"loss": 402.7691,
"grad_norm": 58.80691909790039,
"learning_rate": 0.0004980778493449912,
"epoch": 0.03946441155743481,
"step": 44
},
{
"loss": 406.07,
"grad_norm": 58.074493408203125,
"learning_rate": 0.0004979896046311265,
"epoch": 0.04036133000192197,
"step": 45
},
{
"loss": 406.7423,
"grad_norm": 62.749534606933594,
"learning_rate": 0.0004978993876645944,
"epoch": 0.04125824844640912,
"step": 46
},
{
"loss": 403.2931,
"grad_norm": 58.47712707519531,
"learning_rate": 0.0004978071991628874,
"epoch": 0.04215516689089628,
"step": 47
},
{
"loss": 402.5574,
"grad_norm": 64.82901000976562,
"learning_rate": 0.0004977130398591775,
"epoch": 0.04305208533538343,
"step": 48
},
{
"loss": 405.5097,
"grad_norm": 56.95109939575195,
"learning_rate": 0.00049761691050231,
"epoch": 0.043949003779870585,
"step": 49
},
{
"loss": 408.4274,
"grad_norm": 60.67522048950195,
"learning_rate": 0.0004975188118567987,
"epoch": 0.044845922224357744,
"step": 50
},
{
"eval_loss": 1.7932980060577393,
"eval_runtime": 41.7475,
"eval_samples_per_second": 49.057,
"eval_steps_per_second": 3.066,
"epoch": 0.044845922224357744,
"step": 50
},
{
"loss": 405.2191,
"grad_norm": 61.441951751708984,
"learning_rate": 0.0004974187447028184,
"epoch": 0.045742840668844896,
"step": 51
},
{
"loss": 402.9874,
"grad_norm": 56.64131546020508,
"learning_rate": 0.0004973167098361999,
"epoch": 0.046639759113332055,
"step": 52
},
{
"loss": 403.7462,
"grad_norm": 58.905479431152344,
"learning_rate": 0.0004972127080684228,
"epoch": 0.04753667755781921,
"step": 53
},
{
"loss": 402.2606,
"grad_norm": 60.9106559753418,
"learning_rate": 0.0004971067402266096,
"epoch": 0.04843359600230636,
"step": 54
},
{
"loss": 397.4493,
"grad_norm": 55.347869873046875,
"learning_rate": 0.0004969988071535188,
"epoch": 0.04933051444679352,
"step": 55
},
{
"loss": 398.7716,
"grad_norm": 56.816104888916016,
"learning_rate": 0.0004968889097075385,
"epoch": 0.05022743289128067,
"step": 56
},
{
"loss": 399.2036,
"grad_norm": 63.388851165771484,
"learning_rate": 0.0004967770487626791,
"epoch": 0.05112435133576783,
"step": 57
},
{
"loss": 402.6399,
"grad_norm": 58.803466796875,
"learning_rate": 0.0004966632252085668,
"epoch": 0.05202126978025498,
"step": 58
},
{
"loss": 401.2329,
"grad_norm": 61.42218780517578,
"learning_rate": 0.0004965474399504364,
"epoch": 0.05291818822474213,
"step": 59
},
{
"loss": 394.491,
"grad_norm": 54.581748962402344,
"learning_rate": 0.000496429693909124,
"epoch": 0.05381510666922929,
"step": 60
},
{
"loss": 402.2176,
"grad_norm": 60.348812103271484,
"learning_rate": 0.0004963099880210597,
"epoch": 0.05471202511371644,
"step": 61
},
{
"loss": 401.5288,
"grad_norm": 58.51568603515625,
"learning_rate": 0.0004961883232382603,
"epoch": 0.0556089435582036,
"step": 62
},
{
"loss": 402.1975,
"grad_norm": 53.891822814941406,
"learning_rate": 0.0004960647005283217,
"epoch": 0.056505862002690754,
"step": 63
},
{
"loss": 402.8554,
"grad_norm": 54.66781234741211,
"learning_rate": 0.0004959391208744108,
"epoch": 0.05740278044717791,
"step": 64
},
{
"loss": 397.2245,
"grad_norm": 57.83986282348633,
"learning_rate": 0.0004958115852752582,
"epoch": 0.058299698891665065,
"step": 65
},
{
"loss": 398.295,
"grad_norm": 56.6056022644043,
"learning_rate": 0.0004956820947451502,
"epoch": 0.05919661733615222,
"step": 66
},
{
"loss": 398.1401,
"grad_norm": 58.830711364746094,
"learning_rate": 0.0004955506503139204,
"epoch": 0.060093535780639376,
"step": 67
},
{
"loss": 401.4149,
"grad_norm": 54.770755767822266,
"learning_rate": 0.0004954172530269418,
"epoch": 0.06099045422512653,
"step": 68
},
{
"loss": 399.5218,
"grad_norm": 59.45661926269531,
"learning_rate": 0.0004952819039451183,
"epoch": 0.06188737266961369,
"step": 69
},
{
"loss": 396.4537,
"grad_norm": 53.4246826171875,
"learning_rate": 0.0004951446041448765,
"epoch": 0.06278429111410085,
"step": 70
},
{
"loss": 401.2764,
"grad_norm": 55.125919342041016,
"learning_rate": 0.0004950053547181568,
"epoch": 0.063681209558588,
"step": 71
},
{
"loss": 400.9092,
"grad_norm": 63.59549331665039,
"learning_rate": 0.0004948641567724053,
"epoch": 0.06457812800307515,
"step": 72
},
{
"loss": 397.1968,
"grad_norm": 58.40228271484375,
"learning_rate": 0.0004947210114305639,
"epoch": 0.0654750464475623,
"step": 73
},
{
"loss": 398.0598,
"grad_norm": 62.7151985168457,
"learning_rate": 0.0004945759198310629,
"epoch": 0.06637196489204945,
"step": 74
},
{
"loss": 398.7396,
"grad_norm": 59.287742614746094,
"learning_rate": 0.0004944288831278106,
"epoch": 0.06726888333653662,
"step": 75
},
{
"loss": 391.3397,
"grad_norm": 59.052059173583984,
"learning_rate": 0.0004942799024901846,
"epoch": 0.06816580178102377,
"step": 76
},
{
"loss": 394.1899,
"grad_norm": 54.65058135986328,
"learning_rate": 0.0004941289791030229,
"epoch": 0.06906272022551092,
"step": 77
},
{
"loss": 393.8536,
"grad_norm": 51.59941101074219,
"learning_rate": 0.0004939761141666139,
"epoch": 0.06995963866999808,
"step": 78
},
{
"loss": 396.7059,
"grad_norm": 55.84555435180664,
"learning_rate": 0.0004938213088966872,
"epoch": 0.07085655711448523,
"step": 79
},
{
"loss": 392.0196,
"grad_norm": 55.808250427246094,
"learning_rate": 0.0004936645645244033,
"epoch": 0.0717534755589724,
"step": 80
},
{
"loss": 395.5785,
"grad_norm": 53.83452224731445,
"learning_rate": 0.0004935058822963453,
"epoch": 0.07265039400345955,
"step": 81
},
{
"loss": 398.3966,
"grad_norm": 61.950626373291016,
"learning_rate": 0.000493345263474507,
"epoch": 0.0735473124479467,
"step": 82
},
{
"loss": 399.4866,
"grad_norm": 65.6949462890625,
"learning_rate": 0.0004931827093362844,
"epoch": 0.07444423089243385,
"step": 83
},
{
"loss": 393.8017,
"grad_norm": 54.928836822509766,
"learning_rate": 0.0004930182211744649,
"epoch": 0.075341149336921,
"step": 84
},
{
"loss": 398.1347,
"grad_norm": 59.81849670410156,
"learning_rate": 0.0004928518002972172,
"epoch": 0.07623806778140817,
"step": 85
},
{
"loss": 392.8837,
"grad_norm": 57.970462799072266,
"learning_rate": 0.0004926834480280805,
"epoch": 0.07713498622589532,
"step": 86
},
{
"loss": 394.3792,
"grad_norm": 57.43026351928711,
"learning_rate": 0.0004925131657059547,
"epoch": 0.07803190467038247,
"step": 87
},
{
"loss": 395.7612,
"grad_norm": 57.73651123046875,
"learning_rate": 0.0004923409546850891,
"epoch": 0.07892882311486962,
"step": 88
},
{
"loss": 396.5627,
"grad_norm": 58.27775573730469,
"learning_rate": 0.000492166816335072,
"epoch": 0.07982574155935677,
"step": 89
},
{
"loss": 398.5615,
"grad_norm": 53.49543762207031,
"learning_rate": 0.0004919907520408196,
"epoch": 0.08072266000384394,
"step": 90
},
{
"loss": 398.6497,
"grad_norm": 57.175514221191406,
"learning_rate": 0.000491812763202565,
"epoch": 0.08161957844833109,
"step": 91
},
{
"loss": 392.5616,
"grad_norm": 58.206119537353516,
"learning_rate": 0.0004916328512358472,
"epoch": 0.08251649689281824,
"step": 92
},
{
"loss": 390.17,
"grad_norm": 56.978179931640625,
"learning_rate": 0.0004914510175714999,
"epoch": 0.0834134153373054,
"step": 93
},
{
"loss": 391.477,
"grad_norm": 59.842369079589844,
"learning_rate": 0.0004912672636556397,
"epoch": 0.08431033378179256,
"step": 94
},
{
"loss": 394.4383,
"grad_norm": 52.20112609863281,
"learning_rate": 0.0004910815909496555,
"epoch": 0.08520725222627971,
"step": 95
},
{
"loss": 390.8443,
"grad_norm": 61.12334060668945,
"learning_rate": 0.0004908940009301954,
"epoch": 0.08610417067076687,
"step": 96
},
{
"loss": 395.9276,
"grad_norm": 55.49872589111328,
"learning_rate": 0.0004907044950891565,
"epoch": 0.08700108911525402,
"step": 97
},
{
"loss": 394.7866,
"grad_norm": 59.71890640258789,
"learning_rate": 0.000490513074933672,
"epoch": 0.08789800755974117,
"step": 98
},
{
"loss": 388.5464,
"grad_norm": 55.72919845581055,
"learning_rate": 0.0004903197419860999,
"epoch": 0.08879492600422834,
"step": 99
},
{
"loss": 392.9969,
"grad_norm": 61.6799430847168,
"learning_rate": 0.0004901244977840103,
"epoch": 0.08969184444871549,
"step": 100
},
{
"eval_loss": 1.7485355138778687,
"eval_runtime": 49.5113,
"eval_samples_per_second": 41.364,
"eval_steps_per_second": 2.585,
"epoch": 0.08969184444871549,
"step": 100
},
{
"loss": 393.0805,
"grad_norm": 58.71113204956055,
"learning_rate": 0.0004899273438801734,
"epoch": 0.09058876289320264,
"step": 101
},
{
"loss": 391.5116,
"grad_norm": 54.11758804321289,
"learning_rate": 0.0004897282818425474,
"epoch": 0.09148568133768979,
"step": 102
},
{
"loss": 394.4952,
"grad_norm": 53.54176712036133,
"learning_rate": 0.0004895273132542658,
"epoch": 0.09238259978217694,
"step": 103
},
{
"loss": 392.5484,
"grad_norm": 51.26163101196289,
"learning_rate": 0.0004893244397136246,
"epoch": 0.09327951822666411,
"step": 104
},
{
"loss": 392.7574,
"grad_norm": 57.158973693847656,
"learning_rate": 0.0004891196628340703,
"epoch": 0.09417643667115126,
"step": 105
},
{
"loss": 392.1094,
"grad_norm": 51.87057113647461,
"learning_rate": 0.0004889129842441859,
"epoch": 0.09507335511563841,
"step": 106
},
{
"loss": 391.9873,
"grad_norm": 62.71110534667969,
"learning_rate": 0.0004887044055876793,
"epoch": 0.09597027356012557,
"step": 107
},
{
"loss": 393.0227,
"grad_norm": 61.41956329345703,
"learning_rate": 0.0004884939285233691,
"epoch": 0.09686719200461272,
"step": 108
},
{
"loss": 389.2371,
"grad_norm": 59.030765533447266,
"learning_rate": 0.0004882815547251721,
"epoch": 0.09776411044909988,
"step": 109
},
{
"loss": 394.932,
"grad_norm": 60.926448822021484,
"learning_rate": 0.00048806728588208966,
"epoch": 0.09866102889358704,
"step": 110
},
{
"loss": 389.2965,
"grad_norm": 59.546268463134766,
"learning_rate": 0.0004878511236981945,
"epoch": 0.09955794733807419,
"step": 111
},
{
"loss": 389.0897,
"grad_norm": 56.25603103637695,
"learning_rate": 0.0004876330698926169,
"epoch": 0.10045486578256134,
"step": 112
},
{
"loss": 391.7546,
"grad_norm": 63.1163444519043,
"learning_rate": 0.00048741312619953104,
"epoch": 0.10135178422704849,
"step": 113
},
{
"loss": 392.0137,
"grad_norm": 70.23162078857422,
"learning_rate": 0.00048719129436814156,
"epoch": 0.10224870267153566,
"step": 114
},
{
"loss": 390.5738,
"grad_norm": 60.9749755859375,
"learning_rate": 0.00048696757616266927,
"epoch": 0.10314562111602281,
"step": 115
},
{
"loss": 387.7592,
"grad_norm": 60.2146110534668,
"learning_rate": 0.0004867419733623372,
"epoch": 0.10404253956050996,
"step": 116
},
{
"loss": 390.6403,
"grad_norm": 59.26010513305664,
"learning_rate": 0.00048651448776135654,
"epoch": 0.10493945800499711,
"step": 117
},
{
"loss": 391.4545,
"grad_norm": 55.02613067626953,
"learning_rate": 0.00048628512116891234,
"epoch": 0.10583637644948427,
"step": 118
},
{
"loss": 388.2937,
"grad_norm": 56.28743362426758,
"learning_rate": 0.00048605387540914916,
"epoch": 0.10673329489397143,
"step": 119
},
{
"loss": 389.2755,
"grad_norm": 55.22878646850586,
"learning_rate": 0.0004858207523211563,
"epoch": 0.10763021333845858,
"step": 120
},
{
"loss": 392.9062,
"grad_norm": 55.45512771606445,
"learning_rate": 0.00048558575375895377,
"epoch": 0.10852713178294573,
"step": 121
},
{
"loss": 388.4548,
"grad_norm": 58.8115119934082,
"learning_rate": 0.0004853488815914767,
"epoch": 0.10942405022743289,
"step": 122
},
{
"loss": 390.1011,
"grad_norm": 55.49444580078125,
"learning_rate": 0.00048511013770256134,
"epoch": 0.11032096867192005,
"step": 123
},
{
"loss": 388.7439,
"grad_norm": 54.36104202270508,
"learning_rate": 0.00048486952399092945,
"epoch": 0.1112178871164072,
"step": 124
},
{
"loss": 391.1307,
"grad_norm": 52.75822067260742,
"learning_rate": 0.0004846270423701734,
"epoch": 0.11211480556089436,
"step": 125
},
{
"loss": 388.8095,
"grad_norm": 55.67084884643555,
"learning_rate": 0.0004843826947687411,
"epoch": 0.11301172400538151,
"step": 126
},
{
"loss": 388.7104,
"grad_norm": 58.483211517333984,
"learning_rate": 0.0004841364831299206,
"epoch": 0.11390864244986866,
"step": 127
},
{
"loss": 392.5351,
"grad_norm": 54.69878387451172,
"learning_rate": 0.00048388840941182435,
"epoch": 0.11480556089435583,
"step": 128
},
{
"loss": 389.9329,
"grad_norm": 56.85935974121094,
"learning_rate": 0.00048363847558737395,
"epoch": 0.11570247933884298,
"step": 129
},
{
"loss": 389.8976,
"grad_norm": 55.818260192871094,
"learning_rate": 0.0004833866836442844,
"epoch": 0.11659939778333013,
"step": 130
},
{
"loss": 389.0714,
"grad_norm": 69.33192443847656,
"learning_rate": 0.0004831330355850483,
"epoch": 0.11749631622781728,
"step": 131
},
{
"loss": 387.675,
"grad_norm": 59.69966506958008,
"learning_rate": 0.0004828775334269198,
"epoch": 0.11839323467230443,
"step": 132
},
{
"loss": 389.1474,
"grad_norm": 63.28241729736328,
"learning_rate": 0.0004826201792018986,
"epoch": 0.1192901531167916,
"step": 133
},
{
"loss": 386.0185,
"grad_norm": 60.13338851928711,
"learning_rate": 0.0004823609749567138,
"epoch": 0.12018707156127875,
"step": 134
},
{
"loss": 393.0312,
"grad_norm": 50.345890045166016,
"learning_rate": 0.0004820999227528079,
"epoch": 0.1210839900057659,
"step": 135
},
{
"loss": 388.9017,
"grad_norm": 54.398582458496094,
"learning_rate": 0.00048183702466631986,
"epoch": 0.12198090845025306,
"step": 136
},
{
"loss": 390.3952,
"grad_norm": 58.791343688964844,
"learning_rate": 0.0004815722827880689,
"epoch": 0.12287782689474021,
"step": 137
},
{
"loss": 391.5972,
"grad_norm": 56.27891540527344,
"learning_rate": 0.000481305699223538,
"epoch": 0.12377474533922737,
"step": 138
},
{
"loss": 390.4619,
"grad_norm": 57.29872512817383,
"learning_rate": 0.000481037276092857,
"epoch": 0.12467166378371453,
"step": 139
},
{
"loss": 386.5269,
"grad_norm": 56.40953826904297,
"learning_rate": 0.0004807670155307856,
"epoch": 0.1255685822282017,
"step": 140
},
{
"loss": 386.9588,
"grad_norm": 56.36626434326172,
"learning_rate": 0.0004804949196866967,
"epoch": 0.12646550067268883,
"step": 141
},
{
"loss": 390.6064,
"grad_norm": 59.941890716552734,
"learning_rate": 0.00048022099072455893,
"epoch": 0.127362419117176,
"step": 142
},
{
"loss": 389.5639,
"grad_norm": 55.42548370361328,
"learning_rate": 0.0004799452308229199,
"epoch": 0.12825933756166313,
"step": 143
},
{
"loss": 389.1144,
"grad_norm": 59.46462631225586,
"learning_rate": 0.0004796676421748883,
"epoch": 0.1291562560061503,
"step": 144
},
{
"loss": 387.238,
"grad_norm": 61.307960510253906,
"learning_rate": 0.0004793882269881172,
"epoch": 0.13005317445063747,
"step": 145
},
{
"loss": 385.9282,
"grad_norm": 53.019859313964844,
"learning_rate": 0.00047910698748478565,
"epoch": 0.1309500928951246,
"step": 146
},
{
"loss": 388.6133,
"grad_norm": 59.57033920288086,
"learning_rate": 0.00047882392590158166,
"epoch": 0.13184701133961177,
"step": 147
},
{
"loss": 385.2765,
"grad_norm": 55.921993255615234,
"learning_rate": 0.000478539044489684,
"epoch": 0.1327439297840989,
"step": 148
},
{
"loss": 387.315,
"grad_norm": 53.27146911621094,
"learning_rate": 0.0004782523455147448,
"epoch": 0.13364084822858607,
"step": 149
},
{
"loss": 384.9127,
"grad_norm": 61.21531295776367,
"learning_rate": 0.0004779638312568708,
"epoch": 0.13453776667307324,
"step": 150
},
{
"eval_loss": 1.7258449792861938,
"eval_runtime": 36.7008,
"eval_samples_per_second": 55.803,
"eval_steps_per_second": 3.488,
"epoch": 0.13453776667307324,
"step": 150
},
{
"loss": 385.8539,
"grad_norm": 60.04133605957031,
"learning_rate": 0.00047767350401060606,
"epoch": 0.13543468511756038,
"step": 151
},
{
"loss": 384.8003,
"grad_norm": 59.11763000488281,
"learning_rate": 0.0004773813660849128,
"epoch": 0.13633160356204754,
"step": 152
},
{
"loss": 387.7485,
"grad_norm": 56.51465606689453,
"learning_rate": 0.0004770874198031538,
"epoch": 0.13722852200653468,
"step": 153
},
{
"loss": 383.2278,
"grad_norm": 56.18191146850586,
"learning_rate": 0.0004767916675030736,
"epoch": 0.13812544045102185,
"step": 154
},
{
"loss": 383.6736,
"grad_norm": 57.308799743652344,
"learning_rate": 0.00047649411153678,
"epoch": 0.139022358895509,
"step": 155
},
{
"loss": 383.3135,
"grad_norm": 56.1787109375,
"learning_rate": 0.0004761947542707251,
"epoch": 0.13991927733999615,
"step": 156
},
{
"loss": 380.7021,
"grad_norm": 59.29663848876953,
"learning_rate": 0.0004758935980856868,
"epoch": 0.14081619578448332,
"step": 157
},
{
"loss": 388.3537,
"grad_norm": 56.997901916503906,
"learning_rate": 0.00047559064537674973,
"epoch": 0.14171311422897045,
"step": 158
},
{
"loss": 382.6107,
"grad_norm": 54.997398376464844,
"learning_rate": 0.0004752858985532862,
"epoch": 0.14261003267345762,
"step": 159
},
{
"loss": 390.4788,
"grad_norm": 61.30497360229492,
"learning_rate": 0.00047497936003893713,
"epoch": 0.1435069511179448,
"step": 160
},
{
"loss": 383.9597,
"grad_norm": 56.59492492675781,
"learning_rate": 0.0004746710322715926,
"epoch": 0.14440386956243192,
"step": 161
},
{
"loss": 392.4949,
"grad_norm": 63.977073669433594,
"learning_rate": 0.0004743609177033725,
"epoch": 0.1453007880069191,
"step": 162
},
{
"loss": 385.7721,
"grad_norm": 63.132537841796875,
"learning_rate": 0.0004740490188006072,
"epoch": 0.14619770645140623,
"step": 163
},
{
"loss": 385.057,
"grad_norm": 61.54987716674805,
"learning_rate": 0.0004737353380438178,
"epoch": 0.1470946248958934,
"step": 164
},
{
"loss": 384.8288,
"grad_norm": 64.65653228759766,
"learning_rate": 0.00047341987792769635,
"epoch": 0.14799154334038056,
"step": 165
},
{
"loss": 385.061,
"grad_norm": 52.979087829589844,
"learning_rate": 0.0004731026409610863,
"epoch": 0.1488884617848677,
"step": 166
},
{
"loss": 385.9828,
"grad_norm": 66.97553253173828,
"learning_rate": 0.00047278362966696197,
"epoch": 0.14978538022935486,
"step": 167
},
{
"loss": 381.6645,
"grad_norm": 49.72977066040039,
"learning_rate": 0.00047246284658240925,
"epoch": 0.150682298673842,
"step": 168
},
{
"loss": 387.0713,
"grad_norm": 59.0352668762207,
"learning_rate": 0.0004721402942586046,
"epoch": 0.15157921711832917,
"step": 169
},
{
"loss": 388.6861,
"grad_norm": 56.49056625366211,
"learning_rate": 0.0004718159752607955,
"epoch": 0.15247613556281633,
"step": 170
},
{
"loss": 386.6622,
"grad_norm": 61.9783935546875,
"learning_rate": 0.00047148989216827964,
"epoch": 0.15337305400730347,
"step": 171
},
{
"loss": 385.3264,
"grad_norm": 60.84406280517578,
"learning_rate": 0.0004711620475743844,
"epoch": 0.15426997245179064,
"step": 172
},
{
"loss": 383.2025,
"grad_norm": 55.59370803833008,
"learning_rate": 0.00047083244408644646,
"epoch": 0.15516689089627778,
"step": 173
},
{
"loss": 383.7802,
"grad_norm": 59.102760314941406,
"learning_rate": 0.0004705010843257908,
"epoch": 0.15606380934076494,
"step": 174
},
{
"loss": 387.181,
"grad_norm": 63.97918701171875,
"learning_rate": 0.00047016797092771004,
"epoch": 0.1569607277852521,
"step": 175
},
{
"loss": 382.4706,
"grad_norm": 58.40498733520508,
"learning_rate": 0.0004698331065414434,
"epoch": 0.15785764622973925,
"step": 176
},
{
"loss": 374.7974,
"grad_norm": 57.276405334472656,
"learning_rate": 0.0004694964938301556,
"epoch": 0.1587545646742264,
"step": 177
},
{
"loss": 383.6686,
"grad_norm": 65.17239379882812,
"learning_rate": 0.0004691581354709159,
"epoch": 0.15965148311871355,
"step": 178
},
{
"loss": 382.2492,
"grad_norm": 54.67914962768555,
"learning_rate": 0.0004688180341546765,
"epoch": 0.16054840156320072,
"step": 179
},
{
"loss": 379.0845,
"grad_norm": 61.17100524902344,
"learning_rate": 0.0004684761925862512,
"epoch": 0.16144532000768788,
"step": 180
},
{
"loss": 380.5147,
"grad_norm": 53.48952102661133,
"learning_rate": 0.00046813261348429403,
"epoch": 0.16234223845217502,
"step": 181
},
{
"loss": 388.3456,
"grad_norm": 62.524898529052734,
"learning_rate": 0.0004677872995812778,
"epoch": 0.16323915689666219,
"step": 182
},
{
"loss": 384.9105,
"grad_norm": 55.23896026611328,
"learning_rate": 0.00046744025362347174,
"epoch": 0.16413607534114932,
"step": 183
},
{
"loss": 388.0769,
"grad_norm": 58.2794075012207,
"learning_rate": 0.0004670914783709203,
"epoch": 0.1650329937856365,
"step": 184
},
{
"loss": 375.4843,
"grad_norm": 57.62440872192383,
"learning_rate": 0.00046674097659742087,
"epoch": 0.16592991223012366,
"step": 185
},
{
"loss": 388.4005,
"grad_norm": 54.49860763549805,
"learning_rate": 0.00046638875109050184,
"epoch": 0.1668268306746108,
"step": 186
},
{
"loss": 379.2246,
"grad_norm": 56.57727813720703,
"learning_rate": 0.00046603480465140035,
"epoch": 0.16772374911909796,
"step": 187
},
{
"loss": 390.5371,
"grad_norm": 53.35488510131836,
"learning_rate": 0.0004656791400950401,
"epoch": 0.16862066756358512,
"step": 188
},
{
"loss": 376.5087,
"grad_norm": 57.38853454589844,
"learning_rate": 0.0004653217602500088,
"epoch": 0.16951758600807226,
"step": 189
},
{
"loss": 383.3448,
"grad_norm": 53.162269592285156,
"learning_rate": 0.00046496266795853606,
"epoch": 0.17041450445255943,
"step": 190
},
{
"loss": 385.954,
"grad_norm": 56.76969528198242,
"learning_rate": 0.0004646018660764701,
"epoch": 0.17131142289704657,
"step": 191
},
{
"loss": 380.8749,
"grad_norm": 55.99345016479492,
"learning_rate": 0.0004642393574732559,
"epoch": 0.17220834134153373,
"step": 192
},
{
"loss": 379.5312,
"grad_norm": 49.73320770263672,
"learning_rate": 0.0004638751450319116,
"epoch": 0.1731052597860209,
"step": 193
},
{
"loss": 385.7988,
"grad_norm": 56.80336380004883,
"learning_rate": 0.00046350923164900604,
"epoch": 0.17400217823050804,
"step": 194
},
{
"loss": 380.8796,
"grad_norm": 57.32421875,
"learning_rate": 0.0004631416202346357,
"epoch": 0.1748990966749952,
"step": 195
},
{
"loss": 382.128,
"grad_norm": 62.81551742553711,
"learning_rate": 0.00046277231371240113,
"epoch": 0.17579601511948234,
"step": 196
},
{
"loss": 383.9042,
"grad_norm": 60.5498046875,
"learning_rate": 0.00046240131501938436,
"epoch": 0.1766929335639695,
"step": 197
},
{
"loss": 380.0457,
"grad_norm": 54.78828811645508,
"learning_rate": 0.000462028627106125,
"epoch": 0.17758985200845667,
"step": 198
},
{
"loss": 383.6067,
"grad_norm": 60.62177276611328,
"learning_rate": 0.00046165425293659694,
"epoch": 0.1784867704529438,
"step": 199
},
{
"loss": 385.004,
"grad_norm": 53.65549850463867,
"learning_rate": 0.00046127819548818507,
"epoch": 0.17938368889743098,
"step": 200
},
{
"eval_loss": 1.6973483562469482,
"eval_runtime": 57.4311,
"eval_samples_per_second": 35.66,
"eval_steps_per_second": 2.229,
"epoch": 0.17938368889743098,
"step": 200
},
{
"loss": 381.3797,
"grad_norm": 60.24985885620117,
"learning_rate": 0.0004609004577516609,
"epoch": 0.18028060734191811,
"step": 201
},
{
"loss": 384.8868,
"grad_norm": 55.66313552856445,
"learning_rate": 0.00046052104273115957,
"epoch": 0.18117752578640528,
"step": 202
},
{
"loss": 381.8181,
"grad_norm": 58.7210807800293,
"learning_rate": 0.0004601399534441556,
"epoch": 0.18207444423089245,
"step": 203
},
{
"loss": 381.6777,
"grad_norm": 51.48910903930664,
"learning_rate": 0.0004597571929214386,
"epoch": 0.18297136267537958,
"step": 204
},
{
"loss": 389.5296,
"grad_norm": 55.63520050048828,
"learning_rate": 0.00045937276420708985,
"epoch": 0.18386828111986675,
"step": 205
},
{
"loss": 379.7319,
"grad_norm": 56.91200637817383,
"learning_rate": 0.00045898667035845726,
"epoch": 0.1847651995643539,
"step": 206
},
{
"loss": 383.4648,
"grad_norm": 60.174800872802734,
"learning_rate": 0.0004585989144461319,
"epoch": 0.18566211800884105,
"step": 207
},
{
"loss": 381.6614,
"grad_norm": 46.41486740112305,
"learning_rate": 0.00045820949955392286,
"epoch": 0.18655903645332822,
"step": 208
},
{
"loss": 388.843,
"grad_norm": 66.20514678955078,
"learning_rate": 0.0004578184287788333,
"epoch": 0.18745595489781536,
"step": 209
},
{
"loss": 382.3195,
"grad_norm": 52.08879470825195,
"learning_rate": 0.0004574257052310355,
"epoch": 0.18835287334230252,
"step": 210
},
{
"loss": 376.9011,
"grad_norm": 59.04060363769531,
"learning_rate": 0.00045703133203384594,
"epoch": 0.18924979178678966,
"step": 211
},
{
"loss": 382.9858,
"grad_norm": 57.139583587646484,
"learning_rate": 0.000456635312323701,
"epoch": 0.19014671023127683,
"step": 212
},
{
"loss": 386.4098,
"grad_norm": 56.69694137573242,
"learning_rate": 0.00045623764925013154,
"epoch": 0.191043628675764,
"step": 213
},
{
"loss": 381.0145,
"grad_norm": 54.969146728515625,
"learning_rate": 0.00045583834597573826,
"epoch": 0.19194054712025113,
"step": 214
},
{
"loss": 386.2006,
"grad_norm": 55.187095642089844,
"learning_rate": 0.000455437405676166,
"epoch": 0.1928374655647383,
"step": 215
},
{
"loss": 385.4291,
"grad_norm": 56.27381896972656,
"learning_rate": 0.000455034831540079,
"epoch": 0.19373438400922544,
"step": 216
},
{
"loss": 382.2878,
"grad_norm": 55.81896209716797,
"learning_rate": 0.00045463062676913527,
"epoch": 0.1946313024537126,
"step": 217
},
{
"loss": 381.0126,
"grad_norm": 60.54517364501953,
"learning_rate": 0.0004542247945779613,
"epoch": 0.19552822089819977,
"step": 218
},
{
"loss": 382.4228,
"grad_norm": 51.44652557373047,
"learning_rate": 0.0004538173381941264,
"epoch": 0.1964251393426869,
"step": 219
},
{
"loss": 374.3478,
"grad_norm": 57.77920150756836,
"learning_rate": 0.0004534082608581168,
"epoch": 0.19732205778717407,
"step": 220
},
{
"loss": 379.4279,
"grad_norm": 52.3509635925293,
"learning_rate": 0.0004529975658233104,
"epoch": 0.1982189762316612,
"step": 221
},
{
"loss": 380.0542,
"grad_norm": 53.75742721557617,
"learning_rate": 0.0004525852563559505,
"epoch": 0.19911589467614838,
"step": 222
},
{
"loss": 387.0319,
"grad_norm": 59.18511199951172,
"learning_rate": 0.0004521713357351198,
"epoch": 0.20001281312063554,
"step": 223
},
{
"loss": 375.638,
"grad_norm": 53.67622375488281,
"learning_rate": 0.00045175580725271457,
"epoch": 0.20090973156512268,
"step": 224
},
{
"loss": 383.951,
"grad_norm": 67.28981018066406,
"learning_rate": 0.00045133867421341835,
"epoch": 0.20180665000960984,
"step": 225
},
{
"loss": 380.0722,
"grad_norm": 62.926700592041016,
"learning_rate": 0.00045091993993467554,
"epoch": 0.20270356845409698,
"step": 226
},
{
"loss": 377.9981,
"grad_norm": 53.50834274291992,
"learning_rate": 0.0004504996077466654,
"epoch": 0.20360048689858415,
"step": 227
},
{
"loss": 380.4308,
"grad_norm": 61.55268096923828,
"learning_rate": 0.0004500776809922751,
"epoch": 0.20449740534307131,
"step": 228
},
{
"loss": 375.9146,
"grad_norm": 55.11613845825195,
"learning_rate": 0.0004496541630270733,
"epoch": 0.20539432378755845,
"step": 229
},
{
"loss": 381.8729,
"grad_norm": 61.67683410644531,
"learning_rate": 0.00044922905721928366,
"epoch": 0.20629124223204562,
"step": 230
},
{
"loss": 377.6188,
"grad_norm": 55.07930374145508,
"learning_rate": 0.00044880236694975773,
"epoch": 0.20718816067653276,
"step": 231
},
{
"loss": 383.7285,
"grad_norm": 56.17093276977539,
"learning_rate": 0.0004483740956119485,
"epoch": 0.20808507912101992,
"step": 232
},
{
"loss": 379.3219,
"grad_norm": 57.20262908935547,
"learning_rate": 0.0004479442466118828,
"epoch": 0.2089819975655071,
"step": 233
},
{
"loss": 378.996,
"grad_norm": 52.91606521606445,
"learning_rate": 0.0004475128233681349,
"epoch": 0.20987891600999423,
"step": 234
},
{
"loss": 376.5712,
"grad_norm": 53.59124755859375,
"learning_rate": 0.00044707982931179856,
"epoch": 0.2107758344544814,
"step": 235
},
{
"loss": 385.7614,
"grad_norm": 57.6840705871582,
"learning_rate": 0.00044664526788646064,
"epoch": 0.21167275289896853,
"step": 236
},
{
"loss": 381.0049,
"grad_norm": 54.7835578918457,
"learning_rate": 0.0004462091425481728,
"epoch": 0.2125696713434557,
"step": 237
},
{
"loss": 380.4299,
"grad_norm": 56.61455535888672,
"learning_rate": 0.0004457714567654247,
"epoch": 0.21346658978794286,
"step": 238
},
{
"loss": 377.3007,
"grad_norm": 54.04520797729492,
"learning_rate": 0.0004453322140191162,
"epoch": 0.21436350823243,
"step": 239
},
{
"loss": 376.2494,
"grad_norm": 61.18534469604492,
"learning_rate": 0.0004448914178025293,
"epoch": 0.21526042667691717,
"step": 240
},
{
"loss": 379.0678,
"grad_norm": 58.791934967041016,
"learning_rate": 0.000444449071621301,
"epoch": 0.21615734512140433,
"step": 241
},
{
"loss": 383.8186,
"grad_norm": 54.751407623291016,
"learning_rate": 0.0004440051789933951,
"epoch": 0.21705426356589147,
"step": 242
},
{
"loss": 374.9797,
"grad_norm": 54.97734451293945,
"learning_rate": 0.0004435597434490741,
"epoch": 0.21795118201037864,
"step": 243
},
{
"loss": 381.2922,
"grad_norm": 55.37065887451172,
"learning_rate": 0.00044311276853087144,
"epoch": 0.21884810045486577,
"step": 244
},
{
"loss": 378.8845,
"grad_norm": 58.74147033691406,
"learning_rate": 0.0004426642577935629,
"epoch": 0.21974501889935294,
"step": 245
},
{
"loss": 386.1524,
"grad_norm": 58.316097259521484,
"learning_rate": 0.0004422142148041388,
"epoch": 0.2206419373438401,
"step": 246
},
{
"loss": 378.2374,
"grad_norm": 54.42732238769531,
"learning_rate": 0.00044176264314177535,
"epoch": 0.22153885578832724,
"step": 247
},
{
"loss": 378.246,
"grad_norm": 56.714080810546875,
"learning_rate": 0.00044130954639780615,
"epoch": 0.2224357742328144,
"step": 248
},
{
"loss": 373.9691,
"grad_norm": 51.52580642700195,
"learning_rate": 0.0004408549281756937,
"epoch": 0.22333269267730155,
"step": 249
},
{
"loss": 377.4944,
"grad_norm": 61.44560241699219,
"learning_rate": 0.0004403987920910011,
"epoch": 0.2242296111217887,
"step": 250
},
{
"eval_loss": 1.6841200590133667,
"eval_runtime": 35.8648,
"eval_samples_per_second": 57.103,
"eval_steps_per_second": 3.569,
"epoch": 0.2242296111217887,
"step": 250
},
{
"loss": 372.7726,
"grad_norm": 52.64440155029297,
"learning_rate": 0.00043994114177136245,
"epoch": 0.22512652956627588,
"step": 251
},
{
"loss": 374.3314,
"grad_norm": 57.64458084106445,
"learning_rate": 0.0004394819808564549,
"epoch": 0.22602344801076302,
"step": 252
},
{
"loss": 380.1327,
"grad_norm": 48.348487854003906,
"learning_rate": 0.00043902131299796923,
"epoch": 0.22692036645525018,
"step": 253
},
{
"loss": 376.8272,
"grad_norm": 55.306766510009766,
"learning_rate": 0.00043855914185958066,
"epoch": 0.22781728489973732,
"step": 254
},
{
"loss": 373.5811,
"grad_norm": 50.16413879394531,
"learning_rate": 0.0004380954711169202,
"epoch": 0.2287142033442245,
"step": 255
},
{
"loss": 380.8544,
"grad_norm": 52.902305603027344,
"learning_rate": 0.00043763030445754516,
"epoch": 0.22961112178871165,
"step": 256
},
{
"loss": 380.7617,
"grad_norm": 55.323490142822266,
"learning_rate": 0.0004371636455809096,
"epoch": 0.2305080402331988,
"step": 257
},
{
"loss": 378.9308,
"grad_norm": 53.362361907958984,
"learning_rate": 0.00043669549819833536,
"epoch": 0.23140495867768596,
"step": 258
},
{
"loss": 378.0917,
"grad_norm": 51.511932373046875,
"learning_rate": 0.0004362258660329822,
"epoch": 0.2323018771221731,
"step": 259
},
{
"loss": 374.3557,
"grad_norm": 60.112728118896484,
"learning_rate": 0.0004357547528198184,
"epoch": 0.23319879556666026,
"step": 260
},
{
"loss": 382.0044,
"grad_norm": 52.59751510620117,
"learning_rate": 0.0004352821623055908,
"epoch": 0.23409571401114743,
"step": 261
},
{
"loss": 379.4641,
"grad_norm": 54.482444763183594,
"learning_rate": 0.0004348080982487953,
"epoch": 0.23499263245563456,
"step": 262
},
{
"loss": 376.0202,
"grad_norm": 57.2796516418457,
"learning_rate": 0.0004343325644196468,
"epoch": 0.23588955090012173,
"step": 263
},
{
"loss": 380.4021,
"grad_norm": 51.36527633666992,
"learning_rate": 0.0004338555646000492,
"epoch": 0.23678646934460887,
"step": 264
},
{
"loss": 382.1948,
"grad_norm": 54.246639251708984,
"learning_rate": 0.0004333771025835655,
"epoch": 0.23768338778909603,
"step": 265
},
{
"loss": 376.0016,
"grad_norm": 53.845367431640625,
"learning_rate": 0.0004328971821753873,
"epoch": 0.2385803062335832,
"step": 266
},
{
"loss": 378.0241,
"grad_norm": 55.82734298706055,
"learning_rate": 0.0004324158071923049,
"epoch": 0.23947722467807034,
"step": 267
},
{
"loss": 376.6841,
"grad_norm": 52.28315734863281,
"learning_rate": 0.0004319329814626768,
"epoch": 0.2403741431225575,
"step": 268
},
{
"loss": 376.4868,
"grad_norm": 59.60106658935547,
"learning_rate": 0.00043144870882639907,
"epoch": 0.24127106156704464,
"step": 269
},
{
"loss": 376.3779,
"grad_norm": 58.55453109741211,
"learning_rate": 0.0004309629931348752,
"epoch": 0.2421679800115318,
"step": 270
},
{
"loss": 379.1783,
"grad_norm": 52.10798263549805,
"learning_rate": 0.0004304758382509849,
"epoch": 0.24306489845601897,
"step": 271
},
{
"loss": 379.3161,
"grad_norm": 53.941673278808594,
"learning_rate": 0.0004299872480490542,
"epoch": 0.2439618169005061,
"step": 272
},
{
"loss": 379.5319,
"grad_norm": 53.70753860473633,
"learning_rate": 0.00042949722641482383,
"epoch": 0.24485873534499328,
"step": 273
},
{
"loss": 379.6953,
"grad_norm": 61.60326385498047,
"learning_rate": 0.0004290057772454187,
"epoch": 0.24575565378948042,
"step": 274
},
{
"loss": 379.7555,
"grad_norm": 57.09893798828125,
"learning_rate": 0.0004285129044493169,
"epoch": 0.24665257223396758,
"step": 275
},
{
"loss": 381.1754,
"grad_norm": 60.31880187988281,
"learning_rate": 0.0004280186119463186,
"epoch": 0.24754949067845475,
"step": 276
},
{
"loss": 379.8077,
"grad_norm": 57.53593826293945,
"learning_rate": 0.0004275229036675148,
"epoch": 0.24844640912294189,
"step": 277
},
{
"loss": 381.0815,
"grad_norm": 56.55409240722656,
"learning_rate": 0.00042702578355525615,
"epoch": 0.24934332756742905,
"step": 278
},
{
"loss": 378.2445,
"grad_norm": 50.37730026245117,
"learning_rate": 0.00042652725556312156,
"epoch": 0.2502402460119162,
"step": 279
},
{
"loss": 376.4951,
"grad_norm": 50.24005889892578,
"learning_rate": 0.0004260273236558867,
"epoch": 0.2511371644564034,
"step": 280
},
{
"loss": 379.3927,
"grad_norm": 52.99737548828125,
"learning_rate": 0.0004255259918094926,
"epoch": 0.2520340829008905,
"step": 281
},
{
"loss": 379.7873,
"grad_norm": 53.95462417602539,
"learning_rate": 0.00042502326401101386,
"epoch": 0.25293100134537766,
"step": 282
},
{
"loss": 370.9284,
"grad_norm": 51.21118927001953,
"learning_rate": 0.0004245191442586273,
"epoch": 0.2538279197898648,
"step": 283
},
{
"loss": 374.7379,
"grad_norm": 53.918975830078125,
"learning_rate": 0.00042401363656157954,
"epoch": 0.254724838234352,
"step": 284
},
{
"loss": 373.7905,
"grad_norm": 51.7956428527832,
"learning_rate": 0.00042350674494015566,
"epoch": 0.25562175667883913,
"step": 285
},
{
"loss": 376.9342,
"grad_norm": 51.80348205566406,
"learning_rate": 0.0004229984734256471,
"epoch": 0.25651867512332627,
"step": 286
},
{
"loss": 378.537,
"grad_norm": 53.50684356689453,
"learning_rate": 0.0004224888260603195,
"epoch": 0.25741559356781346,
"step": 287
},
{
"loss": 374.9467,
"grad_norm": 52.037200927734375,
"learning_rate": 0.0004219778068973804,
"epoch": 0.2583125120123006,
"step": 288
},
{
"loss": 382.1371,
"grad_norm": 48.98027420043945,
"learning_rate": 0.0004214654200009475,
"epoch": 0.25920943045678774,
"step": 289
},
{
"loss": 378.7361,
"grad_norm": 51.1038818359375,
"learning_rate": 0.0004209516694460157,
"epoch": 0.26010634890127493,
"step": 290
},
{
"loss": 379.9825,
"grad_norm": 53.03129577636719,
"learning_rate": 0.0004204365593184255,
"epoch": 0.26100326734576207,
"step": 291
},
{
"loss": 376.35,
"grad_norm": 54.52887725830078,
"learning_rate": 0.0004199200937148297,
"epoch": 0.2619001857902492,
"step": 292
},
{
"loss": 376.654,
"grad_norm": 51.10536575317383,
"learning_rate": 0.00041940227674266105,
"epoch": 0.26279710423473635,
"step": 293
},
{
"loss": 372.8873,
"grad_norm": 57.231117248535156,
"learning_rate": 0.0004188831125201,
"epoch": 0.26369402267922354,
"step": 294
},
{
"loss": 372.2591,
"grad_norm": 54.170921325683594,
"learning_rate": 0.0004183626051760415,
"epoch": 0.2645909411237107,
"step": 295
},
{
"loss": 376.232,
"grad_norm": 48.81595230102539,
"learning_rate": 0.0004178407588500621,
"epoch": 0.2654878595681978,
"step": 296
},
{
"loss": 377.493,
"grad_norm": 51.22395324707031,
"learning_rate": 0.00041731757769238764,
"epoch": 0.266384778012685,
"step": 297
},
{
"loss": 373.4135,
"grad_norm": 50.80076217651367,
"learning_rate": 0.00041679306586385944,
"epoch": 0.26728169645717215,
"step": 298
},
{
"loss": 373.3929,
"grad_norm": 52.78483581542969,
"learning_rate": 0.00041626722753590185,
"epoch": 0.2681786149016593,
"step": 299
},
{
"loss": 374.4973,
"grad_norm": 59.0179328918457,
"learning_rate": 0.0004157400668904887,
"epoch": 0.2690755333461465,
"step": 300
},
{
"eval_loss": 1.6736700534820557,
"eval_runtime": 48.4303,
"eval_samples_per_second": 42.288,
"eval_steps_per_second": 2.643,
"epoch": 0.2690755333461465,
"step": 300
},
{
"loss": 370.586,
"grad_norm": 51.39365005493164,
"learning_rate": 0.0004152115881201102,
"epoch": 0.2699724517906336,
"step": 301
},
{
"loss": 371.1306,
"grad_norm": 53.13943862915039,
"learning_rate": 0.0004146817954277395,
"epoch": 0.27086937023512075,
"step": 302
},
{
"loss": 375.8091,
"grad_norm": 46.9393310546875,
"learning_rate": 0.0004141506930267995,
"epoch": 0.2717662886796079,
"step": 303
},
{
"loss": 378.5063,
"grad_norm": 56.166954040527344,
"learning_rate": 0.00041361828514112884,
"epoch": 0.2726632071240951,
"step": 304
},
{
"loss": 372.5772,
"grad_norm": 52.24879455566406,
"learning_rate": 0.00041308457600494917,
"epoch": 0.2735601255685822,
"step": 305
},
{
"loss": 371.29,
"grad_norm": 53.966949462890625,
"learning_rate": 0.00041254956986283044,
"epoch": 0.27445704401306936,
"step": 306
},
{
"loss": 376.5358,
"grad_norm": 51.999046325683594,
"learning_rate": 0.0004120132709696578,
"epoch": 0.27535396245755656,
"step": 307
},
{
"loss": 377.9629,
"grad_norm": 53.83307647705078,
"learning_rate": 0.0004114756835905976,
"epoch": 0.2762508809020437,
"step": 308
},
{
"loss": 372.8809,
"grad_norm": 55.104217529296875,
"learning_rate": 0.0004109368120010636,
"epoch": 0.27714779934653083,
"step": 309
},
{
"loss": 377.9377,
"grad_norm": 51.1360969543457,
"learning_rate": 0.00041039666048668265,
"epoch": 0.278044717791018,
"step": 310
},
{
"loss": 377.1788,
"grad_norm": 50.87997817993164,
"learning_rate": 0.00040985523334326093,
"epoch": 0.27894163623550516,
"step": 311
},
{
"loss": 375.3121,
"grad_norm": 49.86625289916992,
"learning_rate": 0.00040931253487674955,
"epoch": 0.2798385546799923,
"step": 312
},
{
"loss": 373.2664,
"grad_norm": 51.52640151977539,
"learning_rate": 0.00040876856940321056,
"epoch": 0.28073547312447944,
"step": 313
},
{
"loss": 373.2856,
"grad_norm": 49.00104904174805,
"learning_rate": 0.00040822334124878236,
"epoch": 0.28163239156896663,
"step": 314
},
{
"loss": 377.6501,
"grad_norm": 52.83418655395508,
"learning_rate": 0.00040767685474964535,
"epoch": 0.28252931001345377,
"step": 315
},
{
"loss": 370.6684,
"grad_norm": 49.96600341796875,
"learning_rate": 0.00040712911425198764,
"epoch": 0.2834262284579409,
"step": 316
},
{
"loss": 376.3713,
"grad_norm": 50.470123291015625,
"learning_rate": 0.0004065801241119702,
"epoch": 0.2843231469024281,
"step": 317
},
{
"loss": 374.6679,
"grad_norm": 47.91783142089844,
"learning_rate": 0.0004060298886956926,
"epoch": 0.28522006534691524,
"step": 318
},
{
"loss": 376.8799,
"grad_norm": 52.6668586730957,
"learning_rate": 0.0004054784123791577,
"epoch": 0.2861169837914024,
"step": 319
},
{
"loss": 371.9651,
"grad_norm": 50.082279205322266,
"learning_rate": 0.00040492569954823763,
"epoch": 0.2870139022358896,
"step": 320
},
{
"loss": 373.8972,
"grad_norm": 56.001190185546875,
"learning_rate": 0.0004043717545986381,
"epoch": 0.2879108206803767,
"step": 321
},
{
"loss": 370.1523,
"grad_norm": 53.00112533569336,
"learning_rate": 0.0004038165819358639,
"epoch": 0.28880773912486385,
"step": 322
},
{
"loss": 377.1375,
"grad_norm": 52.706729888916016,
"learning_rate": 0.0004032601859751839,
"epoch": 0.28970465756935104,
"step": 323
},
{
"loss": 375.1089,
"grad_norm": 51.362571716308594,
"learning_rate": 0.00040270257114159583,
"epoch": 0.2906015760138382,
"step": 324
},
{
"loss": 370.7276,
"grad_norm": 54.43815994262695,
"learning_rate": 0.00040214374186979074,
"epoch": 0.2914984944583253,
"step": 325
},
{
"loss": 375.119,
"grad_norm": 51.00381851196289,
"learning_rate": 0.0004015837026041186,
"epoch": 0.29239541290281246,
"step": 326
},
{
"loss": 371.2367,
"grad_norm": 57.776222229003906,
"learning_rate": 0.000401022457798552,
"epoch": 0.29329233134729965,
"step": 327
},
{
"loss": 380.1667,
"grad_norm": 53.284149169921875,
"learning_rate": 0.0004004600119166513,
"epoch": 0.2941892497917868,
"step": 328
},
{
"loss": 369.6853,
"grad_norm": 56.30731964111328,
"learning_rate": 0.000399896369431529,
"epoch": 0.2950861682362739,
"step": 329
},
{
"loss": 374.0436,
"grad_norm": 54.28211975097656,
"learning_rate": 0.00039933153482581406,
"epoch": 0.2959830866807611,
"step": 330
},
{
"loss": 372.2117,
"grad_norm": 50.88725280761719,
"learning_rate": 0.00039876551259161643,
"epoch": 0.29688000512524826,
"step": 331
},
{
"loss": 374.7655,
"grad_norm": 54.17941665649414,
"learning_rate": 0.00039819830723049105,
"epoch": 0.2977769235697354,
"step": 332
},
{
"loss": 376.0198,
"grad_norm": 52.40755081176758,
"learning_rate": 0.0003976299232534024,
"epoch": 0.2986738420142226,
"step": 333
},
{
"loss": 371.5096,
"grad_norm": 50.74897384643555,
"learning_rate": 0.0003970603651806886,
"epoch": 0.29957076045870973,
"step": 334
},
{
"loss": 375.5447,
"grad_norm": 47.52690124511719,
"learning_rate": 0.00039648963754202496,
"epoch": 0.30046767890319687,
"step": 335
},
{
"loss": 376.1951,
"grad_norm": 52.93135070800781,
"learning_rate": 0.0003959177448763883,
"epoch": 0.301364597347684,
"step": 336
},
{
"loss": 371.1348,
"grad_norm": 50.335418701171875,
"learning_rate": 0.0003953446917320214,
"epoch": 0.3022615157921712,
"step": 337
},
{
"loss": 375.4595,
"grad_norm": 51.26169204711914,
"learning_rate": 0.0003947704826663955,
"epoch": 0.30315843423665834,
"step": 338
},
{
"loss": 372.898,
"grad_norm": 54.89933776855469,
"learning_rate": 0.0003941951222461756,
"epoch": 0.3040553526811455,
"step": 339
},
{
"loss": 370.8462,
"grad_norm": 54.09654235839844,
"learning_rate": 0.00039361861504718276,
"epoch": 0.30495227112563267,
"step": 340
},
{
"loss": 373.6092,
"grad_norm": 52.41168975830078,
"learning_rate": 0.0003930409656543588,
"epoch": 0.3058491895701198,
"step": 341
},
{
"loss": 374.9025,
"grad_norm": 45.53563690185547,
"learning_rate": 0.00039246217866172907,
"epoch": 0.30674610801460694,
"step": 342
},
{
"loss": 376.0628,
"grad_norm": 51.11941146850586,
"learning_rate": 0.00039188225867236643,
"epoch": 0.30764302645909414,
"step": 343
},
{
"loss": 374.4197,
"grad_norm": 50.10179901123047,
"learning_rate": 0.0003913012102983542,
"epoch": 0.3085399449035813,
"step": 344
},
{
"loss": 370.0171,
"grad_norm": 50.524696350097656,
"learning_rate": 0.00039071903816074977,
"epoch": 0.3094368633480684,
"step": 345
},
{
"loss": 371.2375,
"grad_norm": 51.18245315551758,
"learning_rate": 0.00039013574688954793,
"epoch": 0.31033378179255555,
"step": 346
},
{
"loss": 374.7748,
"grad_norm": 64.64472198486328,
"learning_rate": 0.0003895513411236438,
"epoch": 0.31123070023704275,
"step": 347
},
{
"loss": 377.3275,
"grad_norm": 56.01545715332031,
"learning_rate": 0.0003889658255107959,
"epoch": 0.3121276186815299,
"step": 348
},
{
"loss": 369.5843,
"grad_norm": 56.439754486083984,
"learning_rate": 0.0003883792047075896,
"epoch": 0.313024537126017,
"step": 349
},
{
"loss": 368.456,
"grad_norm": 58.23375701904297,
"learning_rate": 0.0003877914833793996,
"epoch": 0.3139214555705042,
"step": 350
},
{
"eval_loss": 1.661989450454712,
"eval_runtime": 36.2255,
"eval_samples_per_second": 56.535,
"eval_steps_per_second": 3.533,
"epoch": 0.3139214555705042,
"step": 350
},
{
"loss": 374.9042,
"grad_norm": 52.63510513305664,
"learning_rate": 0.00038720266620035314,
"epoch": 0.31481837401499135,
"step": 351
},
{
"loss": 367.9091,
"grad_norm": 55.49558639526367,
"learning_rate": 0.0003866127578532927,
"epoch": 0.3157152924594785,
"step": 352
},
{
"loss": 374.5601,
"grad_norm": 52.941497802734375,
"learning_rate": 0.0003860217630297387,
"epoch": 0.3166122109039657,
"step": 353
},
{
"loss": 371.4058,
"grad_norm": 44.237648010253906,
"learning_rate": 0.0003854296864298523,
"epoch": 0.3175091293484528,
"step": 354
},
{
"loss": 376.094,
"grad_norm": 52.86402893066406,
"learning_rate": 0.00038483653276239816,
"epoch": 0.31840604779293996,
"step": 355
},
{
"loss": 374.3872,
"grad_norm": 49.61796569824219,
"learning_rate": 0.0003842423067447066,
"epoch": 0.3193029662374271,
"step": 356
},
{
"loss": 371.5387,
"grad_norm": 49.825504302978516,
"learning_rate": 0.0003836470131026365,
"epoch": 0.3201998846819143,
"step": 357
},
{
"loss": 371.4422,
"grad_norm": 53.598228454589844,
"learning_rate": 0.0003830506565705372,
"epoch": 0.32109680312640143,
"step": 358
},
{
"loss": 371.03,
"grad_norm": 48.73537063598633,
"learning_rate": 0.00038245324189121153,
"epoch": 0.32199372157088857,
"step": 359
},
{
"loss": 377.8967,
"grad_norm": 48.377281188964844,
"learning_rate": 0.00038185477381587763,
"epoch": 0.32289064001537576,
"step": 360
},
{
"loss": 374.9411,
"grad_norm": 53.932228088378906,
"learning_rate": 0.0003812552571041311,
"epoch": 0.3237875584598629,
"step": 361
},
{
"loss": 374.6432,
"grad_norm": 52.54889678955078,
"learning_rate": 0.00038065469652390736,
"epoch": 0.32468447690435004,
"step": 362
},
{
"loss": 371.9634,
"grad_norm": 53.84141159057617,
"learning_rate": 0.000380053096851444,
"epoch": 0.32558139534883723,
"step": 363
},
{
"loss": 371.487,
"grad_norm": 49.041019439697266,
"learning_rate": 0.00037945046287124197,
"epoch": 0.32647831379332437,
"step": 364
},
{
"loss": 370.3628,
"grad_norm": 51.356388092041016,
"learning_rate": 0.00037884679937602827,
"epoch": 0.3273752322378115,
"step": 365
},
{
"loss": 371.4878,
"grad_norm": 49.55571746826172,
"learning_rate": 0.0003782421111667178,
"epoch": 0.32827215068229865,
"step": 366
},
{
"loss": 373.209,
"grad_norm": 51.30101013183594,
"learning_rate": 0.00037763640305237456,
"epoch": 0.32916906912678584,
"step": 367
},
{
"loss": 369.0127,
"grad_norm": 51.14597702026367,
"learning_rate": 0.000377029679850174,
"epoch": 0.330065987571273,
"step": 368
},
{
"loss": 374.4203,
"grad_norm": 51.925132751464844,
"learning_rate": 0.00037642194638536487,
"epoch": 0.3309629060157601,
"step": 369
},
{
"loss": 370.4622,
"grad_norm": 53.620052337646484,
"learning_rate": 0.00037581320749123,
"epoch": 0.3318598244602473,
"step": 370
},
{
"loss": 369.0265,
"grad_norm": 47.18992233276367,
"learning_rate": 0.0003752034680090485,
"epoch": 0.33275674290473445,
"step": 371
},
{
"loss": 372.8077,
"grad_norm": 56.7562141418457,
"learning_rate": 0.0003745927327880574,
"epoch": 0.3336536613492216,
"step": 372
},
{
"loss": 368.2184,
"grad_norm": 56.05765914916992,
"learning_rate": 0.00037398100668541227,
"epoch": 0.3345505797937088,
"step": 373
},
{
"loss": 376.1522,
"grad_norm": 50.888771057128906,
"learning_rate": 0.00037336829456614975,
"epoch": 0.3354474982381959,
"step": 374
},
{
"loss": 371.1161,
"grad_norm": 49.758975982666016,
"learning_rate": 0.0003727546013031478,
"epoch": 0.33634441668268306,
"step": 375
},
{
"loss": 371.6988,
"grad_norm": 53.891990661621094,
"learning_rate": 0.00037213993177708746,
"epoch": 0.33724133512717025,
"step": 376
},
{
"loss": 370.6019,
"grad_norm": 50.557762145996094,
"learning_rate": 0.000371524290876414,
"epoch": 0.3381382535716574,
"step": 377
},
{
"loss": 373.2912,
"grad_norm": 51.6466064453125,
"learning_rate": 0.00037090768349729833,
"epoch": 0.3390351720161445,
"step": 378
},
{
"loss": 372.9784,
"grad_norm": 48.213077545166016,
"learning_rate": 0.00037029011454359695,
"epoch": 0.33993209046063166,
"step": 379
},
{
"loss": 368.0577,
"grad_norm": 49.39459991455078,
"learning_rate": 0.0003696715889268145,
"epoch": 0.34082900890511886,
"step": 380
},
{
"loss": 371.9662,
"grad_norm": 49.54859924316406,
"learning_rate": 0.00036905211156606344,
"epoch": 0.341725927349606,
"step": 381
},
{
"loss": 376.1466,
"grad_norm": 54.29618835449219,
"learning_rate": 0.00036843168738802574,
"epoch": 0.34262284579409313,
"step": 382
},
{
"loss": 372.8206,
"grad_norm": 47.55562210083008,
"learning_rate": 0.00036781032132691304,
"epoch": 0.3435197642385803,
"step": 383
},
{
"loss": 370.9735,
"grad_norm": 49.289615631103516,
"learning_rate": 0.00036718801832442814,
"epoch": 0.34441668268306747,
"step": 384
},
{
"loss": 370.5686,
"grad_norm": 50.339176177978516,
"learning_rate": 0.000366564783329725,
"epoch": 0.3453136011275546,
"step": 385
},
{
"loss": 371.3257,
"grad_norm": 49.51339340209961,
"learning_rate": 0.00036594062129936974,
"epoch": 0.3462105195720418,
"step": 386
},
{
"loss": 366.3475,
"grad_norm": 48.21767044067383,
"learning_rate": 0.0003653155371973012,
"epoch": 0.34710743801652894,
"step": 387
},
{
"loss": 369.8744,
"grad_norm": 52.45291519165039,
"learning_rate": 0.0003646895359947915,
"epoch": 0.3480043564610161,
"step": 388
},
{
"loss": 372.5318,
"grad_norm": 49.45993423461914,
"learning_rate": 0.00036406262267040624,
"epoch": 0.3489012749055032,
"step": 389
},
{
"loss": 369.184,
"grad_norm": 48.8317756652832,
"learning_rate": 0.0003634348022099652,
"epoch": 0.3497981933499904,
"step": 390
},
{
"loss": 373.9739,
"grad_norm": 50.6275634765625,
"learning_rate": 0.0003628060796065027,
"epoch": 0.35069511179447754,
"step": 391
},
{
"loss": 372.0473,
"grad_norm": 48.547447204589844,
"learning_rate": 0.00036217645986022756,
"epoch": 0.3515920302389647,
"step": 392
},
{
"loss": 364.9705,
"grad_norm": 48.18462371826172,
"learning_rate": 0.0003615459479784837,
"epoch": 0.3524889486834519,
"step": 393
},
{
"loss": 369.6471,
"grad_norm": 46.10414123535156,
"learning_rate": 0.0003609145489757101,
"epoch": 0.353385867127939,
"step": 394
},
{
"loss": 371.7173,
"grad_norm": 46.38992691040039,
"learning_rate": 0.0003602822678734008,
"epoch": 0.35428278557242615,
"step": 395
},
{
"loss": 367.3975,
"grad_norm": 45.87107467651367,
"learning_rate": 0.00035964910970006557,
"epoch": 0.35517970401691334,
"step": 396
},
{
"loss": 371.2871,
"grad_norm": 46.54446029663086,
"learning_rate": 0.00035901507949118915,
"epoch": 0.3560766224614005,
"step": 397
},
{
"loss": 368.7915,
"grad_norm": 45.7996826171875,
"learning_rate": 0.0003583801822891917,
"epoch": 0.3569735409058876,
"step": 398
},
{
"loss": 371.0395,
"grad_norm": 48.34632873535156,
"learning_rate": 0.0003577444231433885,
"epoch": 0.35787045935037476,
"step": 399
},
{
"loss": 374.4672,
"grad_norm": 48.63014221191406,
"learning_rate": 0.00035710780710994985,
"epoch": 0.35876737779486195,
"step": 400
},
{
"eval_loss": 1.6527702808380127,
"eval_runtime": 51.2432,
"eval_samples_per_second": 39.966,
"eval_steps_per_second": 2.498,
"epoch": 0.35876737779486195,
"step": 400
},
{
"loss": 369.2286,
"grad_norm": 50.575950622558594,
"learning_rate": 0.00035647033925186066,
"epoch": 0.3596642962393491,
"step": 401
},
{
"loss": 366.6179,
"grad_norm": 50.074954986572266,
"learning_rate": 0.0003558320246388808,
"epoch": 0.36056121468383623,
"step": 402
},
{
"loss": 370.1017,
"grad_norm": 51.92937088012695,
"learning_rate": 0.00035519286834750403,
"epoch": 0.3614581331283234,
"step": 403
},
{
"loss": 366.74,
"grad_norm": 52.75185775756836,
"learning_rate": 0.00035455287546091785,
"epoch": 0.36235505157281056,
"step": 404
},
{
"loss": 369.307,
"grad_norm": 50.451271057128906,
"learning_rate": 0.0003539120510689636,
"epoch": 0.3632519700172977,
"step": 405
},
{
"loss": 374.2456,
"grad_norm": 56.06875228881836,
"learning_rate": 0.0003532704002680951,
"epoch": 0.3641488884617849,
"step": 406
},
{
"loss": 371.9364,
"grad_norm": 49.18859100341797,
"learning_rate": 0.0003526279281613388,
"epoch": 0.36504580690627203,
"step": 407
},
{
"loss": 375.3452,
"grad_norm": 60.49544143676758,
"learning_rate": 0.00035198463985825303,
"epoch": 0.36594272535075917,
"step": 408
},
{
"loss": 364.7332,
"grad_norm": 55.390960693359375,
"learning_rate": 0.0003513405404748872,
"epoch": 0.3668396437952463,
"step": 409
},
{
"loss": 367.328,
"grad_norm": 45.79146194458008,
"learning_rate": 0.00035069563513374105,
"epoch": 0.3677365622397335,
"step": 410
},
{
"loss": 372.7194,
"grad_norm": 50.601531982421875,
"learning_rate": 0.0003500499289637243,
"epoch": 0.36863348068422064,
"step": 411
},
{
"loss": 373.3177,
"grad_norm": 58.5416374206543,
"learning_rate": 0.0003494034271001158,
"epoch": 0.3695303991287078,
"step": 412
},
{
"loss": 367.5529,
"grad_norm": 48.93236541748047,
"learning_rate": 0.00034875613468452203,
"epoch": 0.37042731757319497,
"step": 413
},
{
"loss": 368.6186,
"grad_norm": 49.043251037597656,
"learning_rate": 0.00034810805686483713,
"epoch": 0.3713242360176821,
"step": 414
},
{
"loss": 363.3611,
"grad_norm": 48.577144622802734,
"learning_rate": 0.0003474591987952013,
"epoch": 0.37222115446216925,
"step": 415
},
{
"loss": 368.0312,
"grad_norm": 48.73127746582031,
"learning_rate": 0.0003468095656359601,
"epoch": 0.37311807290665644,
"step": 416
},
{
"loss": 367.3114,
"grad_norm": 51.46812057495117,
"learning_rate": 0.0003461591625536234,
"epoch": 0.3740149913511436,
"step": 417
},
{
"loss": 375.6931,
"grad_norm": 49.236141204833984,
"learning_rate": 0.0003455079947208242,
"epoch": 0.3749119097956307,
"step": 418
},
{
"loss": 365.6711,
"grad_norm": 48.81379318237305,
"learning_rate": 0.00034485606731627755,
"epoch": 0.37580882824011785,
"step": 419
},
{
"loss": 364.9393,
"grad_norm": 51.185340881347656,
"learning_rate": 0.0003442033855247394,
"epoch": 0.37670574668460505,
"step": 420
},
{
"loss": 369.8553,
"grad_norm": 53.58812713623047,
"learning_rate": 0.000343549954536965,
"epoch": 0.3776026651290922,
"step": 421
},
{
"loss": 372.3922,
"grad_norm": 51.472042083740234,
"learning_rate": 0.0003428957795496685,
"epoch": 0.3784995835735793,
"step": 422
},
{
"loss": 371.9807,
"grad_norm": 54.97187805175781,
"learning_rate": 0.0003422408657654805,
"epoch": 0.3793965020180665,
"step": 423
},
{
"loss": 370.048,
"grad_norm": 54.97746276855469,
"learning_rate": 0.0003415852183929077,
"epoch": 0.38029342046255366,
"step": 424
},
{
"loss": 370.0667,
"grad_norm": 46.41242980957031,
"learning_rate": 0.0003409288426462904,
"epoch": 0.3811903389070408,
"step": 425
},
{
"loss": 366.4669,
"grad_norm": 51.722904205322266,
"learning_rate": 0.0003402717437457624,
"epoch": 0.382087257351528,
"step": 426
},
{
"loss": 367.8651,
"grad_norm": 51.60542678833008,
"learning_rate": 0.00033961392691720803,
"epoch": 0.3829841757960151,
"step": 427
},
{
"loss": 364.8575,
"grad_norm": 46.896331787109375,
"learning_rate": 0.0003389553973922217,
"epoch": 0.38388109424050226,
"step": 428
},
{
"loss": 366.1106,
"grad_norm": 47.48381042480469,
"learning_rate": 0.00033829616040806566,
"epoch": 0.38477801268498946,
"step": 429
},
{
"loss": 369.6983,
"grad_norm": 47.15787124633789,
"learning_rate": 0.0003376362212076287,
"epoch": 0.3856749311294766,
"step": 430
},
{
"loss": 372.8012,
"grad_norm": 49.67255401611328,
"learning_rate": 0.0003369755850393841,
"epoch": 0.38657184957396373,
"step": 431
},
{
"loss": 369.0824,
"grad_norm": 50.87350082397461,
"learning_rate": 0.0003363142571573484,
"epoch": 0.38746876801845087,
"step": 432
},
{
"loss": 368.5385,
"grad_norm": 52.32754135131836,
"learning_rate": 0.0003356522428210391,
"epoch": 0.38836568646293806,
"step": 433
},
{
"loss": 370.1974,
"grad_norm": 46.638084411621094,
"learning_rate": 0.0003349895472954331,
"epoch": 0.3892626049074252,
"step": 434
},
{
"loss": 367.2549,
"grad_norm": 51.39384460449219,
"learning_rate": 0.00033432617585092467,
"epoch": 0.39015952335191234,
"step": 435
},
{
"loss": 368.2899,
"grad_norm": 49.1676139831543,
"learning_rate": 0.00033366213376328396,
"epoch": 0.39105644179639953,
"step": 436
},
{
"loss": 372.2977,
"grad_norm": 51.6141242980957,
"learning_rate": 0.0003329974263136144,
"epoch": 0.3919533602408867,
"step": 437
},
{
"loss": 368.3735,
"grad_norm": 49.94230270385742,
"learning_rate": 0.0003323320587883111,
"epoch": 0.3928502786853738,
"step": 438
},
{
"loss": 370.6481,
"grad_norm": 49.947837829589844,
"learning_rate": 0.0003316660364790188,
"epoch": 0.393747197129861,
"step": 439
},
{
"loss": 369.6432,
"grad_norm": 48.53517532348633,
"learning_rate": 0.0003309993646825896,
"epoch": 0.39464411557434814,
"step": 440
},
{
"loss": 366.7539,
"grad_norm": 50.93443298339844,
"learning_rate": 0.00033033204870104116,
"epoch": 0.3955410340188353,
"step": 441
},
{
"loss": 367.3075,
"grad_norm": 49.63651657104492,
"learning_rate": 0.000329664093841514,
"epoch": 0.3964379524633224,
"step": 442
},
{
"loss": 369.597,
"grad_norm": 48.85470962524414,
"learning_rate": 0.00032899550541623,
"epoch": 0.3973348709078096,
"step": 443
},
{
"loss": 366.1455,
"grad_norm": 49.675559997558594,
"learning_rate": 0.0003283262887424494,
"epoch": 0.39823178935229675,
"step": 444
},
{
"loss": 362.2254,
"grad_norm": 48.583370208740234,
"learning_rate": 0.0003276564491424292,
"epoch": 0.3991287077967839,
"step": 445
},
{
"loss": 372.5689,
"grad_norm": 50.507293701171875,
"learning_rate": 0.0003269859919433802,
"epoch": 0.4000256262412711,
"step": 446
},
{
"loss": 366.7801,
"grad_norm": 50.75261688232422,
"learning_rate": 0.0003263149224774251,
"epoch": 0.4009225446857582,
"step": 447
},
{
"loss": 369.5224,
"grad_norm": 49.42384719848633,
"learning_rate": 0.00032564324608155604,
"epoch": 0.40181946313024536,
"step": 448
},
{
"loss": 369.6519,
"grad_norm": 49.12044143676758,
"learning_rate": 0.00032497096809759184,
"epoch": 0.40271638157473255,
"step": 449
},
{
"loss": 370.9763,
"grad_norm": 53.04697036743164,
"learning_rate": 0.0003242980938721359,
"epoch": 0.4036133000192197,
"step": 450
},
{
"eval_loss": 1.6399173736572266,
"eval_runtime": 36.1587,
"eval_samples_per_second": 56.639,
"eval_steps_per_second": 3.54,
"epoch": 0.4036133000192197,
"step": 450
},
{
"loss": 367.9265,
"grad_norm": 52.0450553894043,
"learning_rate": 0.00032362462875653355,
"epoch": 0.4045102184637068,
"step": 451
},
{
"loss": 372.4974,
"grad_norm": 48.33359146118164,
"learning_rate": 0.0003229505781068291,
"epoch": 0.40540713690819397,
"step": 452
},
{
"loss": 366.6081,
"grad_norm": 49.462974548339844,
"learning_rate": 0.00032227594728372397,
"epoch": 0.40630405535268116,
"step": 453
},
{
"loss": 366.3152,
"grad_norm": 48.31398391723633,
"learning_rate": 0.0003216007416525335,
"epoch": 0.4072009737971683,
"step": 454
},
{
"loss": 369.983,
"grad_norm": 47.523338317871094,
"learning_rate": 0.0003209249665831445,
"epoch": 0.40809789224165544,
"step": 455
},
{
"loss": 366.8036,
"grad_norm": 45.295806884765625,
"learning_rate": 0.00032024862744997265,
"epoch": 0.40899481068614263,
"step": 456
},
{
"loss": 366.4848,
"grad_norm": 49.89873504638672,
"learning_rate": 0.0003195717296319193,
"epoch": 0.40989172913062977,
"step": 457
},
{
"loss": 365.4414,
"grad_norm": 46.948055267333984,
"learning_rate": 0.00031889427851232915,
"epoch": 0.4107886475751169,
"step": 458
},
{
"loss": 369.7285,
"grad_norm": 48.40359115600586,
"learning_rate": 0.0003182162794789474,
"epoch": 0.4116855660196041,
"step": 459
},
{
"loss": 370.345,
"grad_norm": 48.55045700073242,
"learning_rate": 0.0003175377379238767,
"epoch": 0.41258248446409124,
"step": 460
},
{
"loss": 366.95,
"grad_norm": 47.37104415893555,
"learning_rate": 0.0003168586592435341,
"epoch": 0.4134794029085784,
"step": 461
},
{
"loss": 370.2368,
"grad_norm": 51.285888671875,
"learning_rate": 0.00031617904883860903,
"epoch": 0.4143763213530655,
"step": 462
},
{
"loss": 365.4067,
"grad_norm": 50.595340728759766,
"learning_rate": 0.000315498912114019,
"epoch": 0.4152732397975527,
"step": 463
},
{
"loss": 366.4186,
"grad_norm": 45.943519592285156,
"learning_rate": 0.0003148182544788678,
"epoch": 0.41617015824203984,
"step": 464
},
{
"loss": 362.8856,
"grad_norm": 52.45280075073242,
"learning_rate": 0.0003141370813464018,
"epoch": 0.417067076686527,
"step": 465
},
{
"loss": 366.827,
"grad_norm": 47.95954132080078,
"learning_rate": 0.0003134553981339672,
"epoch": 0.4179639951310142,
"step": 466
},
{
"loss": 370.8824,
"grad_norm": 51.57919692993164,
"learning_rate": 0.00031277321026296657,
"epoch": 0.4188609135755013,
"step": 467
},
{
"loss": 368.826,
"grad_norm": 51.78611755371094,
"learning_rate": 0.0003120905231588164,
"epoch": 0.41975783201998845,
"step": 468
},
{
"loss": 369.1159,
"grad_norm": 46.962074279785156,
"learning_rate": 0.0003114073422509034,
"epoch": 0.42065475046447565,
"step": 469
},
{
"loss": 361.8488,
"grad_norm": 46.85802459716797,
"learning_rate": 0.0003107236729725414,
"epoch": 0.4215516689089628,
"step": 470
},
{
"loss": 367.4666,
"grad_norm": 54.017906188964844,
"learning_rate": 0.0003100395207609284,
"epoch": 0.4224485873534499,
"step": 471
},
{
"loss": 366.9775,
"grad_norm": 53.34091567993164,
"learning_rate": 0.000309354891057103,
"epoch": 0.42334550579793706,
"step": 472
},
{
"loss": 366.0834,
"grad_norm": 47.76055908203125,
"learning_rate": 0.00030866978930590126,
"epoch": 0.42424242424242425,
"step": 473
},
{
"loss": 368.5773,
"grad_norm": 49.945613861083984,
"learning_rate": 0.00030798422095591364,
"epoch": 0.4251393426869114,
"step": 474
},
{
"loss": 363.8445,
"grad_norm": 48.995609283447266,
"learning_rate": 0.00030729819145944114,
"epoch": 0.42603626113139853,
"step": 475
},
{
"loss": 362.6448,
"grad_norm": 45.06385040283203,
"learning_rate": 0.00030661170627245256,
"epoch": 0.4269331795758857,
"step": 476
},
{
"loss": 364.0858,
"grad_norm": 49.73957061767578,
"learning_rate": 0.00030592477085454047,
"epoch": 0.42783009802037286,
"step": 477
},
{
"loss": 371.1085,
"grad_norm": 49.45321273803711,
"learning_rate": 0.00030523739066887836,
"epoch": 0.42872701646486,
"step": 478
},
{
"loss": 363.6934,
"grad_norm": 49.325355529785156,
"learning_rate": 0.00030454957118217674,
"epoch": 0.4296239349093472,
"step": 479
},
{
"loss": 368.4297,
"grad_norm": 47.509742736816406,
"learning_rate": 0.0003038613178646401,
"epoch": 0.43052085335383433,
"step": 480
},
{
"loss": 366.2455,
"grad_norm": 48.50214767456055,
"learning_rate": 0.000303172636189923,
"epoch": 0.43141777179832147,
"step": 481
},
{
"loss": 362.4247,
"grad_norm": 46.59059143066406,
"learning_rate": 0.00030248353163508674,
"epoch": 0.43231469024280866,
"step": 482
},
{
"loss": 368.7481,
"grad_norm": 47.74319839477539,
"learning_rate": 0.0003017940096805557,
"epoch": 0.4332116086872958,
"step": 483
},
{
"loss": 365.7433,
"grad_norm": 53.59490203857422,
"learning_rate": 0.0003011040758100741,
"epoch": 0.43410852713178294,
"step": 484
},
{
"loss": 366.9239,
"grad_norm": 49.87615966796875,
"learning_rate": 0.00030041373551066173,
"epoch": 0.4350054455762701,
"step": 485
},
{
"loss": 360.9555,
"grad_norm": 44.795536041259766,
"learning_rate": 0.0002997229942725711,
"epoch": 0.43590236402075727,
"step": 486
},
{
"loss": 370.6934,
"grad_norm": 56.454227447509766,
"learning_rate": 0.000299031857589243,
"epoch": 0.4367992824652444,
"step": 487
},
{
"loss": 369.9133,
"grad_norm": 48.472312927246094,
"learning_rate": 0.00029834033095726335,
"epoch": 0.43769620090973155,
"step": 488
},
{
"loss": 361.5723,
"grad_norm": 51.665260314941406,
"learning_rate": 0.00029764841987631933,
"epoch": 0.43859311935421874,
"step": 489
},
{
"loss": 366.223,
"grad_norm": 51.25084686279297,
"learning_rate": 0.0002969561298491557,
"epoch": 0.4394900377987059,
"step": 490
},
{
"loss": 367.7071,
"grad_norm": 50.52541732788086,
"learning_rate": 0.00029626346638153073,
"epoch": 0.440386956243193,
"step": 491
},
{
"loss": 367.0807,
"grad_norm": 50.71653366088867,
"learning_rate": 0.0002955704349821729,
"epoch": 0.4412838746876802,
"step": 492
},
{
"loss": 366.5776,
"grad_norm": 44.603485107421875,
"learning_rate": 0.0002948770411627367,
"epoch": 0.44218079313216735,
"step": 493
},
{
"loss": 367.2019,
"grad_norm": 49.68048858642578,
"learning_rate": 0.0002941832904377589,
"epoch": 0.4430777115766545,
"step": 494
},
{
"loss": 367.4325,
"grad_norm": 56.277896881103516,
"learning_rate": 0.000293489188324615,
"epoch": 0.4439746300211416,
"step": 495
},
{
"loss": 369.3215,
"grad_norm": 46.4665412902832,
"learning_rate": 0.00029279474034347465,
"epoch": 0.4448715484656288,
"step": 496
},
{
"loss": 368.6407,
"grad_norm": 51.84563446044922,
"learning_rate": 0.00029209995201725836,
"epoch": 0.44576846691011596,
"step": 497
},
{
"loss": 366.8856,
"grad_norm": 55.93694305419922,
"learning_rate": 0.0002914048288715937,
"epoch": 0.4466653853546031,
"step": 498
},
{
"loss": 367.8516,
"grad_norm": 50.97298812866211,
"learning_rate": 0.00029070937643477056,
"epoch": 0.4475623037990903,
"step": 499
},
{
"loss": 364.7996,
"grad_norm": 53.179847717285156,
"learning_rate": 0.000290013600237698,
"epoch": 0.4484592222435774,
"step": 500
},
{
"eval_loss": 1.6293703317642212,
"eval_runtime": 47.4683,
"eval_samples_per_second": 43.145,
"eval_steps_per_second": 2.697,
"epoch": 0.4484592222435774,
"step": 500
},
{
"loss": 364.7999,
"grad_norm": 53.32307434082031,
"learning_rate": 0.00028931750581385975,
"epoch": 0.44935614068806456,
"step": 501
},
{
"loss": 368.2321,
"grad_norm": 48.1343994140625,
"learning_rate": 0.00028862109869927057,
"epoch": 0.45025305913255176,
"step": 502
},
{
"loss": 363.4522,
"grad_norm": 48.97591781616211,
"learning_rate": 0.00028792438443243175,
"epoch": 0.4511499775770389,
"step": 503
},
{
"loss": 367.3519,
"grad_norm": 48.5214729309082,
"learning_rate": 0.00028722736855428755,
"epoch": 0.45204689602152603,
"step": 504
},
{
"loss": 366.9135,
"grad_norm": 48.30058288574219,
"learning_rate": 0.00028653005660818115,
"epoch": 0.4529438144660132,
"step": 505
},
{
"loss": 365.4208,
"grad_norm": 48.56584548950195,
"learning_rate": 0.00028583245413980993,
"epoch": 0.45384073291050037,
"step": 506
},
{
"loss": 366.6342,
"grad_norm": 44.84033203125,
"learning_rate": 0.0002851345666971819,
"epoch": 0.4547376513549875,
"step": 507
},
{
"loss": 366.2589,
"grad_norm": 46.03631591796875,
"learning_rate": 0.0002844363998305717,
"epoch": 0.45563456979947464,
"step": 508
},
{
"loss": 368.2724,
"grad_norm": 52.3626708984375,
"learning_rate": 0.0002837379590924759,
"epoch": 0.45653148824396184,
"step": 509
},
{
"loss": 366.9325,
"grad_norm": 42.26225280761719,
"learning_rate": 0.0002830392500375694,
"epoch": 0.457428406688449,
"step": 510
},
{
"loss": 363.1102,
"grad_norm": 47.719661712646484,
"learning_rate": 0.0002823402782226608,
"epoch": 0.4583253251329361,
"step": 511
},
{
"loss": 369.943,
"grad_norm": 48.35748291015625,
"learning_rate": 0.00028164104920664864,
"epoch": 0.4592222435774233,
"step": 512
},
{
"loss": 366.7622,
"grad_norm": 47.81887435913086,
"learning_rate": 0.00028094156855047687,
"epoch": 0.46011916202191044,
"step": 513
},
{
"loss": 369.4684,
"grad_norm": 51.35517883300781,
"learning_rate": 0.0002802418418170908,
"epoch": 0.4610160804663976,
"step": 514
},
{
"loss": 367.9245,
"grad_norm": 52.903011322021484,
"learning_rate": 0.0002795418745713925,
"epoch": 0.4619129989108847,
"step": 515
},
{
"loss": 363.503,
"grad_norm": 50.455223083496094,
"learning_rate": 0.00027884167238019714,
"epoch": 0.4628099173553719,
"step": 516
},
{
"loss": 361.0208,
"grad_norm": 48.27017593383789,
"learning_rate": 0.0002781412408121884,
"epoch": 0.46370683579985905,
"step": 517
},
{
"loss": 364.5886,
"grad_norm": 49.851619720458984,
"learning_rate": 0.0002774405854378739,
"epoch": 0.4646037542443462,
"step": 518
},
{
"loss": 359.5211,
"grad_norm": 49.12308120727539,
"learning_rate": 0.00027673971182954157,
"epoch": 0.4655006726888334,
"step": 519
},
{
"loss": 366.8299,
"grad_norm": 47.60043716430664,
"learning_rate": 0.00027603862556121463,
"epoch": 0.4663975911333205,
"step": 520
},
{
"loss": 368.2267,
"grad_norm": 41.944801330566406,
"learning_rate": 0.0002753373322086077,
"epoch": 0.46729450957780766,
"step": 521
},
{
"loss": 368.1608,
"grad_norm": 45.84396743774414,
"learning_rate": 0.00027463583734908234,
"epoch": 0.46819142802229485,
"step": 522
},
{
"loss": 359.4468,
"grad_norm": 44.122989654541016,
"learning_rate": 0.0002739341465616026,
"epoch": 0.469088346466782,
"step": 523
},
{
"loss": 367.6043,
"grad_norm": 44.97038269042969,
"learning_rate": 0.000273232265426691,
"epoch": 0.46998526491126913,
"step": 524
},
{
"loss": 367.8859,
"grad_norm": 49.4835319519043,
"learning_rate": 0.0002725301995263835,
"epoch": 0.47088218335575627,
"step": 525
},
{
"loss": 365.9901,
"grad_norm": 46.08525466918945,
"learning_rate": 0.00027182795444418583,
"epoch": 0.47177910180024346,
"step": 526
},
{
"loss": 362.7762,
"grad_norm": 45.26884841918945,
"learning_rate": 0.0002711255357650286,
"epoch": 0.4726760202447306,
"step": 527
},
{
"loss": 363.5254,
"grad_norm": 52.6630973815918,
"learning_rate": 0.0002704229490752229,
"epoch": 0.47357293868921774,
"step": 528
},
{
"loss": 362.2083,
"grad_norm": 49.639488220214844,
"learning_rate": 0.00026972019996241635,
"epoch": 0.47446985713370493,
"step": 529
},
{
"loss": 370.2541,
"grad_norm": 51.361610412597656,
"learning_rate": 0.00026901729401554805,
"epoch": 0.47536677557819207,
"step": 530
},
{
"loss": 364.9506,
"grad_norm": 45.84967803955078,
"learning_rate": 0.00026831423682480425,
"epoch": 0.4762636940226792,
"step": 531
},
{
"loss": 373.7259,
"grad_norm": 48.99913024902344,
"learning_rate": 0.00026761103398157456,
"epoch": 0.4771606124671664,
"step": 532
},
{
"loss": 367.0407,
"grad_norm": 53.0494270324707,
"learning_rate": 0.00026690769107840634,
"epoch": 0.47805753091165354,
"step": 533
},
{
"loss": 366.3498,
"grad_norm": 46.16975784301758,
"learning_rate": 0.00026620421370896136,
"epoch": 0.4789544493561407,
"step": 534
},
{
"loss": 363.5735,
"grad_norm": 45.147125244140625,
"learning_rate": 0.00026550060746797057,
"epoch": 0.47985136780062787,
"step": 535
},
{
"loss": 362.9278,
"grad_norm": 47.262821197509766,
"learning_rate": 0.0002647968779511897,
"epoch": 0.480748286245115,
"step": 536
},
{
"loss": 366.6017,
"grad_norm": 49.1768913269043,
"learning_rate": 0.00026409303075535504,
"epoch": 0.48164520468960215,
"step": 537
},
{
"loss": 363.7893,
"grad_norm": 47.41939163208008,
"learning_rate": 0.00026338907147813894,
"epoch": 0.4825421231340893,
"step": 538
},
{
"loss": 362.325,
"grad_norm": 45.2095947265625,
"learning_rate": 0.0002626850057181048,
"epoch": 0.4834390415785765,
"step": 539
},
{
"loss": 368.0108,
"grad_norm": 44.87570571899414,
"learning_rate": 0.000261980839074663,
"epoch": 0.4843359600230636,
"step": 540
},
{
"loss": 363.8844,
"grad_norm": 44.87836456298828,
"learning_rate": 0.0002612765771480264,
"epoch": 0.48523287846755075,
"step": 541
},
{
"loss": 366.2256,
"grad_norm": 52.47968292236328,
"learning_rate": 0.00026057222553916545,
"epoch": 0.48612979691203795,
"step": 542
},
{
"loss": 364.6898,
"grad_norm": 49.18819808959961,
"learning_rate": 0.0002598677898497638,
"epoch": 0.4870267153565251,
"step": 543
},
{
"loss": 364.0697,
"grad_norm": 47.542850494384766,
"learning_rate": 0.00025916327568217416,
"epoch": 0.4879236338010122,
"step": 544
},
{
"loss": 362.7703,
"grad_norm": 44.471256256103516,
"learning_rate": 0.0002584586886393729,
"epoch": 0.4888205522454994,
"step": 545
},
{
"loss": 370.4043,
"grad_norm": 46.374263763427734,
"learning_rate": 0.0002577540343249162,
"epoch": 0.48971747068998656,
"step": 546
},
{
"loss": 362.8738,
"grad_norm": 44.021278381347656,
"learning_rate": 0.0002570493183428952,
"epoch": 0.4906143891344737,
"step": 547
},
{
"loss": 365.418,
"grad_norm": 47.044212341308594,
"learning_rate": 0.00025634454629789156,
"epoch": 0.49151130757896083,
"step": 548
},
{
"loss": 363.5009,
"grad_norm": 48.60353469848633,
"learning_rate": 0.00025563972379493273,
"epoch": 0.492408226023448,
"step": 549
},
{
"loss": 365.955,
"grad_norm": 47.8569221496582,
"learning_rate": 0.00025493485643944753,
"epoch": 0.49330514446793516,
"step": 550
},
{
"eval_loss": 1.6247297525405884,
"eval_runtime": 36.2552,
"eval_samples_per_second": 56.488,
"eval_steps_per_second": 3.531,
"epoch": 0.49330514446793516,
"step": 550
},
{
"loss": 361.769,
"grad_norm": 52.47264099121094,
"learning_rate": 0.00025422994983722127,
"epoch": 0.4942020629124223,
"step": 551
},
{
"loss": 369.0356,
"grad_norm": 51.903358459472656,
"learning_rate": 0.0002535250095943517,
"epoch": 0.4950989813569095,
"step": 552
},
{
"loss": 362.5946,
"grad_norm": 55.91824722290039,
"learning_rate": 0.0002528200413172039,
"epoch": 0.49599589980139663,
"step": 553
},
{
"loss": 364.1907,
"grad_norm": 49.117069244384766,
"learning_rate": 0.00025211505061236583,
"epoch": 0.49689281824588377,
"step": 554
},
{
"loss": 363.2774,
"grad_norm": 44.69606018066406,
"learning_rate": 0.00025141004308660414,
"epoch": 0.49778973669037097,
"step": 555
},
{
"loss": 363.2139,
"grad_norm": 52.18587112426758,
"learning_rate": 0.00025070502434681915,
"epoch": 0.4986866551348581,
"step": 556
},
{
"loss": 365.6665,
"grad_norm": 57.393428802490234,
"learning_rate": 0.00025,
"epoch": 0.49958357357934524,
"step": 557
},
{
"loss": 363.4536,
"grad_norm": 52.89313507080078,
"learning_rate": 0.0002492949756531809,
"epoch": 0.5004804920238324,
"step": 558
},
{
"loss": 363.2097,
"grad_norm": 51.265533447265625,
"learning_rate": 0.00024858995691339587,
"epoch": 0.5013774104683195,
"step": 559
},
{
"loss": 366.4611,
"grad_norm": 56.473567962646484,
"learning_rate": 0.0002478849493876342,
"epoch": 0.5022743289128068,
"step": 560
},
{
"loss": 361.8987,
"grad_norm": 49.68058776855469,
"learning_rate": 0.0002471799586827962,
"epoch": 0.5031712473572939,
"step": 561
},
{
"loss": 360.8694,
"grad_norm": 42.74179458618164,
"learning_rate": 0.00024647499040564844,
"epoch": 0.504068165801781,
"step": 562
},
{
"loss": 364.9089,
"grad_norm": 45.61265563964844,
"learning_rate": 0.00024577005016277885,
"epoch": 0.5049650842462682,
"step": 563
},
{
"loss": 365.8124,
"grad_norm": 46.97050857543945,
"learning_rate": 0.0002450651435605526,
"epoch": 0.5058620026907553,
"step": 564
},
{
"loss": 360.1623,
"grad_norm": 46.26262664794922,
"learning_rate": 0.0002443602762050673,
"epoch": 0.5067589211352425,
"step": 565
},
{
"loss": 363.2248,
"grad_norm": 44.43347930908203,
"learning_rate": 0.00024365545370210842,
"epoch": 0.5076558395797296,
"step": 566
},
{
"loss": 365.1527,
"grad_norm": 46.19889831542969,
"learning_rate": 0.00024295068165710478,
"epoch": 0.5085527580242168,
"step": 567
},
{
"loss": 365.0658,
"grad_norm": 49.645484924316406,
"learning_rate": 0.00024224596567508385,
"epoch": 0.509449676468704,
"step": 568
},
{
"loss": 362.5722,
"grad_norm": 47.69388961791992,
"learning_rate": 0.00024154131136062715,
"epoch": 0.5103465949131911,
"step": 569
},
{
"loss": 361.0171,
"grad_norm": 44.855857849121094,
"learning_rate": 0.00024083672431782585,
"epoch": 0.5112435133576783,
"step": 570
},
{
"loss": 361.5502,
"grad_norm": 48.860435485839844,
"learning_rate": 0.00024013221015023619,
"epoch": 0.5121404318021654,
"step": 571
},
{
"loss": 360.8487,
"grad_norm": 45.69166564941406,
"learning_rate": 0.0002394277744608346,
"epoch": 0.5130373502466525,
"step": 572
},
{
"loss": 361.6857,
"grad_norm": 45.67158889770508,
"learning_rate": 0.00023872342285197366,
"epoch": 0.5139342686911397,
"step": 573
},
{
"loss": 364.0296,
"grad_norm": 51.487369537353516,
"learning_rate": 0.00023801916092533706,
"epoch": 0.5148311871356269,
"step": 574
},
{
"loss": 366.4655,
"grad_norm": 49.884727478027344,
"learning_rate": 0.0002373149942818953,
"epoch": 0.5157281055801141,
"step": 575
},
{
"loss": 360.9107,
"grad_norm": 42.73551940917969,
"learning_rate": 0.00023661092852186118,
"epoch": 0.5166250240246012,
"step": 576
},
{
"loss": 364.7719,
"grad_norm": 44.425777435302734,
"learning_rate": 0.000235906969244645,
"epoch": 0.5175219424690883,
"step": 577
},
{
"loss": 362.6983,
"grad_norm": 52.82978057861328,
"learning_rate": 0.00023520312204881045,
"epoch": 0.5184188609135755,
"step": 578
},
{
"loss": 359.655,
"grad_norm": 46.826904296875,
"learning_rate": 0.0002344993925320295,
"epoch": 0.5193157793580626,
"step": 579
},
{
"loss": 364.8085,
"grad_norm": 42.24338150024414,
"learning_rate": 0.00023379578629103865,
"epoch": 0.5202126978025499,
"step": 580
},
{
"loss": 358.4188,
"grad_norm": 49.714271545410156,
"learning_rate": 0.00023309230892159364,
"epoch": 0.521109616247037,
"step": 581
},
{
"loss": 364.1614,
"grad_norm": 47.561073303222656,
"learning_rate": 0.0002323889660184255,
"epoch": 0.5220065346915241,
"step": 582
},
{
"loss": 361.0988,
"grad_norm": 45.20221710205078,
"learning_rate": 0.00023168576317519576,
"epoch": 0.5229034531360113,
"step": 583
},
{
"loss": 367.0533,
"grad_norm": 47.38787078857422,
"learning_rate": 0.00023098270598445204,
"epoch": 0.5238003715804984,
"step": 584
},
{
"loss": 366.2763,
"grad_norm": 47.23054122924805,
"learning_rate": 0.00023027980003758363,
"epoch": 0.5246972900249856,
"step": 585
},
{
"loss": 365.6816,
"grad_norm": 43.855403900146484,
"learning_rate": 0.0002295770509247771,
"epoch": 0.5255942084694727,
"step": 586
},
{
"loss": 365.6198,
"grad_norm": 51.30084228515625,
"learning_rate": 0.00022887446423497146,
"epoch": 0.5264911269139599,
"step": 587
},
{
"loss": 362.4194,
"grad_norm": 50.142330169677734,
"learning_rate": 0.00022817204555581418,
"epoch": 0.5273880453584471,
"step": 588
},
{
"loss": 364.2704,
"grad_norm": 46.52515411376953,
"learning_rate": 0.00022746980047361654,
"epoch": 0.5282849638029342,
"step": 589
},
{
"loss": 362.0045,
"grad_norm": 48.26958465576172,
"learning_rate": 0.00022676773457330906,
"epoch": 0.5291818822474214,
"step": 590
},
{
"loss": 364.3056,
"grad_norm": 45.78593063354492,
"learning_rate": 0.0002260658534383974,
"epoch": 0.5300788006919085,
"step": 591
},
{
"loss": 364.2805,
"grad_norm": 47.130184173583984,
"learning_rate": 0.00022536416265091775,
"epoch": 0.5309757191363956,
"step": 592
},
{
"loss": 362.9882,
"grad_norm": 43.309181213378906,
"learning_rate": 0.0002246626677913923,
"epoch": 0.5318726375808829,
"step": 593
},
{
"loss": 362.9743,
"grad_norm": 40.39152145385742,
"learning_rate": 0.00022396137443878535,
"epoch": 0.53276955602537,
"step": 594
},
{
"loss": 359.4163,
"grad_norm": 47.722068786621094,
"learning_rate": 0.00022326028817045844,
"epoch": 0.5336664744698572,
"step": 595
},
{
"loss": 364.6919,
"grad_norm": 42.61846160888672,
"learning_rate": 0.00022255941456212605,
"epoch": 0.5345633929143443,
"step": 596
},
{
"loss": 368.3342,
"grad_norm": 44.96833038330078,
"learning_rate": 0.00022185875918781162,
"epoch": 0.5354603113588314,
"step": 597
},
{
"loss": 363.2259,
"grad_norm": 43.944881439208984,
"learning_rate": 0.00022115832761980287,
"epoch": 0.5363572298033186,
"step": 598
},
{
"loss": 362.7245,
"grad_norm": 47.073341369628906,
"learning_rate": 0.00022045812542860756,
"epoch": 0.5372541482478057,
"step": 599
},
{
"loss": 363.0497,
"grad_norm": 44.11311721801758,
"learning_rate": 0.00021975815818290928,
"epoch": 0.538151066692293,
"step": 600
},
{
"eval_loss": 1.61993408203125,
"eval_runtime": 65.3564,
"eval_samples_per_second": 31.336,
"eval_steps_per_second": 1.958,
"epoch": 0.538151066692293,
"step": 600
},
{
"loss": 360.9368,
"grad_norm": 45.97838592529297,
"learning_rate": 0.00021905843144952316,
"epoch": 0.5390479851367801,
"step": 601
},
{
"loss": 363.959,
"grad_norm": 45.36203384399414,
"learning_rate": 0.0002183589507933514,
"epoch": 0.5399449035812672,
"step": 602
},
{
"loss": 363.9291,
"grad_norm": 43.02581024169922,
"learning_rate": 0.00021765972177733924,
"epoch": 0.5408418220257544,
"step": 603
},
{
"loss": 363.5491,
"grad_norm": 47.46310806274414,
"learning_rate": 0.0002169607499624307,
"epoch": 0.5417387404702415,
"step": 604
},
{
"loss": 367.6017,
"grad_norm": 47.89605712890625,
"learning_rate": 0.00021626204090752422,
"epoch": 0.5426356589147286,
"step": 605
},
{
"loss": 364.9732,
"grad_norm": 45.463443756103516,
"learning_rate": 0.00021556360016942842,
"epoch": 0.5435325773592158,
"step": 606
},
{
"loss": 364.4341,
"grad_norm": 43.64617919921875,
"learning_rate": 0.00021486543330281812,
"epoch": 0.544429495803703,
"step": 607
},
{
"loss": 366.3894,
"grad_norm": 41.575531005859375,
"learning_rate": 0.0002141675458601901,
"epoch": 0.5453264142481902,
"step": 608
},
{
"loss": 363.112,
"grad_norm": 46.79388427734375,
"learning_rate": 0.00021346994339181883,
"epoch": 0.5462233326926773,
"step": 609
},
{
"loss": 361.5751,
"grad_norm": 48.13455581665039,
"learning_rate": 0.0002127726314457124,
"epoch": 0.5471202511371644,
"step": 610
},
{
"loss": 361.1321,
"grad_norm": 45.220550537109375,
"learning_rate": 0.0002120756155675683,
"epoch": 0.5480171695816516,
"step": 611
},
{
"loss": 365.0866,
"grad_norm": 46.22264099121094,
"learning_rate": 0.0002113789013007295,
"epoch": 0.5489140880261387,
"step": 612
},
{
"loss": 360.2099,
"grad_norm": 47.99028015136719,
"learning_rate": 0.00021068249418614027,
"epoch": 0.549811006470626,
"step": 613
},
{
"loss": 362.4004,
"grad_norm": 45.35298538208008,
"learning_rate": 0.00020998639976230202,
"epoch": 0.5507079249151131,
"step": 614
},
{
"loss": 362.9482,
"grad_norm": 45.84006118774414,
"learning_rate": 0.00020929062356522942,
"epoch": 0.5516048433596002,
"step": 615
},
{
"loss": 361.6893,
"grad_norm": 46.06373977661133,
"learning_rate": 0.00020859517112840637,
"epoch": 0.5525017618040874,
"step": 616
},
{
"loss": 368.1667,
"grad_norm": 43.56032180786133,
"learning_rate": 0.00020790004798274165,
"epoch": 0.5533986802485745,
"step": 617
},
{
"loss": 363.2073,
"grad_norm": 43.215370178222656,
"learning_rate": 0.00020720525965652544,
"epoch": 0.5542955986930617,
"step": 618
},
{
"loss": 358.3785,
"grad_norm": 47.84462356567383,
"learning_rate": 0.00020651081167538508,
"epoch": 0.5551925171375488,
"step": 619
},
{
"loss": 365.6581,
"grad_norm": 49.96092987060547,
"learning_rate": 0.00020581670956224113,
"epoch": 0.556089435582036,
"step": 620
},
{
"loss": 363.1918,
"grad_norm": 44.61714172363281,
"learning_rate": 0.00020512295883726338,
"epoch": 0.5569863540265232,
"step": 621
},
{
"loss": 363.2948,
"grad_norm": 44.841495513916016,
"learning_rate": 0.00020442956501782713,
"epoch": 0.5578832724710103,
"step": 622
},
{
"loss": 358.7636,
"grad_norm": 46.29624938964844,
"learning_rate": 0.00020373653361846925,
"epoch": 0.5587801909154975,
"step": 623
},
{
"loss": 362.0233,
"grad_norm": 43.61477279663086,
"learning_rate": 0.0002030438701508443,
"epoch": 0.5596771093599846,
"step": 624
},
{
"loss": 366.3086,
"grad_norm": 44.28224182128906,
"learning_rate": 0.00020235158012368065,
"epoch": 0.5605740278044717,
"step": 625
},
{
"loss": 357.9655,
"grad_norm": 43.08799362182617,
"learning_rate": 0.00020165966904273666,
"epoch": 0.5614709462489589,
"step": 626
},
{
"loss": 364.1879,
"grad_norm": 45.73900604248047,
"learning_rate": 0.00020096814241075703,
"epoch": 0.5623678646934461,
"step": 627
},
{
"loss": 359.9633,
"grad_norm": 48.213985443115234,
"learning_rate": 0.00020027700572742895,
"epoch": 0.5632647831379333,
"step": 628
},
{
"loss": 365.9498,
"grad_norm": 43.3817253112793,
"learning_rate": 0.00019958626448933825,
"epoch": 0.5641617015824204,
"step": 629
},
{
"loss": 362.1366,
"grad_norm": 42.70503234863281,
"learning_rate": 0.00019889592418992594,
"epoch": 0.5650586200269075,
"step": 630
},
{
"loss": 361.433,
"grad_norm": 46.60575485229492,
"learning_rate": 0.00019820599031944436,
"epoch": 0.5659555384713947,
"step": 631
},
{
"loss": 364.1061,
"grad_norm": 42.36573791503906,
"learning_rate": 0.00019751646836491338,
"epoch": 0.5668524569158818,
"step": 632
},
{
"loss": 360.4161,
"grad_norm": 43.14451599121094,
"learning_rate": 0.00019682736381007707,
"epoch": 0.5677493753603691,
"step": 633
},
{
"loss": 357.0567,
"grad_norm": 44.19496154785156,
"learning_rate": 0.00019613868213535997,
"epoch": 0.5686462938048562,
"step": 634
},
{
"loss": 361.1339,
"grad_norm": 42.32905960083008,
"learning_rate": 0.00019545042881782333,
"epoch": 0.5695432122493433,
"step": 635
},
{
"loss": 361.2873,
"grad_norm": 47.53689956665039,
"learning_rate": 0.00019476260933112163,
"epoch": 0.5704401306938305,
"step": 636
},
{
"loss": 362.2348,
"grad_norm": 47.5960578918457,
"learning_rate": 0.00019407522914545957,
"epoch": 0.5713370491383176,
"step": 637
},
{
"loss": 366.9183,
"grad_norm": 43.92160415649414,
"learning_rate": 0.00019338829372754745,
"epoch": 0.5722339675828048,
"step": 638
},
{
"loss": 361.6643,
"grad_norm": 46.373863220214844,
"learning_rate": 0.0001927018085405588,
"epoch": 0.5731308860272919,
"step": 639
},
{
"loss": 362.9005,
"grad_norm": 45.955814361572266,
"learning_rate": 0.0001920157790440864,
"epoch": 0.5740278044717791,
"step": 640
},
{
"loss": 360.8845,
"grad_norm": 46.01215362548828,
"learning_rate": 0.00019133021069409872,
"epoch": 0.5749247229162663,
"step": 641
},
{
"loss": 361.9622,
"grad_norm": 46.09065628051758,
"learning_rate": 0.00019064510894289705,
"epoch": 0.5758216413607534,
"step": 642
},
{
"loss": 363.2684,
"grad_norm": 45.370140075683594,
"learning_rate": 0.00018996047923907166,
"epoch": 0.5767185598052406,
"step": 643
},
{
"loss": 362.285,
"grad_norm": 43.416664123535156,
"learning_rate": 0.00018927632702745866,
"epoch": 0.5776154782497277,
"step": 644
},
{
"loss": 360.188,
"grad_norm": 44.63084030151367,
"learning_rate": 0.00018859265774909668,
"epoch": 0.5785123966942148,
"step": 645
},
{
"loss": 362.1082,
"grad_norm": 43.95875930786133,
"learning_rate": 0.00018790947684118364,
"epoch": 0.5794093151387021,
"step": 646
},
{
"loss": 364.6595,
"grad_norm": 46.196041107177734,
"learning_rate": 0.00018722678973703355,
"epoch": 0.5803062335831892,
"step": 647
},
{
"loss": 367.5318,
"grad_norm": 52.50529479980469,
"learning_rate": 0.00018654460186603295,
"epoch": 0.5812031520276764,
"step": 648
},
{
"loss": 364.7477,
"grad_norm": 44.10645294189453,
"learning_rate": 0.00018586291865359822,
"epoch": 0.5821000704721635,
"step": 649
},
{
"loss": 362.5089,
"grad_norm": 42.808326721191406,
"learning_rate": 0.00018518174552113216,
"epoch": 0.5829969889166506,
"step": 650
},
{
"eval_loss": 1.6019372940063477,
"eval_runtime": 17.6903,
"eval_samples_per_second": 115.769,
"eval_steps_per_second": 14.471,
"epoch": 0.5829969889166506,
"step": 650
},
{
"loss": 361.447,
"grad_norm": 45.0283088684082,
"learning_rate": 0.0001845010878859809,
"epoch": 0.5838939073611378,
"step": 651
},
{
"loss": 363.9907,
"grad_norm": 45.77663040161133,
"learning_rate": 0.00018382095116139098,
"epoch": 0.5847908258056249,
"step": 652
},
{
"loss": 358.2193,
"grad_norm": 47.19649124145508,
"learning_rate": 0.00018314134075646582,
"epoch": 0.5856877442501122,
"step": 653
},
{
"loss": 362.618,
"grad_norm": 45.46641540527344,
"learning_rate": 0.00018246226207612338,
"epoch": 0.5865846626945993,
"step": 654
},
{
"loss": 364.6533,
"grad_norm": 45.993873596191406,
"learning_rate": 0.00018178372052105263,
"epoch": 0.5874815811390864,
"step": 655
},
{
"loss": 359.9103,
"grad_norm": 49.62721252441406,
"learning_rate": 0.00018110572148767089,
"epoch": 0.5883784995835736,
"step": 656
},
{
"loss": 362.929,
"grad_norm": 47.14739227294922,
"learning_rate": 0.00018042827036808074,
"epoch": 0.5892754180280607,
"step": 657
},
{
"loss": 364.1747,
"grad_norm": 46.9727897644043,
"learning_rate": 0.00017975137255002744,
"epoch": 0.5901723364725479,
"step": 658
},
{
"loss": 362.2029,
"grad_norm": 45.876277923583984,
"learning_rate": 0.0001790750334168555,
"epoch": 0.591069254917035,
"step": 659
},
{
"loss": 359.2526,
"grad_norm": 42.93642807006836,
"learning_rate": 0.00017839925834746653,
"epoch": 0.5919661733615222,
"step": 660
},
{
"loss": 363.6162,
"grad_norm": 41.57487487792969,
"learning_rate": 0.0001777240527162761,
"epoch": 0.5928630918060094,
"step": 661
},
{
"loss": 361.9038,
"grad_norm": 46.25205993652344,
"learning_rate": 0.00017704942189317104,
"epoch": 0.5937600102504965,
"step": 662
},
{
"loss": 358.8016,
"grad_norm": 45.354007720947266,
"learning_rate": 0.0001763753712434666,
"epoch": 0.5946569286949837,
"step": 663
},
{
"loss": 361.5577,
"grad_norm": 42.980037689208984,
"learning_rate": 0.00017570190612786413,
"epoch": 0.5955538471394708,
"step": 664
},
{
"loss": 361.3445,
"grad_norm": 44.7468147277832,
"learning_rate": 0.00017502903190240815,
"epoch": 0.5964507655839579,
"step": 665
},
{
"loss": 360.489,
"grad_norm": 43.96569061279297,
"learning_rate": 0.00017435675391844397,
"epoch": 0.5973476840284452,
"step": 666
},
{
"loss": 365.539,
"grad_norm": 45.040103912353516,
"learning_rate": 0.00017368507752257495,
"epoch": 0.5982446024729323,
"step": 667
},
{
"loss": 363.3497,
"grad_norm": 45.93570327758789,
"learning_rate": 0.00017301400805661989,
"epoch": 0.5991415209174195,
"step": 668
},
{
"loss": 356.2852,
"grad_norm": 41.94508743286133,
"learning_rate": 0.00017234355085757086,
"epoch": 0.6000384393619066,
"step": 669
},
{
"loss": 364.3321,
"grad_norm": 40.20936584472656,
"learning_rate": 0.00017167371125755064,
"epoch": 0.6009353578063937,
"step": 670
},
{
"loss": 365.0333,
"grad_norm": 42.29598617553711,
"learning_rate": 0.00017100449458377003,
"epoch": 0.6018322762508809,
"step": 671
},
{
"loss": 356.7194,
"grad_norm": 41.43622589111328,
"learning_rate": 0.00017033590615848598,
"epoch": 0.602729194695368,
"step": 672
},
{
"loss": 362.7276,
"grad_norm": 44.03760528564453,
"learning_rate": 0.0001696679512989589,
"epoch": 0.6036261131398553,
"step": 673
},
{
"loss": 359.1711,
"grad_norm": 39.68849182128906,
"learning_rate": 0.00016900063531741048,
"epoch": 0.6045230315843424,
"step": 674
},
{
"loss": 357.2,
"grad_norm": 40.92485809326172,
"learning_rate": 0.0001683339635209813,
"epoch": 0.6054199500288295,
"step": 675
},
{
"loss": 362.3214,
"grad_norm": 41.29072189331055,
"learning_rate": 0.000167667941211689,
"epoch": 0.6063168684733167,
"step": 676
},
{
"loss": 361.0124,
"grad_norm": 41.026676177978516,
"learning_rate": 0.00016700257368638572,
"epoch": 0.6072137869178038,
"step": 677
},
{
"loss": 360.2582,
"grad_norm": 43.93520736694336,
"learning_rate": 0.0001663378662367161,
"epoch": 0.608110705362291,
"step": 678
},
{
"loss": 358.0945,
"grad_norm": 43.4892578125,
"learning_rate": 0.00016567382414907532,
"epoch": 0.6090076238067781,
"step": 679
},
{
"loss": 360.7998,
"grad_norm": 43.67966842651367,
"learning_rate": 0.00016501045270456694,
"epoch": 0.6099045422512653,
"step": 680
},
{
"loss": 359.6815,
"grad_norm": 42.92584991455078,
"learning_rate": 0.0001643477571789609,
"epoch": 0.6108014606957525,
"step": 681
},
{
"loss": 361.6625,
"grad_norm": 42.53407287597656,
"learning_rate": 0.00016368574284265165,
"epoch": 0.6116983791402396,
"step": 682
},
{
"loss": 363.5579,
"grad_norm": 41.2686767578125,
"learning_rate": 0.00016302441496061592,
"epoch": 0.6125952975847268,
"step": 683
},
{
"loss": 360.9108,
"grad_norm": 42.09267044067383,
"learning_rate": 0.00016236377879237136,
"epoch": 0.6134922160292139,
"step": 684
},
{
"loss": 360.2266,
"grad_norm": 42.135650634765625,
"learning_rate": 0.0001617038395919344,
"epoch": 0.614389134473701,
"step": 685
},
{
"loss": 355.2124,
"grad_norm": 41.78007888793945,
"learning_rate": 0.00016104460260777837,
"epoch": 0.6152860529181883,
"step": 686
},
{
"loss": 357.8339,
"grad_norm": 41.49577713012695,
"learning_rate": 0.00016038607308279198,
"epoch": 0.6161829713626754,
"step": 687
},
{
"loss": 361.7785,
"grad_norm": 47.102848052978516,
"learning_rate": 0.00015972825625423765,
"epoch": 0.6170798898071626,
"step": 688
},
{
"loss": 357.3535,
"grad_norm": 41.43706512451172,
"learning_rate": 0.0001590711573537096,
"epoch": 0.6179768082516497,
"step": 689
},
{
"loss": 359.8207,
"grad_norm": 40.92182540893555,
"learning_rate": 0.00015841478160709242,
"epoch": 0.6188737266961368,
"step": 690
},
{
"loss": 358.1373,
"grad_norm": 49.461273193359375,
"learning_rate": 0.0001577591342345195,
"epoch": 0.619770645140624,
"step": 691
},
{
"loss": 361.2856,
"grad_norm": 50.03120040893555,
"learning_rate": 0.00015710422045033158,
"epoch": 0.6206675635851111,
"step": 692
},
{
"loss": 359.0531,
"grad_norm": 43.81147003173828,
"learning_rate": 0.00015645004546303493,
"epoch": 0.6215644820295984,
"step": 693
},
{
"loss": 357.6739,
"grad_norm": 44.85881042480469,
"learning_rate": 0.00015579661447526067,
"epoch": 0.6224614004740855,
"step": 694
},
{
"loss": 358.5413,
"grad_norm": 45.34134292602539,
"learning_rate": 0.00015514393268372247,
"epoch": 0.6233583189185726,
"step": 695
},
{
"loss": 362.4291,
"grad_norm": 44.94168472290039,
"learning_rate": 0.00015449200527917578,
"epoch": 0.6242552373630598,
"step": 696
},
{
"loss": 353.4212,
"grad_norm": 43.28814697265625,
"learning_rate": 0.00015384083744637663,
"epoch": 0.6251521558075469,
"step": 697
},
{
"loss": 361.8906,
"grad_norm": 42.88665008544922,
"learning_rate": 0.00015319043436403992,
"epoch": 0.626049074252034,
"step": 698
},
{
"loss": 357.3509,
"grad_norm": 46.005001068115234,
"learning_rate": 0.00015254080120479874,
"epoch": 0.6269459926965213,
"step": 699
},
{
"loss": 356.4296,
"grad_norm": 44.4104118347168,
"learning_rate": 0.00015189194313516288,
"epoch": 0.6278429111410084,
"step": 700
},
{
"eval_loss": 1.597915768623352,
"eval_runtime": 17.571,
"eval_samples_per_second": 116.555,
"eval_steps_per_second": 14.569,
"epoch": 0.6278429111410084,
"step": 700
},
{
"loss": 358.631,
"grad_norm": 43.341407775878906,
"learning_rate": 0.000151243865315478,
"epoch": 0.6287398295854956,
"step": 701
},
{
"loss": 361.772,
"grad_norm": 43.18885803222656,
"learning_rate": 0.00015059657289988426,
"epoch": 0.6296367480299827,
"step": 702
},
{
"loss": 359.0464,
"grad_norm": 41.106483459472656,
"learning_rate": 0.00014995007103627567,
"epoch": 0.6305336664744698,
"step": 703
},
{
"loss": 358.0773,
"grad_norm": 42.815834045410156,
"learning_rate": 0.00014930436486625907,
"epoch": 0.631430584918957,
"step": 704
},
{
"loss": 358.7279,
"grad_norm": 39.7459602355957,
"learning_rate": 0.00014865945952511296,
"epoch": 0.6323275033634441,
"step": 705
},
{
"loss": 358.3263,
"grad_norm": 42.54743576049805,
"learning_rate": 0.00014801536014174706,
"epoch": 0.6332244218079314,
"step": 706
},
{
"loss": 365.4639,
"grad_norm": 45.69781494140625,
"learning_rate": 0.00014737207183866118,
"epoch": 0.6341213402524185,
"step": 707
},
{
"loss": 357.4766,
"grad_norm": 44.834136962890625,
"learning_rate": 0.0001467295997319049,
"epoch": 0.6350182586969056,
"step": 708
},
{
"loss": 361.5132,
"grad_norm": 40.79405975341797,
"learning_rate": 0.00014608794893103646,
"epoch": 0.6359151771413928,
"step": 709
},
{
"loss": 361.108,
"grad_norm": 40.1624870300293,
"learning_rate": 0.00014544712453908216,
"epoch": 0.6368120955858799,
"step": 710
},
{
"loss": 357.4099,
"grad_norm": 42.602073669433594,
"learning_rate": 0.00014480713165249609,
"epoch": 0.6377090140303671,
"step": 711
},
{
"loss": 360.979,
"grad_norm": 43.97264099121094,
"learning_rate": 0.00014416797536111919,
"epoch": 0.6386059324748542,
"step": 712
},
{
"loss": 361.3081,
"grad_norm": 40.94137191772461,
"learning_rate": 0.00014352966074813932,
"epoch": 0.6395028509193414,
"step": 713
},
{
"loss": 359.9567,
"grad_norm": 40.18381881713867,
"learning_rate": 0.00014289219289005027,
"epoch": 0.6403997693638286,
"step": 714
},
{
"loss": 353.732,
"grad_norm": 45.907203674316406,
"learning_rate": 0.0001422555768566115,
"epoch": 0.6412966878083157,
"step": 715
},
{
"loss": 358.1761,
"grad_norm": 46.9672737121582,
"learning_rate": 0.0001416198177108083,
"epoch": 0.6421936062528029,
"step": 716
},
{
"loss": 358.2166,
"grad_norm": 40.92546081542969,
"learning_rate": 0.0001409849205088109,
"epoch": 0.64309052469729,
"step": 717
},
{
"loss": 358.0281,
"grad_norm": 39.04634475708008,
"learning_rate": 0.00014035089029993444,
"epoch": 0.6439874431417771,
"step": 718
},
{
"loss": 358.9151,
"grad_norm": 41.55719757080078,
"learning_rate": 0.00013971773212659929,
"epoch": 0.6448843615862644,
"step": 719
},
{
"loss": 356.5345,
"grad_norm": 41.81498336791992,
"learning_rate": 0.00013908545102429,
"epoch": 0.6457812800307515,
"step": 720
},
{
"loss": 358.3629,
"grad_norm": 40.042484283447266,
"learning_rate": 0.00013845405202151637,
"epoch": 0.6466781984752387,
"step": 721
},
{
"loss": 360.9086,
"grad_norm": 44.207122802734375,
"learning_rate": 0.00013782354013977245,
"epoch": 0.6475751169197258,
"step": 722
},
{
"loss": 357.7452,
"grad_norm": 45.20026779174805,
"learning_rate": 0.00013719392039349734,
"epoch": 0.6484720353642129,
"step": 723
},
{
"loss": 358.4982,
"grad_norm": 41.07488250732422,
"learning_rate": 0.00013656519779003476,
"epoch": 0.6493689538087001,
"step": 724
},
{
"loss": 361.3215,
"grad_norm": 43.69713592529297,
"learning_rate": 0.00013593737732959382,
"epoch": 0.6502658722531872,
"step": 725
},
{
"loss": 356.6879,
"grad_norm": 45.356109619140625,
"learning_rate": 0.00013531046400520858,
"epoch": 0.6511627906976745,
"step": 726
},
{
"loss": 363.6577,
"grad_norm": 44.325103759765625,
"learning_rate": 0.0001346844628026988,
"epoch": 0.6520597091421616,
"step": 727
},
{
"loss": 358.3399,
"grad_norm": 40.79582595825195,
"learning_rate": 0.0001340593787006303,
"epoch": 0.6529566275866487,
"step": 728
},
{
"loss": 360.8162,
"grad_norm": 40.47697448730469,
"learning_rate": 0.0001334352166702751,
"epoch": 0.6538535460311359,
"step": 729
},
{
"loss": 356.254,
"grad_norm": 43.549407958984375,
"learning_rate": 0.00013281198167557185,
"epoch": 0.654750464475623,
"step": 730
},
{
"loss": 356.3695,
"grad_norm": 41.08717727661133,
"learning_rate": 0.00013218967867308694,
"epoch": 0.6556473829201102,
"step": 731
},
{
"loss": 359.2961,
"grad_norm": 44.06740951538086,
"learning_rate": 0.00013156831261197438,
"epoch": 0.6565443013645973,
"step": 732
},
{
"loss": 354.8276,
"grad_norm": 44.14928436279297,
"learning_rate": 0.00013094788843393657,
"epoch": 0.6574412198090845,
"step": 733
},
{
"loss": 356.655,
"grad_norm": 41.25139236450195,
"learning_rate": 0.0001303284110731856,
"epoch": 0.6583381382535717,
"step": 734
},
{
"loss": 359.9945,
"grad_norm": 43.141475677490234,
"learning_rate": 0.00012970988545640307,
"epoch": 0.6592350566980588,
"step": 735
},
{
"loss": 354.7369,
"grad_norm": 45.27100372314453,
"learning_rate": 0.0001290923165027017,
"epoch": 0.660131975142546,
"step": 736
},
{
"loss": 357.4191,
"grad_norm": 41.795658111572266,
"learning_rate": 0.0001284757091235859,
"epoch": 0.6610288935870331,
"step": 737
},
{
"loss": 353.508,
"grad_norm": 43.1330680847168,
"learning_rate": 0.0001278600682229126,
"epoch": 0.6619258120315202,
"step": 738
},
{
"loss": 356.3365,
"grad_norm": 43.488121032714844,
"learning_rate": 0.00012724539869685226,
"epoch": 0.6628227304760075,
"step": 739
},
{
"loss": 357.6046,
"grad_norm": 42.182777404785156,
"learning_rate": 0.0001266317054338503,
"epoch": 0.6637196489204946,
"step": 740
},
{
"loss": 358.7371,
"grad_norm": 43.06134796142578,
"learning_rate": 0.00012601899331458777,
"epoch": 0.6646165673649818,
"step": 741
},
{
"loss": 358.2452,
"grad_norm": 40.01738357543945,
"learning_rate": 0.00012540726721194266,
"epoch": 0.6655134858094689,
"step": 742
},
{
"loss": 361.5233,
"grad_norm": 40.66733169555664,
"learning_rate": 0.0001247965319909515,
"epoch": 0.666410404253956,
"step": 743
},
{
"loss": 354.1553,
"grad_norm": 39.47666931152344,
"learning_rate": 0.0001241867925087701,
"epoch": 0.6673073226984432,
"step": 744
},
{
"loss": 358.3203,
"grad_norm": 39.22403335571289,
"learning_rate": 0.00012357805361463514,
"epoch": 0.6682042411429303,
"step": 745
},
{
"loss": 357.0617,
"grad_norm": 39.071529388427734,
"learning_rate": 0.00012297032014982597,
"epoch": 0.6691011595874176,
"step": 746
},
{
"loss": 362.905,
"grad_norm": 40.75625228881836,
"learning_rate": 0.0001223635969476255,
"epoch": 0.6699980780319047,
"step": 747
},
{
"loss": 354.9351,
"grad_norm": 42.89009094238281,
"learning_rate": 0.00012175788883328232,
"epoch": 0.6708949964763918,
"step": 748
},
{
"loss": 359.415,
"grad_norm": 43.072513580322266,
"learning_rate": 0.0001211532006239718,
"epoch": 0.671791914920879,
"step": 749
},
{
"loss": 357.7546,
"grad_norm": 40.25785446166992,
"learning_rate": 0.00012054953712875807,
"epoch": 0.6726888333653661,
"step": 750
},
{
"eval_loss": 1.609327793121338,
"eval_runtime": 17.5285,
"eval_samples_per_second": 116.839,
"eval_steps_per_second": 14.605,
"epoch": 0.6726888333653661,
"step": 750
},
{
"loss": 357.2794,
"grad_norm": 41.602596282958984,
"learning_rate": 0.00011994690314855598,
"epoch": 0.6735857518098533,
"step": 751
},
{
"loss": 361.091,
"grad_norm": 41.749717712402344,
"learning_rate": 0.00011934530347609257,
"epoch": 0.6744826702543405,
"step": 752
},
{
"loss": 362.0817,
"grad_norm": 39.51606369018555,
"learning_rate": 0.00011874474289586895,
"epoch": 0.6753795886988276,
"step": 753
},
{
"loss": 356.8317,
"grad_norm": 40.00758743286133,
"learning_rate": 0.00011814522618412235,
"epoch": 0.6762765071433148,
"step": 754
},
{
"loss": 359.7722,
"grad_norm": 41.676292419433594,
"learning_rate": 0.00011754675810878845,
"epoch": 0.6771734255878019,
"step": 755
},
{
"loss": 359.641,
"grad_norm": 41.25587463378906,
"learning_rate": 0.00011694934342946287,
"epoch": 0.678070344032289,
"step": 756
},
{
"loss": 352.955,
"grad_norm": 40.348514556884766,
"learning_rate": 0.00011635298689736357,
"epoch": 0.6789672624767762,
"step": 757
},
{
"loss": 362.8987,
"grad_norm": 43.387184143066406,
"learning_rate": 0.00011575769325529342,
"epoch": 0.6798641809212633,
"step": 758
},
{
"loss": 357.0482,
"grad_norm": 40.06668472290039,
"learning_rate": 0.00011516346723760193,
"epoch": 0.6807610993657506,
"step": 759
},
{
"loss": 359.7377,
"grad_norm": 39.39516830444336,
"learning_rate": 0.00011457031357014772,
"epoch": 0.6816580178102377,
"step": 760
},
{
"loss": 362.0869,
"grad_norm": 39.07398223876953,
"learning_rate": 0.0001139782369702614,
"epoch": 0.6825549362547249,
"step": 761
},
{
"loss": 357.4482,
"grad_norm": 42.54057312011719,
"learning_rate": 0.00011338724214670734,
"epoch": 0.683451854699212,
"step": 762
},
{
"loss": 360.6057,
"grad_norm": 40.7839241027832,
"learning_rate": 0.00011279733379964691,
"epoch": 0.6843487731436991,
"step": 763
},
{
"loss": 362.9106,
"grad_norm": 41.402889251708984,
"learning_rate": 0.00011220851662060047,
"epoch": 0.6852456915881863,
"step": 764
},
{
"loss": 357.1811,
"grad_norm": 41.3732795715332,
"learning_rate": 0.00011162079529241042,
"epoch": 0.6861426100326734,
"step": 765
},
{
"loss": 358.0857,
"grad_norm": 42.31522750854492,
"learning_rate": 0.00011103417448920406,
"epoch": 0.6870395284771607,
"step": 766
},
{
"loss": 357.946,
"grad_norm": 38.36897277832031,
"learning_rate": 0.00011044865887635625,
"epoch": 0.6879364469216478,
"step": 767
},
{
"loss": 360.9647,
"grad_norm": 43.01420974731445,
"learning_rate": 0.00010986425311045212,
"epoch": 0.6888333653661349,
"step": 768
},
{
"loss": 362.1032,
"grad_norm": 40.731163024902344,
"learning_rate": 0.00010928096183925024,
"epoch": 0.6897302838106221,
"step": 769
},
{
"loss": 363.3222,
"grad_norm": 41.69025421142578,
"learning_rate": 0.00010869878970164587,
"epoch": 0.6906272022551092,
"step": 770
},
{
"loss": 358.3542,
"grad_norm": 37.463043212890625,
"learning_rate": 0.00010811774132763366,
"epoch": 0.6915241206995963,
"step": 771
},
{
"loss": 364.5648,
"grad_norm": 38.481815338134766,
"learning_rate": 0.00010753782133827093,
"epoch": 0.6924210391440836,
"step": 772
},
{
"loss": 361.0055,
"grad_norm": 39.70282745361328,
"learning_rate": 0.00010695903434564124,
"epoch": 0.6933179575885707,
"step": 773
},
{
"loss": 359.3154,
"grad_norm": 38.182132720947266,
"learning_rate": 0.00010638138495281725,
"epoch": 0.6942148760330579,
"step": 774
},
{
"loss": 356.322,
"grad_norm": 37.12331008911133,
"learning_rate": 0.00010580487775382449,
"epoch": 0.695111794477545,
"step": 775
},
{
"loss": 356.3972,
"grad_norm": 40.065006256103516,
"learning_rate": 0.00010522951733360456,
"epoch": 0.6960087129220321,
"step": 776
},
{
"loss": 351.4366,
"grad_norm": 40.21229553222656,
"learning_rate": 0.0001046553082679787,
"epoch": 0.6969056313665193,
"step": 777
},
{
"loss": 356.3872,
"grad_norm": 39.17121124267578,
"learning_rate": 0.00010408225512361171,
"epoch": 0.6978025498110064,
"step": 778
},
{
"loss": 358.5863,
"grad_norm": 38.62257766723633,
"learning_rate": 0.0001035103624579751,
"epoch": 0.6986994682554937,
"step": 779
},
{
"loss": 359.1902,
"grad_norm": 39.73896408081055,
"learning_rate": 0.00010293963481931143,
"epoch": 0.6995963866999808,
"step": 780
},
{
"loss": 357.0757,
"grad_norm": 38.72207260131836,
"learning_rate": 0.00010237007674659752,
"epoch": 0.700493305144468,
"step": 781
},
{
"loss": 359.07,
"grad_norm": 39.15367126464844,
"learning_rate": 0.00010180169276950899,
"epoch": 0.7013902235889551,
"step": 782
},
{
"loss": 357.7226,
"grad_norm": 39.2513542175293,
"learning_rate": 0.00010123448740838367,
"epoch": 0.7022871420334422,
"step": 783
},
{
"loss": 359.4571,
"grad_norm": 41.660953521728516,
"learning_rate": 0.00010066846517418596,
"epoch": 0.7031840604779294,
"step": 784
},
{
"loss": 358.3033,
"grad_norm": 40.074806213378906,
"learning_rate": 0.00010010363056847103,
"epoch": 0.7040809789224165,
"step": 785
},
{
"loss": 358.5859,
"grad_norm": 40.53306198120117,
"learning_rate": 9.953998808334874e-05,
"epoch": 0.7049778973669037,
"step": 786
},
{
"loss": 353.3639,
"grad_norm": 43.58430099487305,
"learning_rate": 9.8977542201448e-05,
"epoch": 0.7058748158113909,
"step": 787
},
{
"loss": 359.5676,
"grad_norm": 39.986785888671875,
"learning_rate": 9.841629739588145e-05,
"epoch": 0.706771734255878,
"step": 788
},
{
"loss": 361.0522,
"grad_norm": 41.356590270996094,
"learning_rate": 9.785625813020923e-05,
"epoch": 0.7076686527003652,
"step": 789
},
{
"loss": 355.244,
"grad_norm": 40.596397399902344,
"learning_rate": 9.729742885840429e-05,
"epoch": 0.7085655711448523,
"step": 790
},
{
"loss": 358.6471,
"grad_norm": 39.8510627746582,
"learning_rate": 9.673981402481619e-05,
"epoch": 0.7094624895893394,
"step": 791
},
{
"loss": 355.7997,
"grad_norm": 37.443397521972656,
"learning_rate": 9.618341806413614e-05,
"epoch": 0.7103594080338267,
"step": 792
},
{
"loss": 358.5055,
"grad_norm": 38.937034606933594,
"learning_rate": 9.562824540136192e-05,
"epoch": 0.7112563264783138,
"step": 793
},
{
"loss": 357.9367,
"grad_norm": 39.378326416015625,
"learning_rate": 9.507430045176238e-05,
"epoch": 0.712153244922801,
"step": 794
},
{
"loss": 356.7012,
"grad_norm": 40.44821548461914,
"learning_rate": 9.452158762084228e-05,
"epoch": 0.7130501633672881,
"step": 795
},
{
"loss": 361.7253,
"grad_norm": 39.721378326416016,
"learning_rate": 9.397011130430741e-05,
"epoch": 0.7139470818117752,
"step": 796
},
{
"loss": 359.5762,
"grad_norm": 40.48420333862305,
"learning_rate": 9.341987588802984e-05,
"epoch": 0.7148440002562624,
"step": 797
},
{
"loss": 355.1304,
"grad_norm": 38.8956413269043,
"learning_rate": 9.287088574801248e-05,
"epoch": 0.7157409187007495,
"step": 798
},
{
"loss": 360.5678,
"grad_norm": 41.26605987548828,
"learning_rate": 9.23231452503547e-05,
"epoch": 0.7166378371452368,
"step": 799
},
{
"loss": 359.8319,
"grad_norm": 36.14881134033203,
"learning_rate": 9.177665875121774e-05,
"epoch": 0.7175347555897239,
"step": 800
},
{
"eval_loss": 1.5968618392944336,
"eval_runtime": 17.8479,
"eval_samples_per_second": 114.747,
"eval_steps_per_second": 14.343,
"epoch": 0.7175347555897239,
"step": 800
},
{
"loss": 361.1777,
"grad_norm": 40.25320053100586,
"learning_rate": 9.123143059678952e-05,
"epoch": 0.718431674034211,
"step": 801
},
{
"loss": 355.5561,
"grad_norm": 39.248783111572266,
"learning_rate": 9.068746512325046e-05,
"epoch": 0.7193285924786982,
"step": 802
},
{
"loss": 353.493,
"grad_norm": 41.21136474609375,
"learning_rate": 9.014476665673915e-05,
"epoch": 0.7202255109231853,
"step": 803
},
{
"loss": 355.8681,
"grad_norm": 38.923973083496094,
"learning_rate": 8.960333951331739e-05,
"epoch": 0.7211224293676725,
"step": 804
},
{
"loss": 355.0969,
"grad_norm": 43.01164627075195,
"learning_rate": 8.906318799893648e-05,
"epoch": 0.7220193478121597,
"step": 805
},
{
"loss": 354.1833,
"grad_norm": 39.02459716796875,
"learning_rate": 8.852431640940247e-05,
"epoch": 0.7229162662566468,
"step": 806
},
{
"loss": 359.125,
"grad_norm": 37.63704299926758,
"learning_rate": 8.798672903034225e-05,
"epoch": 0.723813184701134,
"step": 807
},
{
"loss": 355.6418,
"grad_norm": 38.401512145996094,
"learning_rate": 8.745043013716955e-05,
"epoch": 0.7247101031456211,
"step": 808
},
{
"loss": 358.6194,
"grad_norm": 37.391685485839844,
"learning_rate": 8.691542399505081e-05,
"epoch": 0.7256070215901083,
"step": 809
},
{
"loss": 359.1611,
"grad_norm": 40.48008728027344,
"learning_rate": 8.638171485887111e-05,
"epoch": 0.7265039400345954,
"step": 810
},
{
"loss": 359.4613,
"grad_norm": 40.47174835205078,
"learning_rate": 8.584930697320053e-05,
"epoch": 0.7274008584790825,
"step": 811
},
{
"loss": 351.1801,
"grad_norm": 39.59210968017578,
"learning_rate": 8.531820457226055e-05,
"epoch": 0.7282977769235698,
"step": 812
},
{
"loss": 355.662,
"grad_norm": 36.89620590209961,
"learning_rate": 8.478841187988992e-05,
"epoch": 0.7291946953680569,
"step": 813
},
{
"loss": 361.7194,
"grad_norm": 38.956214904785156,
"learning_rate": 8.425993310951132e-05,
"epoch": 0.7300916138125441,
"step": 814
},
{
"loss": 359.9547,
"grad_norm": 36.15619659423828,
"learning_rate": 8.373277246409818e-05,
"epoch": 0.7309885322570312,
"step": 815
},
{
"loss": 353.2803,
"grad_norm": 41.085899353027344,
"learning_rate": 8.320693413614053e-05,
"epoch": 0.7318854507015183,
"step": 816
},
{
"loss": 356.6743,
"grad_norm": 40.31721878051758,
"learning_rate": 8.268242230761239e-05,
"epoch": 0.7327823691460055,
"step": 817
},
{
"loss": 356.205,
"grad_norm": 41.351558685302734,
"learning_rate": 8.215924114993792e-05,
"epoch": 0.7336792875904926,
"step": 818
},
{
"loss": 360.4526,
"grad_norm": 39.119476318359375,
"learning_rate": 8.163739482395851e-05,
"epoch": 0.7345762060349799,
"step": 819
},
{
"loss": 361.5057,
"grad_norm": 38.80229949951172,
"learning_rate": 8.111688747990001e-05,
"epoch": 0.735473124479467,
"step": 820
},
{
"loss": 352.7518,
"grad_norm": 40.22185134887695,
"learning_rate": 8.059772325733899e-05,
"epoch": 0.7363700429239541,
"step": 821
},
{
"loss": 356.2066,
"grad_norm": 40.426979064941406,
"learning_rate": 8.007990628517034e-05,
"epoch": 0.7372669613684413,
"step": 822
},
{
"loss": 358.5974,
"grad_norm": 39.50589370727539,
"learning_rate": 7.956344068157443e-05,
"epoch": 0.7381638798129284,
"step": 823
},
{
"loss": 360.1032,
"grad_norm": 38.537113189697266,
"learning_rate": 7.904833055398428e-05,
"epoch": 0.7390607982574156,
"step": 824
},
{
"loss": 358.6521,
"grad_norm": 38.09297180175781,
"learning_rate": 7.853457999905264e-05,
"epoch": 0.7399577167019028,
"step": 825
},
{
"loss": 358.724,
"grad_norm": 38.27792739868164,
"learning_rate": 7.802219310261965e-05,
"epoch": 0.7408546351463899,
"step": 826
},
{
"loss": 361.0538,
"grad_norm": 40.946353912353516,
"learning_rate": 7.75111739396806e-05,
"epoch": 0.7417515535908771,
"step": 827
},
{
"loss": 354.2574,
"grad_norm": 37.80830764770508,
"learning_rate": 7.700152657435297e-05,
"epoch": 0.7426484720353642,
"step": 828
},
{
"loss": 356.4567,
"grad_norm": 39.698429107666016,
"learning_rate": 7.649325505984434e-05,
"epoch": 0.7435453904798514,
"step": 829
},
{
"loss": 355.0162,
"grad_norm": 38.21966552734375,
"learning_rate": 7.598636343842053e-05,
"epoch": 0.7444423089243385,
"step": 830
},
{
"loss": 356.4822,
"grad_norm": 39.37642288208008,
"learning_rate": 7.548085574137273e-05,
"epoch": 0.7453392273688256,
"step": 831
},
{
"loss": 357.8192,
"grad_norm": 37.3087158203125,
"learning_rate": 7.497673598898613e-05,
"epoch": 0.7462361458133129,
"step": 832
},
{
"loss": 363.7517,
"grad_norm": 35.9515266418457,
"learning_rate": 7.447400819050751e-05,
"epoch": 0.7471330642578,
"step": 833
},
{
"loss": 355.3728,
"grad_norm": 36.964534759521484,
"learning_rate": 7.397267634411337e-05,
"epoch": 0.7480299827022872,
"step": 834
},
{
"loss": 354.5074,
"grad_norm": 39.167415618896484,
"learning_rate": 7.347274443687855e-05,
"epoch": 0.7489269011467743,
"step": 835
},
{
"loss": 361.1248,
"grad_norm": 40.1679801940918,
"learning_rate": 7.297421644474387e-05,
"epoch": 0.7498238195912614,
"step": 836
},
{
"loss": 357.9431,
"grad_norm": 38.67217254638672,
"learning_rate": 7.247709633248526e-05,
"epoch": 0.7507207380357486,
"step": 837
},
{
"loss": 360.9297,
"grad_norm": 37.734153747558594,
"learning_rate": 7.198138805368143e-05,
"epoch": 0.7516176564802357,
"step": 838
},
{
"loss": 350.7899,
"grad_norm": 36.58796691894531,
"learning_rate": 7.148709555068314e-05,
"epoch": 0.752514574924723,
"step": 839
},
{
"loss": 358.5099,
"grad_norm": 37.6004753112793,
"learning_rate": 7.09942227545814e-05,
"epoch": 0.7534114933692101,
"step": 840
},
{
"loss": 350.2813,
"grad_norm": 39.31602096557617,
"learning_rate": 7.05027735851762e-05,
"epoch": 0.7543084118136972,
"step": 841
},
{
"loss": 361.4473,
"grad_norm": 37.72463607788086,
"learning_rate": 7.001275195094581e-05,
"epoch": 0.7552053302581844,
"step": 842
},
{
"loss": 356.7912,
"grad_norm": 36.68344497680664,
"learning_rate": 6.952416174901504e-05,
"epoch": 0.7561022487026715,
"step": 843
},
{
"loss": 360.7002,
"grad_norm": 39.82998275756836,
"learning_rate": 6.903700686512485e-05,
"epoch": 0.7569991671471586,
"step": 844
},
{
"loss": 357.1058,
"grad_norm": 39.26710510253906,
"learning_rate": 6.855129117360095e-05,
"epoch": 0.7578960855916459,
"step": 845
},
{
"loss": 356.4349,
"grad_norm": 37.95897674560547,
"learning_rate": 6.806701853732319e-05,
"epoch": 0.758793004036133,
"step": 846
},
{
"loss": 353.9336,
"grad_norm": 36.72467041015625,
"learning_rate": 6.75841928076951e-05,
"epoch": 0.7596899224806202,
"step": 847
},
{
"loss": 355.9283,
"grad_norm": 38.29819869995117,
"learning_rate": 6.710281782461275e-05,
"epoch": 0.7605868409251073,
"step": 848
},
{
"loss": 357.5876,
"grad_norm": 39.196720123291016,
"learning_rate": 6.662289741643454e-05,
"epoch": 0.7614837593695944,
"step": 849
},
{
"loss": 359.8077,
"grad_norm": 40.00128936767578,
"learning_rate": 6.614443539995078e-05,
"epoch": 0.7623806778140816,
"step": 850
},
{
"eval_loss": 1.582360863685608,
"eval_runtime": 18.4592,
"eval_samples_per_second": 110.947,
"eval_steps_per_second": 13.868,
"epoch": 0.7623806778140816,
"step": 850
},
{
"loss": 355.6048,
"grad_norm": 38.59453582763672,
"learning_rate": 6.56674355803532e-05,
"epoch": 0.7632775962585687,
"step": 851
},
{
"loss": 360.1093,
"grad_norm": 39.37229537963867,
"learning_rate": 6.519190175120473e-05,
"epoch": 0.764174514703056,
"step": 852
},
{
"loss": 357.6195,
"grad_norm": 36.07246017456055,
"learning_rate": 6.47178376944092e-05,
"epoch": 0.7650714331475431,
"step": 853
},
{
"loss": 357.4596,
"grad_norm": 36.77618408203125,
"learning_rate": 6.424524718018163e-05,
"epoch": 0.7659683515920302,
"step": 854
},
{
"loss": 359.593,
"grad_norm": 36.766483306884766,
"learning_rate": 6.377413396701781e-05,
"epoch": 0.7668652700365174,
"step": 855
},
{
"loss": 356.4777,
"grad_norm": 43.47877502441406,
"learning_rate": 6.330450180166464e-05,
"epoch": 0.7677621884810045,
"step": 856
},
{
"loss": 353.8591,
"grad_norm": 39.65815353393555,
"learning_rate": 6.283635441909044e-05,
"epoch": 0.7686591069254917,
"step": 857
},
{
"loss": 358.9107,
"grad_norm": 42.22090148925781,
"learning_rate": 6.236969554245486e-05,
"epoch": 0.7695560253699789,
"step": 858
},
{
"loss": 361.3808,
"grad_norm": 37.009342193603516,
"learning_rate": 6.19045288830798e-05,
"epoch": 0.770452943814466,
"step": 859
},
{
"loss": 359.7101,
"grad_norm": 36.62922668457031,
"learning_rate": 6.144085814041941e-05,
"epoch": 0.7713498622589532,
"step": 860
},
{
"loss": 360.3506,
"grad_norm": 35.92998123168945,
"learning_rate": 6.097868700203082e-05,
"epoch": 0.7722467807034403,
"step": 861
},
{
"loss": 352.6364,
"grad_norm": 40.08286666870117,
"learning_rate": 6.05180191435451e-05,
"epoch": 0.7731436991479275,
"step": 862
},
{
"loss": 356.8879,
"grad_norm": 38.76757049560547,
"learning_rate": 6.0058858228637605e-05,
"epoch": 0.7740406175924146,
"step": 863
},
{
"loss": 355.7852,
"grad_norm": 37.80318069458008,
"learning_rate": 5.960120790899895e-05,
"epoch": 0.7749375360369017,
"step": 864
},
{
"loss": 357.245,
"grad_norm": 36.61247253417969,
"learning_rate": 5.914507182430626e-05,
"epoch": 0.775834454481389,
"step": 865
},
{
"loss": 355.3506,
"grad_norm": 37.76987838745117,
"learning_rate": 5.869045360219391e-05,
"epoch": 0.7767313729258761,
"step": 866
},
{
"loss": 351.2185,
"grad_norm": 37.881492614746094,
"learning_rate": 5.8237356858224704e-05,
"epoch": 0.7776282913703633,
"step": 867
},
{
"loss": 360.2768,
"grad_norm": 39.45249557495117,
"learning_rate": 5.7785785195861194e-05,
"epoch": 0.7785252098148504,
"step": 868
},
{
"loss": 353.9251,
"grad_norm": 39.94224548339844,
"learning_rate": 5.733574220643712e-05,
"epoch": 0.7794221282593375,
"step": 869
},
{
"loss": 355.1441,
"grad_norm": 37.91038513183594,
"learning_rate": 5.688723146912858e-05,
"epoch": 0.7803190467038247,
"step": 870
},
{
"loss": 359.303,
"grad_norm": 36.14017105102539,
"learning_rate": 5.644025655092591e-05,
"epoch": 0.7812159651483118,
"step": 871
},
{
"loss": 359.8912,
"grad_norm": 37.15394592285156,
"learning_rate": 5.5994821006604965e-05,
"epoch": 0.7821128835927991,
"step": 872
},
{
"loss": 360.2237,
"grad_norm": 35.74496078491211,
"learning_rate": 5.555092837869902e-05,
"epoch": 0.7830098020372862,
"step": 873
},
{
"loss": 352.0333,
"grad_norm": 37.32427215576172,
"learning_rate": 5.5108582197470784e-05,
"epoch": 0.7839067204817733,
"step": 874
},
{
"loss": 359.9949,
"grad_norm": 40.355411529541016,
"learning_rate": 5.4667785980883897e-05,
"epoch": 0.7848036389262605,
"step": 875
},
{
"loss": 351.2752,
"grad_norm": 36.727745056152344,
"learning_rate": 5.422854323457527e-05,
"epoch": 0.7857005573707476,
"step": 876
},
{
"loss": 352.9948,
"grad_norm": 37.40601348876953,
"learning_rate": 5.379085745182721e-05,
"epoch": 0.7865974758152348,
"step": 877
},
{
"loss": 357.7682,
"grad_norm": 36.147159576416016,
"learning_rate": 5.335473211353942e-05,
"epoch": 0.787494394259722,
"step": 878
},
{
"loss": 360.3233,
"grad_norm": 36.26030349731445,
"learning_rate": 5.29201706882014e-05,
"epoch": 0.7883913127042091,
"step": 879
},
{
"loss": 354.8234,
"grad_norm": 34.958744049072266,
"learning_rate": 5.2487176631865114e-05,
"epoch": 0.7892882311486963,
"step": 880
},
{
"loss": 358.086,
"grad_norm": 36.89348602294922,
"learning_rate": 5.205575338811719e-05,
"epoch": 0.7901851495931834,
"step": 881
},
{
"loss": 357.6668,
"grad_norm": 39.996177673339844,
"learning_rate": 5.1625904388051564e-05,
"epoch": 0.7910820680376706,
"step": 882
},
{
"loss": 353.7882,
"grad_norm": 36.440711975097656,
"learning_rate": 5.119763305024225e-05,
"epoch": 0.7919789864821577,
"step": 883
},
{
"loss": 356.1277,
"grad_norm": 36.0537223815918,
"learning_rate": 5.077094278071642e-05,
"epoch": 0.7928759049266448,
"step": 884
},
{
"loss": 359.5157,
"grad_norm": 35.76783752441406,
"learning_rate": 5.034583697292674e-05,
"epoch": 0.7937728233711321,
"step": 885
},
{
"loss": 353.6391,
"grad_norm": 34.94169998168945,
"learning_rate": 4.9922319007724954e-05,
"epoch": 0.7946697418156192,
"step": 886
},
{
"loss": 361.0958,
"grad_norm": 38.87442398071289,
"learning_rate": 4.9500392253334635e-05,
"epoch": 0.7955666602601064,
"step": 887
},
{
"loss": 357.8425,
"grad_norm": 36.01359558105469,
"learning_rate": 4.908006006532445e-05,
"epoch": 0.7964635787045935,
"step": 888
},
{
"loss": 358.4057,
"grad_norm": 39.11752700805664,
"learning_rate": 4.866132578658172e-05,
"epoch": 0.7973604971490806,
"step": 889
},
{
"loss": 355.1286,
"grad_norm": 37.169158935546875,
"learning_rate": 4.8244192747285507e-05,
"epoch": 0.7982574155935678,
"step": 890
},
{
"loss": 356.0285,
"grad_norm": 35.89703369140625,
"learning_rate": 4.7828664264880254e-05,
"epoch": 0.7991543340380549,
"step": 891
},
{
"loss": 353.9138,
"grad_norm": 35.52785873413086,
"learning_rate": 4.741474364404955e-05,
"epoch": 0.8000512524825422,
"step": 892
},
{
"loss": 359.8646,
"grad_norm": 35.992713928222656,
"learning_rate": 4.7002434176689564e-05,
"epoch": 0.8009481709270293,
"step": 893
},
{
"loss": 360.1763,
"grad_norm": 36.50730514526367,
"learning_rate": 4.659173914188319e-05,
"epoch": 0.8018450893715164,
"step": 894
},
{
"loss": 356.7962,
"grad_norm": 36.77907180786133,
"learning_rate": 4.618266180587363e-05,
"epoch": 0.8027420078160036,
"step": 895
},
{
"loss": 354.5534,
"grad_norm": 36.69013214111328,
"learning_rate": 4.5775205422038695e-05,
"epoch": 0.8036389262604907,
"step": 896
},
{
"loss": 355.8555,
"grad_norm": 36.079769134521484,
"learning_rate": 4.536937323086479e-05,
"epoch": 0.8045358447049779,
"step": 897
},
{
"loss": 352.4216,
"grad_norm": 36.98958969116211,
"learning_rate": 4.4965168459921076e-05,
"epoch": 0.8054327631494651,
"step": 898
},
{
"loss": 354.3763,
"grad_norm": 36.339656829833984,
"learning_rate": 4.456259432383408e-05,
"epoch": 0.8063296815939522,
"step": 899
},
{
"loss": 353.9048,
"grad_norm": 35.602909088134766,
"learning_rate": 4.4161654024261756e-05,
"epoch": 0.8072266000384394,
"step": 900
},
{
"eval_loss": 1.581258773803711,
"eval_runtime": 19.1453,
"eval_samples_per_second": 106.971,
"eval_steps_per_second": 13.371,
"epoch": 0.8072266000384394,
"step": 900
},
{
"loss": 353.9864,
"grad_norm": 37.425819396972656,
"learning_rate": 4.3762350749868425e-05,
"epoch": 0.8081235184829265,
"step": 901
},
{
"loss": 352.1746,
"grad_norm": 36.96770095825195,
"learning_rate": 4.336468767629906e-05,
"epoch": 0.8090204369274137,
"step": 902
},
{
"loss": 362.0162,
"grad_norm": 36.64163589477539,
"learning_rate": 4.296866796615406e-05,
"epoch": 0.8099173553719008,
"step": 903
},
{
"loss": 356.8323,
"grad_norm": 37.755550384521484,
"learning_rate": 4.257429476896454e-05,
"epoch": 0.8108142738163879,
"step": 904
},
{
"loss": 355.0851,
"grad_norm": 35.74870300292969,
"learning_rate": 4.2181571221166696e-05,
"epoch": 0.8117111922608752,
"step": 905
},
{
"loss": 354.1617,
"grad_norm": 35.670047760009766,
"learning_rate": 4.179050044607713e-05,
"epoch": 0.8126081107053623,
"step": 906
},
{
"loss": 354.9214,
"grad_norm": 36.92220687866211,
"learning_rate": 4.140108555386812e-05,
"epoch": 0.8135050291498495,
"step": 907
},
{
"loss": 351.6111,
"grad_norm": 38.204166412353516,
"learning_rate": 4.101332964154275e-05,
"epoch": 0.8144019475943366,
"step": 908
},
{
"loss": 355.9622,
"grad_norm": 35.54768753051758,
"learning_rate": 4.0627235792910224e-05,
"epoch": 0.8152988660388237,
"step": 909
},
{
"loss": 359.8922,
"grad_norm": 37.4915771484375,
"learning_rate": 4.024280707856134e-05,
"epoch": 0.8161957844833109,
"step": 910
},
{
"loss": 356.2166,
"grad_norm": 36.84100341796875,
"learning_rate": 3.9860046555844406e-05,
"epoch": 0.8170927029277981,
"step": 911
},
{
"loss": 355.0562,
"grad_norm": 35.636878967285156,
"learning_rate": 3.947895726884038e-05,
"epoch": 0.8179896213722853,
"step": 912
},
{
"loss": 360.0903,
"grad_norm": 36.50727081298828,
"learning_rate": 3.909954224833911e-05,
"epoch": 0.8188865398167724,
"step": 913
},
{
"loss": 359.0554,
"grad_norm": 37.51554489135742,
"learning_rate": 3.8721804511815007e-05,
"epoch": 0.8197834582612595,
"step": 914
},
{
"loss": 356.6491,
"grad_norm": 36.2037239074707,
"learning_rate": 3.834574706340302e-05,
"epoch": 0.8206803767057467,
"step": 915
},
{
"loss": 357.358,
"grad_norm": 39.62883758544922,
"learning_rate": 3.797137289387503e-05,
"epoch": 0.8215772951502338,
"step": 916
},
{
"loss": 356.6225,
"grad_norm": 35.792728424072266,
"learning_rate": 3.7598684980615694e-05,
"epoch": 0.822474213594721,
"step": 917
},
{
"loss": 351.0151,
"grad_norm": 35.77069854736328,
"learning_rate": 3.7227686287598874e-05,
"epoch": 0.8233711320392082,
"step": 918
},
{
"loss": 356.1569,
"grad_norm": 36.655330657958984,
"learning_rate": 3.685837976536435e-05,
"epoch": 0.8242680504836953,
"step": 919
},
{
"loss": 356.6186,
"grad_norm": 35.82206726074219,
"learning_rate": 3.649076835099399e-05,
"epoch": 0.8251649689281825,
"step": 920
},
{
"loss": 352.9849,
"grad_norm": 36.314361572265625,
"learning_rate": 3.612485496808843e-05,
"epoch": 0.8260618873726696,
"step": 921
},
{
"loss": 355.4819,
"grad_norm": 37.96638870239258,
"learning_rate": 3.57606425267441e-05,
"epoch": 0.8269588058171568,
"step": 922
},
{
"loss": 358.6233,
"grad_norm": 36.10899353027344,
"learning_rate": 3.539813392352989e-05,
"epoch": 0.8278557242616439,
"step": 923
},
{
"loss": 353.3172,
"grad_norm": 34.54022216796875,
"learning_rate": 3.5037332041464e-05,
"epoch": 0.828752642706131,
"step": 924
},
{
"loss": 357.7184,
"grad_norm": 36.95024108886719,
"learning_rate": 3.467823974999115e-05,
"epoch": 0.8296495611506183,
"step": 925
},
{
"loss": 352.9876,
"grad_norm": 37.89804458618164,
"learning_rate": 3.4320859904959924e-05,
"epoch": 0.8305464795951054,
"step": 926
},
{
"loss": 354.4651,
"grad_norm": 36.63965606689453,
"learning_rate": 3.3965195348599626e-05,
"epoch": 0.8314433980395926,
"step": 927
},
{
"loss": 356.9139,
"grad_norm": 35.67973709106445,
"learning_rate": 3.361124890949816e-05,
"epoch": 0.8323403164840797,
"step": 928
},
{
"loss": 358.1943,
"grad_norm": 35.843719482421875,
"learning_rate": 3.325902340257914e-05,
"epoch": 0.8332372349285668,
"step": 929
},
{
"loss": 352.4489,
"grad_norm": 36.6231803894043,
"learning_rate": 3.2908521629079704e-05,
"epoch": 0.834134153373054,
"step": 930
},
{
"loss": 350.1209,
"grad_norm": 34.934112548828125,
"learning_rate": 3.255974637652828e-05,
"epoch": 0.8350310718175412,
"step": 931
},
{
"loss": 356.8803,
"grad_norm": 34.707252502441406,
"learning_rate": 3.2212700418722265e-05,
"epoch": 0.8359279902620284,
"step": 932
},
{
"loss": 356.7214,
"grad_norm": 35.543949127197266,
"learning_rate": 3.186738651570595e-05,
"epoch": 0.8368249087065155,
"step": 933
},
{
"loss": 354.0534,
"grad_norm": 35.74333572387695,
"learning_rate": 3.1523807413748887e-05,
"epoch": 0.8377218271510026,
"step": 934
},
{
"loss": 350.9949,
"grad_norm": 36.81149673461914,
"learning_rate": 3.118196584532359e-05,
"epoch": 0.8386187455954898,
"step": 935
},
{
"loss": 355.0341,
"grad_norm": 36.43380355834961,
"learning_rate": 3.084186452908411e-05,
"epoch": 0.8395156640399769,
"step": 936
},
{
"loss": 357.6827,
"grad_norm": 35.787872314453125,
"learning_rate": 3.0503506169844373e-05,
"epoch": 0.840412582484464,
"step": 937
},
{
"loss": 353.5415,
"grad_norm": 35.96485137939453,
"learning_rate": 3.0166893458556666e-05,
"epoch": 0.8413095009289513,
"step": 938
},
{
"loss": 357.3773,
"grad_norm": 33.9022216796875,
"learning_rate": 2.983202907228999e-05,
"epoch": 0.8422064193734384,
"step": 939
},
{
"loss": 355.6847,
"grad_norm": 36.94380187988281,
"learning_rate": 2.949891567420923e-05,
"epoch": 0.8431033378179256,
"step": 940
},
{
"loss": 352.4488,
"grad_norm": 36.33073043823242,
"learning_rate": 2.9167555913553577e-05,
"epoch": 0.8440002562624127,
"step": 941
},
{
"loss": 355.2479,
"grad_norm": 34.81533432006836,
"learning_rate": 2.88379524256156e-05,
"epoch": 0.8448971747068998,
"step": 942
},
{
"loss": 359.0098,
"grad_norm": 34.85913848876953,
"learning_rate": 2.8510107831720393e-05,
"epoch": 0.845794093151387,
"step": 943
},
{
"loss": 355.3041,
"grad_norm": 35.2500114440918,
"learning_rate": 2.8184024739204534e-05,
"epoch": 0.8466910115958741,
"step": 944
},
{
"loss": 357.6105,
"grad_norm": 36.625144958496094,
"learning_rate": 2.7859705741395403e-05,
"epoch": 0.8475879300403614,
"step": 945
},
{
"loss": 355.7482,
"grad_norm": 34.630428314208984,
"learning_rate": 2.7537153417590803e-05,
"epoch": 0.8484848484848485,
"step": 946
},
{
"loss": 358.0374,
"grad_norm": 35.17256164550781,
"learning_rate": 2.721637033303803e-05,
"epoch": 0.8493817669293356,
"step": 947
},
{
"loss": 352.4902,
"grad_norm": 36.90748596191406,
"learning_rate": 2.6897359038913716e-05,
"epoch": 0.8502786853738228,
"step": 948
},
{
"loss": 356.3272,
"grad_norm": 35.69559097290039,
"learning_rate": 2.6580122072303647e-05,
"epoch": 0.8511756038183099,
"step": 949
},
{
"loss": 351.9118,
"grad_norm": 34.44248580932617,
"learning_rate": 2.6264661956182212e-05,
"epoch": 0.8520725222627971,
"step": 950
},
{
"eval_loss": 1.5959553718566895,
"eval_runtime": 18.4817,
"eval_samples_per_second": 110.812,
"eval_steps_per_second": 13.852,
"epoch": 0.8520725222627971,
"step": 950
},
{
"loss": 356.2447,
"grad_norm": 34.08928680419922,
"learning_rate": 2.5950981199392847e-05,
"epoch": 0.8529694407072843,
"step": 951
},
{
"loss": 357.2951,
"grad_norm": 35.93143844604492,
"learning_rate": 2.5639082296627537e-05,
"epoch": 0.8538663591517714,
"step": 952
},
{
"loss": 357.1935,
"grad_norm": 34.351898193359375,
"learning_rate": 2.5328967728407454e-05,
"epoch": 0.8547632775962586,
"step": 953
},
{
"loss": 352.3139,
"grad_norm": 36.010223388671875,
"learning_rate": 2.5020639961062853e-05,
"epoch": 0.8556601960407457,
"step": 954
},
{
"loss": 356.4665,
"grad_norm": 34.825042724609375,
"learning_rate": 2.4714101446713793e-05,
"epoch": 0.8565571144852329,
"step": 955
},
{
"loss": 354.6561,
"grad_norm": 35.965755462646484,
"learning_rate": 2.4409354623250307e-05,
"epoch": 0.85745403292972,
"step": 956
},
{
"loss": 350.8446,
"grad_norm": 34.73567199707031,
"learning_rate": 2.4106401914313238e-05,
"epoch": 0.8583509513742071,
"step": 957
},
{
"loss": 357.6875,
"grad_norm": 34.63365936279297,
"learning_rate": 2.3805245729274947e-05,
"epoch": 0.8592478698186944,
"step": 958
},
{
"loss": 352.3867,
"grad_norm": 37.33460235595703,
"learning_rate": 2.3505888463220047e-05,
"epoch": 0.8601447882631815,
"step": 959
},
{
"loss": 357.7318,
"grad_norm": 35.54653549194336,
"learning_rate": 2.3208332496926387e-05,
"epoch": 0.8610417067076687,
"step": 960
},
{
"loss": 356.5225,
"grad_norm": 34.780433654785156,
"learning_rate": 2.2912580196846222e-05,
"epoch": 0.8619386251521558,
"step": 961
},
{
"loss": 358.1692,
"grad_norm": 37.751983642578125,
"learning_rate": 2.2618633915087282e-05,
"epoch": 0.8628355435966429,
"step": 962
},
{
"loss": 359.3351,
"grad_norm": 35.848167419433594,
"learning_rate": 2.2326495989393985e-05,
"epoch": 0.8637324620411301,
"step": 963
},
{
"loss": 354.9636,
"grad_norm": 34.292728424072266,
"learning_rate": 2.203616874312919e-05,
"epoch": 0.8646293804856173,
"step": 964
},
{
"loss": 350.5273,
"grad_norm": 35.46641540527344,
"learning_rate": 2.174765448525523e-05,
"epoch": 0.8655262989301045,
"step": 965
},
{
"loss": 355.4344,
"grad_norm": 34.72315979003906,
"learning_rate": 2.1460955510315962e-05,
"epoch": 0.8664232173745916,
"step": 966
},
{
"loss": 353.3275,
"grad_norm": 36.16691589355469,
"learning_rate": 2.1176074098418402e-05,
"epoch": 0.8673201358190787,
"step": 967
},
{
"loss": 355.2486,
"grad_norm": 36.415794372558594,
"learning_rate": 2.0893012515214388e-05,
"epoch": 0.8682170542635659,
"step": 968
},
{
"loss": 355.4182,
"grad_norm": 35.465538024902344,
"learning_rate": 2.06117730118828e-05,
"epoch": 0.869113972708053,
"step": 969
},
{
"loss": 354.304,
"grad_norm": 35.425926208496094,
"learning_rate": 2.0332357825111668e-05,
"epoch": 0.8700108911525402,
"step": 970
},
{
"loss": 351.7629,
"grad_norm": 34.78888702392578,
"learning_rate": 2.0054769177080185e-05,
"epoch": 0.8709078095970274,
"step": 971
},
{
"loss": 358.8823,
"grad_norm": 35.0769157409668,
"learning_rate": 1.97790092754411e-05,
"epoch": 0.8718047280415145,
"step": 972
},
{
"loss": 353.2525,
"grad_norm": 35.73164749145508,
"learning_rate": 1.9505080313303365e-05,
"epoch": 0.8727016464860017,
"step": 973
},
{
"loss": 355.5436,
"grad_norm": 35.51607894897461,
"learning_rate": 1.9232984469214453e-05,
"epoch": 0.8735985649304888,
"step": 974
},
{
"loss": 353.8528,
"grad_norm": 35.09918975830078,
"learning_rate": 1.8962723907143044e-05,
"epoch": 0.874495483374976,
"step": 975
},
{
"loss": 358.7514,
"grad_norm": 36.12480926513672,
"learning_rate": 1.869430077646203e-05,
"epoch": 0.8753924018194631,
"step": 976
},
{
"loss": 354.3459,
"grad_norm": 34.32866287231445,
"learning_rate": 1.8427717211931177e-05,
"epoch": 0.8762893202639502,
"step": 977
},
{
"loss": 350.5236,
"grad_norm": 35.1101188659668,
"learning_rate": 1.816297533368022e-05,
"epoch": 0.8771862387084375,
"step": 978
},
{
"loss": 353.4749,
"grad_norm": 36.59587478637695,
"learning_rate": 1.7900077247192087e-05,
"epoch": 0.8780831571529246,
"step": 979
},
{
"loss": 353.3892,
"grad_norm": 34.86069869995117,
"learning_rate": 1.7639025043286155e-05,
"epoch": 0.8789800755974118,
"step": 980
},
{
"loss": 354.1761,
"grad_norm": 35.580291748046875,
"learning_rate": 1.7379820798101383e-05,
"epoch": 0.8798769940418989,
"step": 981
},
{
"loss": 355.6291,
"grad_norm": 34.58673095703125,
"learning_rate": 1.7122466573080196e-05,
"epoch": 0.880773912486386,
"step": 982
},
{
"loss": 357.7327,
"grad_norm": 33.76737976074219,
"learning_rate": 1.6866964414951698e-05,
"epoch": 0.8816708309308732,
"step": 983
},
{
"loss": 355.4995,
"grad_norm": 34.57607650756836,
"learning_rate": 1.6613316355715558e-05,
"epoch": 0.8825677493753604,
"step": 984
},
{
"loss": 357.9588,
"grad_norm": 34.49372100830078,
"learning_rate": 1.6361524412626088e-05,
"epoch": 0.8834646678198476,
"step": 985
},
{
"loss": 357.0802,
"grad_norm": 34.17061996459961,
"learning_rate": 1.611159058817571e-05,
"epoch": 0.8843615862643347,
"step": 986
},
{
"loss": 354.1526,
"grad_norm": 36.93791198730469,
"learning_rate": 1.5863516870079418e-05,
"epoch": 0.8852585047088218,
"step": 987
},
{
"loss": 358.1216,
"grad_norm": 35.566646575927734,
"learning_rate": 1.5617305231258898e-05,
"epoch": 0.886155423153309,
"step": 988
},
{
"loss": 351.2595,
"grad_norm": 35.77732467651367,
"learning_rate": 1.5372957629826655e-05,
"epoch": 0.8870523415977961,
"step": 989
},
{
"loss": 353.016,
"grad_norm": 37.376441955566406,
"learning_rate": 1.513047600907061e-05,
"epoch": 0.8879492600422833,
"step": 990
},
{
"loss": 352.4042,
"grad_norm": 34.55933380126953,
"learning_rate": 1.4889862297438688e-05,
"epoch": 0.8888461784867705,
"step": 991
},
{
"loss": 352.0331,
"grad_norm": 34.30587387084961,
"learning_rate": 1.4651118408523317e-05,
"epoch": 0.8897430969312576,
"step": 992
},
{
"loss": 356.2885,
"grad_norm": 34.28126525878906,
"learning_rate": 1.4414246241046286e-05,
"epoch": 0.8906400153757448,
"step": 993
},
{
"loss": 356.9485,
"grad_norm": 35.106529235839844,
"learning_rate": 1.4179247678843681e-05,
"epoch": 0.8915369338202319,
"step": 994
},
{
"loss": 357.6618,
"grad_norm": 33.811737060546875,
"learning_rate": 1.3946124590850901e-05,
"epoch": 0.892433852264719,
"step": 995
},
{
"loss": 361.4888,
"grad_norm": 33.41731643676758,
"learning_rate": 1.3714878831087657e-05,
"epoch": 0.8933307707092062,
"step": 996
},
{
"loss": 358.7178,
"grad_norm": 34.46256637573242,
"learning_rate": 1.3485512238643499e-05,
"epoch": 0.8942276891536933,
"step": 997
},
{
"loss": 357.5736,
"grad_norm": 35.067893981933594,
"learning_rate": 1.3258026637662846e-05,
"epoch": 0.8951246075981806,
"step": 998
},
{
"loss": 353.149,
"grad_norm": 34.04292678833008,
"learning_rate": 1.3032423837330748e-05,
"epoch": 0.8960215260426677,
"step": 999
},
{
"loss": 356.1142,
"grad_norm": 34.39286422729492,
"learning_rate": 1.2808705631858459e-05,
"epoch": 0.8969184444871549,
"step": 1000
},
{
"eval_loss": 1.586561918258667,
"eval_runtime": 20.2668,
"eval_samples_per_second": 101.052,
"eval_steps_per_second": 12.631,
"epoch": 0.8969184444871549,
"step": 1000
},
{
"loss": 354.0248,
"grad_norm": 36.2171516418457,
"learning_rate": 1.2586873800468996e-05,
"epoch": 0.897815362931642,
"step": 1001
},
{
"loss": 362.0434,
"grad_norm": 34.42704391479492,
"learning_rate": 1.2366930107383156e-05,
"epoch": 0.8987122813761291,
"step": 1002
},
{
"loss": 354.9637,
"grad_norm": 34.4918212890625,
"learning_rate": 1.2148876301805528e-05,
"epoch": 0.8996091998206163,
"step": 1003
},
{
"loss": 348.8729,
"grad_norm": 34.57630157470703,
"learning_rate": 1.1932714117910386e-05,
"epoch": 0.9005061182651035,
"step": 1004
},
{
"loss": 352.9299,
"grad_norm": 35.46476745605469,
"learning_rate": 1.171844527482796e-05,
"epoch": 0.9014030367095907,
"step": 1005
},
{
"loss": 355.247,
"grad_norm": 34.4285888671875,
"learning_rate": 1.1506071476630964e-05,
"epoch": 0.9022999551540778,
"step": 1006
},
{
"loss": 352.168,
"grad_norm": 34.935569763183594,
"learning_rate": 1.1295594412320754e-05,
"epoch": 0.9031968735985649,
"step": 1007
},
{
"loss": 357.9673,
"grad_norm": 33.162166595458984,
"learning_rate": 1.1087015755814084e-05,
"epoch": 0.9040937920430521,
"step": 1008
},
{
"loss": 350.8712,
"grad_norm": 34.0540657043457,
"learning_rate": 1.088033716592976e-05,
"epoch": 0.9049907104875392,
"step": 1009
},
{
"loss": 356.8466,
"grad_norm": 33.83312225341797,
"learning_rate": 1.0675560286375369e-05,
"epoch": 0.9058876289320263,
"step": 1010
},
{
"loss": 353.7512,
"grad_norm": 34.7866096496582,
"learning_rate": 1.0472686745734233e-05,
"epoch": 0.9067845473765136,
"step": 1011
},
{
"loss": 354.8209,
"grad_norm": 34.10197067260742,
"learning_rate": 1.027171815745262e-05,
"epoch": 0.9076814658210007,
"step": 1012
},
{
"loss": 354.7816,
"grad_norm": 34.292598724365234,
"learning_rate": 1.0072656119826662e-05,
"epoch": 0.9085783842654879,
"step": 1013
},
{
"loss": 356.8245,
"grad_norm": 34.5960693359375,
"learning_rate": 9.875502215989791e-06,
"epoch": 0.909475302709975,
"step": 1014
},
{
"loss": 353.8681,
"grad_norm": 33.786537170410156,
"learning_rate": 9.680258013900129e-06,
"epoch": 0.9103722211544621,
"step": 1015
},
{
"loss": 355.527,
"grad_norm": 35.2137565612793,
"learning_rate": 9.486925066327978e-06,
"epoch": 0.9112691395989493,
"step": 1016
},
{
"loss": 352.3827,
"grad_norm": 34.659767150878906,
"learning_rate": 9.295504910843522e-06,
"epoch": 0.9121660580434365,
"step": 1017
},
{
"loss": 355.3458,
"grad_norm": 33.41202926635742,
"learning_rate": 9.10599906980461e-06,
"epoch": 0.9130629764879237,
"step": 1018
},
{
"loss": 357.3716,
"grad_norm": 32.52941131591797,
"learning_rate": 8.91840905034455e-06,
"epoch": 0.9139598949324108,
"step": 1019
},
{
"loss": 354.1408,
"grad_norm": 33.926963806152344,
"learning_rate": 8.732736344360198e-06,
"epoch": 0.914856813376898,
"step": 1020
},
{
"loss": 357.4122,
"grad_norm": 33.29584503173828,
"learning_rate": 8.548982428500163e-06,
"epoch": 0.9157537318213851,
"step": 1021
},
{
"loss": 356.5175,
"grad_norm": 35.51197814941406,
"learning_rate": 8.367148764152843e-06,
"epoch": 0.9166506502658722,
"step": 1022
},
{
"loss": 361.666,
"grad_norm": 35.082054138183594,
"learning_rate": 8.187236797435077e-06,
"epoch": 0.9175475687103594,
"step": 1023
},
{
"loss": 350.1344,
"grad_norm": 34.95941925048828,
"learning_rate": 8.009247959180482e-06,
"epoch": 0.9184444871548466,
"step": 1024
},
{
"loss": 359.1797,
"grad_norm": 34.81248474121094,
"learning_rate": 7.833183664928023e-06,
"epoch": 0.9193414055993337,
"step": 1025
},
{
"loss": 352.5403,
"grad_norm": 34.408485412597656,
"learning_rate": 7.659045314910879e-06,
"epoch": 0.9202383240438209,
"step": 1026
},
{
"loss": 353.7971,
"grad_norm": 34.32902526855469,
"learning_rate": 7.486834294045286e-06,
"epoch": 0.921135242488308,
"step": 1027
},
{
"loss": 352.8156,
"grad_norm": 33.39252471923828,
"learning_rate": 7.316551971919522e-06,
"epoch": 0.9220321609327952,
"step": 1028
},
{
"loss": 355.1404,
"grad_norm": 35.65606689453125,
"learning_rate": 7.148199702782854e-06,
"epoch": 0.9229290793772823,
"step": 1029
},
{
"loss": 358.3244,
"grad_norm": 35.14055252075195,
"learning_rate": 6.981778825535079e-06,
"epoch": 0.9238259978217694,
"step": 1030
},
{
"loss": 356.6115,
"grad_norm": 32.90983581542969,
"learning_rate": 6.817290663715614e-06,
"epoch": 0.9247229162662567,
"step": 1031
},
{
"loss": 354.6003,
"grad_norm": 33.653778076171875,
"learning_rate": 6.654736525493033e-06,
"epoch": 0.9256198347107438,
"step": 1032
},
{
"loss": 356.817,
"grad_norm": 35.58637619018555,
"learning_rate": 6.494117703654739e-06,
"epoch": 0.926516753155231,
"step": 1033
},
{
"loss": 355.3286,
"grad_norm": 33.73952102661133,
"learning_rate": 6.335435475596646e-06,
"epoch": 0.9274136715997181,
"step": 1034
},
{
"loss": 355.2651,
"grad_norm": 33.62116241455078,
"learning_rate": 6.1786911033129e-06,
"epoch": 0.9283105900442052,
"step": 1035
},
{
"loss": 357.9323,
"grad_norm": 33.39925003051758,
"learning_rate": 6.023885833386061e-06,
"epoch": 0.9292075084886924,
"step": 1036
},
{
"loss": 351.2944,
"grad_norm": 34.47417068481445,
"learning_rate": 5.87102089697708e-06,
"epoch": 0.9301044269331796,
"step": 1037
},
{
"loss": 355.5925,
"grad_norm": 33.980857849121094,
"learning_rate": 5.720097509815392e-06,
"epoch": 0.9310013453776668,
"step": 1038
},
{
"loss": 355.6397,
"grad_norm": 32.85739517211914,
"learning_rate": 5.571116872189475e-06,
"epoch": 0.9318982638221539,
"step": 1039
},
{
"loss": 355.7616,
"grad_norm": 33.64262390136719,
"learning_rate": 5.424080168937112e-06,
"epoch": 0.932795182266641,
"step": 1040
},
{
"loss": 357.7719,
"grad_norm": 34.275169372558594,
"learning_rate": 5.278988569436066e-06,
"epoch": 0.9336921007111282,
"step": 1041
},
{
"loss": 357.6499,
"grad_norm": 34.75218963623047,
"learning_rate": 5.1358432275947775e-06,
"epoch": 0.9345890191556153,
"step": 1042
},
{
"loss": 353.3368,
"grad_norm": 34.046241760253906,
"learning_rate": 4.994645281843152e-06,
"epoch": 0.9354859376001025,
"step": 1043
},
{
"loss": 354.6295,
"grad_norm": 34.62663269042969,
"learning_rate": 4.855395855123512e-06,
"epoch": 0.9363828560445897,
"step": 1044
},
{
"loss": 352.3897,
"grad_norm": 35.12565231323242,
"learning_rate": 4.718096054881688e-06,
"epoch": 0.9372797744890768,
"step": 1045
},
{
"loss": 352.5993,
"grad_norm": 33.51365661621094,
"learning_rate": 4.582746973058216e-06,
"epoch": 0.938176692933564,
"step": 1046
},
{
"loss": 354.0611,
"grad_norm": 33.32587814331055,
"learning_rate": 4.449349686079574e-06,
"epoch": 0.9390736113780511,
"step": 1047
},
{
"loss": 361.4709,
"grad_norm": 35.336490631103516,
"learning_rate": 4.317905254849791e-06,
"epoch": 0.9399705298225383,
"step": 1048
},
{
"loss": 360.2202,
"grad_norm": 34.51678466796875,
"learning_rate": 4.188414724741768e-06,
"epoch": 0.9408674482670254,
"step": 1049
},
{
"loss": 354.1904,
"grad_norm": 34.459373474121094,
"learning_rate": 4.060879125589195e-06,
"epoch": 0.9417643667115125,
"step": 1050
},
{
"eval_loss": 1.5787107944488525,
"eval_runtime": 18.3575,
"eval_samples_per_second": 111.562,
"eval_steps_per_second": 13.945,
"epoch": 0.9417643667115125,
"step": 1050
},
{
"loss": 353.3853,
"grad_norm": 33.25263214111328,
"learning_rate": 3.9352994716783105e-06,
"epoch": 0.9426612851559998,
"step": 1051
},
{
"loss": 350.3391,
"grad_norm": 35.57413101196289,
"learning_rate": 3.8116767617396298e-06,
"epoch": 0.9435582036004869,
"step": 1052
},
{
"loss": 356.2869,
"grad_norm": 33.38325881958008,
"learning_rate": 3.690011978940255e-06,
"epoch": 0.9444551220449741,
"step": 1053
},
{
"loss": 356.4574,
"grad_norm": 34.5271110534668,
"learning_rate": 3.570306090876024e-06,
"epoch": 0.9453520404894612,
"step": 1054
},
{
"loss": 359.7423,
"grad_norm": 35.02552795410156,
"learning_rate": 3.4525600495636246e-06,
"epoch": 0.9462489589339483,
"step": 1055
},
{
"loss": 353.1874,
"grad_norm": 35.6952018737793,
"learning_rate": 3.3367747914331838e-06,
"epoch": 0.9471458773784355,
"step": 1056
},
{
"loss": 355.9973,
"grad_norm": 35.45086669921875,
"learning_rate": 3.222951237320915e-06,
"epoch": 0.9480427958229227,
"step": 1057
},
{
"loss": 355.2783,
"grad_norm": 32.976966857910156,
"learning_rate": 3.1110902924615102e-06,
"epoch": 0.9489397142674099,
"step": 1058
},
{
"loss": 358.506,
"grad_norm": 34.06571960449219,
"learning_rate": 3.0011928464811213e-06,
"epoch": 0.949836632711897,
"step": 1059
},
{
"loss": 358.1763,
"grad_norm": 33.59235382080078,
"learning_rate": 2.8932597733903886e-06,
"epoch": 0.9507335511563841,
"step": 1060
},
{
"loss": 357.5705,
"grad_norm": 32.182106018066406,
"learning_rate": 2.7872919315772017e-06,
"epoch": 0.9516304696008713,
"step": 1061
},
{
"loss": 354.619,
"grad_norm": 35.46062469482422,
"learning_rate": 2.683290163800145e-06,
"epoch": 0.9525273880453584,
"step": 1062
},
{
"loss": 350.0426,
"grad_norm": 32.130767822265625,
"learning_rate": 2.581255297181617e-06,
"epoch": 0.9534243064898456,
"step": 1063
},
{
"loss": 351.98,
"grad_norm": 32.878875732421875,
"learning_rate": 2.4811881432013905e-06,
"epoch": 0.9543212249343328,
"step": 1064
},
{
"loss": 353.1487,
"grad_norm": 33.90510559082031,
"learning_rate": 2.3830894976899774e-06,
"epoch": 0.9552181433788199,
"step": 1065
},
{
"loss": 357.164,
"grad_norm": 34.16891860961914,
"learning_rate": 2.2869601408225805e-06,
"epoch": 0.9561150618233071,
"step": 1066
},
{
"loss": 351.2288,
"grad_norm": 33.57730484008789,
"learning_rate": 2.1928008371125406e-06,
"epoch": 0.9570119802677942,
"step": 1067
},
{
"loss": 356.0024,
"grad_norm": 33.691978454589844,
"learning_rate": 2.1006123354055384e-06,
"epoch": 0.9579088987122814,
"step": 1068
},
{
"loss": 361.7596,
"grad_norm": 33.60329055786133,
"learning_rate": 2.0103953688734853e-06,
"epoch": 0.9588058171567685,
"step": 1069
},
{
"loss": 354.5997,
"grad_norm": 35.25307083129883,
"learning_rate": 1.9221506550088365e-06,
"epoch": 0.9597027356012557,
"step": 1070
},
{
"loss": 355.2119,
"grad_norm": 34.94419860839844,
"learning_rate": 1.83587889561862e-06,
"epoch": 0.9605996540457429,
"step": 1071
},
{
"loss": 355.9485,
"grad_norm": 34.35773468017578,
"learning_rate": 1.7515807768192228e-06,
"epoch": 0.96149657249023,
"step": 1072
},
{
"loss": 353.5008,
"grad_norm": 33.7717170715332,
"learning_rate": 1.6692569690305859e-06,
"epoch": 0.9623934909347172,
"step": 1073
},
{
"loss": 357.9717,
"grad_norm": 35.07488250732422,
"learning_rate": 1.5889081269710726e-06,
"epoch": 0.9632904093792043,
"step": 1074
},
{
"loss": 361.8947,
"grad_norm": 34.685150146484375,
"learning_rate": 1.5105348896522486e-06,
"epoch": 0.9641873278236914,
"step": 1075
},
{
"loss": 357.5904,
"grad_norm": 34.1632080078125,
"learning_rate": 1.4341378803737204e-06,
"epoch": 0.9650842462681786,
"step": 1076
},
{
"loss": 357.5146,
"grad_norm": 34.23555374145508,
"learning_rate": 1.3597177067181943e-06,
"epoch": 0.9659811647126658,
"step": 1077
},
{
"loss": 356.91,
"grad_norm": 32.962257385253906,
"learning_rate": 1.2872749605468137e-06,
"epoch": 0.966878083157153,
"step": 1078
},
{
"loss": 351.4866,
"grad_norm": 34.07936096191406,
"learning_rate": 1.2168102179941076e-06,
"epoch": 0.9677750016016401,
"step": 1079
},
{
"loss": 355.5893,
"grad_norm": 33.35137939453125,
"learning_rate": 1.1483240394637717e-06,
"epoch": 0.9686719200461272,
"step": 1080
},
{
"loss": 355.4586,
"grad_norm": 34.09134292602539,
"learning_rate": 1.0818169696239776e-06,
"epoch": 0.9695688384906144,
"step": 1081
},
{
"loss": 354.5378,
"grad_norm": 32.67642593383789,
"learning_rate": 1.0172895374031265e-06,
"epoch": 0.9704657569351015,
"step": 1082
},
{
"loss": 354.3784,
"grad_norm": 32.6947021484375,
"learning_rate": 9.5474225598563e-07,
"epoch": 0.9713626753795886,
"step": 1083
},
{
"loss": 355.8788,
"grad_norm": 33.51148986816406,
"learning_rate": 8.941756228078579e-07,
"epoch": 0.9722595938240759,
"step": 1084
},
{
"loss": 353.8372,
"grad_norm": 33.57039260864258,
"learning_rate": 8.35590119554086e-07,
"epoch": 0.973156512268563,
"step": 1085
},
{
"loss": 353.2452,
"grad_norm": 33.60462188720703,
"learning_rate": 7.789862121528324e-07,
"epoch": 0.9740534307130502,
"step": 1086
},
{
"loss": 357.0675,
"grad_norm": 33.704349517822266,
"learning_rate": 7.243643507729436e-07,
"epoch": 0.9749503491575373,
"step": 1087
},
{
"loss": 354.5553,
"grad_norm": 34.90256881713867,
"learning_rate": 6.717249698202088e-07,
"epoch": 0.9758472676020244,
"step": 1088
},
{
"loss": 349.4813,
"grad_norm": 34.148128509521484,
"learning_rate": 6.210684879337513e-07,
"epoch": 0.9767441860465116,
"step": 1089
},
{
"loss": 357.7331,
"grad_norm": 34.612762451171875,
"learning_rate": 5.72395307982837e-07,
"epoch": 0.9776411044909988,
"step": 1090
},
{
"loss": 358.809,
"grad_norm": 32.881195068359375,
"learning_rate": 5.257058170635709e-07,
"epoch": 0.978538022935486,
"step": 1091
},
{
"loss": 356.2231,
"grad_norm": 32.4294319152832,
"learning_rate": 4.810003864958168e-07,
"epoch": 0.9794349413799731,
"step": 1092
},
{
"loss": 354.6883,
"grad_norm": 35.39781951904297,
"learning_rate": 4.3827937182033815e-07,
"epoch": 0.9803318598244602,
"step": 1093
},
{
"loss": 352.7607,
"grad_norm": 34.17608642578125,
"learning_rate": 3.9754311279582844e-07,
"epoch": 0.9812287782689474,
"step": 1094
},
{
"loss": 353.8497,
"grad_norm": 31.340768814086914,
"learning_rate": 3.587919333963574e-07,
"epoch": 0.9821256967134345,
"step": 1095
},
{
"loss": 357.9939,
"grad_norm": 33.75115966796875,
"learning_rate": 3.2202614180870673e-07,
"epoch": 0.9830226151579217,
"step": 1096
},
{
"loss": 356.0656,
"grad_norm": 32.56006622314453,
"learning_rate": 2.872460304299274e-07,
"epoch": 0.9839195336024089,
"step": 1097
},
{
"loss": 353.62,
"grad_norm": 34.134193420410156,
"learning_rate": 2.5445187586503603e-07,
"epoch": 0.984816452046896,
"step": 1098
},
{
"loss": 355.838,
"grad_norm": 34.15678024291992,
"learning_rate": 2.2364393892479462e-07,
"epoch": 0.9857133704913832,
"step": 1099
},
{
"loss": 358.3669,
"grad_norm": 32.837039947509766,
"learning_rate": 1.9482246462365626e-07,
"epoch": 0.9866102889358703,
"step": 1100
},
{
"eval_loss": 1.5716547966003418,
"eval_runtime": 18.217,
"eval_samples_per_second": 112.422,
"eval_steps_per_second": 14.053,
"epoch": 0.9866102889358703,
"step": 1100
},
{
"loss": 356.8408,
"grad_norm": 33.33000183105469,
"learning_rate": 1.6798768217776706e-07,
"epoch": 0.9875072073803575,
"step": 1101
},
{
"loss": 356.4636,
"grad_norm": 34.879573822021484,
"learning_rate": 1.4313980500327283e-07,
"epoch": 0.9884041258248446,
"step": 1102
},
{
"loss": 356.378,
"grad_norm": 33.825469970703125,
"learning_rate": 1.2027903071440415e-07,
"epoch": 0.9893010442693317,
"step": 1103
},
{
"loss": 359.4078,
"grad_norm": 34.18437957763672,
"learning_rate": 9.94055411221717e-08,
"epoch": 0.990197962713819,
"step": 1104
},
{
"loss": 356.8303,
"grad_norm": 35.02104187011719,
"learning_rate": 8.051950223267323e-08,
"epoch": 0.9910948811583061,
"step": 1105
},
{
"loss": 351.9132,
"grad_norm": 33.7501220703125,
"learning_rate": 6.362106424590009e-08,
"epoch": 0.9919917996027933,
"step": 1106
},
{
"loss": 356.2349,
"grad_norm": 34.74052810668945,
"learning_rate": 4.871036155454367e-08,
"epoch": 0.9928887180472804,
"step": 1107
},
{
"loss": 357.3864,
"grad_norm": 33.26545715332031,
"learning_rate": 3.578751274294079e-08,
"epoch": 0.9937856364917675,
"step": 1108
},
{
"loss": 358.4432,
"grad_norm": 33.61418914794922,
"learning_rate": 2.4852620586046647e-08,
"epoch": 0.9946825549362547,
"step": 1109
},
{
"loss": 356.3781,
"grad_norm": 33.90690612792969,
"learning_rate": 1.5905772048629975e-08,
"epoch": 0.9955794733807419,
"step": 1110
},
{
"loss": 355.2562,
"grad_norm": 36.185489654541016,
"learning_rate": 8.947038284717879e-09,
"epoch": 0.9964763918252291,
"step": 1111
},
{
"loss": 353.4495,
"grad_norm": 35.645416259765625,
"learning_rate": 3.976474636874228e-09,
"epoch": 0.9973733102697162,
"step": 1112
},
{
"loss": 358.9317,
"grad_norm": 34.38767623901367,
"learning_rate": 9.941206357555465e-10,
"epoch": 0.9982702287142033,
"step": 1113
},
{
"loss": 355.1901,
"grad_norm": 33.96023941040039,
"learning_rate": 0.0,
"epoch": 0.9991671471586905,
"step": 1114
},
{
"train_runtime": 10703.3349,
"train_samples_per_second": 186.666,
"train_steps_per_second": 0.104,
"total_flos": 6.811715592467251e+17,
"train_loss": 100.33408414611269,
"epoch": 0.9991671471586905,
"step": 1114
},
{
"eval_loss": 1.585738182067871,
"eval_runtime": 19.5932,
"eval_samples_per_second": 104.526,
"eval_steps_per_second": 13.066,
"epoch": 0.9991671471586905,
"step": 1114
}
],
"best_metric": null,
"best_model_checkpoint": null,
"is_local_process_zero": true,
"is_world_process_zero": true,
"is_hyper_param_search": false,
"trial_name": null,
"trial_params": null,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_training_stop": true,
"should_epoch_stop": false,
"should_save": true,
"should_evaluate": false,
"should_log": false
},
"attributes": {}
}
}
}