eageringdev's picture
Training in progress, step 1348, checkpoint
461d237 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.5708840656431975,
"eval_steps": 337,
"global_step": 1348,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.00042350449973530967,
"grad_norm": 0.027733758091926575,
"learning_rate": 2e-05,
"loss": 10.3743,
"step": 1
},
{
"epoch": 0.00042350449973530967,
"eval_loss": 10.376607894897461,
"eval_runtime": 3.5039,
"eval_samples_per_second": 283.969,
"eval_steps_per_second": 142.127,
"step": 1
},
{
"epoch": 0.0008470089994706193,
"grad_norm": 0.02669823355972767,
"learning_rate": 4e-05,
"loss": 10.374,
"step": 2
},
{
"epoch": 0.001270513499205929,
"grad_norm": 0.021611209958791733,
"learning_rate": 6e-05,
"loss": 10.3801,
"step": 3
},
{
"epoch": 0.0016940179989412387,
"grad_norm": 0.027095356956124306,
"learning_rate": 8e-05,
"loss": 10.3786,
"step": 4
},
{
"epoch": 0.0021175224986765486,
"grad_norm": 0.030345361679792404,
"learning_rate": 0.0001,
"loss": 10.378,
"step": 5
},
{
"epoch": 0.002541026998411858,
"grad_norm": 0.025746231898665428,
"learning_rate": 0.00012,
"loss": 10.3767,
"step": 6
},
{
"epoch": 0.002964531498147168,
"grad_norm": 0.026296626776456833,
"learning_rate": 0.00014,
"loss": 10.375,
"step": 7
},
{
"epoch": 0.0033880359978824773,
"grad_norm": 0.026994528248906136,
"learning_rate": 0.00016,
"loss": 10.3775,
"step": 8
},
{
"epoch": 0.0038115404976177872,
"grad_norm": 0.02642114832997322,
"learning_rate": 0.00018,
"loss": 10.3785,
"step": 9
},
{
"epoch": 0.004235044997353097,
"grad_norm": 0.03136637434363365,
"learning_rate": 0.0002,
"loss": 10.3753,
"step": 10
},
{
"epoch": 0.004658549497088407,
"grad_norm": 0.022933412343263626,
"learning_rate": 0.00019999972435042745,
"loss": 10.3753,
"step": 11
},
{
"epoch": 0.005082053996823716,
"grad_norm": 0.02334180846810341,
"learning_rate": 0.0001999988974032295,
"loss": 10.3753,
"step": 12
},
{
"epoch": 0.005505558496559026,
"grad_norm": 0.03419239819049835,
"learning_rate": 0.00019999751916296505,
"loss": 10.3767,
"step": 13
},
{
"epoch": 0.005929062996294336,
"grad_norm": 0.022363845258951187,
"learning_rate": 0.0001999955896372324,
"loss": 10.3773,
"step": 14
},
{
"epoch": 0.006352567496029645,
"grad_norm": 0.025751987472176552,
"learning_rate": 0.0001999931088366689,
"loss": 10.3788,
"step": 15
},
{
"epoch": 0.006776071995764955,
"grad_norm": 0.02451767958700657,
"learning_rate": 0.00019999007677495127,
"loss": 10.3781,
"step": 16
},
{
"epoch": 0.007199576495500265,
"grad_norm": 0.023951657116413116,
"learning_rate": 0.00019998649346879524,
"loss": 10.3746,
"step": 17
},
{
"epoch": 0.0076230809952355745,
"grad_norm": 0.02496548369526863,
"learning_rate": 0.0001999823589379555,
"loss": 10.3739,
"step": 18
},
{
"epoch": 0.008046585494970884,
"grad_norm": 0.02257522940635681,
"learning_rate": 0.0001999776732052257,
"loss": 10.3755,
"step": 19
},
{
"epoch": 0.008470089994706194,
"grad_norm": 0.027529660612344742,
"learning_rate": 0.00019997243629643827,
"loss": 10.3753,
"step": 20
},
{
"epoch": 0.008893594494441503,
"grad_norm": 0.025733161717653275,
"learning_rate": 0.0001999666482404642,
"loss": 10.376,
"step": 21
},
{
"epoch": 0.009317098994176813,
"grad_norm": 0.028723513707518578,
"learning_rate": 0.00019996030906921302,
"loss": 10.373,
"step": 22
},
{
"epoch": 0.009740603493912123,
"grad_norm": 0.03213539347052574,
"learning_rate": 0.00019995341881763254,
"loss": 10.3749,
"step": 23
},
{
"epoch": 0.010164107993647432,
"grad_norm": 0.033838726580142975,
"learning_rate": 0.0001999459775237086,
"loss": 10.374,
"step": 24
},
{
"epoch": 0.010587612493382742,
"grad_norm": 0.029485682025551796,
"learning_rate": 0.00019993798522846508,
"loss": 10.3769,
"step": 25
},
{
"epoch": 0.011011116993118053,
"grad_norm": 0.03559406101703644,
"learning_rate": 0.00019992944197596337,
"loss": 10.3748,
"step": 26
},
{
"epoch": 0.011434621492853361,
"grad_norm": 0.033679552376270294,
"learning_rate": 0.00019992034781330235,
"loss": 10.3733,
"step": 27
},
{
"epoch": 0.011858125992588672,
"grad_norm": 0.032387569546699524,
"learning_rate": 0.00019991070279061808,
"loss": 10.3711,
"step": 28
},
{
"epoch": 0.01228163049232398,
"grad_norm": 0.03400762379169464,
"learning_rate": 0.0001999005069610835,
"loss": 10.3726,
"step": 29
},
{
"epoch": 0.01270513499205929,
"grad_norm": 0.030853325501084328,
"learning_rate": 0.0001998897603809081,
"loss": 10.3756,
"step": 30
},
{
"epoch": 0.0131286394917946,
"grad_norm": 0.039369914680719376,
"learning_rate": 0.00019987846310933768,
"loss": 10.373,
"step": 31
},
{
"epoch": 0.01355214399152991,
"grad_norm": 0.04837853088974953,
"learning_rate": 0.00019986661520865405,
"loss": 10.3751,
"step": 32
},
{
"epoch": 0.01397564849126522,
"grad_norm": 0.045920245349407196,
"learning_rate": 0.00019985421674417452,
"loss": 10.3693,
"step": 33
},
{
"epoch": 0.01439915299100053,
"grad_norm": 0.04273354262113571,
"learning_rate": 0.00019984126778425178,
"loss": 10.3702,
"step": 34
},
{
"epoch": 0.014822657490735839,
"grad_norm": 0.048800382763147354,
"learning_rate": 0.0001998277684002733,
"loss": 10.3745,
"step": 35
},
{
"epoch": 0.015246161990471149,
"grad_norm": 0.05085352063179016,
"learning_rate": 0.00019981371866666109,
"loss": 10.3745,
"step": 36
},
{
"epoch": 0.015669666490206458,
"grad_norm": 0.05351710319519043,
"learning_rate": 0.0001997991186608712,
"loss": 10.3718,
"step": 37
},
{
"epoch": 0.016093170989941768,
"grad_norm": 0.048667021095752716,
"learning_rate": 0.0001997839684633933,
"loss": 10.3713,
"step": 38
},
{
"epoch": 0.016516675489677078,
"grad_norm": 0.04797323793172836,
"learning_rate": 0.0001997682681577504,
"loss": 10.3696,
"step": 39
},
{
"epoch": 0.01694017998941239,
"grad_norm": 0.04910498112440109,
"learning_rate": 0.00019975201783049805,
"loss": 10.3753,
"step": 40
},
{
"epoch": 0.0173636844891477,
"grad_norm": 0.04971903935074806,
"learning_rate": 0.00019973521757122418,
"loss": 10.3724,
"step": 41
},
{
"epoch": 0.017787188988883006,
"grad_norm": 0.06753654778003693,
"learning_rate": 0.00019971786747254852,
"loss": 10.3717,
"step": 42
},
{
"epoch": 0.018210693488618316,
"grad_norm": 0.04575066268444061,
"learning_rate": 0.00019969996763012198,
"loss": 10.3708,
"step": 43
},
{
"epoch": 0.018634197988353626,
"grad_norm": 0.0645640566945076,
"learning_rate": 0.00019968151814262627,
"loss": 10.37,
"step": 44
},
{
"epoch": 0.019057702488088937,
"grad_norm": 0.07102999091148376,
"learning_rate": 0.00019966251911177323,
"loss": 10.371,
"step": 45
},
{
"epoch": 0.019481206987824247,
"grad_norm": 0.07168291509151459,
"learning_rate": 0.00019964297064230436,
"loss": 10.3691,
"step": 46
},
{
"epoch": 0.019904711487559554,
"grad_norm": 0.06409385055303574,
"learning_rate": 0.0001996228728419902,
"loss": 10.3712,
"step": 47
},
{
"epoch": 0.020328215987294864,
"grad_norm": 0.06838654726743698,
"learning_rate": 0.00019960222582162976,
"loss": 10.3681,
"step": 48
},
{
"epoch": 0.020751720487030174,
"grad_norm": 0.10280771553516388,
"learning_rate": 0.0001995810296950499,
"loss": 10.3681,
"step": 49
},
{
"epoch": 0.021175224986765485,
"grad_norm": 0.08676479011774063,
"learning_rate": 0.00019955928457910464,
"loss": 10.3678,
"step": 50
},
{
"epoch": 0.021598729486500795,
"grad_norm": 0.0853012353181839,
"learning_rate": 0.00019953699059367468,
"loss": 10.3662,
"step": 51
},
{
"epoch": 0.022022233986236105,
"grad_norm": 0.09723170846700668,
"learning_rate": 0.00019951414786166654,
"loss": 10.3658,
"step": 52
},
{
"epoch": 0.022445738485971412,
"grad_norm": 0.08303584158420563,
"learning_rate": 0.00019949075650901196,
"loss": 10.3637,
"step": 53
},
{
"epoch": 0.022869242985706723,
"grad_norm": 0.09252041578292847,
"learning_rate": 0.00019946681666466737,
"loss": 10.3663,
"step": 54
},
{
"epoch": 0.023292747485442033,
"grad_norm": 0.07753727585077286,
"learning_rate": 0.00019944232846061283,
"loss": 10.363,
"step": 55
},
{
"epoch": 0.023716251985177343,
"grad_norm": 0.09255944937467575,
"learning_rate": 0.00019941729203185165,
"loss": 10.3598,
"step": 56
},
{
"epoch": 0.024139756484912653,
"grad_norm": 0.1016978919506073,
"learning_rate": 0.0001993917075164095,
"loss": 10.3632,
"step": 57
},
{
"epoch": 0.02456326098464796,
"grad_norm": 0.09657126665115356,
"learning_rate": 0.00019936557505533344,
"loss": 10.363,
"step": 58
},
{
"epoch": 0.02498676548438327,
"grad_norm": 0.08984223008155823,
"learning_rate": 0.00019933889479269162,
"loss": 10.366,
"step": 59
},
{
"epoch": 0.02541026998411858,
"grad_norm": 0.11775655299425125,
"learning_rate": 0.0001993116668755721,
"loss": 10.3619,
"step": 60
},
{
"epoch": 0.02583377448385389,
"grad_norm": 0.10796835273504257,
"learning_rate": 0.00019928389145408213,
"loss": 10.3609,
"step": 61
},
{
"epoch": 0.0262572789835892,
"grad_norm": 0.10575428605079651,
"learning_rate": 0.00019925556868134736,
"loss": 10.3571,
"step": 62
},
{
"epoch": 0.026680783483324512,
"grad_norm": 0.12180919945240021,
"learning_rate": 0.000199226698713511,
"loss": 10.3553,
"step": 63
},
{
"epoch": 0.02710428798305982,
"grad_norm": 0.10506236553192139,
"learning_rate": 0.00019919728170973296,
"loss": 10.3593,
"step": 64
},
{
"epoch": 0.02752779248279513,
"grad_norm": 0.09984668344259262,
"learning_rate": 0.00019916731783218888,
"loss": 10.3593,
"step": 65
},
{
"epoch": 0.02795129698253044,
"grad_norm": 0.10555399954319,
"learning_rate": 0.00019913680724606945,
"loss": 10.3591,
"step": 66
},
{
"epoch": 0.02837480148226575,
"grad_norm": 0.08873631060123444,
"learning_rate": 0.00019910575011957918,
"loss": 10.3568,
"step": 67
},
{
"epoch": 0.02879830598200106,
"grad_norm": 0.09549526870250702,
"learning_rate": 0.00019907414662393574,
"loss": 10.3574,
"step": 68
},
{
"epoch": 0.029221810481736367,
"grad_norm": 0.09750920534133911,
"learning_rate": 0.000199041996933369,
"loss": 10.3536,
"step": 69
},
{
"epoch": 0.029645314981471677,
"grad_norm": 0.10485312342643738,
"learning_rate": 0.00019900930122511993,
"loss": 10.3566,
"step": 70
},
{
"epoch": 0.030068819481206988,
"grad_norm": 0.09972581267356873,
"learning_rate": 0.00019897605967943963,
"loss": 10.3531,
"step": 71
},
{
"epoch": 0.030492323980942298,
"grad_norm": 0.08210822939872742,
"learning_rate": 0.00019894227247958845,
"loss": 10.3534,
"step": 72
},
{
"epoch": 0.030915828480677608,
"grad_norm": 0.08600655943155289,
"learning_rate": 0.00019890793981183503,
"loss": 10.356,
"step": 73
},
{
"epoch": 0.031339332980412915,
"grad_norm": 0.08058468252420425,
"learning_rate": 0.00019887306186545497,
"loss": 10.3549,
"step": 74
},
{
"epoch": 0.03176283748014823,
"grad_norm": 0.0659925639629364,
"learning_rate": 0.00019883763883273012,
"loss": 10.3507,
"step": 75
},
{
"epoch": 0.032186341979883536,
"grad_norm": 0.06881393492221832,
"learning_rate": 0.0001988016709089474,
"loss": 10.3529,
"step": 76
},
{
"epoch": 0.03260984647961884,
"grad_norm": 0.0784982293844223,
"learning_rate": 0.00019876515829239763,
"loss": 10.3528,
"step": 77
},
{
"epoch": 0.033033350979354156,
"grad_norm": 0.06941844522953033,
"learning_rate": 0.00019872810118437456,
"loss": 10.351,
"step": 78
},
{
"epoch": 0.03345685547908946,
"grad_norm": 0.06965084373950958,
"learning_rate": 0.00019869049978917368,
"loss": 10.3507,
"step": 79
},
{
"epoch": 0.03388035997882478,
"grad_norm": 0.0600489042699337,
"learning_rate": 0.00019865235431409123,
"loss": 10.3514,
"step": 80
},
{
"epoch": 0.034303864478560084,
"grad_norm": 0.06106571480631828,
"learning_rate": 0.00019861366496942283,
"loss": 10.3501,
"step": 81
},
{
"epoch": 0.0347273689782954,
"grad_norm": 0.05668988823890686,
"learning_rate": 0.0001985744319684625,
"loss": 10.3479,
"step": 82
},
{
"epoch": 0.035150873478030704,
"grad_norm": 0.05988716706633568,
"learning_rate": 0.00019853465552750147,
"loss": 10.3472,
"step": 83
},
{
"epoch": 0.03557437797776601,
"grad_norm": 0.047210004180669785,
"learning_rate": 0.00019849433586582692,
"loss": 10.3522,
"step": 84
},
{
"epoch": 0.035997882477501325,
"grad_norm": 0.04648837819695473,
"learning_rate": 0.00019845347320572078,
"loss": 10.3489,
"step": 85
},
{
"epoch": 0.03642138697723663,
"grad_norm": 0.057975709438323975,
"learning_rate": 0.00019841206777245857,
"loss": 10.3482,
"step": 86
},
{
"epoch": 0.036844891476971946,
"grad_norm": 0.06715747714042664,
"learning_rate": 0.00019837011979430806,
"loss": 10.3486,
"step": 87
},
{
"epoch": 0.03726839597670725,
"grad_norm": 0.05633699893951416,
"learning_rate": 0.00019832762950252813,
"loss": 10.3506,
"step": 88
},
{
"epoch": 0.03769190047644256,
"grad_norm": 0.04711679369211197,
"learning_rate": 0.00019828459713136737,
"loss": 10.349,
"step": 89
},
{
"epoch": 0.03811540497617787,
"grad_norm": 0.050088070333004,
"learning_rate": 0.0001982410229180629,
"loss": 10.3457,
"step": 90
},
{
"epoch": 0.03853890947591318,
"grad_norm": 0.0481443926692009,
"learning_rate": 0.00019819690710283893,
"loss": 10.3488,
"step": 91
},
{
"epoch": 0.038962413975648494,
"grad_norm": 0.04781080409884453,
"learning_rate": 0.0001981522499289056,
"loss": 10.3476,
"step": 92
},
{
"epoch": 0.0393859184753838,
"grad_norm": 0.04098181053996086,
"learning_rate": 0.00019810705164245756,
"loss": 10.3486,
"step": 93
},
{
"epoch": 0.03980942297511911,
"grad_norm": 0.050709549337625504,
"learning_rate": 0.00019806131249267255,
"loss": 10.3465,
"step": 94
},
{
"epoch": 0.04023292747485442,
"grad_norm": 0.04031967371702194,
"learning_rate": 0.00019801503273171012,
"loss": 10.3497,
"step": 95
},
{
"epoch": 0.04065643197458973,
"grad_norm": 0.029422029852867126,
"learning_rate": 0.00019796821261471018,
"loss": 10.3476,
"step": 96
},
{
"epoch": 0.04107993647432504,
"grad_norm": 0.04418569803237915,
"learning_rate": 0.00019792085239979162,
"loss": 10.3488,
"step": 97
},
{
"epoch": 0.04150344097406035,
"grad_norm": 0.05277906730771065,
"learning_rate": 0.00019787295234805096,
"loss": 10.3495,
"step": 98
},
{
"epoch": 0.041926945473795656,
"grad_norm": 0.03719155862927437,
"learning_rate": 0.00019782451272356075,
"loss": 10.3493,
"step": 99
},
{
"epoch": 0.04235044997353097,
"grad_norm": 0.042001668363809586,
"learning_rate": 0.0001977755337933682,
"loss": 10.3474,
"step": 100
},
{
"epoch": 0.042773954473266276,
"grad_norm": 0.045230720192193985,
"learning_rate": 0.00019772601582749376,
"loss": 10.3498,
"step": 101
},
{
"epoch": 0.04319745897300159,
"grad_norm": 0.03940007835626602,
"learning_rate": 0.00019767595909892953,
"loss": 10.3499,
"step": 102
},
{
"epoch": 0.0436209634727369,
"grad_norm": 0.044866979122161865,
"learning_rate": 0.00019762536388363784,
"loss": 10.3464,
"step": 103
},
{
"epoch": 0.04404446797247221,
"grad_norm": 0.039521895349025726,
"learning_rate": 0.00019757423046054968,
"loss": 10.3491,
"step": 104
},
{
"epoch": 0.04446797247220752,
"grad_norm": 0.04928427189588547,
"learning_rate": 0.00019752255911156317,
"loss": 10.345,
"step": 105
},
{
"epoch": 0.044891476971942824,
"grad_norm": 0.04378641024231911,
"learning_rate": 0.00019747035012154202,
"loss": 10.3488,
"step": 106
},
{
"epoch": 0.04531498147167814,
"grad_norm": 0.048980504274368286,
"learning_rate": 0.00019741760377831396,
"loss": 10.3468,
"step": 107
},
{
"epoch": 0.045738485971413445,
"grad_norm": 0.04159266874194145,
"learning_rate": 0.00019736432037266912,
"loss": 10.3478,
"step": 108
},
{
"epoch": 0.04616199047114876,
"grad_norm": 0.0287900660187006,
"learning_rate": 0.00019731050019835842,
"loss": 10.3497,
"step": 109
},
{
"epoch": 0.046585494970884066,
"grad_norm": 0.03839430958032608,
"learning_rate": 0.00019725614355209204,
"loss": 10.35,
"step": 110
},
{
"epoch": 0.04700899947061937,
"grad_norm": 0.04616628587245941,
"learning_rate": 0.00019720125073353776,
"loss": 10.3471,
"step": 111
},
{
"epoch": 0.047432503970354686,
"grad_norm": 0.05492490157485008,
"learning_rate": 0.00019714582204531918,
"loss": 10.3503,
"step": 112
},
{
"epoch": 0.04785600847008999,
"grad_norm": 0.037890441715717316,
"learning_rate": 0.00019708985779301417,
"loss": 10.3488,
"step": 113
},
{
"epoch": 0.04827951296982531,
"grad_norm": 0.036491066217422485,
"learning_rate": 0.00019703335828515322,
"loss": 10.3476,
"step": 114
},
{
"epoch": 0.048703017469560614,
"grad_norm": 0.03580768033862114,
"learning_rate": 0.00019697632383321756,
"loss": 10.3509,
"step": 115
},
{
"epoch": 0.04912652196929592,
"grad_norm": 0.04286257550120354,
"learning_rate": 0.0001969187547516377,
"loss": 10.3475,
"step": 116
},
{
"epoch": 0.049550026469031234,
"grad_norm": 0.06037011742591858,
"learning_rate": 0.00019686065135779144,
"loss": 10.3534,
"step": 117
},
{
"epoch": 0.04997353096876654,
"grad_norm": 0.05510607734322548,
"learning_rate": 0.00019680201397200236,
"loss": 10.3529,
"step": 118
},
{
"epoch": 0.050397035468501855,
"grad_norm": 0.04488476365804672,
"learning_rate": 0.00019674284291753785,
"loss": 10.3482,
"step": 119
},
{
"epoch": 0.05082053996823716,
"grad_norm": 0.02746366150677204,
"learning_rate": 0.00019668313852060735,
"loss": 10.3507,
"step": 120
},
{
"epoch": 0.05124404446797247,
"grad_norm": 0.031117988750338554,
"learning_rate": 0.00019662290111036078,
"loss": 10.3472,
"step": 121
},
{
"epoch": 0.05166754896770778,
"grad_norm": 0.04041313752532005,
"learning_rate": 0.00019656213101888645,
"loss": 10.3468,
"step": 122
},
{
"epoch": 0.05209105346744309,
"grad_norm": 0.04518342763185501,
"learning_rate": 0.00019650082858120932,
"loss": 10.35,
"step": 123
},
{
"epoch": 0.0525145579671784,
"grad_norm": 0.034027136862277985,
"learning_rate": 0.00019643899413528926,
"loss": 10.3474,
"step": 124
},
{
"epoch": 0.05293806246691371,
"grad_norm": 0.0336722694337368,
"learning_rate": 0.000196376628022019,
"loss": 10.347,
"step": 125
},
{
"epoch": 0.053361566966649024,
"grad_norm": 0.03731876611709595,
"learning_rate": 0.00019631373058522238,
"loss": 10.3484,
"step": 126
},
{
"epoch": 0.05378507146638433,
"grad_norm": 0.038337815552949905,
"learning_rate": 0.00019625030217165245,
"loss": 10.3493,
"step": 127
},
{
"epoch": 0.05420857596611964,
"grad_norm": 0.036029715090990067,
"learning_rate": 0.00019618634313098952,
"loss": 10.346,
"step": 128
},
{
"epoch": 0.05463208046585495,
"grad_norm": 0.031205767765641212,
"learning_rate": 0.00019612185381583924,
"loss": 10.3502,
"step": 129
},
{
"epoch": 0.05505558496559026,
"grad_norm": 0.04413217306137085,
"learning_rate": 0.0001960568345817306,
"loss": 10.3507,
"step": 130
},
{
"epoch": 0.05547908946532557,
"grad_norm": 0.03828402981162071,
"learning_rate": 0.00019599128578711415,
"loss": 10.3485,
"step": 131
},
{
"epoch": 0.05590259396506088,
"grad_norm": 0.03328114375472069,
"learning_rate": 0.0001959252077933598,
"loss": 10.3481,
"step": 132
},
{
"epoch": 0.056326098464796186,
"grad_norm": 0.04720017686486244,
"learning_rate": 0.000195858600964755,
"loss": 10.3468,
"step": 133
},
{
"epoch": 0.0567496029645315,
"grad_norm": 0.03394393250346184,
"learning_rate": 0.00019579146566850252,
"loss": 10.3457,
"step": 134
},
{
"epoch": 0.057173107464266806,
"grad_norm": 0.03747075796127319,
"learning_rate": 0.0001957238022747188,
"loss": 10.3488,
"step": 135
},
{
"epoch": 0.05759661196400212,
"grad_norm": 0.03510262444615364,
"learning_rate": 0.00019565561115643152,
"loss": 10.3504,
"step": 136
},
{
"epoch": 0.05802011646373743,
"grad_norm": 0.03729300945997238,
"learning_rate": 0.00019558689268957767,
"loss": 10.3464,
"step": 137
},
{
"epoch": 0.058443620963472734,
"grad_norm": 0.029604580253362656,
"learning_rate": 0.00019551764725300166,
"loss": 10.3438,
"step": 138
},
{
"epoch": 0.05886712546320805,
"grad_norm": 0.039334215223789215,
"learning_rate": 0.0001954478752284529,
"loss": 10.3472,
"step": 139
},
{
"epoch": 0.059290629962943354,
"grad_norm": 0.04949035122990608,
"learning_rate": 0.00019537757700058403,
"loss": 10.3476,
"step": 140
},
{
"epoch": 0.05971413446267867,
"grad_norm": 0.034930143505334854,
"learning_rate": 0.00019530675295694857,
"loss": 10.3475,
"step": 141
},
{
"epoch": 0.060137638962413975,
"grad_norm": 0.02766244113445282,
"learning_rate": 0.00019523540348799885,
"loss": 10.3457,
"step": 142
},
{
"epoch": 0.06056114346214928,
"grad_norm": 0.02754233032464981,
"learning_rate": 0.0001951635289870839,
"loss": 10.3471,
"step": 143
},
{
"epoch": 0.060984647961884596,
"grad_norm": 0.05756423994898796,
"learning_rate": 0.00019509112985044717,
"loss": 10.348,
"step": 144
},
{
"epoch": 0.0614081524616199,
"grad_norm": 0.03342543542385101,
"learning_rate": 0.00019501820647722457,
"loss": 10.349,
"step": 145
},
{
"epoch": 0.061831656961355216,
"grad_norm": 0.04082402214407921,
"learning_rate": 0.00019494475926944195,
"loss": 10.3486,
"step": 146
},
{
"epoch": 0.06225516146109052,
"grad_norm": 0.03864405304193497,
"learning_rate": 0.00019487078863201322,
"loss": 10.351,
"step": 147
},
{
"epoch": 0.06267866596082583,
"grad_norm": 0.028355760499835014,
"learning_rate": 0.00019479629497273781,
"loss": 10.3474,
"step": 148
},
{
"epoch": 0.06310217046056114,
"grad_norm": 0.03946223482489586,
"learning_rate": 0.00019472127870229867,
"loss": 10.349,
"step": 149
},
{
"epoch": 0.06352567496029646,
"grad_norm": 0.04293173551559448,
"learning_rate": 0.00019464574023425984,
"loss": 10.3508,
"step": 150
},
{
"epoch": 0.06394917946003176,
"grad_norm": 0.04612809792160988,
"learning_rate": 0.0001945696799850642,
"loss": 10.3473,
"step": 151
},
{
"epoch": 0.06437268395976707,
"grad_norm": 0.04514515772461891,
"learning_rate": 0.00019449309837403137,
"loss": 10.3484,
"step": 152
},
{
"epoch": 0.06479618845950239,
"grad_norm": 0.03168589621782303,
"learning_rate": 0.00019441599582335498,
"loss": 10.3465,
"step": 153
},
{
"epoch": 0.06521969295923769,
"grad_norm": 0.04755236580967903,
"learning_rate": 0.00019433837275810082,
"loss": 10.3474,
"step": 154
},
{
"epoch": 0.065643197458973,
"grad_norm": 0.031274329870939255,
"learning_rate": 0.00019426022960620417,
"loss": 10.3451,
"step": 155
},
{
"epoch": 0.06606670195870831,
"grad_norm": 0.036476653069257736,
"learning_rate": 0.00019418156679846754,
"loss": 10.3483,
"step": 156
},
{
"epoch": 0.06649020645844363,
"grad_norm": 0.0386991873383522,
"learning_rate": 0.0001941023847685583,
"loss": 10.3474,
"step": 157
},
{
"epoch": 0.06691371095817893,
"grad_norm": 0.034830257296562195,
"learning_rate": 0.00019402268395300637,
"loss": 10.3493,
"step": 158
},
{
"epoch": 0.06733721545791424,
"grad_norm": 0.03715137764811516,
"learning_rate": 0.00019394246479120163,
"loss": 10.3529,
"step": 159
},
{
"epoch": 0.06776071995764955,
"grad_norm": 0.03950640186667442,
"learning_rate": 0.00019386172772539162,
"loss": 10.3479,
"step": 160
},
{
"epoch": 0.06818422445738485,
"grad_norm": 0.04391263425350189,
"learning_rate": 0.0001937804732006791,
"loss": 10.3456,
"step": 161
},
{
"epoch": 0.06860772895712017,
"grad_norm": 0.047799207270145416,
"learning_rate": 0.00019369870166501959,
"loss": 10.3451,
"step": 162
},
{
"epoch": 0.06903123345685548,
"grad_norm": 0.031426433473825455,
"learning_rate": 0.00019361641356921883,
"loss": 10.3499,
"step": 163
},
{
"epoch": 0.0694547379565908,
"grad_norm": 0.0359007902443409,
"learning_rate": 0.00019353360936693041,
"loss": 10.3433,
"step": 164
},
{
"epoch": 0.0698782424563261,
"grad_norm": 0.02672567404806614,
"learning_rate": 0.00019345028951465318,
"loss": 10.343,
"step": 165
},
{
"epoch": 0.07030174695606141,
"grad_norm": 0.04336037486791611,
"learning_rate": 0.0001933664544717288,
"loss": 10.3488,
"step": 166
},
{
"epoch": 0.07072525145579672,
"grad_norm": 0.030480332672595978,
"learning_rate": 0.0001932821047003391,
"loss": 10.3464,
"step": 167
},
{
"epoch": 0.07114875595553202,
"grad_norm": 0.03520766645669937,
"learning_rate": 0.00019319724066550373,
"loss": 10.3475,
"step": 168
},
{
"epoch": 0.07157226045526734,
"grad_norm": 0.020646894350647926,
"learning_rate": 0.0001931118628350773,
"loss": 10.3476,
"step": 169
},
{
"epoch": 0.07199576495500265,
"grad_norm": 0.04156513512134552,
"learning_rate": 0.00019302597167974707,
"loss": 10.3485,
"step": 170
},
{
"epoch": 0.07241926945473795,
"grad_norm": 0.02938881516456604,
"learning_rate": 0.0001929395676730303,
"loss": 10.3464,
"step": 171
},
{
"epoch": 0.07284277395447326,
"grad_norm": 0.03371270000934601,
"learning_rate": 0.00019285265129127151,
"loss": 10.3443,
"step": 172
},
{
"epoch": 0.07326627845420858,
"grad_norm": 0.045955732464790344,
"learning_rate": 0.00019276522301363996,
"loss": 10.346,
"step": 173
},
{
"epoch": 0.07368978295394389,
"grad_norm": 0.022017156705260277,
"learning_rate": 0.000192677283322127,
"loss": 10.3461,
"step": 174
},
{
"epoch": 0.07411328745367919,
"grad_norm": 0.045463208109140396,
"learning_rate": 0.0001925888327015434,
"loss": 10.3462,
"step": 175
},
{
"epoch": 0.0745367919534145,
"grad_norm": 0.041146885603666306,
"learning_rate": 0.00019249987163951667,
"loss": 10.3453,
"step": 176
},
{
"epoch": 0.07496029645314982,
"grad_norm": 0.04077988117933273,
"learning_rate": 0.0001924104006264884,
"loss": 10.3472,
"step": 177
},
{
"epoch": 0.07538380095288512,
"grad_norm": 0.033624131232500076,
"learning_rate": 0.00019232042015571152,
"loss": 10.3493,
"step": 178
},
{
"epoch": 0.07580730545262043,
"grad_norm": 0.04149757698178291,
"learning_rate": 0.00019222993072324758,
"loss": 10.347,
"step": 179
},
{
"epoch": 0.07623080995235575,
"grad_norm": 0.04390670359134674,
"learning_rate": 0.00019213893282796405,
"loss": 10.3499,
"step": 180
},
{
"epoch": 0.07665431445209106,
"grad_norm": 0.04238109290599823,
"learning_rate": 0.00019204742697153155,
"loss": 10.3482,
"step": 181
},
{
"epoch": 0.07707781895182636,
"grad_norm": 0.0415191613137722,
"learning_rate": 0.0001919554136584211,
"loss": 10.3485,
"step": 182
},
{
"epoch": 0.07750132345156167,
"grad_norm": 0.04313662648200989,
"learning_rate": 0.0001918628933959013,
"loss": 10.3447,
"step": 183
},
{
"epoch": 0.07792482795129699,
"grad_norm": 0.0481775663793087,
"learning_rate": 0.00019176986669403555,
"loss": 10.3456,
"step": 184
},
{
"epoch": 0.07834833245103229,
"grad_norm": 0.031192272901535034,
"learning_rate": 0.0001916763340656793,
"loss": 10.3488,
"step": 185
},
{
"epoch": 0.0787718369507676,
"grad_norm": 0.04172395542263985,
"learning_rate": 0.00019158229602647708,
"loss": 10.3442,
"step": 186
},
{
"epoch": 0.07919534145050292,
"grad_norm": 0.03788716346025467,
"learning_rate": 0.00019148775309485983,
"loss": 10.3443,
"step": 187
},
{
"epoch": 0.07961884595023822,
"grad_norm": 0.0322580486536026,
"learning_rate": 0.00019139270579204194,
"loss": 10.3478,
"step": 188
},
{
"epoch": 0.08004235044997353,
"grad_norm": 0.035218119621276855,
"learning_rate": 0.00019129715464201832,
"loss": 10.3475,
"step": 189
},
{
"epoch": 0.08046585494970884,
"grad_norm": 0.0283061470836401,
"learning_rate": 0.0001912011001715617,
"loss": 10.3469,
"step": 190
},
{
"epoch": 0.08088935944944416,
"grad_norm": 0.03684883192181587,
"learning_rate": 0.00019110454291021954,
"loss": 10.3483,
"step": 191
},
{
"epoch": 0.08131286394917946,
"grad_norm": 0.028339441865682602,
"learning_rate": 0.00019100748339031113,
"loss": 10.3484,
"step": 192
},
{
"epoch": 0.08173636844891477,
"grad_norm": 0.03159940615296364,
"learning_rate": 0.00019090992214692488,
"loss": 10.346,
"step": 193
},
{
"epoch": 0.08215987294865008,
"grad_norm": 0.029895318672060966,
"learning_rate": 0.00019081185971791504,
"loss": 10.3481,
"step": 194
},
{
"epoch": 0.08258337744838538,
"grad_norm": 0.04218447580933571,
"learning_rate": 0.0001907132966438989,
"loss": 10.3453,
"step": 195
},
{
"epoch": 0.0830068819481207,
"grad_norm": 0.042808372527360916,
"learning_rate": 0.00019061423346825395,
"loss": 10.3466,
"step": 196
},
{
"epoch": 0.08343038644785601,
"grad_norm": 0.03805699571967125,
"learning_rate": 0.00019051467073711456,
"loss": 10.3466,
"step": 197
},
{
"epoch": 0.08385389094759131,
"grad_norm": 0.04781021177768707,
"learning_rate": 0.00019041460899936921,
"loss": 10.3436,
"step": 198
},
{
"epoch": 0.08427739544732663,
"grad_norm": 0.025532910600304604,
"learning_rate": 0.00019031404880665739,
"loss": 10.3478,
"step": 199
},
{
"epoch": 0.08470089994706194,
"grad_norm": 0.030978702008724213,
"learning_rate": 0.00019021299071336664,
"loss": 10.3455,
"step": 200
},
{
"epoch": 0.08512440444679725,
"grad_norm": 0.03757680207490921,
"learning_rate": 0.00019011143527662935,
"loss": 10.3481,
"step": 201
},
{
"epoch": 0.08554790894653255,
"grad_norm": 0.04030987620353699,
"learning_rate": 0.00019000938305631975,
"loss": 10.3465,
"step": 202
},
{
"epoch": 0.08597141344626787,
"grad_norm": 0.04490538313984871,
"learning_rate": 0.00018990683461505087,
"loss": 10.3444,
"step": 203
},
{
"epoch": 0.08639491794600318,
"grad_norm": 0.03259282931685448,
"learning_rate": 0.00018980379051817138,
"loss": 10.3471,
"step": 204
},
{
"epoch": 0.08681842244573848,
"grad_norm": 0.04348522052168846,
"learning_rate": 0.00018970025133376253,
"loss": 10.3488,
"step": 205
},
{
"epoch": 0.0872419269454738,
"grad_norm": 0.0327000729739666,
"learning_rate": 0.00018959621763263494,
"loss": 10.347,
"step": 206
},
{
"epoch": 0.08766543144520911,
"grad_norm": 0.043357010930776596,
"learning_rate": 0.0001894916899883255,
"loss": 10.3514,
"step": 207
},
{
"epoch": 0.08808893594494442,
"grad_norm": 0.03237845376133919,
"learning_rate": 0.00018938666897709425,
"loss": 10.3454,
"step": 208
},
{
"epoch": 0.08851244044467972,
"grad_norm": 0.040286242961883545,
"learning_rate": 0.0001892811551779211,
"loss": 10.3446,
"step": 209
},
{
"epoch": 0.08893594494441504,
"grad_norm": 0.03772817552089691,
"learning_rate": 0.00018917514917250275,
"loss": 10.3458,
"step": 210
},
{
"epoch": 0.08935944944415035,
"grad_norm": 0.0332292765378952,
"learning_rate": 0.00018906865154524942,
"loss": 10.3453,
"step": 211
},
{
"epoch": 0.08978295394388565,
"grad_norm": 0.038554366677999496,
"learning_rate": 0.00018896166288328155,
"loss": 10.3463,
"step": 212
},
{
"epoch": 0.09020645844362096,
"grad_norm": 0.05035999044775963,
"learning_rate": 0.00018885418377642674,
"loss": 10.346,
"step": 213
},
{
"epoch": 0.09062996294335628,
"grad_norm": 0.03604661673307419,
"learning_rate": 0.00018874621481721645,
"loss": 10.3474,
"step": 214
},
{
"epoch": 0.09105346744309158,
"grad_norm": 0.0357426181435585,
"learning_rate": 0.00018863775660088258,
"loss": 10.347,
"step": 215
},
{
"epoch": 0.09147697194282689,
"grad_norm": 0.031967032700777054,
"learning_rate": 0.00018852880972535432,
"loss": 10.3471,
"step": 216
},
{
"epoch": 0.0919004764425622,
"grad_norm": 0.031692031770944595,
"learning_rate": 0.0001884193747912549,
"loss": 10.3457,
"step": 217
},
{
"epoch": 0.09232398094229752,
"grad_norm": 0.04586448892951012,
"learning_rate": 0.00018830945240189817,
"loss": 10.3457,
"step": 218
},
{
"epoch": 0.09274748544203282,
"grad_norm": 0.039624523371458054,
"learning_rate": 0.00018819904316328532,
"loss": 10.3455,
"step": 219
},
{
"epoch": 0.09317098994176813,
"grad_norm": 0.042145851999521255,
"learning_rate": 0.00018808814768410157,
"loss": 10.3445,
"step": 220
},
{
"epoch": 0.09359449444150345,
"grad_norm": 0.0323745459318161,
"learning_rate": 0.0001879767665757127,
"loss": 10.3408,
"step": 221
},
{
"epoch": 0.09401799894123875,
"grad_norm": 0.03138961270451546,
"learning_rate": 0.00018786490045216182,
"loss": 10.3448,
"step": 222
},
{
"epoch": 0.09444150344097406,
"grad_norm": 0.0241545382887125,
"learning_rate": 0.00018775254993016595,
"loss": 10.3481,
"step": 223
},
{
"epoch": 0.09486500794070937,
"grad_norm": 0.03655562922358513,
"learning_rate": 0.0001876397156291125,
"loss": 10.3438,
"step": 224
},
{
"epoch": 0.09528851244044469,
"grad_norm": 0.042878881096839905,
"learning_rate": 0.00018752639817105606,
"loss": 10.345,
"step": 225
},
{
"epoch": 0.09571201694017999,
"grad_norm": 0.03456420823931694,
"learning_rate": 0.0001874125981807148,
"loss": 10.3447,
"step": 226
},
{
"epoch": 0.0961355214399153,
"grad_norm": 0.033554937690496445,
"learning_rate": 0.00018729831628546702,
"loss": 10.3467,
"step": 227
},
{
"epoch": 0.09655902593965061,
"grad_norm": 0.028273334726691246,
"learning_rate": 0.00018718355311534793,
"loss": 10.348,
"step": 228
},
{
"epoch": 0.09698253043938591,
"grad_norm": 0.03176790103316307,
"learning_rate": 0.00018706830930304585,
"loss": 10.3438,
"step": 229
},
{
"epoch": 0.09740603493912123,
"grad_norm": 0.04405641928315163,
"learning_rate": 0.000186952585483899,
"loss": 10.3432,
"step": 230
},
{
"epoch": 0.09782953943885654,
"grad_norm": 0.039611659944057465,
"learning_rate": 0.00018683638229589168,
"loss": 10.3477,
"step": 231
},
{
"epoch": 0.09825304393859184,
"grad_norm": 0.03426756337285042,
"learning_rate": 0.00018671970037965118,
"loss": 10.3482,
"step": 232
},
{
"epoch": 0.09867654843832716,
"grad_norm": 0.0545201450586319,
"learning_rate": 0.00018660254037844388,
"loss": 10.3437,
"step": 233
},
{
"epoch": 0.09910005293806247,
"grad_norm": 0.041258279234170914,
"learning_rate": 0.00018648490293817185,
"loss": 10.3463,
"step": 234
},
{
"epoch": 0.09952355743779778,
"grad_norm": 0.025181951001286507,
"learning_rate": 0.00018636678870736928,
"loss": 10.3454,
"step": 235
},
{
"epoch": 0.09994706193753308,
"grad_norm": 0.02877328358590603,
"learning_rate": 0.00018624819833719896,
"loss": 10.3448,
"step": 236
},
{
"epoch": 0.1003705664372684,
"grad_norm": 0.049800995737314224,
"learning_rate": 0.00018612913248144852,
"loss": 10.3473,
"step": 237
},
{
"epoch": 0.10079407093700371,
"grad_norm": 0.02784821018576622,
"learning_rate": 0.0001860095917965271,
"loss": 10.3458,
"step": 238
},
{
"epoch": 0.10121757543673901,
"grad_norm": 0.0490335077047348,
"learning_rate": 0.00018588957694146138,
"loss": 10.3444,
"step": 239
},
{
"epoch": 0.10164107993647432,
"grad_norm": 0.03331589698791504,
"learning_rate": 0.0001857690885778923,
"loss": 10.3478,
"step": 240
},
{
"epoch": 0.10206458443620964,
"grad_norm": 0.03565964475274086,
"learning_rate": 0.00018564812737007112,
"loss": 10.3445,
"step": 241
},
{
"epoch": 0.10248808893594494,
"grad_norm": 0.036929886788129807,
"learning_rate": 0.00018552669398485598,
"loss": 10.3427,
"step": 242
},
{
"epoch": 0.10291159343568025,
"grad_norm": 0.03922504186630249,
"learning_rate": 0.0001854047890917081,
"loss": 10.3466,
"step": 243
},
{
"epoch": 0.10333509793541557,
"grad_norm": 0.03552815318107605,
"learning_rate": 0.0001852824133626881,
"loss": 10.3414,
"step": 244
},
{
"epoch": 0.10375860243515088,
"grad_norm": 0.051186930388212204,
"learning_rate": 0.0001851595674724523,
"loss": 10.3479,
"step": 245
},
{
"epoch": 0.10418210693488618,
"grad_norm": 0.03887473791837692,
"learning_rate": 0.00018503625209824906,
"loss": 10.3456,
"step": 246
},
{
"epoch": 0.10460561143462149,
"grad_norm": 0.03032403625547886,
"learning_rate": 0.00018491246791991502,
"loss": 10.3421,
"step": 247
},
{
"epoch": 0.1050291159343568,
"grad_norm": 0.03820972517132759,
"learning_rate": 0.0001847882156198713,
"loss": 10.3479,
"step": 248
},
{
"epoch": 0.1054526204340921,
"grad_norm": 0.04590925946831703,
"learning_rate": 0.0001846634958831197,
"loss": 10.3442,
"step": 249
},
{
"epoch": 0.10587612493382742,
"grad_norm": 0.033261772245168686,
"learning_rate": 0.00018453830939723913,
"loss": 10.3457,
"step": 250
},
{
"epoch": 0.10629962943356273,
"grad_norm": 0.037462469190359116,
"learning_rate": 0.00018441265685238158,
"loss": 10.3421,
"step": 251
},
{
"epoch": 0.10672313393329805,
"grad_norm": 0.0378030389547348,
"learning_rate": 0.00018428653894126846,
"loss": 10.345,
"step": 252
},
{
"epoch": 0.10714663843303335,
"grad_norm": 0.030371299013495445,
"learning_rate": 0.00018415995635918676,
"loss": 10.3488,
"step": 253
},
{
"epoch": 0.10757014293276866,
"grad_norm": 0.028029056265950203,
"learning_rate": 0.00018403290980398512,
"loss": 10.3436,
"step": 254
},
{
"epoch": 0.10799364743250398,
"grad_norm": 0.03999907523393631,
"learning_rate": 0.00018390539997607014,
"loss": 10.3432,
"step": 255
},
{
"epoch": 0.10841715193223928,
"grad_norm": 0.046100400388240814,
"learning_rate": 0.00018377742757840244,
"loss": 10.3444,
"step": 256
},
{
"epoch": 0.10884065643197459,
"grad_norm": 0.03245000168681145,
"learning_rate": 0.0001836489933164927,
"loss": 10.3434,
"step": 257
},
{
"epoch": 0.1092641609317099,
"grad_norm": 0.028921889141201973,
"learning_rate": 0.000183520097898398,
"loss": 10.3453,
"step": 258
},
{
"epoch": 0.1096876654314452,
"grad_norm": 0.033345550298690796,
"learning_rate": 0.00018339074203471757,
"loss": 10.3431,
"step": 259
},
{
"epoch": 0.11011116993118052,
"grad_norm": 0.036411840468645096,
"learning_rate": 0.00018326092643858923,
"loss": 10.3433,
"step": 260
},
{
"epoch": 0.11053467443091583,
"grad_norm": 0.039705585688352585,
"learning_rate": 0.00018313065182568527,
"loss": 10.3447,
"step": 261
},
{
"epoch": 0.11095817893065114,
"grad_norm": 0.0351327620446682,
"learning_rate": 0.00018299991891420847,
"loss": 10.3451,
"step": 262
},
{
"epoch": 0.11138168343038644,
"grad_norm": 0.04147129878401756,
"learning_rate": 0.00018286872842488832,
"loss": 10.3408,
"step": 263
},
{
"epoch": 0.11180518793012176,
"grad_norm": 0.036880653351545334,
"learning_rate": 0.00018273708108097677,
"loss": 10.3433,
"step": 264
},
{
"epoch": 0.11222869242985707,
"grad_norm": 0.05368657410144806,
"learning_rate": 0.00018260497760824458,
"loss": 10.3491,
"step": 265
},
{
"epoch": 0.11265219692959237,
"grad_norm": 0.03917551413178444,
"learning_rate": 0.00018247241873497707,
"loss": 10.3421,
"step": 266
},
{
"epoch": 0.11307570142932769,
"grad_norm": 0.03874152526259422,
"learning_rate": 0.0001823394051919701,
"loss": 10.3435,
"step": 267
},
{
"epoch": 0.113499205929063,
"grad_norm": 0.04026317596435547,
"learning_rate": 0.0001822059377125263,
"loss": 10.3456,
"step": 268
},
{
"epoch": 0.11392271042879831,
"grad_norm": 0.06843981891870499,
"learning_rate": 0.00018207201703245062,
"loss": 10.3463,
"step": 269
},
{
"epoch": 0.11434621492853361,
"grad_norm": 0.05057435482740402,
"learning_rate": 0.00018193764389004674,
"loss": 10.3409,
"step": 270
},
{
"epoch": 0.11476971942826893,
"grad_norm": 0.03488187864422798,
"learning_rate": 0.0001818028190261126,
"loss": 10.3471,
"step": 271
},
{
"epoch": 0.11519322392800424,
"grad_norm": 0.05958685651421547,
"learning_rate": 0.0001816675431839365,
"loss": 10.3482,
"step": 272
},
{
"epoch": 0.11561672842773954,
"grad_norm": 0.03940315917134285,
"learning_rate": 0.000181531817109293,
"loss": 10.3487,
"step": 273
},
{
"epoch": 0.11604023292747485,
"grad_norm": 0.044273462146520615,
"learning_rate": 0.00018139564155043885,
"loss": 10.3443,
"step": 274
},
{
"epoch": 0.11646373742721017,
"grad_norm": 0.028659025207161903,
"learning_rate": 0.00018125901725810865,
"loss": 10.346,
"step": 275
},
{
"epoch": 0.11688724192694547,
"grad_norm": 0.054137472063302994,
"learning_rate": 0.00018112194498551106,
"loss": 10.3445,
"step": 276
},
{
"epoch": 0.11731074642668078,
"grad_norm": 0.028421467170119286,
"learning_rate": 0.00018098442548832426,
"loss": 10.3408,
"step": 277
},
{
"epoch": 0.1177342509264161,
"grad_norm": 0.036345433443784714,
"learning_rate": 0.0001808464595246921,
"loss": 10.3425,
"step": 278
},
{
"epoch": 0.11815775542615141,
"grad_norm": 0.034704405814409256,
"learning_rate": 0.00018070804785521975,
"loss": 10.3469,
"step": 279
},
{
"epoch": 0.11858125992588671,
"grad_norm": 0.04459795728325844,
"learning_rate": 0.0001805691912429696,
"loss": 10.3433,
"step": 280
},
{
"epoch": 0.11900476442562202,
"grad_norm": 0.03889624401926994,
"learning_rate": 0.0001804298904534569,
"loss": 10.3421,
"step": 281
},
{
"epoch": 0.11942826892535734,
"grad_norm": 0.03947869688272476,
"learning_rate": 0.0001802901462546457,
"loss": 10.3403,
"step": 282
},
{
"epoch": 0.11985177342509264,
"grad_norm": 0.055074214935302734,
"learning_rate": 0.00018014995941694468,
"loss": 10.344,
"step": 283
},
{
"epoch": 0.12027527792482795,
"grad_norm": 0.029949650168418884,
"learning_rate": 0.00018000933071320258,
"loss": 10.3431,
"step": 284
},
{
"epoch": 0.12069878242456326,
"grad_norm": 0.041310764849185944,
"learning_rate": 0.0001798682609187043,
"loss": 10.3423,
"step": 285
},
{
"epoch": 0.12112228692429856,
"grad_norm": 0.0433335155248642,
"learning_rate": 0.00017972675081116637,
"loss": 10.3431,
"step": 286
},
{
"epoch": 0.12154579142403388,
"grad_norm": 0.03031458891928196,
"learning_rate": 0.0001795848011707328,
"loss": 10.3417,
"step": 287
},
{
"epoch": 0.12196929592376919,
"grad_norm": 0.03494204208254814,
"learning_rate": 0.00017944241277997077,
"loss": 10.345,
"step": 288
},
{
"epoch": 0.1223928004235045,
"grad_norm": 0.027185741811990738,
"learning_rate": 0.0001792995864238663,
"loss": 10.3429,
"step": 289
},
{
"epoch": 0.1228163049232398,
"grad_norm": 0.056868940591812134,
"learning_rate": 0.00017915632288981978,
"loss": 10.3404,
"step": 290
},
{
"epoch": 0.12323980942297512,
"grad_norm": 0.03859318792819977,
"learning_rate": 0.0001790126229676419,
"loss": 10.3404,
"step": 291
},
{
"epoch": 0.12366331392271043,
"grad_norm": 0.029093610122799873,
"learning_rate": 0.0001788684874495491,
"loss": 10.3451,
"step": 292
},
{
"epoch": 0.12408681842244573,
"grad_norm": 0.0350499302148819,
"learning_rate": 0.00017872391713015924,
"loss": 10.3397,
"step": 293
},
{
"epoch": 0.12451032292218105,
"grad_norm": 0.03968917950987816,
"learning_rate": 0.00017857891280648728,
"loss": 10.3428,
"step": 294
},
{
"epoch": 0.12493382742191636,
"grad_norm": 0.038241248577833176,
"learning_rate": 0.00017843347527794081,
"loss": 10.3415,
"step": 295
},
{
"epoch": 0.12535733192165166,
"grad_norm": 0.03694219887256622,
"learning_rate": 0.00017828760534631565,
"loss": 10.341,
"step": 296
},
{
"epoch": 0.125780836421387,
"grad_norm": 0.03630373254418373,
"learning_rate": 0.00017814130381579155,
"loss": 10.3388,
"step": 297
},
{
"epoch": 0.1262043409211223,
"grad_norm": 0.031546298414468765,
"learning_rate": 0.00017799457149292753,
"loss": 10.3418,
"step": 298
},
{
"epoch": 0.1266278454208576,
"grad_norm": 0.05403247848153114,
"learning_rate": 0.00017784740918665767,
"loss": 10.3451,
"step": 299
},
{
"epoch": 0.12705134992059292,
"grad_norm": 0.03717590495944023,
"learning_rate": 0.00017769981770828652,
"loss": 10.3419,
"step": 300
},
{
"epoch": 0.12747485442032822,
"grad_norm": 0.05108652263879776,
"learning_rate": 0.0001775517978714846,
"loss": 10.3414,
"step": 301
},
{
"epoch": 0.12789835892006352,
"grad_norm": 0.04460764676332474,
"learning_rate": 0.000177403350492284,
"loss": 10.3443,
"step": 302
},
{
"epoch": 0.12832186341979884,
"grad_norm": 0.049197494983673096,
"learning_rate": 0.00017725447638907392,
"loss": 10.3426,
"step": 303
},
{
"epoch": 0.12874536791953414,
"grad_norm": 0.029134899377822876,
"learning_rate": 0.0001771051763825959,
"loss": 10.3409,
"step": 304
},
{
"epoch": 0.12916887241926944,
"grad_norm": 0.03543701395392418,
"learning_rate": 0.00017695545129593973,
"loss": 10.3442,
"step": 305
},
{
"epoch": 0.12959237691900477,
"grad_norm": 0.03356650099158287,
"learning_rate": 0.00017680530195453845,
"loss": 10.3429,
"step": 306
},
{
"epoch": 0.13001588141874007,
"grad_norm": 0.0437590628862381,
"learning_rate": 0.00017665472918616413,
"loss": 10.3449,
"step": 307
},
{
"epoch": 0.13043938591847537,
"grad_norm": 0.03688879683613777,
"learning_rate": 0.00017650373382092314,
"loss": 10.3398,
"step": 308
},
{
"epoch": 0.1308628904182107,
"grad_norm": 0.02486329711973667,
"learning_rate": 0.00017635231669125165,
"loss": 10.3408,
"step": 309
},
{
"epoch": 0.131286394917946,
"grad_norm": 0.03931661695241928,
"learning_rate": 0.000176200478631911,
"loss": 10.3393,
"step": 310
},
{
"epoch": 0.13170989941768133,
"grad_norm": 0.03328394889831543,
"learning_rate": 0.00017604822047998306,
"loss": 10.3418,
"step": 311
},
{
"epoch": 0.13213340391741663,
"grad_norm": 0.04236508905887604,
"learning_rate": 0.0001758955430748658,
"loss": 10.3432,
"step": 312
},
{
"epoch": 0.13255690841715193,
"grad_norm": 0.026039429008960724,
"learning_rate": 0.0001757424472582684,
"loss": 10.3464,
"step": 313
},
{
"epoch": 0.13298041291688725,
"grad_norm": 0.0246927160769701,
"learning_rate": 0.00017558893387420682,
"loss": 10.3451,
"step": 314
},
{
"epoch": 0.13340391741662255,
"grad_norm": 0.030340131372213364,
"learning_rate": 0.00017543500376899902,
"loss": 10.3401,
"step": 315
},
{
"epoch": 0.13382742191635785,
"grad_norm": 0.04260968044400215,
"learning_rate": 0.00017528065779126033,
"loss": 10.3414,
"step": 316
},
{
"epoch": 0.13425092641609318,
"grad_norm": 0.03421700373291969,
"learning_rate": 0.00017512589679189887,
"loss": 10.3402,
"step": 317
},
{
"epoch": 0.13467443091582848,
"grad_norm": 0.03428565710783005,
"learning_rate": 0.0001749707216241106,
"loss": 10.3406,
"step": 318
},
{
"epoch": 0.13509793541556378,
"grad_norm": 0.042442288249731064,
"learning_rate": 0.000174815133143375,
"loss": 10.3381,
"step": 319
},
{
"epoch": 0.1355214399152991,
"grad_norm": 0.0397978350520134,
"learning_rate": 0.00017465913220744998,
"loss": 10.3427,
"step": 320
},
{
"epoch": 0.1359449444150344,
"grad_norm": 0.03605269640684128,
"learning_rate": 0.00017450271967636737,
"loss": 10.3397,
"step": 321
},
{
"epoch": 0.1363684489147697,
"grad_norm": 0.034129634499549866,
"learning_rate": 0.00017434589641242813,
"loss": 10.3463,
"step": 322
},
{
"epoch": 0.13679195341450504,
"grad_norm": 0.03590450435876846,
"learning_rate": 0.0001741886632801976,
"loss": 10.3416,
"step": 323
},
{
"epoch": 0.13721545791424034,
"grad_norm": 0.040069352835416794,
"learning_rate": 0.0001740310211465006,
"loss": 10.3427,
"step": 324
},
{
"epoch": 0.13763896241397564,
"grad_norm": 0.03840317204594612,
"learning_rate": 0.00017387297088041693,
"loss": 10.3431,
"step": 325
},
{
"epoch": 0.13806246691371096,
"grad_norm": 0.04085763916373253,
"learning_rate": 0.0001737145133532764,
"loss": 10.3379,
"step": 326
},
{
"epoch": 0.13848597141344626,
"grad_norm": 0.03601207211613655,
"learning_rate": 0.0001735556494386539,
"loss": 10.3407,
"step": 327
},
{
"epoch": 0.1389094759131816,
"grad_norm": 0.03058718331158161,
"learning_rate": 0.00017339638001236492,
"loss": 10.3411,
"step": 328
},
{
"epoch": 0.1393329804129169,
"grad_norm": 0.03896321728825569,
"learning_rate": 0.0001732367059524604,
"loss": 10.3426,
"step": 329
},
{
"epoch": 0.1397564849126522,
"grad_norm": 0.040502067655324936,
"learning_rate": 0.0001730766281392221,
"loss": 10.3411,
"step": 330
},
{
"epoch": 0.14017998941238752,
"grad_norm": 0.032813332974910736,
"learning_rate": 0.0001729161474551576,
"loss": 10.343,
"step": 331
},
{
"epoch": 0.14060349391212282,
"grad_norm": 0.032831039279699326,
"learning_rate": 0.00017275526478499555,
"loss": 10.3403,
"step": 332
},
{
"epoch": 0.14102699841185812,
"grad_norm": 0.033066242933273315,
"learning_rate": 0.00017259398101568076,
"loss": 10.3439,
"step": 333
},
{
"epoch": 0.14145050291159345,
"grad_norm": 0.032812707126140594,
"learning_rate": 0.00017243229703636922,
"loss": 10.3396,
"step": 334
},
{
"epoch": 0.14187400741132875,
"grad_norm": 0.03849957883358002,
"learning_rate": 0.0001722702137384234,
"loss": 10.3437,
"step": 335
},
{
"epoch": 0.14229751191106405,
"grad_norm": 0.047831226140260696,
"learning_rate": 0.00017210773201540707,
"loss": 10.3375,
"step": 336
},
{
"epoch": 0.14272101641079937,
"grad_norm": 0.04042219743132591,
"learning_rate": 0.0001719448527630806,
"loss": 10.3405,
"step": 337
},
{
"epoch": 0.14272101641079937,
"eval_loss": 10.340270042419434,
"eval_runtime": 3.4931,
"eval_samples_per_second": 284.85,
"eval_steps_per_second": 142.568,
"step": 337
},
{
"epoch": 0.14314452091053467,
"grad_norm": 0.030297674238681793,
"learning_rate": 0.00017178157687939592,
"loss": 10.3392,
"step": 338
},
{
"epoch": 0.14356802541026997,
"grad_norm": 0.030716104432940483,
"learning_rate": 0.00017161790526449156,
"loss": 10.3387,
"step": 339
},
{
"epoch": 0.1439915299100053,
"grad_norm": 0.034860242158174515,
"learning_rate": 0.00017145383882068778,
"loss": 10.3383,
"step": 340
},
{
"epoch": 0.1444150344097406,
"grad_norm": 0.04767249897122383,
"learning_rate": 0.00017128937845248146,
"loss": 10.3434,
"step": 341
},
{
"epoch": 0.1448385389094759,
"grad_norm": 0.02438390627503395,
"learning_rate": 0.00017112452506654117,
"loss": 10.3438,
"step": 342
},
{
"epoch": 0.14526204340921123,
"grad_norm": 0.04478878155350685,
"learning_rate": 0.00017095927957170228,
"loss": 10.3411,
"step": 343
},
{
"epoch": 0.14568554790894653,
"grad_norm": 0.03832190856337547,
"learning_rate": 0.00017079364287896174,
"loss": 10.3427,
"step": 344
},
{
"epoch": 0.14610905240868186,
"grad_norm": 0.03669346123933792,
"learning_rate": 0.00017062761590147323,
"loss": 10.3416,
"step": 345
},
{
"epoch": 0.14653255690841716,
"grad_norm": 0.03234705701470375,
"learning_rate": 0.00017046119955454206,
"loss": 10.3382,
"step": 346
},
{
"epoch": 0.14695606140815246,
"grad_norm": 0.028199292719364166,
"learning_rate": 0.00017029439475562015,
"loss": 10.3395,
"step": 347
},
{
"epoch": 0.14737956590788778,
"grad_norm": 0.03456572815775871,
"learning_rate": 0.0001701272024243009,
"loss": 10.3412,
"step": 348
},
{
"epoch": 0.14780307040762308,
"grad_norm": 0.041843071579933167,
"learning_rate": 0.00016995962348231424,
"loss": 10.3384,
"step": 349
},
{
"epoch": 0.14822657490735838,
"grad_norm": 0.0353543683886528,
"learning_rate": 0.0001697916588535214,
"loss": 10.3402,
"step": 350
},
{
"epoch": 0.1486500794070937,
"grad_norm": 0.03074280545115471,
"learning_rate": 0.00016962330946391,
"loss": 10.3408,
"step": 351
},
{
"epoch": 0.149073583906829,
"grad_norm": 0.029107527807354927,
"learning_rate": 0.00016945457624158871,
"loss": 10.3404,
"step": 352
},
{
"epoch": 0.1494970884065643,
"grad_norm": 0.028990836814045906,
"learning_rate": 0.00016928546011678238,
"loss": 10.3366,
"step": 353
},
{
"epoch": 0.14992059290629964,
"grad_norm": 0.026332266628742218,
"learning_rate": 0.00016911596202182677,
"loss": 10.3423,
"step": 354
},
{
"epoch": 0.15034409740603494,
"grad_norm": 0.044704243540763855,
"learning_rate": 0.00016894608289116342,
"loss": 10.3407,
"step": 355
},
{
"epoch": 0.15076760190577024,
"grad_norm": 0.036491744220256805,
"learning_rate": 0.00016877582366133455,
"loss": 10.3393,
"step": 356
},
{
"epoch": 0.15119110640550557,
"grad_norm": 0.02925538271665573,
"learning_rate": 0.0001686051852709778,
"loss": 10.3394,
"step": 357
},
{
"epoch": 0.15161461090524087,
"grad_norm": 0.03139074891805649,
"learning_rate": 0.00016843416866082117,
"loss": 10.3381,
"step": 358
},
{
"epoch": 0.15203811540497617,
"grad_norm": 0.03710184246301651,
"learning_rate": 0.00016826277477367775,
"loss": 10.3378,
"step": 359
},
{
"epoch": 0.1524616199047115,
"grad_norm": 0.0361437126994133,
"learning_rate": 0.0001680910045544406,
"loss": 10.3408,
"step": 360
},
{
"epoch": 0.1528851244044468,
"grad_norm": 0.04383867606520653,
"learning_rate": 0.0001679188589500775,
"loss": 10.3415,
"step": 361
},
{
"epoch": 0.15330862890418212,
"grad_norm": 0.03228599205613136,
"learning_rate": 0.0001677463389096256,
"loss": 10.3413,
"step": 362
},
{
"epoch": 0.15373213340391742,
"grad_norm": 0.03311069682240486,
"learning_rate": 0.00016757344538418653,
"loss": 10.3409,
"step": 363
},
{
"epoch": 0.15415563790365272,
"grad_norm": 0.037153564393520355,
"learning_rate": 0.00016740017932692075,
"loss": 10.338,
"step": 364
},
{
"epoch": 0.15457914240338805,
"grad_norm": 0.03567847982048988,
"learning_rate": 0.00016722654169304253,
"loss": 10.3395,
"step": 365
},
{
"epoch": 0.15500264690312335,
"grad_norm": 0.026938440278172493,
"learning_rate": 0.0001670525334398147,
"loss": 10.3397,
"step": 366
},
{
"epoch": 0.15542615140285865,
"grad_norm": 0.02322826161980629,
"learning_rate": 0.00016687815552654327,
"loss": 10.3386,
"step": 367
},
{
"epoch": 0.15584965590259398,
"grad_norm": 0.03586160019040108,
"learning_rate": 0.00016670340891457216,
"loss": 10.3396,
"step": 368
},
{
"epoch": 0.15627316040232928,
"grad_norm": 0.03536440059542656,
"learning_rate": 0.00016652829456727797,
"loss": 10.3412,
"step": 369
},
{
"epoch": 0.15669666490206458,
"grad_norm": 0.025009091943502426,
"learning_rate": 0.00016635281345006461,
"loss": 10.34,
"step": 370
},
{
"epoch": 0.1571201694017999,
"grad_norm": 0.02612980827689171,
"learning_rate": 0.00016617696653035795,
"loss": 10.3401,
"step": 371
},
{
"epoch": 0.1575436739015352,
"grad_norm": 0.04117359593510628,
"learning_rate": 0.00016600075477760058,
"loss": 10.3393,
"step": 372
},
{
"epoch": 0.1579671784012705,
"grad_norm": 0.0326978899538517,
"learning_rate": 0.00016582417916324635,
"loss": 10.3384,
"step": 373
},
{
"epoch": 0.15839068290100583,
"grad_norm": 0.044377487152814865,
"learning_rate": 0.00016564724066075515,
"loss": 10.3382,
"step": 374
},
{
"epoch": 0.15881418740074113,
"grad_norm": 0.050321295857429504,
"learning_rate": 0.00016546994024558743,
"loss": 10.3387,
"step": 375
},
{
"epoch": 0.15923769190047643,
"grad_norm": 0.022547969594597816,
"learning_rate": 0.00016529227889519886,
"loss": 10.3385,
"step": 376
},
{
"epoch": 0.15966119640021176,
"grad_norm": 0.034384775906801224,
"learning_rate": 0.00016511425758903493,
"loss": 10.3391,
"step": 377
},
{
"epoch": 0.16008470089994706,
"grad_norm": 0.02677147649228573,
"learning_rate": 0.00016493587730852558,
"loss": 10.3399,
"step": 378
},
{
"epoch": 0.16050820539968239,
"grad_norm": 0.03600003570318222,
"learning_rate": 0.00016475713903707978,
"loss": 10.3418,
"step": 379
},
{
"epoch": 0.16093170989941769,
"grad_norm": 0.032675545662641525,
"learning_rate": 0.00016457804376008008,
"loss": 10.3388,
"step": 380
},
{
"epoch": 0.16135521439915299,
"grad_norm": 0.03568057715892792,
"learning_rate": 0.00016439859246487724,
"loss": 10.3362,
"step": 381
},
{
"epoch": 0.1617787188988883,
"grad_norm": 0.035958852618932724,
"learning_rate": 0.00016421878614078468,
"loss": 10.3396,
"step": 382
},
{
"epoch": 0.1622022233986236,
"grad_norm": 0.03066675178706646,
"learning_rate": 0.00016403862577907315,
"loss": 10.3426,
"step": 383
},
{
"epoch": 0.1626257278983589,
"grad_norm": 0.042271457612514496,
"learning_rate": 0.0001638581123729652,
"loss": 10.3404,
"step": 384
},
{
"epoch": 0.16304923239809424,
"grad_norm": 0.034284938126802444,
"learning_rate": 0.00016367724691762967,
"loss": 10.3381,
"step": 385
},
{
"epoch": 0.16347273689782954,
"grad_norm": 0.023705342784523964,
"learning_rate": 0.00016349603041017626,
"loss": 10.3375,
"step": 386
},
{
"epoch": 0.16389624139756484,
"grad_norm": 0.031792912632226944,
"learning_rate": 0.00016331446384965003,
"loss": 10.3383,
"step": 387
},
{
"epoch": 0.16431974589730017,
"grad_norm": 0.035305608063936234,
"learning_rate": 0.0001631325482370259,
"loss": 10.3434,
"step": 388
},
{
"epoch": 0.16474325039703547,
"grad_norm": 0.03486499562859535,
"learning_rate": 0.00016295028457520306,
"loss": 10.3428,
"step": 389
},
{
"epoch": 0.16516675489677077,
"grad_norm": 0.03409821167588234,
"learning_rate": 0.00016276767386899955,
"loss": 10.3386,
"step": 390
},
{
"epoch": 0.1655902593965061,
"grad_norm": 0.02966834418475628,
"learning_rate": 0.0001625847171251466,
"loss": 10.3393,
"step": 391
},
{
"epoch": 0.1660137638962414,
"grad_norm": 0.02835707552731037,
"learning_rate": 0.00016240141535228323,
"loss": 10.3388,
"step": 392
},
{
"epoch": 0.1664372683959767,
"grad_norm": 0.03911609947681427,
"learning_rate": 0.00016221776956095046,
"loss": 10.3423,
"step": 393
},
{
"epoch": 0.16686077289571202,
"grad_norm": 0.02803829312324524,
"learning_rate": 0.00016203378076358598,
"loss": 10.3427,
"step": 394
},
{
"epoch": 0.16728427739544732,
"grad_norm": 0.03135819733142853,
"learning_rate": 0.00016184944997451854,
"loss": 10.3364,
"step": 395
},
{
"epoch": 0.16770778189518262,
"grad_norm": 0.03102540783584118,
"learning_rate": 0.00016166477820996216,
"loss": 10.3403,
"step": 396
},
{
"epoch": 0.16813128639491795,
"grad_norm": 0.026423562318086624,
"learning_rate": 0.0001614797664880107,
"loss": 10.3372,
"step": 397
},
{
"epoch": 0.16855479089465325,
"grad_norm": 0.03439650684595108,
"learning_rate": 0.00016129441582863217,
"loss": 10.342,
"step": 398
},
{
"epoch": 0.16897829539438858,
"grad_norm": 0.03104579448699951,
"learning_rate": 0.00016110872725366316,
"loss": 10.3377,
"step": 399
},
{
"epoch": 0.16940179989412388,
"grad_norm": 0.03529537469148636,
"learning_rate": 0.0001609227017868033,
"loss": 10.3395,
"step": 400
},
{
"epoch": 0.16982530439385918,
"grad_norm": 0.03002871572971344,
"learning_rate": 0.00016073634045360932,
"loss": 10.3411,
"step": 401
},
{
"epoch": 0.1702488088935945,
"grad_norm": 0.03288958594202995,
"learning_rate": 0.00016054964428148963,
"loss": 10.3367,
"step": 402
},
{
"epoch": 0.1706723133933298,
"grad_norm": 0.026648705825209618,
"learning_rate": 0.00016036261429969867,
"loss": 10.3367,
"step": 403
},
{
"epoch": 0.1710958178930651,
"grad_norm": 0.035582203418016434,
"learning_rate": 0.00016017525153933114,
"loss": 10.3375,
"step": 404
},
{
"epoch": 0.17151932239280043,
"grad_norm": 0.024190323427319527,
"learning_rate": 0.00015998755703331634,
"loss": 10.3416,
"step": 405
},
{
"epoch": 0.17194282689253573,
"grad_norm": 0.03089403547346592,
"learning_rate": 0.00015979953181641246,
"loss": 10.3418,
"step": 406
},
{
"epoch": 0.17236633139227103,
"grad_norm": 0.026094770058989525,
"learning_rate": 0.00015961117692520088,
"loss": 10.3357,
"step": 407
},
{
"epoch": 0.17278983589200636,
"grad_norm": 0.04286188259720802,
"learning_rate": 0.00015942249339808058,
"loss": 10.3408,
"step": 408
},
{
"epoch": 0.17321334039174166,
"grad_norm": 0.0313500352203846,
"learning_rate": 0.00015923348227526218,
"loss": 10.3354,
"step": 409
},
{
"epoch": 0.17363684489147696,
"grad_norm": 0.03544219583272934,
"learning_rate": 0.00015904414459876238,
"loss": 10.3367,
"step": 410
},
{
"epoch": 0.1740603493912123,
"grad_norm": 0.03017052263021469,
"learning_rate": 0.00015885448141239822,
"loss": 10.3418,
"step": 411
},
{
"epoch": 0.1744838538909476,
"grad_norm": 0.030451800674200058,
"learning_rate": 0.00015866449376178117,
"loss": 10.3386,
"step": 412
},
{
"epoch": 0.1749073583906829,
"grad_norm": 0.035226162523031235,
"learning_rate": 0.00015847418269431153,
"loss": 10.3412,
"step": 413
},
{
"epoch": 0.17533086289041822,
"grad_norm": 0.02857392653822899,
"learning_rate": 0.00015828354925917262,
"loss": 10.3414,
"step": 414
},
{
"epoch": 0.17575436739015352,
"grad_norm": 0.050622567534446716,
"learning_rate": 0.00015809259450732494,
"loss": 10.3392,
"step": 415
},
{
"epoch": 0.17617787188988884,
"grad_norm": 0.0338461808860302,
"learning_rate": 0.00015790131949150035,
"loss": 10.3419,
"step": 416
},
{
"epoch": 0.17660137638962414,
"grad_norm": 0.027923308312892914,
"learning_rate": 0.00015770972526619646,
"loss": 10.3385,
"step": 417
},
{
"epoch": 0.17702488088935944,
"grad_norm": 0.03212830424308777,
"learning_rate": 0.0001575178128876705,
"loss": 10.339,
"step": 418
},
{
"epoch": 0.17744838538909477,
"grad_norm": 0.020661218091845512,
"learning_rate": 0.00015732558341393385,
"loss": 10.338,
"step": 419
},
{
"epoch": 0.17787188988883007,
"grad_norm": 0.02785920538008213,
"learning_rate": 0.00015713303790474594,
"loss": 10.3392,
"step": 420
},
{
"epoch": 0.17829539438856537,
"grad_norm": 0.018963869661092758,
"learning_rate": 0.00015694017742160846,
"loss": 10.3381,
"step": 421
},
{
"epoch": 0.1787188988883007,
"grad_norm": 0.02660539373755455,
"learning_rate": 0.0001567470030277597,
"loss": 10.3389,
"step": 422
},
{
"epoch": 0.179142403388036,
"grad_norm": 0.03342144191265106,
"learning_rate": 0.00015655351578816834,
"loss": 10.3395,
"step": 423
},
{
"epoch": 0.1795659078877713,
"grad_norm": 0.03541478142142296,
"learning_rate": 0.00015635971676952797,
"loss": 10.3356,
"step": 424
},
{
"epoch": 0.17998941238750663,
"grad_norm": 0.04339861124753952,
"learning_rate": 0.00015616560704025088,
"loss": 10.34,
"step": 425
},
{
"epoch": 0.18041291688724193,
"grad_norm": 0.030052557587623596,
"learning_rate": 0.00015597118767046232,
"loss": 10.3366,
"step": 426
},
{
"epoch": 0.18083642138697723,
"grad_norm": 0.03362065181136131,
"learning_rate": 0.00015577645973199465,
"loss": 10.3446,
"step": 427
},
{
"epoch": 0.18125992588671255,
"grad_norm": 0.033407680690288544,
"learning_rate": 0.00015558142429838133,
"loss": 10.3382,
"step": 428
},
{
"epoch": 0.18168343038644785,
"grad_norm": 0.03306809812784195,
"learning_rate": 0.00015538608244485103,
"loss": 10.3391,
"step": 429
},
{
"epoch": 0.18210693488618315,
"grad_norm": 0.035972122102975845,
"learning_rate": 0.0001551904352483217,
"loss": 10.3378,
"step": 430
},
{
"epoch": 0.18253043938591848,
"grad_norm": 0.02942793443799019,
"learning_rate": 0.0001549944837873947,
"loss": 10.341,
"step": 431
},
{
"epoch": 0.18295394388565378,
"grad_norm": 0.0311295036226511,
"learning_rate": 0.00015479822914234875,
"loss": 10.3427,
"step": 432
},
{
"epoch": 0.1833774483853891,
"grad_norm": 0.03349452093243599,
"learning_rate": 0.00015460167239513396,
"loss": 10.3335,
"step": 433
},
{
"epoch": 0.1838009528851244,
"grad_norm": 0.024683522060513496,
"learning_rate": 0.00015440481462936613,
"loss": 10.3403,
"step": 434
},
{
"epoch": 0.1842244573848597,
"grad_norm": 0.02533009834587574,
"learning_rate": 0.00015420765693032035,
"loss": 10.3352,
"step": 435
},
{
"epoch": 0.18464796188459504,
"grad_norm": 0.02682666666805744,
"learning_rate": 0.0001540102003849253,
"loss": 10.3351,
"step": 436
},
{
"epoch": 0.18507146638433034,
"grad_norm": 0.026133093982934952,
"learning_rate": 0.0001538124460817573,
"loss": 10.3377,
"step": 437
},
{
"epoch": 0.18549497088406564,
"grad_norm": 0.04049040377140045,
"learning_rate": 0.00015361439511103414,
"loss": 10.3402,
"step": 438
},
{
"epoch": 0.18591847538380096,
"grad_norm": 0.02733178623020649,
"learning_rate": 0.00015341604856460904,
"loss": 10.3352,
"step": 439
},
{
"epoch": 0.18634197988353626,
"grad_norm": 0.02330494113266468,
"learning_rate": 0.0001532174075359649,
"loss": 10.341,
"step": 440
},
{
"epoch": 0.18676548438327156,
"grad_norm": 0.03259949013590813,
"learning_rate": 0.00015301847312020796,
"loss": 10.3403,
"step": 441
},
{
"epoch": 0.1871889888830069,
"grad_norm": 0.05194835364818573,
"learning_rate": 0.000152819246414062,
"loss": 10.3413,
"step": 442
},
{
"epoch": 0.1876124933827422,
"grad_norm": 0.0325242318212986,
"learning_rate": 0.0001526197285158621,
"loss": 10.3396,
"step": 443
},
{
"epoch": 0.1880359978824775,
"grad_norm": 0.02710815891623497,
"learning_rate": 0.00015241992052554876,
"loss": 10.34,
"step": 444
},
{
"epoch": 0.18845950238221282,
"grad_norm": 0.024676240980625153,
"learning_rate": 0.0001522198235446617,
"loss": 10.3362,
"step": 445
},
{
"epoch": 0.18888300688194812,
"grad_norm": 0.02788936160504818,
"learning_rate": 0.0001520194386763339,
"loss": 10.3376,
"step": 446
},
{
"epoch": 0.18930651138168342,
"grad_norm": 0.03856251761317253,
"learning_rate": 0.00015181876702528537,
"loss": 10.3352,
"step": 447
},
{
"epoch": 0.18973001588141875,
"grad_norm": 0.03264036402106285,
"learning_rate": 0.00015161780969781728,
"loss": 10.338,
"step": 448
},
{
"epoch": 0.19015352038115405,
"grad_norm": 0.027694035321474075,
"learning_rate": 0.00015141656780180558,
"loss": 10.3354,
"step": 449
},
{
"epoch": 0.19057702488088937,
"grad_norm": 0.030413135886192322,
"learning_rate": 0.00015121504244669515,
"loss": 10.3383,
"step": 450
},
{
"epoch": 0.19100052938062467,
"grad_norm": 0.03150556609034538,
"learning_rate": 0.0001510132347434936,
"loss": 10.3389,
"step": 451
},
{
"epoch": 0.19142403388035997,
"grad_norm": 0.029888266697525978,
"learning_rate": 0.000150811145804765,
"loss": 10.3394,
"step": 452
},
{
"epoch": 0.1918475383800953,
"grad_norm": 0.03171524032950401,
"learning_rate": 0.000150608776744624,
"loss": 10.334,
"step": 453
},
{
"epoch": 0.1922710428798306,
"grad_norm": 0.032492250204086304,
"learning_rate": 0.00015040612867872947,
"loss": 10.3366,
"step": 454
},
{
"epoch": 0.1926945473795659,
"grad_norm": 0.030303264036774635,
"learning_rate": 0.00015020320272427843,
"loss": 10.3366,
"step": 455
},
{
"epoch": 0.19311805187930123,
"grad_norm": 0.03860599547624588,
"learning_rate": 0.00015000000000000001,
"loss": 10.3379,
"step": 456
},
{
"epoch": 0.19354155637903653,
"grad_norm": 0.03272419795393944,
"learning_rate": 0.00014979652162614904,
"loss": 10.3352,
"step": 457
},
{
"epoch": 0.19396506087877183,
"grad_norm": 0.038201820105314255,
"learning_rate": 0.00014959276872450006,
"loss": 10.3362,
"step": 458
},
{
"epoch": 0.19438856537850716,
"grad_norm": 0.025923024863004684,
"learning_rate": 0.00014938874241834108,
"loss": 10.3403,
"step": 459
},
{
"epoch": 0.19481206987824246,
"grad_norm": 0.03889621049165726,
"learning_rate": 0.00014918444383246737,
"loss": 10.3385,
"step": 460
},
{
"epoch": 0.19523557437797776,
"grad_norm": 0.031947895884513855,
"learning_rate": 0.00014897987409317532,
"loss": 10.3385,
"step": 461
},
{
"epoch": 0.19565907887771308,
"grad_norm": 0.03579488396644592,
"learning_rate": 0.00014877503432825614,
"loss": 10.3339,
"step": 462
},
{
"epoch": 0.19608258337744838,
"grad_norm": 0.033163949847221375,
"learning_rate": 0.00014856992566698965,
"loss": 10.3402,
"step": 463
},
{
"epoch": 0.19650608787718368,
"grad_norm": 0.03128167986869812,
"learning_rate": 0.00014836454924013824,
"loss": 10.3408,
"step": 464
},
{
"epoch": 0.196929592376919,
"grad_norm": 0.04108097031712532,
"learning_rate": 0.00014815890617994034,
"loss": 10.3394,
"step": 465
},
{
"epoch": 0.1973530968766543,
"grad_norm": 0.04260754585266113,
"learning_rate": 0.0001479529976201044,
"loss": 10.3428,
"step": 466
},
{
"epoch": 0.19777660137638964,
"grad_norm": 0.027531959116458893,
"learning_rate": 0.00014774682469580248,
"loss": 10.3395,
"step": 467
},
{
"epoch": 0.19820010587612494,
"grad_norm": 0.028333760797977448,
"learning_rate": 0.00014754038854366424,
"loss": 10.3374,
"step": 468
},
{
"epoch": 0.19862361037586024,
"grad_norm": 0.029396837577223778,
"learning_rate": 0.00014733369030177042,
"loss": 10.3363,
"step": 469
},
{
"epoch": 0.19904711487559557,
"grad_norm": 0.029380813241004944,
"learning_rate": 0.00014712673110964665,
"loss": 10.3372,
"step": 470
},
{
"epoch": 0.19947061937533087,
"grad_norm": 0.02283712849020958,
"learning_rate": 0.0001469195121082571,
"loss": 10.3408,
"step": 471
},
{
"epoch": 0.19989412387506617,
"grad_norm": 0.025367606431245804,
"learning_rate": 0.00014671203443999845,
"loss": 10.3383,
"step": 472
},
{
"epoch": 0.2003176283748015,
"grad_norm": 0.034685924649238586,
"learning_rate": 0.0001465042992486933,
"loss": 10.3373,
"step": 473
},
{
"epoch": 0.2007411328745368,
"grad_norm": 0.0398382693529129,
"learning_rate": 0.00014629630767958396,
"loss": 10.3374,
"step": 474
},
{
"epoch": 0.2011646373742721,
"grad_norm": 0.03815117105841637,
"learning_rate": 0.00014608806087932619,
"loss": 10.3382,
"step": 475
},
{
"epoch": 0.20158814187400742,
"grad_norm": 0.028847893700003624,
"learning_rate": 0.0001458795599959828,
"loss": 10.3355,
"step": 476
},
{
"epoch": 0.20201164637374272,
"grad_norm": 0.033290982246398926,
"learning_rate": 0.00014567080617901735,
"loss": 10.3353,
"step": 477
},
{
"epoch": 0.20243515087347802,
"grad_norm": 0.03120148368179798,
"learning_rate": 0.00014546180057928792,
"loss": 10.3365,
"step": 478
},
{
"epoch": 0.20285865537321335,
"grad_norm": 0.03227855637669563,
"learning_rate": 0.00014525254434904055,
"loss": 10.3373,
"step": 479
},
{
"epoch": 0.20328215987294865,
"grad_norm": 0.02253713831305504,
"learning_rate": 0.00014504303864190307,
"loss": 10.3379,
"step": 480
},
{
"epoch": 0.20370566437268395,
"grad_norm": 0.027942582964897156,
"learning_rate": 0.00014483328461287862,
"loss": 10.3387,
"step": 481
},
{
"epoch": 0.20412916887241928,
"grad_norm": 0.028897034004330635,
"learning_rate": 0.0001446232834183394,
"loss": 10.3406,
"step": 482
},
{
"epoch": 0.20455267337215458,
"grad_norm": 0.03516876697540283,
"learning_rate": 0.00014441303621602017,
"loss": 10.3317,
"step": 483
},
{
"epoch": 0.20497617787188988,
"grad_norm": 0.030100248754024506,
"learning_rate": 0.00014420254416501197,
"loss": 10.3365,
"step": 484
},
{
"epoch": 0.2053996823716252,
"grad_norm": 0.020048066973686218,
"learning_rate": 0.00014399180842575575,
"loss": 10.3426,
"step": 485
},
{
"epoch": 0.2058231868713605,
"grad_norm": 0.031375959515571594,
"learning_rate": 0.00014378083016003572,
"loss": 10.3376,
"step": 486
},
{
"epoch": 0.20624669137109583,
"grad_norm": 0.034831635653972626,
"learning_rate": 0.00014356961053097332,
"loss": 10.3354,
"step": 487
},
{
"epoch": 0.20667019587083113,
"grad_norm": 0.030198190361261368,
"learning_rate": 0.00014335815070302054,
"loss": 10.3361,
"step": 488
},
{
"epoch": 0.20709370037056643,
"grad_norm": 0.031040605157613754,
"learning_rate": 0.00014314645184195364,
"loss": 10.3412,
"step": 489
},
{
"epoch": 0.20751720487030176,
"grad_norm": 0.05391615629196167,
"learning_rate": 0.00014293451511486658,
"loss": 10.3402,
"step": 490
},
{
"epoch": 0.20794070937003706,
"grad_norm": 0.030534790828824043,
"learning_rate": 0.00014272234169016474,
"loss": 10.3402,
"step": 491
},
{
"epoch": 0.20836421386977236,
"grad_norm": 0.03578052297234535,
"learning_rate": 0.00014250993273755844,
"loss": 10.3348,
"step": 492
},
{
"epoch": 0.20878771836950769,
"grad_norm": 0.03920895233750343,
"learning_rate": 0.00014229728942805636,
"loss": 10.3417,
"step": 493
},
{
"epoch": 0.20921122286924299,
"grad_norm": 0.030715953558683395,
"learning_rate": 0.00014208441293395925,
"loss": 10.3379,
"step": 494
},
{
"epoch": 0.20963472736897829,
"grad_norm": 0.036160390824079514,
"learning_rate": 0.00014187130442885345,
"loss": 10.3368,
"step": 495
},
{
"epoch": 0.2100582318687136,
"grad_norm": 0.032142747193574905,
"learning_rate": 0.0001416579650876043,
"loss": 10.3404,
"step": 496
},
{
"epoch": 0.2104817363684489,
"grad_norm": 0.02567223645746708,
"learning_rate": 0.00014144439608634976,
"loss": 10.3387,
"step": 497
},
{
"epoch": 0.2109052408681842,
"grad_norm": 0.03470413014292717,
"learning_rate": 0.0001412305986024939,
"loss": 10.3419,
"step": 498
},
{
"epoch": 0.21132874536791954,
"grad_norm": 0.036063164472579956,
"learning_rate": 0.00014101657381470045,
"loss": 10.3335,
"step": 499
},
{
"epoch": 0.21175224986765484,
"grad_norm": 0.02859325334429741,
"learning_rate": 0.00014080232290288622,
"loss": 10.3385,
"step": 500
},
{
"epoch": 0.21217575436739014,
"grad_norm": 0.03691897913813591,
"learning_rate": 0.00014058784704821465,
"loss": 10.3371,
"step": 501
},
{
"epoch": 0.21259925886712547,
"grad_norm": 0.02370496280491352,
"learning_rate": 0.0001403731474330893,
"loss": 10.3373,
"step": 502
},
{
"epoch": 0.21302276336686077,
"grad_norm": 0.02717514894902706,
"learning_rate": 0.0001401582252411473,
"loss": 10.3362,
"step": 503
},
{
"epoch": 0.2134462678665961,
"grad_norm": 0.027684593573212624,
"learning_rate": 0.00013994308165725288,
"loss": 10.3407,
"step": 504
},
{
"epoch": 0.2138697723663314,
"grad_norm": 0.027036601677536964,
"learning_rate": 0.00013972771786749074,
"loss": 10.3387,
"step": 505
},
{
"epoch": 0.2142932768660667,
"grad_norm": 0.03559018298983574,
"learning_rate": 0.00013951213505915969,
"loss": 10.3398,
"step": 506
},
{
"epoch": 0.21471678136580202,
"grad_norm": 0.04133779555559158,
"learning_rate": 0.0001392963344207658,
"loss": 10.3355,
"step": 507
},
{
"epoch": 0.21514028586553732,
"grad_norm": 0.03785044327378273,
"learning_rate": 0.0001390803171420162,
"loss": 10.3344,
"step": 508
},
{
"epoch": 0.21556379036527262,
"grad_norm": 0.023411711677908897,
"learning_rate": 0.00013886408441381233,
"loss": 10.3362,
"step": 509
},
{
"epoch": 0.21598729486500795,
"grad_norm": 0.0443277508020401,
"learning_rate": 0.00013864763742824334,
"loss": 10.339,
"step": 510
},
{
"epoch": 0.21641079936474325,
"grad_norm": 0.036806512624025345,
"learning_rate": 0.0001384309773785796,
"loss": 10.338,
"step": 511
},
{
"epoch": 0.21683430386447855,
"grad_norm": 0.02885564975440502,
"learning_rate": 0.00013821410545926613,
"loss": 10.3333,
"step": 512
},
{
"epoch": 0.21725780836421388,
"grad_norm": 0.03067517653107643,
"learning_rate": 0.00013799702286591598,
"loss": 10.3356,
"step": 513
},
{
"epoch": 0.21768131286394918,
"grad_norm": 0.03321646526455879,
"learning_rate": 0.00013777973079530362,
"loss": 10.3388,
"step": 514
},
{
"epoch": 0.21810481736368448,
"grad_norm": 0.03147870674729347,
"learning_rate": 0.00013756223044535833,
"loss": 10.3391,
"step": 515
},
{
"epoch": 0.2185283218634198,
"grad_norm": 0.02573389932513237,
"learning_rate": 0.00013734452301515776,
"loss": 10.3377,
"step": 516
},
{
"epoch": 0.2189518263631551,
"grad_norm": 0.026358777657151222,
"learning_rate": 0.00013712660970492107,
"loss": 10.3371,
"step": 517
},
{
"epoch": 0.2193753308628904,
"grad_norm": 0.02714933454990387,
"learning_rate": 0.00013690849171600245,
"loss": 10.3378,
"step": 518
},
{
"epoch": 0.21979883536262573,
"grad_norm": 0.02859034389257431,
"learning_rate": 0.00013669017025088456,
"loss": 10.3365,
"step": 519
},
{
"epoch": 0.22022233986236103,
"grad_norm": 0.044585928320884705,
"learning_rate": 0.00013647164651317176,
"loss": 10.3362,
"step": 520
},
{
"epoch": 0.22064584436209636,
"grad_norm": 0.053858619183301926,
"learning_rate": 0.00013625292170758356,
"loss": 10.3373,
"step": 521
},
{
"epoch": 0.22106934886183166,
"grad_norm": 0.03403494879603386,
"learning_rate": 0.00013603399703994787,
"loss": 10.3309,
"step": 522
},
{
"epoch": 0.22149285336156696,
"grad_norm": 0.028249001130461693,
"learning_rate": 0.00013581487371719457,
"loss": 10.3379,
"step": 523
},
{
"epoch": 0.2219163578613023,
"grad_norm": 0.028280075639486313,
"learning_rate": 0.00013559555294734868,
"loss": 10.3388,
"step": 524
},
{
"epoch": 0.2223398623610376,
"grad_norm": 0.04397103190422058,
"learning_rate": 0.00013537603593952367,
"loss": 10.3335,
"step": 525
},
{
"epoch": 0.2227633668607729,
"grad_norm": 0.035089749842882156,
"learning_rate": 0.000135156323903915,
"loss": 10.34,
"step": 526
},
{
"epoch": 0.22318687136050822,
"grad_norm": 0.03598684072494507,
"learning_rate": 0.00013493641805179319,
"loss": 10.3348,
"step": 527
},
{
"epoch": 0.22361037586024352,
"grad_norm": 0.03583105653524399,
"learning_rate": 0.0001347163195954973,
"loss": 10.3383,
"step": 528
},
{
"epoch": 0.22403388035997882,
"grad_norm": 0.03622949495911598,
"learning_rate": 0.0001344960297484283,
"loss": 10.3378,
"step": 529
},
{
"epoch": 0.22445738485971414,
"grad_norm": 0.027924714609980583,
"learning_rate": 0.00013427554972504226,
"loss": 10.3372,
"step": 530
},
{
"epoch": 0.22488088935944944,
"grad_norm": 0.047317858785390854,
"learning_rate": 0.00013405488074084358,
"loss": 10.3375,
"step": 531
},
{
"epoch": 0.22530439385918474,
"grad_norm": 0.031993038952350616,
"learning_rate": 0.0001338340240123785,
"loss": 10.3371,
"step": 532
},
{
"epoch": 0.22572789835892007,
"grad_norm": 0.03276574984192848,
"learning_rate": 0.00013361298075722833,
"loss": 10.3376,
"step": 533
},
{
"epoch": 0.22615140285865537,
"grad_norm": 0.024694286286830902,
"learning_rate": 0.00013339175219400257,
"loss": 10.34,
"step": 534
},
{
"epoch": 0.22657490735839067,
"grad_norm": 0.031688570976257324,
"learning_rate": 0.00013317033954233246,
"loss": 10.3411,
"step": 535
},
{
"epoch": 0.226998411858126,
"grad_norm": 0.03652056306600571,
"learning_rate": 0.00013294874402286402,
"loss": 10.3329,
"step": 536
},
{
"epoch": 0.2274219163578613,
"grad_norm": 0.03224468603730202,
"learning_rate": 0.0001327269668572515,
"loss": 10.3386,
"step": 537
},
{
"epoch": 0.22784542085759663,
"grad_norm": 0.034342508763074875,
"learning_rate": 0.00013250500926815045,
"loss": 10.3371,
"step": 538
},
{
"epoch": 0.22826892535733193,
"grad_norm": 0.030163973569869995,
"learning_rate": 0.0001322828724792112,
"loss": 10.336,
"step": 539
},
{
"epoch": 0.22869242985706723,
"grad_norm": 0.030578266829252243,
"learning_rate": 0.00013206055771507197,
"loss": 10.3391,
"step": 540
},
{
"epoch": 0.22911593435680255,
"grad_norm": 0.035477470606565475,
"learning_rate": 0.00013183806620135216,
"loss": 10.3384,
"step": 541
},
{
"epoch": 0.22953943885653785,
"grad_norm": 0.026009559631347656,
"learning_rate": 0.00013161539916464558,
"loss": 10.3369,
"step": 542
},
{
"epoch": 0.22996294335627315,
"grad_norm": 0.033704426139593124,
"learning_rate": 0.00013139255783251367,
"loss": 10.3369,
"step": 543
},
{
"epoch": 0.23038644785600848,
"grad_norm": 0.03469805791974068,
"learning_rate": 0.00013116954343347882,
"loss": 10.3359,
"step": 544
},
{
"epoch": 0.23080995235574378,
"grad_norm": 0.029503265395760536,
"learning_rate": 0.0001309463571970175,
"loss": 10.3337,
"step": 545
},
{
"epoch": 0.23123345685547908,
"grad_norm": 0.027178343385457993,
"learning_rate": 0.0001307230003535535,
"loss": 10.3383,
"step": 546
},
{
"epoch": 0.2316569613552144,
"grad_norm": 0.026484569534659386,
"learning_rate": 0.00013049947413445125,
"loss": 10.3411,
"step": 547
},
{
"epoch": 0.2320804658549497,
"grad_norm": 0.03568257763981819,
"learning_rate": 0.00013027577977200883,
"loss": 10.3351,
"step": 548
},
{
"epoch": 0.232503970354685,
"grad_norm": 0.044057317078113556,
"learning_rate": 0.0001300519184994513,
"loss": 10.3367,
"step": 549
},
{
"epoch": 0.23292747485442034,
"grad_norm": 0.03619583323597908,
"learning_rate": 0.00012982789155092407,
"loss": 10.3385,
"step": 550
},
{
"epoch": 0.23335097935415564,
"grad_norm": 0.042276639491319656,
"learning_rate": 0.00012960370016148567,
"loss": 10.337,
"step": 551
},
{
"epoch": 0.23377448385389094,
"grad_norm": 0.03055988810956478,
"learning_rate": 0.00012937934556710143,
"loss": 10.3385,
"step": 552
},
{
"epoch": 0.23419798835362626,
"grad_norm": 0.02854546532034874,
"learning_rate": 0.00012915482900463624,
"loss": 10.3393,
"step": 553
},
{
"epoch": 0.23462149285336156,
"grad_norm": 0.029309969395399094,
"learning_rate": 0.00012893015171184797,
"loss": 10.3319,
"step": 554
},
{
"epoch": 0.23504499735309686,
"grad_norm": 0.0332510843873024,
"learning_rate": 0.00012870531492738065,
"loss": 10.3338,
"step": 555
},
{
"epoch": 0.2354685018528322,
"grad_norm": 0.03669944778084755,
"learning_rate": 0.00012848031989075754,
"loss": 10.3325,
"step": 556
},
{
"epoch": 0.2358920063525675,
"grad_norm": 0.027661770582199097,
"learning_rate": 0.00012825516784237436,
"loss": 10.3382,
"step": 557
},
{
"epoch": 0.23631551085230282,
"grad_norm": 0.029674025252461433,
"learning_rate": 0.0001280298600234924,
"loss": 10.3387,
"step": 558
},
{
"epoch": 0.23673901535203812,
"grad_norm": 0.03104621358215809,
"learning_rate": 0.00012780439767623181,
"loss": 10.3354,
"step": 559
},
{
"epoch": 0.23716251985177342,
"grad_norm": 0.0300068948417902,
"learning_rate": 0.0001275787820435645,
"loss": 10.3396,
"step": 560
},
{
"epoch": 0.23758602435150875,
"grad_norm": 0.03742906451225281,
"learning_rate": 0.00012735301436930758,
"loss": 10.3364,
"step": 561
},
{
"epoch": 0.23800952885124405,
"grad_norm": 0.029214419424533844,
"learning_rate": 0.0001271270958981163,
"loss": 10.3368,
"step": 562
},
{
"epoch": 0.23843303335097935,
"grad_norm": 0.034154172986745834,
"learning_rate": 0.00012690102787547722,
"loss": 10.3364,
"step": 563
},
{
"epoch": 0.23885653785071467,
"grad_norm": 0.024321483448147774,
"learning_rate": 0.00012667481154770148,
"loss": 10.3348,
"step": 564
},
{
"epoch": 0.23928004235044997,
"grad_norm": 0.030538305640220642,
"learning_rate": 0.0001264484481619177,
"loss": 10.3374,
"step": 565
},
{
"epoch": 0.23970354685018527,
"grad_norm": 0.028275547549128532,
"learning_rate": 0.00012622193896606528,
"loss": 10.3343,
"step": 566
},
{
"epoch": 0.2401270513499206,
"grad_norm": 0.024137398228049278,
"learning_rate": 0.00012599528520888757,
"loss": 10.3363,
"step": 567
},
{
"epoch": 0.2405505558496559,
"grad_norm": 0.0387752428650856,
"learning_rate": 0.00012576848813992475,
"loss": 10.3355,
"step": 568
},
{
"epoch": 0.2409740603493912,
"grad_norm": 0.02671218290925026,
"learning_rate": 0.00012554154900950708,
"loss": 10.339,
"step": 569
},
{
"epoch": 0.24139756484912653,
"grad_norm": 0.031162571161985397,
"learning_rate": 0.00012531446906874808,
"loss": 10.3402,
"step": 570
},
{
"epoch": 0.24182106934886183,
"grad_norm": 0.03754870593547821,
"learning_rate": 0.00012508724956953755,
"loss": 10.3392,
"step": 571
},
{
"epoch": 0.24224457384859713,
"grad_norm": 0.030516209080815315,
"learning_rate": 0.00012485989176453462,
"loss": 10.3373,
"step": 572
},
{
"epoch": 0.24266807834833246,
"grad_norm": 0.04033865034580231,
"learning_rate": 0.0001246323969071609,
"loss": 10.3358,
"step": 573
},
{
"epoch": 0.24309158284806776,
"grad_norm": 0.0301966555416584,
"learning_rate": 0.00012440476625159364,
"loss": 10.335,
"step": 574
},
{
"epoch": 0.24351508734780308,
"grad_norm": 0.036701519042253494,
"learning_rate": 0.00012417700105275866,
"loss": 10.3382,
"step": 575
},
{
"epoch": 0.24393859184753838,
"grad_norm": 0.02948085404932499,
"learning_rate": 0.00012394910256632356,
"loss": 10.3342,
"step": 576
},
{
"epoch": 0.24436209634727368,
"grad_norm": 0.0245877243578434,
"learning_rate": 0.00012372107204869077,
"loss": 10.3364,
"step": 577
},
{
"epoch": 0.244785600847009,
"grad_norm": 0.023439116775989532,
"learning_rate": 0.00012349291075699058,
"loss": 10.3361,
"step": 578
},
{
"epoch": 0.2452091053467443,
"grad_norm": 0.026123927906155586,
"learning_rate": 0.00012326461994907424,
"loss": 10.3398,
"step": 579
},
{
"epoch": 0.2456326098464796,
"grad_norm": 0.03437687084078789,
"learning_rate": 0.000123036200883507,
"loss": 10.3373,
"step": 580
},
{
"epoch": 0.24605611434621494,
"grad_norm": 0.03299521282315254,
"learning_rate": 0.00012280765481956124,
"loss": 10.3344,
"step": 581
},
{
"epoch": 0.24647961884595024,
"grad_norm": 0.03710121661424637,
"learning_rate": 0.0001225789830172094,
"loss": 10.3354,
"step": 582
},
{
"epoch": 0.24690312334568554,
"grad_norm": 0.032498303800821304,
"learning_rate": 0.0001223501867371173,
"loss": 10.3344,
"step": 583
},
{
"epoch": 0.24732662784542087,
"grad_norm": 0.03610834851861,
"learning_rate": 0.00012212126724063676,
"loss": 10.3359,
"step": 584
},
{
"epoch": 0.24775013234515617,
"grad_norm": 0.03149677813053131,
"learning_rate": 0.00012189222578979903,
"loss": 10.3376,
"step": 585
},
{
"epoch": 0.24817363684489147,
"grad_norm": 0.031013086438179016,
"learning_rate": 0.00012166306364730766,
"loss": 10.3333,
"step": 586
},
{
"epoch": 0.2485971413446268,
"grad_norm": 0.030261732637882233,
"learning_rate": 0.00012143378207653164,
"loss": 10.3327,
"step": 587
},
{
"epoch": 0.2490206458443621,
"grad_norm": 0.030076345428824425,
"learning_rate": 0.00012120438234149827,
"loss": 10.3393,
"step": 588
},
{
"epoch": 0.2494441503440974,
"grad_norm": 0.027937186881899834,
"learning_rate": 0.00012097486570688634,
"loss": 10.3386,
"step": 589
},
{
"epoch": 0.24986765484383272,
"grad_norm": 0.037603769451379776,
"learning_rate": 0.00012074523343801906,
"loss": 10.3306,
"step": 590
},
{
"epoch": 0.25029115934356805,
"grad_norm": 0.027752617374062538,
"learning_rate": 0.0001205154868008572,
"loss": 10.3352,
"step": 591
},
{
"epoch": 0.2507146638433033,
"grad_norm": 0.030105147510766983,
"learning_rate": 0.000120285627061992,
"loss": 10.3306,
"step": 592
},
{
"epoch": 0.25113816834303865,
"grad_norm": 0.026609288528561592,
"learning_rate": 0.00012005565548863822,
"loss": 10.3347,
"step": 593
},
{
"epoch": 0.251561672842774,
"grad_norm": 0.04250922426581383,
"learning_rate": 0.00011982557334862723,
"loss": 10.3303,
"step": 594
},
{
"epoch": 0.25198517734250925,
"grad_norm": 0.03030312806367874,
"learning_rate": 0.00011959538191039985,
"loss": 10.3389,
"step": 595
},
{
"epoch": 0.2524086818422446,
"grad_norm": 0.03314143419265747,
"learning_rate": 0.00011936508244299948,
"loss": 10.336,
"step": 596
},
{
"epoch": 0.2528321863419799,
"grad_norm": 0.03237884119153023,
"learning_rate": 0.0001191346762160652,
"loss": 10.3406,
"step": 597
},
{
"epoch": 0.2532556908417152,
"grad_norm": 0.02621031180024147,
"learning_rate": 0.00011890416449982451,
"loss": 10.3367,
"step": 598
},
{
"epoch": 0.2536791953414505,
"grad_norm": 0.023484721779823303,
"learning_rate": 0.00011867354856508656,
"loss": 10.3327,
"step": 599
},
{
"epoch": 0.25410269984118583,
"grad_norm": 0.02962653897702694,
"learning_rate": 0.00011844282968323501,
"loss": 10.3359,
"step": 600
},
{
"epoch": 0.2545262043409211,
"grad_norm": 0.028051255270838737,
"learning_rate": 0.0001182120091262211,
"loss": 10.3356,
"step": 601
},
{
"epoch": 0.25494970884065643,
"grad_norm": 0.029255535453557968,
"learning_rate": 0.00011798108816655657,
"loss": 10.3365,
"step": 602
},
{
"epoch": 0.25537321334039176,
"grad_norm": 0.029102357104420662,
"learning_rate": 0.00011775006807730667,
"loss": 10.3347,
"step": 603
},
{
"epoch": 0.25579671784012703,
"grad_norm": 0.02935311570763588,
"learning_rate": 0.00011751895013208325,
"loss": 10.3369,
"step": 604
},
{
"epoch": 0.25622022233986236,
"grad_norm": 0.033430956304073334,
"learning_rate": 0.00011728773560503751,
"loss": 10.3381,
"step": 605
},
{
"epoch": 0.2566437268395977,
"grad_norm": 0.03818434476852417,
"learning_rate": 0.00011705642577085316,
"loss": 10.3354,
"step": 606
},
{
"epoch": 0.25706723133933296,
"grad_norm": 0.029122449457645416,
"learning_rate": 0.00011682502190473938,
"loss": 10.3382,
"step": 607
},
{
"epoch": 0.2574907358390683,
"grad_norm": 0.030079467222094536,
"learning_rate": 0.00011659352528242366,
"loss": 10.3413,
"step": 608
},
{
"epoch": 0.2579142403388036,
"grad_norm": 0.02247581258416176,
"learning_rate": 0.00011636193718014494,
"loss": 10.3364,
"step": 609
},
{
"epoch": 0.2583377448385389,
"grad_norm": 0.032431941479444504,
"learning_rate": 0.00011613025887464641,
"loss": 10.3323,
"step": 610
},
{
"epoch": 0.2587612493382742,
"grad_norm": 0.032824281603097916,
"learning_rate": 0.00011589849164316862,
"loss": 10.3351,
"step": 611
},
{
"epoch": 0.25918475383800954,
"grad_norm": 0.036410853266716,
"learning_rate": 0.00011566663676344232,
"loss": 10.3414,
"step": 612
},
{
"epoch": 0.2596082583377448,
"grad_norm": 0.03686416149139404,
"learning_rate": 0.00011543469551368144,
"loss": 10.3375,
"step": 613
},
{
"epoch": 0.26003176283748014,
"grad_norm": 0.04031093418598175,
"learning_rate": 0.00011520266917257618,
"loss": 10.3361,
"step": 614
},
{
"epoch": 0.26045526733721547,
"grad_norm": 0.027354370802640915,
"learning_rate": 0.00011497055901928577,
"loss": 10.3334,
"step": 615
},
{
"epoch": 0.26087877183695074,
"grad_norm": 0.029079321771860123,
"learning_rate": 0.00011473836633343144,
"loss": 10.3376,
"step": 616
},
{
"epoch": 0.26130227633668607,
"grad_norm": 0.027393948286771774,
"learning_rate": 0.00011450609239508951,
"loss": 10.3359,
"step": 617
},
{
"epoch": 0.2617257808364214,
"grad_norm": 0.037023283541202545,
"learning_rate": 0.00011427373848478422,
"loss": 10.336,
"step": 618
},
{
"epoch": 0.2621492853361567,
"grad_norm": 0.04202662780880928,
"learning_rate": 0.00011404130588348072,
"loss": 10.3383,
"step": 619
},
{
"epoch": 0.262572789835892,
"grad_norm": 0.031701017171144485,
"learning_rate": 0.00011380879587257792,
"loss": 10.3356,
"step": 620
},
{
"epoch": 0.2629962943356273,
"grad_norm": 0.03459370136260986,
"learning_rate": 0.00011357620973390151,
"loss": 10.3337,
"step": 621
},
{
"epoch": 0.26341979883536265,
"grad_norm": 0.03404482826590538,
"learning_rate": 0.0001133435487496969,
"loss": 10.3373,
"step": 622
},
{
"epoch": 0.2638433033350979,
"grad_norm": 0.03435559198260307,
"learning_rate": 0.0001131108142026221,
"loss": 10.3394,
"step": 623
},
{
"epoch": 0.26426680783483325,
"grad_norm": 0.04172271490097046,
"learning_rate": 0.00011287800737574072,
"loss": 10.3312,
"step": 624
},
{
"epoch": 0.2646903123345686,
"grad_norm": 0.024423452094197273,
"learning_rate": 0.00011264512955251478,
"loss": 10.3384,
"step": 625
},
{
"epoch": 0.26511381683430385,
"grad_norm": 0.036313965916633606,
"learning_rate": 0.00011241218201679773,
"loss": 10.3343,
"step": 626
},
{
"epoch": 0.2655373213340392,
"grad_norm": 0.03670899197459221,
"learning_rate": 0.00011217916605282728,
"loss": 10.3421,
"step": 627
},
{
"epoch": 0.2659608258337745,
"grad_norm": 0.04206259921193123,
"learning_rate": 0.00011194608294521854,
"loss": 10.3304,
"step": 628
},
{
"epoch": 0.2663843303335098,
"grad_norm": 0.029241429641842842,
"learning_rate": 0.00011171293397895665,
"loss": 10.3403,
"step": 629
},
{
"epoch": 0.2668078348332451,
"grad_norm": 0.029772555455565453,
"learning_rate": 0.00011147972043938988,
"loss": 10.3356,
"step": 630
},
{
"epoch": 0.26723133933298043,
"grad_norm": 0.038933563977479935,
"learning_rate": 0.00011124644361222245,
"loss": 10.3396,
"step": 631
},
{
"epoch": 0.2676548438327157,
"grad_norm": 0.03326569125056267,
"learning_rate": 0.00011101310478350754,
"loss": 10.337,
"step": 632
},
{
"epoch": 0.26807834833245103,
"grad_norm": 0.03632461279630661,
"learning_rate": 0.00011077970523964011,
"loss": 10.337,
"step": 633
},
{
"epoch": 0.26850185283218636,
"grad_norm": 0.03578447178006172,
"learning_rate": 0.00011054624626734984,
"loss": 10.3358,
"step": 634
},
{
"epoch": 0.26892535733192163,
"grad_norm": 0.032311227172613144,
"learning_rate": 0.0001103127291536941,
"loss": 10.3417,
"step": 635
},
{
"epoch": 0.26934886183165696,
"grad_norm": 0.03721488639712334,
"learning_rate": 0.00011007915518605067,
"loss": 10.3341,
"step": 636
},
{
"epoch": 0.2697723663313923,
"grad_norm": 0.026686688885092735,
"learning_rate": 0.00010984552565211089,
"loss": 10.3337,
"step": 637
},
{
"epoch": 0.27019587083112756,
"grad_norm": 0.03955764323472977,
"learning_rate": 0.00010961184183987233,
"loss": 10.3331,
"step": 638
},
{
"epoch": 0.2706193753308629,
"grad_norm": 0.024867044761776924,
"learning_rate": 0.00010937810503763191,
"loss": 10.3319,
"step": 639
},
{
"epoch": 0.2710428798305982,
"grad_norm": 0.026639580726623535,
"learning_rate": 0.00010914431653397856,
"loss": 10.3394,
"step": 640
},
{
"epoch": 0.2714663843303335,
"grad_norm": 0.04265257716178894,
"learning_rate": 0.00010891047761778637,
"loss": 10.3355,
"step": 641
},
{
"epoch": 0.2718898888300688,
"grad_norm": 0.03401639685034752,
"learning_rate": 0.00010867658957820723,
"loss": 10.3362,
"step": 642
},
{
"epoch": 0.27231339332980414,
"grad_norm": 0.03278350457549095,
"learning_rate": 0.00010844265370466393,
"loss": 10.3369,
"step": 643
},
{
"epoch": 0.2727368978295394,
"grad_norm": 0.03625522553920746,
"learning_rate": 0.00010820867128684292,
"loss": 10.3386,
"step": 644
},
{
"epoch": 0.27316040232927474,
"grad_norm": 0.028470052406191826,
"learning_rate": 0.0001079746436146873,
"loss": 10.3359,
"step": 645
},
{
"epoch": 0.27358390682901007,
"grad_norm": 0.03894231840968132,
"learning_rate": 0.00010774057197838963,
"loss": 10.3363,
"step": 646
},
{
"epoch": 0.27400741132874534,
"grad_norm": 0.04798604175448418,
"learning_rate": 0.00010750645766838477,
"loss": 10.3351,
"step": 647
},
{
"epoch": 0.27443091582848067,
"grad_norm": 0.038566704839468,
"learning_rate": 0.00010727230197534299,
"loss": 10.3386,
"step": 648
},
{
"epoch": 0.274854420328216,
"grad_norm": 0.038909364491701126,
"learning_rate": 0.0001070381061901626,
"loss": 10.3376,
"step": 649
},
{
"epoch": 0.27527792482795127,
"grad_norm": 0.029502833262085915,
"learning_rate": 0.00010680387160396293,
"loss": 10.3356,
"step": 650
},
{
"epoch": 0.2757014293276866,
"grad_norm": 0.028758224099874496,
"learning_rate": 0.00010656959950807728,
"loss": 10.3313,
"step": 651
},
{
"epoch": 0.2761249338274219,
"grad_norm": 0.024828476831316948,
"learning_rate": 0.0001063352911940457,
"loss": 10.3318,
"step": 652
},
{
"epoch": 0.27654843832715725,
"grad_norm": 0.02429981529712677,
"learning_rate": 0.00010610094795360795,
"loss": 10.333,
"step": 653
},
{
"epoch": 0.2769719428268925,
"grad_norm": 0.028827672824263573,
"learning_rate": 0.00010586657107869626,
"loss": 10.3318,
"step": 654
},
{
"epoch": 0.27739544732662785,
"grad_norm": 0.04222332313656807,
"learning_rate": 0.00010563216186142839,
"loss": 10.3354,
"step": 655
},
{
"epoch": 0.2778189518263632,
"grad_norm": 0.04045010358095169,
"learning_rate": 0.00010539772159410036,
"loss": 10.3356,
"step": 656
},
{
"epoch": 0.27824245632609845,
"grad_norm": 0.02479146048426628,
"learning_rate": 0.00010516325156917926,
"loss": 10.3395,
"step": 657
},
{
"epoch": 0.2786659608258338,
"grad_norm": 0.036765843629837036,
"learning_rate": 0.00010492875307929644,
"loss": 10.3334,
"step": 658
},
{
"epoch": 0.2790894653255691,
"grad_norm": 0.02949843928217888,
"learning_rate": 0.00010469422741724003,
"loss": 10.3405,
"step": 659
},
{
"epoch": 0.2795129698253044,
"grad_norm": 0.02545243129134178,
"learning_rate": 0.000104459675875948,
"loss": 10.3339,
"step": 660
},
{
"epoch": 0.2799364743250397,
"grad_norm": 0.032835401594638824,
"learning_rate": 0.00010422509974850099,
"loss": 10.3426,
"step": 661
},
{
"epoch": 0.28035997882477504,
"grad_norm": 0.029005464166402817,
"learning_rate": 0.00010399050032811519,
"loss": 10.3353,
"step": 662
},
{
"epoch": 0.2807834833245103,
"grad_norm": 0.02459227293729782,
"learning_rate": 0.00010375587890813518,
"loss": 10.3345,
"step": 663
},
{
"epoch": 0.28120698782424564,
"grad_norm": 0.04449470341205597,
"learning_rate": 0.00010352123678202685,
"loss": 10.3358,
"step": 664
},
{
"epoch": 0.28163049232398096,
"grad_norm": 0.025347614660859108,
"learning_rate": 0.00010328657524337029,
"loss": 10.3357,
"step": 665
},
{
"epoch": 0.28205399682371624,
"grad_norm": 0.028995616361498833,
"learning_rate": 0.00010305189558585248,
"loss": 10.3386,
"step": 666
},
{
"epoch": 0.28247750132345156,
"grad_norm": 0.029563058167696,
"learning_rate": 0.00010281719910326042,
"loss": 10.3369,
"step": 667
},
{
"epoch": 0.2829010058231869,
"grad_norm": 0.03033272735774517,
"learning_rate": 0.00010258248708947375,
"loss": 10.337,
"step": 668
},
{
"epoch": 0.28332451032292216,
"grad_norm": 0.03558272868394852,
"learning_rate": 0.00010234776083845787,
"loss": 10.3345,
"step": 669
},
{
"epoch": 0.2837480148226575,
"grad_norm": 0.023746639490127563,
"learning_rate": 0.00010211302164425655,
"loss": 10.3326,
"step": 670
},
{
"epoch": 0.2841715193223928,
"grad_norm": 0.02846304513514042,
"learning_rate": 0.00010187827080098498,
"loss": 10.3353,
"step": 671
},
{
"epoch": 0.2845950238221281,
"grad_norm": 0.035858284682035446,
"learning_rate": 0.00010164350960282252,
"loss": 10.336,
"step": 672
},
{
"epoch": 0.2850185283218634,
"grad_norm": 0.026505351066589355,
"learning_rate": 0.00010140873934400567,
"loss": 10.3382,
"step": 673
},
{
"epoch": 0.28544203282159875,
"grad_norm": 0.02379724755883217,
"learning_rate": 0.00010117396131882087,
"loss": 10.3372,
"step": 674
},
{
"epoch": 0.28544203282159875,
"eval_loss": 10.334465026855469,
"eval_runtime": 3.4817,
"eval_samples_per_second": 285.783,
"eval_steps_per_second": 143.035,
"step": 674
},
{
"epoch": 0.285865537321334,
"grad_norm": 0.030138272792100906,
"learning_rate": 0.00010093917682159735,
"loss": 10.3361,
"step": 675
},
{
"epoch": 0.28628904182106935,
"grad_norm": 0.023656543344259262,
"learning_rate": 0.00010070438714670002,
"loss": 10.3345,
"step": 676
},
{
"epoch": 0.2867125463208047,
"grad_norm": 0.035104621201753616,
"learning_rate": 0.00010046959358852244,
"loss": 10.3347,
"step": 677
},
{
"epoch": 0.28713605082053995,
"grad_norm": 0.030601153150200844,
"learning_rate": 0.00010023479744147936,
"loss": 10.3325,
"step": 678
},
{
"epoch": 0.2875595553202753,
"grad_norm": 0.030649134889245033,
"learning_rate": 0.0001,
"loss": 10.3351,
"step": 679
},
{
"epoch": 0.2879830598200106,
"grad_norm": 0.04906442388892174,
"learning_rate": 9.976520255852065e-05,
"loss": 10.3382,
"step": 680
},
{
"epoch": 0.2884065643197459,
"grad_norm": 0.036667853593826294,
"learning_rate": 9.953040641147761e-05,
"loss": 10.3336,
"step": 681
},
{
"epoch": 0.2888300688194812,
"grad_norm": 0.032969675958156586,
"learning_rate": 9.929561285329999e-05,
"loss": 10.3347,
"step": 682
},
{
"epoch": 0.28925357331921653,
"grad_norm": 0.04049724340438843,
"learning_rate": 9.906082317840266e-05,
"loss": 10.337,
"step": 683
},
{
"epoch": 0.2896770778189518,
"grad_norm": 0.03217809647321701,
"learning_rate": 9.882603868117917e-05,
"loss": 10.332,
"step": 684
},
{
"epoch": 0.29010058231868713,
"grad_norm": 0.03514156490564346,
"learning_rate": 9.859126065599434e-05,
"loss": 10.3331,
"step": 685
},
{
"epoch": 0.29052408681842246,
"grad_norm": 0.02941136807203293,
"learning_rate": 9.83564903971775e-05,
"loss": 10.3325,
"step": 686
},
{
"epoch": 0.29094759131815773,
"grad_norm": 0.026193542405962944,
"learning_rate": 9.812172919901506e-05,
"loss": 10.3382,
"step": 687
},
{
"epoch": 0.29137109581789306,
"grad_norm": 0.031999170780181885,
"learning_rate": 9.788697835574347e-05,
"loss": 10.3378,
"step": 688
},
{
"epoch": 0.2917946003176284,
"grad_norm": 0.0316544771194458,
"learning_rate": 9.765223916154217e-05,
"loss": 10.3369,
"step": 689
},
{
"epoch": 0.2922181048173637,
"grad_norm": 0.030304009094834328,
"learning_rate": 9.741751291052626e-05,
"loss": 10.3381,
"step": 690
},
{
"epoch": 0.292641609317099,
"grad_norm": 0.035043906420469284,
"learning_rate": 9.718280089673959e-05,
"loss": 10.3327,
"step": 691
},
{
"epoch": 0.2930651138168343,
"grad_norm": 0.031086809933185577,
"learning_rate": 9.694810441414754e-05,
"loss": 10.3331,
"step": 692
},
{
"epoch": 0.29348861831656964,
"grad_norm": 0.03664236515760422,
"learning_rate": 9.671342475662975e-05,
"loss": 10.3384,
"step": 693
},
{
"epoch": 0.2939121228163049,
"grad_norm": 0.036936696618795395,
"learning_rate": 9.647876321797314e-05,
"loss": 10.3379,
"step": 694
},
{
"epoch": 0.29433562731604024,
"grad_norm": 0.03095340169966221,
"learning_rate": 9.624412109186484e-05,
"loss": 10.3351,
"step": 695
},
{
"epoch": 0.29475913181577557,
"grad_norm": 0.026670867577195168,
"learning_rate": 9.600949967188484e-05,
"loss": 10.3324,
"step": 696
},
{
"epoch": 0.29518263631551084,
"grad_norm": 0.03176816925406456,
"learning_rate": 9.577490025149903e-05,
"loss": 10.336,
"step": 697
},
{
"epoch": 0.29560614081524617,
"grad_norm": 0.041850414127111435,
"learning_rate": 9.554032412405204e-05,
"loss": 10.3335,
"step": 698
},
{
"epoch": 0.2960296453149815,
"grad_norm": 0.02709740586578846,
"learning_rate": 9.530577258275998e-05,
"loss": 10.335,
"step": 699
},
{
"epoch": 0.29645314981471677,
"grad_norm": 0.03338076174259186,
"learning_rate": 9.507124692070355e-05,
"loss": 10.3393,
"step": 700
},
{
"epoch": 0.2968766543144521,
"grad_norm": 0.03312176465988159,
"learning_rate": 9.483674843082075e-05,
"loss": 10.336,
"step": 701
},
{
"epoch": 0.2973001588141874,
"grad_norm": 0.026730258017778397,
"learning_rate": 9.460227840589967e-05,
"loss": 10.3366,
"step": 702
},
{
"epoch": 0.2977236633139227,
"grad_norm": 0.04017185419797897,
"learning_rate": 9.436783813857161e-05,
"loss": 10.3349,
"step": 703
},
{
"epoch": 0.298147167813658,
"grad_norm": 0.025352856144309044,
"learning_rate": 9.413342892130376e-05,
"loss": 10.331,
"step": 704
},
{
"epoch": 0.29857067231339335,
"grad_norm": 0.04028523713350296,
"learning_rate": 9.389905204639206e-05,
"loss": 10.3326,
"step": 705
},
{
"epoch": 0.2989941768131286,
"grad_norm": 0.034634605050086975,
"learning_rate": 9.366470880595434e-05,
"loss": 10.3326,
"step": 706
},
{
"epoch": 0.29941768131286395,
"grad_norm": 0.037610601633787155,
"learning_rate": 9.343040049192274e-05,
"loss": 10.3342,
"step": 707
},
{
"epoch": 0.2998411858125993,
"grad_norm": 0.0313008613884449,
"learning_rate": 9.31961283960371e-05,
"loss": 10.3337,
"step": 708
},
{
"epoch": 0.30026469031233455,
"grad_norm": 0.03718707337975502,
"learning_rate": 9.296189380983747e-05,
"loss": 10.3325,
"step": 709
},
{
"epoch": 0.3006881948120699,
"grad_norm": 0.03456999734044075,
"learning_rate": 9.272769802465705e-05,
"loss": 10.3325,
"step": 710
},
{
"epoch": 0.3011116993118052,
"grad_norm": 0.03181077539920807,
"learning_rate": 9.249354233161523e-05,
"loss": 10.3338,
"step": 711
},
{
"epoch": 0.3015352038115405,
"grad_norm": 0.0410895049571991,
"learning_rate": 9.225942802161042e-05,
"loss": 10.3376,
"step": 712
},
{
"epoch": 0.3019587083112758,
"grad_norm": 0.05550311505794525,
"learning_rate": 9.202535638531273e-05,
"loss": 10.3373,
"step": 713
},
{
"epoch": 0.30238221281101113,
"grad_norm": 0.03022390976548195,
"learning_rate": 9.179132871315708e-05,
"loss": 10.3323,
"step": 714
},
{
"epoch": 0.3028057173107464,
"grad_norm": 0.058899421244859695,
"learning_rate": 9.155734629533611e-05,
"loss": 10.3373,
"step": 715
},
{
"epoch": 0.30322922181048173,
"grad_norm": 0.0289511289447546,
"learning_rate": 9.132341042179279e-05,
"loss": 10.3365,
"step": 716
},
{
"epoch": 0.30365272631021706,
"grad_norm": 0.024074682965874672,
"learning_rate": 9.108952238221365e-05,
"loss": 10.3343,
"step": 717
},
{
"epoch": 0.30407623080995233,
"grad_norm": 0.03383636474609375,
"learning_rate": 9.085568346602145e-05,
"loss": 10.3376,
"step": 718
},
{
"epoch": 0.30449973530968766,
"grad_norm": 0.03680823743343353,
"learning_rate": 9.062189496236813e-05,
"loss": 10.332,
"step": 719
},
{
"epoch": 0.304923239809423,
"grad_norm": 0.034177515655756,
"learning_rate": 9.038815816012767e-05,
"loss": 10.3365,
"step": 720
},
{
"epoch": 0.30534674430915826,
"grad_norm": 0.04184051603078842,
"learning_rate": 9.015447434788915e-05,
"loss": 10.3308,
"step": 721
},
{
"epoch": 0.3057702488088936,
"grad_norm": 0.031081423163414,
"learning_rate": 8.992084481394934e-05,
"loss": 10.332,
"step": 722
},
{
"epoch": 0.3061937533086289,
"grad_norm": 0.04926011338829994,
"learning_rate": 8.968727084630594e-05,
"loss": 10.3388,
"step": 723
},
{
"epoch": 0.30661725780836424,
"grad_norm": 0.03448108211159706,
"learning_rate": 8.945375373265017e-05,
"loss": 10.3371,
"step": 724
},
{
"epoch": 0.3070407623080995,
"grad_norm": 0.030851799994707108,
"learning_rate": 8.92202947603599e-05,
"loss": 10.3402,
"step": 725
},
{
"epoch": 0.30746426680783484,
"grad_norm": 0.03434957191348076,
"learning_rate": 8.898689521649251e-05,
"loss": 10.3371,
"step": 726
},
{
"epoch": 0.30788777130757017,
"grad_norm": 0.034013282507658005,
"learning_rate": 8.875355638777757e-05,
"loss": 10.3344,
"step": 727
},
{
"epoch": 0.30831127580730544,
"grad_norm": 0.03570681810379028,
"learning_rate": 8.852027956061015e-05,
"loss": 10.3333,
"step": 728
},
{
"epoch": 0.30873478030704077,
"grad_norm": 0.04296912997961044,
"learning_rate": 8.828706602104337e-05,
"loss": 10.3388,
"step": 729
},
{
"epoch": 0.3091582848067761,
"grad_norm": 0.037189483642578125,
"learning_rate": 8.805391705478147e-05,
"loss": 10.335,
"step": 730
},
{
"epoch": 0.30958178930651137,
"grad_norm": 0.02627628669142723,
"learning_rate": 8.782083394717272e-05,
"loss": 10.3354,
"step": 731
},
{
"epoch": 0.3100052938062467,
"grad_norm": 0.026290280744433403,
"learning_rate": 8.758781798320233e-05,
"loss": 10.3344,
"step": 732
},
{
"epoch": 0.310428798305982,
"grad_norm": 0.033993784338235855,
"learning_rate": 8.735487044748523e-05,
"loss": 10.3324,
"step": 733
},
{
"epoch": 0.3108523028057173,
"grad_norm": 0.02894951030611992,
"learning_rate": 8.712199262425927e-05,
"loss": 10.3343,
"step": 734
},
{
"epoch": 0.3112758073054526,
"grad_norm": 0.02918967790901661,
"learning_rate": 8.68891857973779e-05,
"loss": 10.3364,
"step": 735
},
{
"epoch": 0.31169931180518795,
"grad_norm": 0.04133673012256622,
"learning_rate": 8.665645125030311e-05,
"loss": 10.3339,
"step": 736
},
{
"epoch": 0.3121228163049232,
"grad_norm": 0.03206159546971321,
"learning_rate": 8.642379026609849e-05,
"loss": 10.3422,
"step": 737
},
{
"epoch": 0.31254632080465855,
"grad_norm": 0.03564688563346863,
"learning_rate": 8.619120412742212e-05,
"loss": 10.3388,
"step": 738
},
{
"epoch": 0.3129698253043939,
"grad_norm": 0.033441901206970215,
"learning_rate": 8.595869411651931e-05,
"loss": 10.3375,
"step": 739
},
{
"epoch": 0.31339332980412915,
"grad_norm": 0.0351875014603138,
"learning_rate": 8.572626151521581e-05,
"loss": 10.3327,
"step": 740
},
{
"epoch": 0.3138168343038645,
"grad_norm": 0.046769220381975174,
"learning_rate": 8.549390760491051e-05,
"loss": 10.3333,
"step": 741
},
{
"epoch": 0.3142403388035998,
"grad_norm": 0.02873465232551098,
"learning_rate": 8.526163366656858e-05,
"loss": 10.3342,
"step": 742
},
{
"epoch": 0.3146638433033351,
"grad_norm": 0.03012407198548317,
"learning_rate": 8.502944098071427e-05,
"loss": 10.334,
"step": 743
},
{
"epoch": 0.3150873478030704,
"grad_norm": 0.03743249177932739,
"learning_rate": 8.479733082742384e-05,
"loss": 10.3344,
"step": 744
},
{
"epoch": 0.31551085230280573,
"grad_norm": 0.02463219314813614,
"learning_rate": 8.456530448631855e-05,
"loss": 10.3322,
"step": 745
},
{
"epoch": 0.315934356802541,
"grad_norm": 0.035319242626428604,
"learning_rate": 8.433336323655774e-05,
"loss": 10.3363,
"step": 746
},
{
"epoch": 0.31635786130227633,
"grad_norm": 0.03892083838582039,
"learning_rate": 8.41015083568314e-05,
"loss": 10.3344,
"step": 747
},
{
"epoch": 0.31678136580201166,
"grad_norm": 0.04276084899902344,
"learning_rate": 8.386974112535358e-05,
"loss": 10.3367,
"step": 748
},
{
"epoch": 0.31720487030174693,
"grad_norm": 0.03648482635617256,
"learning_rate": 8.363806281985509e-05,
"loss": 10.333,
"step": 749
},
{
"epoch": 0.31762837480148226,
"grad_norm": 0.03600320592522621,
"learning_rate": 8.340647471757636e-05,
"loss": 10.3314,
"step": 750
},
{
"epoch": 0.3180518793012176,
"grad_norm": 0.0343911312520504,
"learning_rate": 8.317497809526063e-05,
"loss": 10.3391,
"step": 751
},
{
"epoch": 0.31847538380095286,
"grad_norm": 0.028392106294631958,
"learning_rate": 8.294357422914685e-05,
"loss": 10.3343,
"step": 752
},
{
"epoch": 0.3188988883006882,
"grad_norm": 0.03276420384645462,
"learning_rate": 8.27122643949625e-05,
"loss": 10.3329,
"step": 753
},
{
"epoch": 0.3193223928004235,
"grad_norm": 0.030692044645547867,
"learning_rate": 8.248104986791676e-05,
"loss": 10.3287,
"step": 754
},
{
"epoch": 0.3197458973001588,
"grad_norm": 0.037886835634708405,
"learning_rate": 8.224993192269334e-05,
"loss": 10.3316,
"step": 755
},
{
"epoch": 0.3201694017998941,
"grad_norm": 0.029941901564598083,
"learning_rate": 8.201891183344345e-05,
"loss": 10.3293,
"step": 756
},
{
"epoch": 0.32059290629962944,
"grad_norm": 0.0404081791639328,
"learning_rate": 8.178799087377894e-05,
"loss": 10.3364,
"step": 757
},
{
"epoch": 0.32101641079936477,
"grad_norm": 0.03296668082475662,
"learning_rate": 8.1557170316765e-05,
"loss": 10.3363,
"step": 758
},
{
"epoch": 0.32143991529910004,
"grad_norm": 0.03453279659152031,
"learning_rate": 8.132645143491346e-05,
"loss": 10.3369,
"step": 759
},
{
"epoch": 0.32186341979883537,
"grad_norm": 0.042309049516916275,
"learning_rate": 8.10958355001755e-05,
"loss": 10.3325,
"step": 760
},
{
"epoch": 0.3222869242985707,
"grad_norm": 0.03626590222120285,
"learning_rate": 8.086532378393482e-05,
"loss": 10.3374,
"step": 761
},
{
"epoch": 0.32271042879830597,
"grad_norm": 0.029558565467596054,
"learning_rate": 8.063491755700051e-05,
"loss": 10.3367,
"step": 762
},
{
"epoch": 0.3231339332980413,
"grad_norm": 0.031136656180024147,
"learning_rate": 8.04046180896002e-05,
"loss": 10.3312,
"step": 763
},
{
"epoch": 0.3235574377977766,
"grad_norm": 0.03206343576312065,
"learning_rate": 8.017442665137278e-05,
"loss": 10.3357,
"step": 764
},
{
"epoch": 0.3239809422975119,
"grad_norm": 0.04191575571894646,
"learning_rate": 7.994434451136177e-05,
"loss": 10.3358,
"step": 765
},
{
"epoch": 0.3244044467972472,
"grad_norm": 0.03315071761608124,
"learning_rate": 7.971437293800803e-05,
"loss": 10.3338,
"step": 766
},
{
"epoch": 0.32482795129698255,
"grad_norm": 0.03882451355457306,
"learning_rate": 7.948451319914282e-05,
"loss": 10.3311,
"step": 767
},
{
"epoch": 0.3252514557967178,
"grad_norm": 0.046539660543203354,
"learning_rate": 7.925476656198095e-05,
"loss": 10.3364,
"step": 768
},
{
"epoch": 0.32567496029645315,
"grad_norm": 0.035186078399419785,
"learning_rate": 7.90251342931137e-05,
"loss": 10.3322,
"step": 769
},
{
"epoch": 0.3260984647961885,
"grad_norm": 0.02894584834575653,
"learning_rate": 7.879561765850176e-05,
"loss": 10.335,
"step": 770
},
{
"epoch": 0.32652196929592375,
"grad_norm": 0.05743710324168205,
"learning_rate": 7.856621792346837e-05,
"loss": 10.3358,
"step": 771
},
{
"epoch": 0.3269454737956591,
"grad_norm": 0.03184637799859047,
"learning_rate": 7.833693635269235e-05,
"loss": 10.3323,
"step": 772
},
{
"epoch": 0.3273689782953944,
"grad_norm": 0.028403330594301224,
"learning_rate": 7.8107774210201e-05,
"loss": 10.3368,
"step": 773
},
{
"epoch": 0.3277924827951297,
"grad_norm": 0.03520669415593147,
"learning_rate": 7.78787327593633e-05,
"loss": 10.3329,
"step": 774
},
{
"epoch": 0.328215987294865,
"grad_norm": 0.033936478197574615,
"learning_rate": 7.764981326288273e-05,
"loss": 10.3354,
"step": 775
},
{
"epoch": 0.32863949179460034,
"grad_norm": 0.04284480959177017,
"learning_rate": 7.74210169827906e-05,
"loss": 10.3353,
"step": 776
},
{
"epoch": 0.3290629962943356,
"grad_norm": 0.035401035100221634,
"learning_rate": 7.719234518043881e-05,
"loss": 10.3383,
"step": 777
},
{
"epoch": 0.32948650079407094,
"grad_norm": 0.02768767438828945,
"learning_rate": 7.696379911649303e-05,
"loss": 10.333,
"step": 778
},
{
"epoch": 0.32991000529380626,
"grad_norm": 0.03562779724597931,
"learning_rate": 7.673538005092578e-05,
"loss": 10.3365,
"step": 779
},
{
"epoch": 0.33033350979354154,
"grad_norm": 0.03725546598434448,
"learning_rate": 7.650708924300944e-05,
"loss": 10.3284,
"step": 780
},
{
"epoch": 0.33075701429327686,
"grad_norm": 0.032989148050546646,
"learning_rate": 7.627892795130925e-05,
"loss": 10.3375,
"step": 781
},
{
"epoch": 0.3311805187930122,
"grad_norm": 0.02557358518242836,
"learning_rate": 7.605089743367644e-05,
"loss": 10.3355,
"step": 782
},
{
"epoch": 0.33160402329274746,
"grad_norm": 0.03748362138867378,
"learning_rate": 7.582299894724138e-05,
"loss": 10.3362,
"step": 783
},
{
"epoch": 0.3320275277924828,
"grad_norm": 0.04423379525542259,
"learning_rate": 7.55952337484064e-05,
"loss": 10.3372,
"step": 784
},
{
"epoch": 0.3324510322922181,
"grad_norm": 0.03931692987680435,
"learning_rate": 7.536760309283912e-05,
"loss": 10.3319,
"step": 785
},
{
"epoch": 0.3328745367919534,
"grad_norm": 0.028346918523311615,
"learning_rate": 7.514010823546543e-05,
"loss": 10.3355,
"step": 786
},
{
"epoch": 0.3332980412916887,
"grad_norm": 0.041941095143556595,
"learning_rate": 7.491275043046246e-05,
"loss": 10.3351,
"step": 787
},
{
"epoch": 0.33372154579142405,
"grad_norm": 0.03487636148929596,
"learning_rate": 7.46855309312519e-05,
"loss": 10.3333,
"step": 788
},
{
"epoch": 0.3341450502911593,
"grad_norm": 0.032287437468767166,
"learning_rate": 7.445845099049294e-05,
"loss": 10.3308,
"step": 789
},
{
"epoch": 0.33456855479089465,
"grad_norm": 0.03427242115139961,
"learning_rate": 7.423151186007527e-05,
"loss": 10.3318,
"step": 790
},
{
"epoch": 0.33499205929063,
"grad_norm": 0.03202645853161812,
"learning_rate": 7.400471479111247e-05,
"loss": 10.3365,
"step": 791
},
{
"epoch": 0.33541556379036525,
"grad_norm": 0.036663834005594254,
"learning_rate": 7.377806103393473e-05,
"loss": 10.3315,
"step": 792
},
{
"epoch": 0.3358390682901006,
"grad_norm": 0.037792034447193146,
"learning_rate": 7.355155183808234e-05,
"loss": 10.3371,
"step": 793
},
{
"epoch": 0.3362625727898359,
"grad_norm": 0.03239692375063896,
"learning_rate": 7.332518845229859e-05,
"loss": 10.3333,
"step": 794
},
{
"epoch": 0.33668607728957123,
"grad_norm": 0.028021618723869324,
"learning_rate": 7.309897212452279e-05,
"loss": 10.3329,
"step": 795
},
{
"epoch": 0.3371095817893065,
"grad_norm": 0.03356965258717537,
"learning_rate": 7.287290410188373e-05,
"loss": 10.3318,
"step": 796
},
{
"epoch": 0.33753308628904183,
"grad_norm": 0.030086075887084007,
"learning_rate": 7.264698563069246e-05,
"loss": 10.3378,
"step": 797
},
{
"epoch": 0.33795659078877716,
"grad_norm": 0.03997505083680153,
"learning_rate": 7.242121795643552e-05,
"loss": 10.3386,
"step": 798
},
{
"epoch": 0.33838009528851243,
"grad_norm": 0.03527563437819481,
"learning_rate": 7.219560232376821e-05,
"loss": 10.338,
"step": 799
},
{
"epoch": 0.33880359978824776,
"grad_norm": 0.032303664833307266,
"learning_rate": 7.197013997650762e-05,
"loss": 10.3403,
"step": 800
},
{
"epoch": 0.3392271042879831,
"grad_norm": 0.03437490016222,
"learning_rate": 7.174483215762568e-05,
"loss": 10.3319,
"step": 801
},
{
"epoch": 0.33965060878771836,
"grad_norm": 0.03714921697974205,
"learning_rate": 7.151968010924249e-05,
"loss": 10.3357,
"step": 802
},
{
"epoch": 0.3400741132874537,
"grad_norm": 0.03493595868349075,
"learning_rate": 7.12946850726194e-05,
"loss": 10.333,
"step": 803
},
{
"epoch": 0.340497617787189,
"grad_norm": 0.027758195996284485,
"learning_rate": 7.106984828815206e-05,
"loss": 10.3392,
"step": 804
},
{
"epoch": 0.3409211222869243,
"grad_norm": 0.03370975703001022,
"learning_rate": 7.084517099536377e-05,
"loss": 10.3326,
"step": 805
},
{
"epoch": 0.3413446267866596,
"grad_norm": 0.03559848666191101,
"learning_rate": 7.062065443289859e-05,
"loss": 10.3339,
"step": 806
},
{
"epoch": 0.34176813128639494,
"grad_norm": 0.03199275955557823,
"learning_rate": 7.039629983851432e-05,
"loss": 10.3325,
"step": 807
},
{
"epoch": 0.3421916357861302,
"grad_norm": 0.0587792843580246,
"learning_rate": 7.017210844907598e-05,
"loss": 10.3334,
"step": 808
},
{
"epoch": 0.34261514028586554,
"grad_norm": 0.04129471629858017,
"learning_rate": 6.994808150054872e-05,
"loss": 10.3343,
"step": 809
},
{
"epoch": 0.34303864478560087,
"grad_norm": 0.03535553812980652,
"learning_rate": 6.972422022799121e-05,
"loss": 10.3325,
"step": 810
},
{
"epoch": 0.34346214928533614,
"grad_norm": 0.03517236188054085,
"learning_rate": 6.95005258655488e-05,
"loss": 10.3332,
"step": 811
},
{
"epoch": 0.34388565378507147,
"grad_norm": 0.03364865854382515,
"learning_rate": 6.927699964644652e-05,
"loss": 10.3341,
"step": 812
},
{
"epoch": 0.3443091582848068,
"grad_norm": 0.04013295844197273,
"learning_rate": 6.905364280298252e-05,
"loss": 10.3285,
"step": 813
},
{
"epoch": 0.34473266278454207,
"grad_norm": 0.03748088330030441,
"learning_rate": 6.883045656652122e-05,
"loss": 10.3321,
"step": 814
},
{
"epoch": 0.3451561672842774,
"grad_norm": 0.032113492488861084,
"learning_rate": 6.860744216748634e-05,
"loss": 10.3329,
"step": 815
},
{
"epoch": 0.3455796717840127,
"grad_norm": 0.031743258237838745,
"learning_rate": 6.838460083535445e-05,
"loss": 10.3353,
"step": 816
},
{
"epoch": 0.346003176283748,
"grad_norm": 0.0373242124915123,
"learning_rate": 6.816193379864786e-05,
"loss": 10.3358,
"step": 817
},
{
"epoch": 0.3464266807834833,
"grad_norm": 0.03148363158106804,
"learning_rate": 6.793944228492803e-05,
"loss": 10.3392,
"step": 818
},
{
"epoch": 0.34685018528321865,
"grad_norm": 0.02826239913702011,
"learning_rate": 6.77171275207888e-05,
"loss": 10.3354,
"step": 819
},
{
"epoch": 0.3472736897829539,
"grad_norm": 0.02493412233889103,
"learning_rate": 6.749499073184957e-05,
"loss": 10.3331,
"step": 820
},
{
"epoch": 0.34769719428268925,
"grad_norm": 0.030130870640277863,
"learning_rate": 6.727303314274852e-05,
"loss": 10.3335,
"step": 821
},
{
"epoch": 0.3481206987824246,
"grad_norm": 0.03829304128885269,
"learning_rate": 6.705125597713598e-05,
"loss": 10.337,
"step": 822
},
{
"epoch": 0.34854420328215985,
"grad_norm": 0.036645397543907166,
"learning_rate": 6.682966045766758e-05,
"loss": 10.3323,
"step": 823
},
{
"epoch": 0.3489677077818952,
"grad_norm": 0.03612329065799713,
"learning_rate": 6.660824780599744e-05,
"loss": 10.3288,
"step": 824
},
{
"epoch": 0.3493912122816305,
"grad_norm": 0.04094702750444412,
"learning_rate": 6.638701924277174e-05,
"loss": 10.3292,
"step": 825
},
{
"epoch": 0.3498147167813658,
"grad_norm": 0.031782373785972595,
"learning_rate": 6.61659759876215e-05,
"loss": 10.333,
"step": 826
},
{
"epoch": 0.3502382212811011,
"grad_norm": 0.03760769963264465,
"learning_rate": 6.594511925915646e-05,
"loss": 10.3337,
"step": 827
},
{
"epoch": 0.35066172578083643,
"grad_norm": 0.033052217215299606,
"learning_rate": 6.572445027495779e-05,
"loss": 10.3336,
"step": 828
},
{
"epoch": 0.35108523028057176,
"grad_norm": 0.0381859727203846,
"learning_rate": 6.550397025157169e-05,
"loss": 10.3385,
"step": 829
},
{
"epoch": 0.35150873478030703,
"grad_norm": 0.033314503729343414,
"learning_rate": 6.528368040450268e-05,
"loss": 10.3333,
"step": 830
},
{
"epoch": 0.35193223928004236,
"grad_norm": 0.029306232929229736,
"learning_rate": 6.506358194820685e-05,
"loss": 10.3326,
"step": 831
},
{
"epoch": 0.3523557437797777,
"grad_norm": 0.035479675978422165,
"learning_rate": 6.484367609608503e-05,
"loss": 10.3346,
"step": 832
},
{
"epoch": 0.35277924827951296,
"grad_norm": 0.028150904923677444,
"learning_rate": 6.462396406047634e-05,
"loss": 10.336,
"step": 833
},
{
"epoch": 0.3532027527792483,
"grad_norm": 0.029760006815195084,
"learning_rate": 6.440444705265136e-05,
"loss": 10.3317,
"step": 834
},
{
"epoch": 0.3536262572789836,
"grad_norm": 0.039765894412994385,
"learning_rate": 6.418512628280544e-05,
"loss": 10.3309,
"step": 835
},
{
"epoch": 0.3540497617787189,
"grad_norm": 0.02911820076406002,
"learning_rate": 6.396600296005213e-05,
"loss": 10.3351,
"step": 836
},
{
"epoch": 0.3544732662784542,
"grad_norm": 0.0354015938937664,
"learning_rate": 6.374707829241648e-05,
"loss": 10.3336,
"step": 837
},
{
"epoch": 0.35489677077818954,
"grad_norm": 0.030309785157442093,
"learning_rate": 6.352835348682823e-05,
"loss": 10.3339,
"step": 838
},
{
"epoch": 0.3553202752779248,
"grad_norm": 0.03831500932574272,
"learning_rate": 6.330982974911542e-05,
"loss": 10.3343,
"step": 839
},
{
"epoch": 0.35574377977766014,
"grad_norm": 0.02785351127386093,
"learning_rate": 6.309150828399754e-05,
"loss": 10.3333,
"step": 840
},
{
"epoch": 0.35616728427739547,
"grad_norm": 0.033175136893987656,
"learning_rate": 6.287339029507894e-05,
"loss": 10.3336,
"step": 841
},
{
"epoch": 0.35659078877713074,
"grad_norm": 0.03539146110415459,
"learning_rate": 6.265547698484226e-05,
"loss": 10.3291,
"step": 842
},
{
"epoch": 0.35701429327686607,
"grad_norm": 0.033553339540958405,
"learning_rate": 6.243776955464169e-05,
"loss": 10.3332,
"step": 843
},
{
"epoch": 0.3574377977766014,
"grad_norm": 0.02890482172369957,
"learning_rate": 6.22202692046964e-05,
"loss": 10.3323,
"step": 844
},
{
"epoch": 0.35786130227633667,
"grad_norm": 0.035188328474760056,
"learning_rate": 6.200297713408405e-05,
"loss": 10.3333,
"step": 845
},
{
"epoch": 0.358284806776072,
"grad_norm": 0.025319932028651237,
"learning_rate": 6.178589454073386e-05,
"loss": 10.3335,
"step": 846
},
{
"epoch": 0.3587083112758073,
"grad_norm": 0.03855932876467705,
"learning_rate": 6.156902262142041e-05,
"loss": 10.3339,
"step": 847
},
{
"epoch": 0.3591318157755426,
"grad_norm": 0.03646783158183098,
"learning_rate": 6.135236257175668e-05,
"loss": 10.3318,
"step": 848
},
{
"epoch": 0.3595553202752779,
"grad_norm": 0.030003085732460022,
"learning_rate": 6.11359155861877e-05,
"loss": 10.3356,
"step": 849
},
{
"epoch": 0.35997882477501325,
"grad_norm": 0.0374993160367012,
"learning_rate": 6.091968285798379e-05,
"loss": 10.3337,
"step": 850
},
{
"epoch": 0.3604023292747485,
"grad_norm": 0.030076105147600174,
"learning_rate": 6.0703665579234235e-05,
"loss": 10.3314,
"step": 851
},
{
"epoch": 0.36082583377448385,
"grad_norm": 0.03631633147597313,
"learning_rate": 6.048786494084036e-05,
"loss": 10.3328,
"step": 852
},
{
"epoch": 0.3612493382742192,
"grad_norm": 0.040016159415245056,
"learning_rate": 6.027228213250926e-05,
"loss": 10.3283,
"step": 853
},
{
"epoch": 0.36167284277395445,
"grad_norm": 0.0370076447725296,
"learning_rate": 6.005691834274716e-05,
"loss": 10.332,
"step": 854
},
{
"epoch": 0.3620963472736898,
"grad_norm": 0.030154719948768616,
"learning_rate": 5.984177475885272e-05,
"loss": 10.3317,
"step": 855
},
{
"epoch": 0.3625198517734251,
"grad_norm": 0.03587184473872185,
"learning_rate": 5.962685256691071e-05,
"loss": 10.3323,
"step": 856
},
{
"epoch": 0.3629433562731604,
"grad_norm": 0.03230219706892967,
"learning_rate": 5.941215295178537e-05,
"loss": 10.3358,
"step": 857
},
{
"epoch": 0.3633668607728957,
"grad_norm": 0.03372441977262497,
"learning_rate": 5.919767709711381e-05,
"loss": 10.3354,
"step": 858
},
{
"epoch": 0.36379036527263103,
"grad_norm": 0.03200405836105347,
"learning_rate": 5.898342618529955e-05,
"loss": 10.3328,
"step": 859
},
{
"epoch": 0.3642138697723663,
"grad_norm": 0.029012855142354965,
"learning_rate": 5.876940139750612e-05,
"loss": 10.332,
"step": 860
},
{
"epoch": 0.36463737427210163,
"grad_norm": 0.03257838636636734,
"learning_rate": 5.8555603913650246e-05,
"loss": 10.3345,
"step": 861
},
{
"epoch": 0.36506087877183696,
"grad_norm": 0.03526037186384201,
"learning_rate": 5.834203491239574e-05,
"loss": 10.3391,
"step": 862
},
{
"epoch": 0.36548438327157223,
"grad_norm": 0.02882193773984909,
"learning_rate": 5.812869557114658e-05,
"loss": 10.3312,
"step": 863
},
{
"epoch": 0.36590788777130756,
"grad_norm": 0.036499861627817154,
"learning_rate": 5.791558706604074e-05,
"loss": 10.3337,
"step": 864
},
{
"epoch": 0.3663313922710429,
"grad_norm": 0.029512615874409676,
"learning_rate": 5.7702710571943696e-05,
"loss": 10.3326,
"step": 865
},
{
"epoch": 0.3667548967707782,
"grad_norm": 0.029680045321583748,
"learning_rate": 5.7490067262441615e-05,
"loss": 10.3327,
"step": 866
},
{
"epoch": 0.3671784012705135,
"grad_norm": 0.03450106456875801,
"learning_rate": 5.727765830983525e-05,
"loss": 10.3335,
"step": 867
},
{
"epoch": 0.3676019057702488,
"grad_norm": 0.04042774438858032,
"learning_rate": 5.7065484885133466e-05,
"loss": 10.3325,
"step": 868
},
{
"epoch": 0.36802541026998414,
"grad_norm": 0.026330476626753807,
"learning_rate": 5.685354815804638e-05,
"loss": 10.3357,
"step": 869
},
{
"epoch": 0.3684489147697194,
"grad_norm": 0.04447445273399353,
"learning_rate": 5.664184929697945e-05,
"loss": 10.3358,
"step": 870
},
{
"epoch": 0.36887241926945474,
"grad_norm": 0.03038848005235195,
"learning_rate": 5.643038946902668e-05,
"loss": 10.3287,
"step": 871
},
{
"epoch": 0.36929592376919007,
"grad_norm": 0.03465115278959274,
"learning_rate": 5.621916983996429e-05,
"loss": 10.3332,
"step": 872
},
{
"epoch": 0.36971942826892534,
"grad_norm": 0.03414342552423477,
"learning_rate": 5.600819157424427e-05,
"loss": 10.3313,
"step": 873
},
{
"epoch": 0.37014293276866067,
"grad_norm": 0.03694528341293335,
"learning_rate": 5.579745583498801e-05,
"loss": 10.3327,
"step": 874
},
{
"epoch": 0.370566437268396,
"grad_norm": 0.03394203633069992,
"learning_rate": 5.558696378397983e-05,
"loss": 10.3348,
"step": 875
},
{
"epoch": 0.37098994176813127,
"grad_norm": 0.03003198839724064,
"learning_rate": 5.537671658166063e-05,
"loss": 10.3372,
"step": 876
},
{
"epoch": 0.3714134462678666,
"grad_norm": 0.03449974209070206,
"learning_rate": 5.51667153871214e-05,
"loss": 10.3282,
"step": 877
},
{
"epoch": 0.3718369507676019,
"grad_norm": 0.03920021653175354,
"learning_rate": 5.495696135809696e-05,
"loss": 10.3374,
"step": 878
},
{
"epoch": 0.3722604552673372,
"grad_norm": 0.02695903740823269,
"learning_rate": 5.4747455650959464e-05,
"loss": 10.3328,
"step": 879
},
{
"epoch": 0.3726839597670725,
"grad_norm": 0.02427126094698906,
"learning_rate": 5.453819942071211e-05,
"loss": 10.3338,
"step": 880
},
{
"epoch": 0.37310746426680785,
"grad_norm": 0.04181993380188942,
"learning_rate": 5.432919382098267e-05,
"loss": 10.3335,
"step": 881
},
{
"epoch": 0.3735309687665431,
"grad_norm": 0.03968915343284607,
"learning_rate": 5.412044000401726e-05,
"loss": 10.3298,
"step": 882
},
{
"epoch": 0.37395447326627845,
"grad_norm": 0.04720834270119667,
"learning_rate": 5.391193912067386e-05,
"loss": 10.3303,
"step": 883
},
{
"epoch": 0.3743779777660138,
"grad_norm": 0.03866041824221611,
"learning_rate": 5.3703692320416034e-05,
"loss": 10.3328,
"step": 884
},
{
"epoch": 0.37480148226574905,
"grad_norm": 0.031206615269184113,
"learning_rate": 5.3495700751306735e-05,
"loss": 10.3295,
"step": 885
},
{
"epoch": 0.3752249867654844,
"grad_norm": 0.02863953448832035,
"learning_rate": 5.328796556000153e-05,
"loss": 10.3352,
"step": 886
},
{
"epoch": 0.3756484912652197,
"grad_norm": 0.027401378378272057,
"learning_rate": 5.308048789174289e-05,
"loss": 10.3319,
"step": 887
},
{
"epoch": 0.376071995764955,
"grad_norm": 0.039630185812711716,
"learning_rate": 5.2873268890353424e-05,
"loss": 10.3303,
"step": 888
},
{
"epoch": 0.3764955002646903,
"grad_norm": 0.037442997097969055,
"learning_rate": 5.266630969822958e-05,
"loss": 10.3304,
"step": 889
},
{
"epoch": 0.37691900476442564,
"grad_norm": 0.03644905984401703,
"learning_rate": 5.2459611456335746e-05,
"loss": 10.3322,
"step": 890
},
{
"epoch": 0.3773425092641609,
"grad_norm": 0.03049355559051037,
"learning_rate": 5.225317530419751e-05,
"loss": 10.3303,
"step": 891
},
{
"epoch": 0.37776601376389624,
"grad_norm": 0.0509609617292881,
"learning_rate": 5.2047002379895636e-05,
"loss": 10.3276,
"step": 892
},
{
"epoch": 0.37818951826363156,
"grad_norm": 0.039859797805547714,
"learning_rate": 5.1841093820059686e-05,
"loss": 10.3278,
"step": 893
},
{
"epoch": 0.37861302276336684,
"grad_norm": 0.040018096566200256,
"learning_rate": 5.163545075986178e-05,
"loss": 10.3321,
"step": 894
},
{
"epoch": 0.37903652726310216,
"grad_norm": 0.030337341129779816,
"learning_rate": 5.143007433301035e-05,
"loss": 10.3373,
"step": 895
},
{
"epoch": 0.3794600317628375,
"grad_norm": 0.0383714959025383,
"learning_rate": 5.12249656717439e-05,
"loss": 10.3338,
"step": 896
},
{
"epoch": 0.37988353626257276,
"grad_norm": 0.03546814247965813,
"learning_rate": 5.10201259068247e-05,
"loss": 10.3348,
"step": 897
},
{
"epoch": 0.3803070407623081,
"grad_norm": 0.025767376646399498,
"learning_rate": 5.081555616753264e-05,
"loss": 10.336,
"step": 898
},
{
"epoch": 0.3807305452620434,
"grad_norm": 0.03003775328397751,
"learning_rate": 5.061125758165896e-05,
"loss": 10.3323,
"step": 899
},
{
"epoch": 0.38115404976177875,
"grad_norm": 0.04286766052246094,
"learning_rate": 5.040723127549998e-05,
"loss": 10.3369,
"step": 900
},
{
"epoch": 0.381577554261514,
"grad_norm": 0.03410165011882782,
"learning_rate": 5.0203478373850955e-05,
"loss": 10.3316,
"step": 901
},
{
"epoch": 0.38200105876124935,
"grad_norm": 0.03914531320333481,
"learning_rate": 5.000000000000002e-05,
"loss": 10.3333,
"step": 902
},
{
"epoch": 0.3824245632609847,
"grad_norm": 0.02953316643834114,
"learning_rate": 4.979679727572159e-05,
"loss": 10.3354,
"step": 903
},
{
"epoch": 0.38284806776071995,
"grad_norm": 0.034125540405511856,
"learning_rate": 4.959387132127054e-05,
"loss": 10.3298,
"step": 904
},
{
"epoch": 0.3832715722604553,
"grad_norm": 0.035765476524829865,
"learning_rate": 4.939122325537604e-05,
"loss": 10.3343,
"step": 905
},
{
"epoch": 0.3836950767601906,
"grad_norm": 0.04484931752085686,
"learning_rate": 4.918885419523499e-05,
"loss": 10.3357,
"step": 906
},
{
"epoch": 0.3841185812599259,
"grad_norm": 0.033943966031074524,
"learning_rate": 4.898676525650639e-05,
"loss": 10.3321,
"step": 907
},
{
"epoch": 0.3845420857596612,
"grad_norm": 0.03354055806994438,
"learning_rate": 4.8784957553304876e-05,
"loss": 10.3308,
"step": 908
},
{
"epoch": 0.38496559025939653,
"grad_norm": 0.029053689911961555,
"learning_rate": 4.858343219819442e-05,
"loss": 10.3289,
"step": 909
},
{
"epoch": 0.3853890947591318,
"grad_norm": 0.04069928824901581,
"learning_rate": 4.838219030218274e-05,
"loss": 10.3315,
"step": 910
},
{
"epoch": 0.38581259925886713,
"grad_norm": 0.035262517631053925,
"learning_rate": 4.818123297471463e-05,
"loss": 10.3373,
"step": 911
},
{
"epoch": 0.38623610375860246,
"grad_norm": 0.034540239721536636,
"learning_rate": 4.7980561323666115e-05,
"loss": 10.323,
"step": 912
},
{
"epoch": 0.38665960825833773,
"grad_norm": 0.031878579407930374,
"learning_rate": 4.77801764553383e-05,
"loss": 10.3362,
"step": 913
},
{
"epoch": 0.38708311275807306,
"grad_norm": 0.029519235715270042,
"learning_rate": 4.758007947445125e-05,
"loss": 10.3275,
"step": 914
},
{
"epoch": 0.3875066172578084,
"grad_norm": 0.03876268118619919,
"learning_rate": 4.7380271484137915e-05,
"loss": 10.3288,
"step": 915
},
{
"epoch": 0.38793012175754366,
"grad_norm": 0.029615303501486778,
"learning_rate": 4.718075358593802e-05,
"loss": 10.3347,
"step": 916
},
{
"epoch": 0.388353626257279,
"grad_norm": 0.030300240963697433,
"learning_rate": 4.698152687979205e-05,
"loss": 10.3329,
"step": 917
},
{
"epoch": 0.3887771307570143,
"grad_norm": 0.05134044587612152,
"learning_rate": 4.678259246403512e-05,
"loss": 10.3394,
"step": 918
},
{
"epoch": 0.3892006352567496,
"grad_norm": 0.04115286096930504,
"learning_rate": 4.6583951435390973e-05,
"loss": 10.3301,
"step": 919
},
{
"epoch": 0.3896241397564849,
"grad_norm": 0.033398233354091644,
"learning_rate": 4.638560488896589e-05,
"loss": 10.3336,
"step": 920
},
{
"epoch": 0.39004764425622024,
"grad_norm": 0.0268535315990448,
"learning_rate": 4.618755391824268e-05,
"loss": 10.3314,
"step": 921
},
{
"epoch": 0.3904711487559555,
"grad_norm": 0.042327847331762314,
"learning_rate": 4.598979961507471e-05,
"loss": 10.3317,
"step": 922
},
{
"epoch": 0.39089465325569084,
"grad_norm": 0.033836785703897476,
"learning_rate": 4.57923430696797e-05,
"loss": 10.3344,
"step": 923
},
{
"epoch": 0.39131815775542617,
"grad_norm": 0.03876091167330742,
"learning_rate": 4.5595185370633875e-05,
"loss": 10.3312,
"step": 924
},
{
"epoch": 0.39174166225516144,
"grad_norm": 0.04275533929467201,
"learning_rate": 4.5398327604866054e-05,
"loss": 10.3328,
"step": 925
},
{
"epoch": 0.39216516675489677,
"grad_norm": 0.038993559777736664,
"learning_rate": 4.5201770857651274e-05,
"loss": 10.3345,
"step": 926
},
{
"epoch": 0.3925886712546321,
"grad_norm": 0.028194980695843697,
"learning_rate": 4.50055162126053e-05,
"loss": 10.3356,
"step": 927
},
{
"epoch": 0.39301217575436737,
"grad_norm": 0.038792677223682404,
"learning_rate": 4.48095647516783e-05,
"loss": 10.3328,
"step": 928
},
{
"epoch": 0.3934356802541027,
"grad_norm": 0.031241275370121002,
"learning_rate": 4.461391755514899e-05,
"loss": 10.3274,
"step": 929
},
{
"epoch": 0.393859184753838,
"grad_norm": 0.04370317608118057,
"learning_rate": 4.4418575701618715e-05,
"loss": 10.3334,
"step": 930
},
{
"epoch": 0.3942826892535733,
"grad_norm": 0.032410670071840286,
"learning_rate": 4.422354026800536e-05,
"loss": 10.3373,
"step": 931
},
{
"epoch": 0.3947061937533086,
"grad_norm": 0.02156672440469265,
"learning_rate": 4.4028812329537694e-05,
"loss": 10.3344,
"step": 932
},
{
"epoch": 0.39512969825304395,
"grad_norm": 0.042322322726249695,
"learning_rate": 4.3834392959749146e-05,
"loss": 10.3309,
"step": 933
},
{
"epoch": 0.3955532027527793,
"grad_norm": 0.027538040652871132,
"learning_rate": 4.3640283230472044e-05,
"loss": 10.3305,
"step": 934
},
{
"epoch": 0.39597670725251455,
"grad_norm": 0.026913011446595192,
"learning_rate": 4.344648421183166e-05,
"loss": 10.3326,
"step": 935
},
{
"epoch": 0.3964002117522499,
"grad_norm": 0.03797266632318497,
"learning_rate": 4.3252996972240324e-05,
"loss": 10.3286,
"step": 936
},
{
"epoch": 0.3968237162519852,
"grad_norm": 0.03437899798154831,
"learning_rate": 4.305982257839154e-05,
"loss": 10.3333,
"step": 937
},
{
"epoch": 0.3972472207517205,
"grad_norm": 0.032235968858003616,
"learning_rate": 4.286696209525409e-05,
"loss": 10.3373,
"step": 938
},
{
"epoch": 0.3976707252514558,
"grad_norm": 0.03257599472999573,
"learning_rate": 4.2674416586066165e-05,
"loss": 10.3336,
"step": 939
},
{
"epoch": 0.39809422975119113,
"grad_norm": 0.03536880016326904,
"learning_rate": 4.248218711232952e-05,
"loss": 10.3347,
"step": 940
},
{
"epoch": 0.3985177342509264,
"grad_norm": 0.03932619467377663,
"learning_rate": 4.229027473380355e-05,
"loss": 10.3343,
"step": 941
},
{
"epoch": 0.39894123875066173,
"grad_norm": 0.03219004347920418,
"learning_rate": 4.2098680508499665e-05,
"loss": 10.3355,
"step": 942
},
{
"epoch": 0.39936474325039706,
"grad_norm": 0.03659631311893463,
"learning_rate": 4.1907405492675065e-05,
"loss": 10.3342,
"step": 943
},
{
"epoch": 0.39978824775013233,
"grad_norm": 0.02803085185587406,
"learning_rate": 4.171645074082737e-05,
"loss": 10.3313,
"step": 944
},
{
"epoch": 0.40021175224986766,
"grad_norm": 0.024601435288786888,
"learning_rate": 4.15258173056885e-05,
"loss": 10.3333,
"step": 945
},
{
"epoch": 0.400635256749603,
"grad_norm": 0.036193400621414185,
"learning_rate": 4.133550623821885e-05,
"loss": 10.3359,
"step": 946
},
{
"epoch": 0.40105876124933826,
"grad_norm": 0.03234044834971428,
"learning_rate": 4.114551858760183e-05,
"loss": 10.3351,
"step": 947
},
{
"epoch": 0.4014822657490736,
"grad_norm": 0.03343448042869568,
"learning_rate": 4.095585540123762e-05,
"loss": 10.3276,
"step": 948
},
{
"epoch": 0.4019057702488089,
"grad_norm": 0.030924122780561447,
"learning_rate": 4.076651772473783e-05,
"loss": 10.3379,
"step": 949
},
{
"epoch": 0.4023292747485442,
"grad_norm": 0.044260427355766296,
"learning_rate": 4.0577506601919467e-05,
"loss": 10.3332,
"step": 950
},
{
"epoch": 0.4027527792482795,
"grad_norm": 0.027923841029405594,
"learning_rate": 4.038882307479912e-05,
"loss": 10.3391,
"step": 951
},
{
"epoch": 0.40317628374801484,
"grad_norm": 0.0312094334512949,
"learning_rate": 4.0200468183587556e-05,
"loss": 10.3327,
"step": 952
},
{
"epoch": 0.4035997882477501,
"grad_norm": 0.03471310809254646,
"learning_rate": 4.0012442966683674e-05,
"loss": 10.3367,
"step": 953
},
{
"epoch": 0.40402329274748544,
"grad_norm": 0.03101627714931965,
"learning_rate": 3.982474846066886e-05,
"loss": 10.3284,
"step": 954
},
{
"epoch": 0.40444679724722077,
"grad_norm": 0.03931306675076485,
"learning_rate": 3.963738570030134e-05,
"loss": 10.3312,
"step": 955
},
{
"epoch": 0.40487030174695604,
"grad_norm": 0.024929361417889595,
"learning_rate": 3.94503557185104e-05,
"loss": 10.3326,
"step": 956
},
{
"epoch": 0.40529380624669137,
"grad_norm": 0.043676454573869705,
"learning_rate": 3.926365954639073e-05,
"loss": 10.3289,
"step": 957
},
{
"epoch": 0.4057173107464267,
"grad_norm": 0.03379151597619057,
"learning_rate": 3.90772982131967e-05,
"loss": 10.3342,
"step": 958
},
{
"epoch": 0.40614081524616197,
"grad_norm": 0.03445500135421753,
"learning_rate": 3.8891272746336845e-05,
"loss": 10.337,
"step": 959
},
{
"epoch": 0.4065643197458973,
"grad_norm": 0.03671969100832939,
"learning_rate": 3.8705584171367885e-05,
"loss": 10.3389,
"step": 960
},
{
"epoch": 0.4069878242456326,
"grad_norm": 0.03856462240219116,
"learning_rate": 3.8520233511989324e-05,
"loss": 10.3318,
"step": 961
},
{
"epoch": 0.4074113287453679,
"grad_norm": 0.037579286843538284,
"learning_rate": 3.833522179003788e-05,
"loss": 10.3312,
"step": 962
},
{
"epoch": 0.4078348332451032,
"grad_norm": 0.0324142761528492,
"learning_rate": 3.8150550025481445e-05,
"loss": 10.3357,
"step": 963
},
{
"epoch": 0.40825833774483855,
"grad_norm": 0.035630084574222565,
"learning_rate": 3.796621923641404e-05,
"loss": 10.3304,
"step": 964
},
{
"epoch": 0.4086818422445738,
"grad_norm": 0.029326455667614937,
"learning_rate": 3.77822304390496e-05,
"loss": 10.3306,
"step": 965
},
{
"epoch": 0.40910534674430915,
"grad_norm": 0.03198442980647087,
"learning_rate": 3.7598584647716804e-05,
"loss": 10.3319,
"step": 966
},
{
"epoch": 0.4095288512440445,
"grad_norm": 0.035467833280563354,
"learning_rate": 3.7415282874853444e-05,
"loss": 10.3316,
"step": 967
},
{
"epoch": 0.40995235574377975,
"grad_norm": 0.047377362847328186,
"learning_rate": 3.723232613100046e-05,
"loss": 10.3287,
"step": 968
},
{
"epoch": 0.4103758602435151,
"grad_norm": 0.036050811409950256,
"learning_rate": 3.704971542479695e-05,
"loss": 10.3347,
"step": 969
},
{
"epoch": 0.4107993647432504,
"grad_norm": 0.037851523607969284,
"learning_rate": 3.6867451762974114e-05,
"loss": 10.3334,
"step": 970
},
{
"epoch": 0.41122286924298573,
"grad_norm": 0.030836213380098343,
"learning_rate": 3.6685536150349986e-05,
"loss": 10.3328,
"step": 971
},
{
"epoch": 0.411646373742721,
"grad_norm": 0.026154899969697,
"learning_rate": 3.650396958982377e-05,
"loss": 10.3323,
"step": 972
},
{
"epoch": 0.41206987824245633,
"grad_norm": 0.036884456872940063,
"learning_rate": 3.6322753082370365e-05,
"loss": 10.33,
"step": 973
},
{
"epoch": 0.41249338274219166,
"grad_norm": 0.041880205273628235,
"learning_rate": 3.614188762703482e-05,
"loss": 10.3294,
"step": 974
},
{
"epoch": 0.41291688724192693,
"grad_norm": 0.04928620532155037,
"learning_rate": 3.596137422092686e-05,
"loss": 10.3351,
"step": 975
},
{
"epoch": 0.41334039174166226,
"grad_norm": 0.027833838015794754,
"learning_rate": 3.578121385921533e-05,
"loss": 10.3309,
"step": 976
},
{
"epoch": 0.4137638962413976,
"grad_norm": 0.03103015385568142,
"learning_rate": 3.560140753512279e-05,
"loss": 10.3359,
"step": 977
},
{
"epoch": 0.41418740074113286,
"grad_norm": 0.03528593108057976,
"learning_rate": 3.542195623991991e-05,
"loss": 10.3282,
"step": 978
},
{
"epoch": 0.4146109052408682,
"grad_norm": 0.03291507437825203,
"learning_rate": 3.524286096292025e-05,
"loss": 10.3309,
"step": 979
},
{
"epoch": 0.4150344097406035,
"grad_norm": 0.04097427800297737,
"learning_rate": 3.5064122691474454e-05,
"loss": 10.3362,
"step": 980
},
{
"epoch": 0.4154579142403388,
"grad_norm": 0.04069104790687561,
"learning_rate": 3.4885742410965104e-05,
"loss": 10.3347,
"step": 981
},
{
"epoch": 0.4158814187400741,
"grad_norm": 0.03851715475320816,
"learning_rate": 3.4707721104801175e-05,
"loss": 10.334,
"step": 982
},
{
"epoch": 0.41630492323980944,
"grad_norm": 0.03845444321632385,
"learning_rate": 3.4530059754412555e-05,
"loss": 10.3324,
"step": 983
},
{
"epoch": 0.4167284277395447,
"grad_norm": 0.027850644662976265,
"learning_rate": 3.435275933924487e-05,
"loss": 10.3309,
"step": 984
},
{
"epoch": 0.41715193223928004,
"grad_norm": 0.03326322138309479,
"learning_rate": 3.417582083675365e-05,
"loss": 10.3325,
"step": 985
},
{
"epoch": 0.41757543673901537,
"grad_norm": 0.027192946523427963,
"learning_rate": 3.399924522239943e-05,
"loss": 10.332,
"step": 986
},
{
"epoch": 0.41799894123875064,
"grad_norm": 0.035279251635074615,
"learning_rate": 3.382303346964209e-05,
"loss": 10.3317,
"step": 987
},
{
"epoch": 0.41842244573848597,
"grad_norm": 0.03443683683872223,
"learning_rate": 3.36471865499354e-05,
"loss": 10.3326,
"step": 988
},
{
"epoch": 0.4188459502382213,
"grad_norm": 0.030605580657720566,
"learning_rate": 3.3471705432722035e-05,
"loss": 10.3345,
"step": 989
},
{
"epoch": 0.41926945473795657,
"grad_norm": 0.032888561487197876,
"learning_rate": 3.329659108542785e-05,
"loss": 10.3265,
"step": 990
},
{
"epoch": 0.4196929592376919,
"grad_norm": 0.02829040214419365,
"learning_rate": 3.3121844473456756e-05,
"loss": 10.3325,
"step": 991
},
{
"epoch": 0.4201164637374272,
"grad_norm": 0.030971676111221313,
"learning_rate": 3.294746656018532e-05,
"loss": 10.3281,
"step": 992
},
{
"epoch": 0.4205399682371625,
"grad_norm": 0.03257730230689049,
"learning_rate": 3.2773458306957495e-05,
"loss": 10.3281,
"step": 993
},
{
"epoch": 0.4209634727368978,
"grad_norm": 0.03114408068358898,
"learning_rate": 3.259982067307928e-05,
"loss": 10.3343,
"step": 994
},
{
"epoch": 0.42138697723663315,
"grad_norm": 0.03386252745985985,
"learning_rate": 3.2426554615813484e-05,
"loss": 10.3316,
"step": 995
},
{
"epoch": 0.4218104817363684,
"grad_norm": 0.03416220098733902,
"learning_rate": 3.2253661090374396e-05,
"loss": 10.329,
"step": 996
},
{
"epoch": 0.42223398623610375,
"grad_norm": 0.031190721318125725,
"learning_rate": 3.2081141049922535e-05,
"loss": 10.3331,
"step": 997
},
{
"epoch": 0.4226574907358391,
"grad_norm": 0.03264687955379486,
"learning_rate": 3.190899544555941e-05,
"loss": 10.3313,
"step": 998
},
{
"epoch": 0.42308099523557435,
"grad_norm": 0.03346019983291626,
"learning_rate": 3.173722522632228e-05,
"loss": 10.3353,
"step": 999
},
{
"epoch": 0.4235044997353097,
"grad_norm": 0.03909333422780037,
"learning_rate": 3.156583133917884e-05,
"loss": 10.3316,
"step": 1000
},
{
"epoch": 0.423928004235045,
"grad_norm": 0.030095241963863373,
"learning_rate": 3.1394814729022235e-05,
"loss": 10.3369,
"step": 1001
},
{
"epoch": 0.4243515087347803,
"grad_norm": 0.02995004691183567,
"learning_rate": 3.1224176338665476e-05,
"loss": 10.3329,
"step": 1002
},
{
"epoch": 0.4247750132345156,
"grad_norm": 0.039438553154468536,
"learning_rate": 3.105391710883656e-05,
"loss": 10.3305,
"step": 1003
},
{
"epoch": 0.42519851773425094,
"grad_norm": 0.04090533405542374,
"learning_rate": 3.088403797817325e-05,
"loss": 10.3314,
"step": 1004
},
{
"epoch": 0.42562202223398626,
"grad_norm": 0.0377449207007885,
"learning_rate": 3.071453988321762e-05,
"loss": 10.3298,
"step": 1005
},
{
"epoch": 0.42604552673372154,
"grad_norm": 0.06536738574504852,
"learning_rate": 3.0545423758411295e-05,
"loss": 10.3276,
"step": 1006
},
{
"epoch": 0.42646903123345686,
"grad_norm": 0.0357985682785511,
"learning_rate": 3.037669053609006e-05,
"loss": 10.3334,
"step": 1007
},
{
"epoch": 0.4268925357331922,
"grad_norm": 0.03490246832370758,
"learning_rate": 3.0208341146478602e-05,
"loss": 10.3342,
"step": 1008
},
{
"epoch": 0.42731604023292746,
"grad_norm": 0.03769504651427269,
"learning_rate": 3.0040376517685764e-05,
"loss": 10.3334,
"step": 1009
},
{
"epoch": 0.4277395447326628,
"grad_norm": 0.027772339060902596,
"learning_rate": 2.9872797575699097e-05,
"loss": 10.3321,
"step": 1010
},
{
"epoch": 0.4281630492323981,
"grad_norm": 0.034188639372587204,
"learning_rate": 2.9705605244379853e-05,
"loss": 10.3324,
"step": 1011
},
{
"epoch": 0.4281630492323981,
"eval_loss": 10.331077575683594,
"eval_runtime": 3.4933,
"eval_samples_per_second": 284.832,
"eval_steps_per_second": 142.559,
"step": 1011
},
{
"epoch": 0.4285865537321334,
"grad_norm": 0.03415974974632263,
"learning_rate": 2.9538800445457946e-05,
"loss": 10.3323,
"step": 1012
},
{
"epoch": 0.4290100582318687,
"grad_norm": 0.039172153919935226,
"learning_rate": 2.9372384098526784e-05,
"loss": 10.3347,
"step": 1013
},
{
"epoch": 0.42943356273160405,
"grad_norm": 0.031853485852479935,
"learning_rate": 2.9206357121038285e-05,
"loss": 10.3338,
"step": 1014
},
{
"epoch": 0.4298570672313393,
"grad_norm": 0.04923943430185318,
"learning_rate": 2.904072042829775e-05,
"loss": 10.3323,
"step": 1015
},
{
"epoch": 0.43028057173107465,
"grad_norm": 0.03674182668328285,
"learning_rate": 2.8875474933458847e-05,
"loss": 10.3334,
"step": 1016
},
{
"epoch": 0.43070407623081,
"grad_norm": 0.030546877533197403,
"learning_rate": 2.871062154751858e-05,
"loss": 10.3296,
"step": 1017
},
{
"epoch": 0.43112758073054525,
"grad_norm": 0.030613403767347336,
"learning_rate": 2.8546161179312248e-05,
"loss": 10.3354,
"step": 1018
},
{
"epoch": 0.4315510852302806,
"grad_norm": 0.030776720494031906,
"learning_rate": 2.8382094735508457e-05,
"loss": 10.3303,
"step": 1019
},
{
"epoch": 0.4319745897300159,
"grad_norm": 0.03810757398605347,
"learning_rate": 2.821842312060409e-05,
"loss": 10.3334,
"step": 1020
},
{
"epoch": 0.4323980942297512,
"grad_norm": 0.030035821720957756,
"learning_rate": 2.8055147236919442e-05,
"loss": 10.3345,
"step": 1021
},
{
"epoch": 0.4328215987294865,
"grad_norm": 0.03650267794728279,
"learning_rate": 2.789226798459298e-05,
"loss": 10.3299,
"step": 1022
},
{
"epoch": 0.43324510322922183,
"grad_norm": 0.030346672981977463,
"learning_rate": 2.7729786261576617e-05,
"loss": 10.334,
"step": 1023
},
{
"epoch": 0.4336686077289571,
"grad_norm": 0.0330539271235466,
"learning_rate": 2.7567702963630803e-05,
"loss": 10.3316,
"step": 1024
},
{
"epoch": 0.43409211222869243,
"grad_norm": 0.03174733370542526,
"learning_rate": 2.740601898431925e-05,
"loss": 10.3278,
"step": 1025
},
{
"epoch": 0.43451561672842776,
"grad_norm": 0.03628386929631233,
"learning_rate": 2.7244735215004446e-05,
"loss": 10.3274,
"step": 1026
},
{
"epoch": 0.43493912122816303,
"grad_norm": 0.024906015023589134,
"learning_rate": 2.7083852544842436e-05,
"loss": 10.3332,
"step": 1027
},
{
"epoch": 0.43536262572789836,
"grad_norm": 0.043956976383924484,
"learning_rate": 2.692337186077791e-05,
"loss": 10.3266,
"step": 1028
},
{
"epoch": 0.4357861302276337,
"grad_norm": 0.032996706664562225,
"learning_rate": 2.67632940475396e-05,
"loss": 10.3346,
"step": 1029
},
{
"epoch": 0.43620963472736896,
"grad_norm": 0.044276829808950424,
"learning_rate": 2.6603619987635086e-05,
"loss": 10.3274,
"step": 1030
},
{
"epoch": 0.4366331392271043,
"grad_norm": 0.038449618965387344,
"learning_rate": 2.64443505613461e-05,
"loss": 10.3341,
"step": 1031
},
{
"epoch": 0.4370566437268396,
"grad_norm": 0.03220584616065025,
"learning_rate": 2.6285486646723634e-05,
"loss": 10.3324,
"step": 1032
},
{
"epoch": 0.4374801482265749,
"grad_norm": 0.03746611624956131,
"learning_rate": 2.612702911958308e-05,
"loss": 10.3354,
"step": 1033
},
{
"epoch": 0.4379036527263102,
"grad_norm": 0.04333876073360443,
"learning_rate": 2.5968978853499425e-05,
"loss": 10.329,
"step": 1034
},
{
"epoch": 0.43832715722604554,
"grad_norm": 0.03539913892745972,
"learning_rate": 2.581133671980246e-05,
"loss": 10.3324,
"step": 1035
},
{
"epoch": 0.4387506617257808,
"grad_norm": 0.04690808430314064,
"learning_rate": 2.565410358757189e-05,
"loss": 10.3316,
"step": 1036
},
{
"epoch": 0.43917416622551614,
"grad_norm": 0.038458049297332764,
"learning_rate": 2.5497280323632654e-05,
"loss": 10.3431,
"step": 1037
},
{
"epoch": 0.43959767072525147,
"grad_norm": 0.03451355919241905,
"learning_rate": 2.534086779255005e-05,
"loss": 10.3296,
"step": 1038
},
{
"epoch": 0.44002117522498674,
"grad_norm": 0.03873763233423233,
"learning_rate": 2.5184866856625023e-05,
"loss": 10.3273,
"step": 1039
},
{
"epoch": 0.44044467972472207,
"grad_norm": 0.044388849288225174,
"learning_rate": 2.5029278375889387e-05,
"loss": 10.3324,
"step": 1040
},
{
"epoch": 0.4408681842244574,
"grad_norm": 0.03534289821982384,
"learning_rate": 2.4874103208101183e-05,
"loss": 10.3343,
"step": 1041
},
{
"epoch": 0.4412916887241927,
"grad_norm": 0.0375693254172802,
"learning_rate": 2.4719342208739693e-05,
"loss": 10.3323,
"step": 1042
},
{
"epoch": 0.441715193223928,
"grad_norm": 0.03341260179877281,
"learning_rate": 2.456499623100098e-05,
"loss": 10.3318,
"step": 1043
},
{
"epoch": 0.4421386977236633,
"grad_norm": 0.04234972223639488,
"learning_rate": 2.4411066125793203e-05,
"loss": 10.3319,
"step": 1044
},
{
"epoch": 0.44256220222339865,
"grad_norm": 0.031914252787828445,
"learning_rate": 2.4257552741731592e-05,
"loss": 10.3361,
"step": 1045
},
{
"epoch": 0.4429857067231339,
"grad_norm": 0.05003447085618973,
"learning_rate": 2.41044569251342e-05,
"loss": 10.3313,
"step": 1046
},
{
"epoch": 0.44340921122286925,
"grad_norm": 0.03364928439259529,
"learning_rate": 2.3951779520016937e-05,
"loss": 10.33,
"step": 1047
},
{
"epoch": 0.4438327157226046,
"grad_norm": 0.028291532769799232,
"learning_rate": 2.379952136808903e-05,
"loss": 10.3336,
"step": 1048
},
{
"epoch": 0.44425622022233985,
"grad_norm": 0.042799290269613266,
"learning_rate": 2.3647683308748392e-05,
"loss": 10.3348,
"step": 1049
},
{
"epoch": 0.4446797247220752,
"grad_norm": 0.042522724717855453,
"learning_rate": 2.3496266179076864e-05,
"loss": 10.3288,
"step": 1050
},
{
"epoch": 0.4451032292218105,
"grad_norm": 0.02918383479118347,
"learning_rate": 2.3345270813835886e-05,
"loss": 10.3361,
"step": 1051
},
{
"epoch": 0.4455267337215458,
"grad_norm": 0.046009406447410583,
"learning_rate": 2.319469804546156e-05,
"loss": 10.3349,
"step": 1052
},
{
"epoch": 0.4459502382212811,
"grad_norm": 0.03431849181652069,
"learning_rate": 2.3044548704060288e-05,
"loss": 10.3283,
"step": 1053
},
{
"epoch": 0.44637374272101643,
"grad_norm": 0.03582574054598808,
"learning_rate": 2.2894823617404104e-05,
"loss": 10.3314,
"step": 1054
},
{
"epoch": 0.4467972472207517,
"grad_norm": 0.02972414344549179,
"learning_rate": 2.2745523610926122e-05,
"loss": 10.3289,
"step": 1055
},
{
"epoch": 0.44722075172048703,
"grad_norm": 0.03548819199204445,
"learning_rate": 2.2596649507716018e-05,
"loss": 10.3299,
"step": 1056
},
{
"epoch": 0.44764425622022236,
"grad_norm": 0.04241335019469261,
"learning_rate": 2.244820212851544e-05,
"loss": 10.3308,
"step": 1057
},
{
"epoch": 0.44806776071995763,
"grad_norm": 0.033176884055137634,
"learning_rate": 2.2300182291713513e-05,
"loss": 10.3351,
"step": 1058
},
{
"epoch": 0.44849126521969296,
"grad_norm": 0.032935190945863724,
"learning_rate": 2.2152590813342345e-05,
"loss": 10.3356,
"step": 1059
},
{
"epoch": 0.4489147697194283,
"grad_norm": 0.030969172716140747,
"learning_rate": 2.2005428507072467e-05,
"loss": 10.3307,
"step": 1060
},
{
"epoch": 0.44933827421916356,
"grad_norm": 0.036834247410297394,
"learning_rate": 2.1858696184208484e-05,
"loss": 10.3324,
"step": 1061
},
{
"epoch": 0.4497617787188989,
"grad_norm": 0.038617976009845734,
"learning_rate": 2.1712394653684344e-05,
"loss": 10.3371,
"step": 1062
},
{
"epoch": 0.4501852832186342,
"grad_norm": 0.026445934548974037,
"learning_rate": 2.15665247220592e-05,
"loss": 10.3334,
"step": 1063
},
{
"epoch": 0.4506087877183695,
"grad_norm": 0.04230870306491852,
"learning_rate": 2.1421087193512756e-05,
"loss": 10.3261,
"step": 1064
},
{
"epoch": 0.4510322922181048,
"grad_norm": 0.03189300373196602,
"learning_rate": 2.1276082869840765e-05,
"loss": 10.3297,
"step": 1065
},
{
"epoch": 0.45145579671784014,
"grad_norm": 0.03367699310183525,
"learning_rate": 2.113151255045095e-05,
"loss": 10.3308,
"step": 1066
},
{
"epoch": 0.4518793012175754,
"grad_norm": 0.032475464046001434,
"learning_rate": 2.0987377032358114e-05,
"loss": 10.339,
"step": 1067
},
{
"epoch": 0.45230280571731074,
"grad_norm": 0.04436371102929115,
"learning_rate": 2.084367711018024e-05,
"loss": 10.3301,
"step": 1068
},
{
"epoch": 0.45272631021704607,
"grad_norm": 0.037988126277923584,
"learning_rate": 2.070041357613376e-05,
"loss": 10.3309,
"step": 1069
},
{
"epoch": 0.45314981471678134,
"grad_norm": 0.03870435804128647,
"learning_rate": 2.0557587220029228e-05,
"loss": 10.3353,
"step": 1070
},
{
"epoch": 0.45357331921651667,
"grad_norm": 0.03660368546843529,
"learning_rate": 2.0415198829267212e-05,
"loss": 10.3317,
"step": 1071
},
{
"epoch": 0.453996823716252,
"grad_norm": 0.03593965247273445,
"learning_rate": 2.0273249188833654e-05,
"loss": 10.3343,
"step": 1072
},
{
"epoch": 0.45442032821598727,
"grad_norm": 0.03798775374889374,
"learning_rate": 2.013173908129573e-05,
"loss": 10.329,
"step": 1073
},
{
"epoch": 0.4548438327157226,
"grad_norm": 0.030165789648890495,
"learning_rate": 1.9990669286797438e-05,
"loss": 10.3325,
"step": 1074
},
{
"epoch": 0.4552673372154579,
"grad_norm": 0.029242129996418953,
"learning_rate": 1.985004058305535e-05,
"loss": 10.3337,
"step": 1075
},
{
"epoch": 0.45569084171519325,
"grad_norm": 0.029076050966978073,
"learning_rate": 1.9709853745354313e-05,
"loss": 10.3347,
"step": 1076
},
{
"epoch": 0.4561143462149285,
"grad_norm": 0.039899520576000214,
"learning_rate": 1.9570109546543126e-05,
"loss": 10.3334,
"step": 1077
},
{
"epoch": 0.45653785071466385,
"grad_norm": 0.03501451388001442,
"learning_rate": 1.943080875703045e-05,
"loss": 10.325,
"step": 1078
},
{
"epoch": 0.4569613552143992,
"grad_norm": 0.029382554814219475,
"learning_rate": 1.929195214478028e-05,
"loss": 10.336,
"step": 1079
},
{
"epoch": 0.45738485971413445,
"grad_norm": 0.03819538280367851,
"learning_rate": 1.915354047530791e-05,
"loss": 10.3329,
"step": 1080
},
{
"epoch": 0.4578083642138698,
"grad_norm": 0.03543626144528389,
"learning_rate": 1.901557451167578e-05,
"loss": 10.3326,
"step": 1081
},
{
"epoch": 0.4582318687136051,
"grad_norm": 0.04363977536559105,
"learning_rate": 1.887805501448896e-05,
"loss": 10.3289,
"step": 1082
},
{
"epoch": 0.4586553732133404,
"grad_norm": 0.03918329253792763,
"learning_rate": 1.8740982741891377e-05,
"loss": 10.3276,
"step": 1083
},
{
"epoch": 0.4590788777130757,
"grad_norm": 0.029666945338249207,
"learning_rate": 1.860435844956121e-05,
"loss": 10.3307,
"step": 1084
},
{
"epoch": 0.45950238221281103,
"grad_norm": 0.035329993814229965,
"learning_rate": 1.8468182890707007e-05,
"loss": 10.3336,
"step": 1085
},
{
"epoch": 0.4599258867125463,
"grad_norm": 0.040378130972385406,
"learning_rate": 1.833245681606356e-05,
"loss": 10.3296,
"step": 1086
},
{
"epoch": 0.46034939121228163,
"grad_norm": 0.04233788326382637,
"learning_rate": 1.8197180973887428e-05,
"loss": 10.3312,
"step": 1087
},
{
"epoch": 0.46077289571201696,
"grad_norm": 0.03670990467071533,
"learning_rate": 1.806235610995327e-05,
"loss": 10.3303,
"step": 1088
},
{
"epoch": 0.46119640021175223,
"grad_norm": 0.03234660625457764,
"learning_rate": 1.7927982967549384e-05,
"loss": 10.3355,
"step": 1089
},
{
"epoch": 0.46161990471148756,
"grad_norm": 0.042892660945653915,
"learning_rate": 1.7794062287473735e-05,
"loss": 10.331,
"step": 1090
},
{
"epoch": 0.4620434092112229,
"grad_norm": 0.04852224513888359,
"learning_rate": 1.7660594808029908e-05,
"loss": 10.3361,
"step": 1091
},
{
"epoch": 0.46246691371095816,
"grad_norm": 0.036822058260440826,
"learning_rate": 1.7527581265022965e-05,
"loss": 10.3364,
"step": 1092
},
{
"epoch": 0.4628904182106935,
"grad_norm": 0.03043217770755291,
"learning_rate": 1.7395022391755434e-05,
"loss": 10.335,
"step": 1093
},
{
"epoch": 0.4633139227104288,
"grad_norm": 0.027736082673072815,
"learning_rate": 1.7262918919023243e-05,
"loss": 10.3335,
"step": 1094
},
{
"epoch": 0.4637374272101641,
"grad_norm": 0.03186174854636192,
"learning_rate": 1.713127157511172e-05,
"loss": 10.3365,
"step": 1095
},
{
"epoch": 0.4641609317098994,
"grad_norm": 0.03788574039936066,
"learning_rate": 1.700008108579154e-05,
"loss": 10.3317,
"step": 1096
},
{
"epoch": 0.46458443620963474,
"grad_norm": 0.047464434057474136,
"learning_rate": 1.6869348174314738e-05,
"loss": 10.3307,
"step": 1097
},
{
"epoch": 0.46500794070937,
"grad_norm": 0.03223862871527672,
"learning_rate": 1.673907356141079e-05,
"loss": 10.3337,
"step": 1098
},
{
"epoch": 0.46543144520910534,
"grad_norm": 0.02775878831744194,
"learning_rate": 1.6609257965282453e-05,
"loss": 10.3376,
"step": 1099
},
{
"epoch": 0.46585494970884067,
"grad_norm": 0.0346621610224247,
"learning_rate": 1.647990210160204e-05,
"loss": 10.334,
"step": 1100
},
{
"epoch": 0.46627845420857594,
"grad_norm": 0.03867461159825325,
"learning_rate": 1.6351006683507297e-05,
"loss": 10.3321,
"step": 1101
},
{
"epoch": 0.46670195870831127,
"grad_norm": 0.033736009150743484,
"learning_rate": 1.622257242159756e-05,
"loss": 10.329,
"step": 1102
},
{
"epoch": 0.4671254632080466,
"grad_norm": 0.03446945920586586,
"learning_rate": 1.6094600023929884e-05,
"loss": 10.3281,
"step": 1103
},
{
"epoch": 0.46754896770778187,
"grad_norm": 0.03439204394817352,
"learning_rate": 1.59670901960149e-05,
"loss": 10.3339,
"step": 1104
},
{
"epoch": 0.4679724722075172,
"grad_norm": 0.03250345215201378,
"learning_rate": 1.5840043640813274e-05,
"loss": 10.3308,
"step": 1105
},
{
"epoch": 0.4683959767072525,
"grad_norm": 0.030219173058867455,
"learning_rate": 1.5713461058731572e-05,
"loss": 10.333,
"step": 1106
},
{
"epoch": 0.4688194812069878,
"grad_norm": 0.031828220933675766,
"learning_rate": 1.558734314761844e-05,
"loss": 10.3353,
"step": 1107
},
{
"epoch": 0.4692429857067231,
"grad_norm": 0.047410812228918076,
"learning_rate": 1.546169060276088e-05,
"loss": 10.3289,
"step": 1108
},
{
"epoch": 0.46966649020645845,
"grad_norm": 0.036803584545850754,
"learning_rate": 1.53365041168803e-05,
"loss": 10.3358,
"step": 1109
},
{
"epoch": 0.4700899947061937,
"grad_norm": 0.03534479811787605,
"learning_rate": 1.5211784380128714e-05,
"loss": 10.33,
"step": 1110
},
{
"epoch": 0.47051349920592905,
"grad_norm": 0.036183904856443405,
"learning_rate": 1.5087532080084976e-05,
"loss": 10.3289,
"step": 1111
},
{
"epoch": 0.4709370037056644,
"grad_norm": 0.033738043159246445,
"learning_rate": 1.4963747901750936e-05,
"loss": 10.3303,
"step": 1112
},
{
"epoch": 0.4713605082053997,
"grad_norm": 0.03870893269777298,
"learning_rate": 1.4840432527547732e-05,
"loss": 10.3364,
"step": 1113
},
{
"epoch": 0.471784012705135,
"grad_norm": 0.04043989256024361,
"learning_rate": 1.4717586637311943e-05,
"loss": 10.3316,
"step": 1114
},
{
"epoch": 0.4722075172048703,
"grad_norm": 0.03024929389357567,
"learning_rate": 1.4595210908291935e-05,
"loss": 10.3364,
"step": 1115
},
{
"epoch": 0.47263102170460564,
"grad_norm": 0.04411826282739639,
"learning_rate": 1.447330601514405e-05,
"loss": 10.3331,
"step": 1116
},
{
"epoch": 0.4730545262043409,
"grad_norm": 0.03368929401040077,
"learning_rate": 1.4351872629928908e-05,
"loss": 10.3323,
"step": 1117
},
{
"epoch": 0.47347803070407624,
"grad_norm": 0.038087401539087296,
"learning_rate": 1.423091142210774e-05,
"loss": 10.3295,
"step": 1118
},
{
"epoch": 0.47390153520381156,
"grad_norm": 0.03507355973124504,
"learning_rate": 1.4110423058538624e-05,
"loss": 10.3273,
"step": 1119
},
{
"epoch": 0.47432503970354684,
"grad_norm": 0.03440206125378609,
"learning_rate": 1.3990408203472938e-05,
"loss": 10.3336,
"step": 1120
},
{
"epoch": 0.47474854420328216,
"grad_norm": 0.03201809525489807,
"learning_rate": 1.387086751855149e-05,
"loss": 10.3323,
"step": 1121
},
{
"epoch": 0.4751720487030175,
"grad_norm": 0.02803219109773636,
"learning_rate": 1.3751801662801056e-05,
"loss": 10.3343,
"step": 1122
},
{
"epoch": 0.47559555320275276,
"grad_norm": 0.03642897307872772,
"learning_rate": 1.3633211292630742e-05,
"loss": 10.3309,
"step": 1123
},
{
"epoch": 0.4760190577024881,
"grad_norm": 0.04547721892595291,
"learning_rate": 1.3515097061828164e-05,
"loss": 10.3248,
"step": 1124
},
{
"epoch": 0.4764425622022234,
"grad_norm": 0.03152972459793091,
"learning_rate": 1.339745962155613e-05,
"loss": 10.3396,
"step": 1125
},
{
"epoch": 0.4768660667019587,
"grad_norm": 0.028171587735414505,
"learning_rate": 1.3280299620348846e-05,
"loss": 10.33,
"step": 1126
},
{
"epoch": 0.477289571201694,
"grad_norm": 0.03410959243774414,
"learning_rate": 1.3163617704108321e-05,
"loss": 10.3344,
"step": 1127
},
{
"epoch": 0.47771307570142935,
"grad_norm": 0.030304502695798874,
"learning_rate": 1.304741451610103e-05,
"loss": 10.3309,
"step": 1128
},
{
"epoch": 0.4781365802011646,
"grad_norm": 0.03257643058896065,
"learning_rate": 1.2931690696954135e-05,
"loss": 10.3346,
"step": 1129
},
{
"epoch": 0.47856008470089995,
"grad_norm": 0.04555933550000191,
"learning_rate": 1.2816446884652066e-05,
"loss": 10.3302,
"step": 1130
},
{
"epoch": 0.4789835892006353,
"grad_norm": 0.0384778194129467,
"learning_rate": 1.2701683714532975e-05,
"loss": 10.3317,
"step": 1131
},
{
"epoch": 0.47940709370037055,
"grad_norm": 0.03637570142745972,
"learning_rate": 1.2587401819285239e-05,
"loss": 10.3295,
"step": 1132
},
{
"epoch": 0.4798305982001059,
"grad_norm": 0.04053565487265587,
"learning_rate": 1.2473601828943949e-05,
"loss": 10.3293,
"step": 1133
},
{
"epoch": 0.4802541026998412,
"grad_norm": 0.042270079255104065,
"learning_rate": 1.236028437088751e-05,
"loss": 10.3271,
"step": 1134
},
{
"epoch": 0.4806776071995765,
"grad_norm": 0.04081670939922333,
"learning_rate": 1.2247450069834076e-05,
"loss": 10.3365,
"step": 1135
},
{
"epoch": 0.4811011116993118,
"grad_norm": 0.03796311840415001,
"learning_rate": 1.2135099547838192e-05,
"loss": 10.333,
"step": 1136
},
{
"epoch": 0.48152461619904713,
"grad_norm": 0.02851458452641964,
"learning_rate": 1.2023233424287328e-05,
"loss": 10.3304,
"step": 1137
},
{
"epoch": 0.4819481206987824,
"grad_norm": 0.03447718173265457,
"learning_rate": 1.1911852315898463e-05,
"loss": 10.3316,
"step": 1138
},
{
"epoch": 0.48237162519851773,
"grad_norm": 0.037812747061252594,
"learning_rate": 1.1800956836714682e-05,
"loss": 10.3288,
"step": 1139
},
{
"epoch": 0.48279512969825306,
"grad_norm": 0.03977108374238014,
"learning_rate": 1.1690547598101864e-05,
"loss": 10.3303,
"step": 1140
},
{
"epoch": 0.48321863419798833,
"grad_norm": 0.031228644773364067,
"learning_rate": 1.1580625208745145e-05,
"loss": 10.3294,
"step": 1141
},
{
"epoch": 0.48364213869772366,
"grad_norm": 0.0270911306142807,
"learning_rate": 1.1471190274645704e-05,
"loss": 10.3322,
"step": 1142
},
{
"epoch": 0.484065643197459,
"grad_norm": 0.03246387094259262,
"learning_rate": 1.1362243399117478e-05,
"loss": 10.3306,
"step": 1143
},
{
"epoch": 0.48448914769719426,
"grad_norm": 0.03161618486046791,
"learning_rate": 1.1253785182783572e-05,
"loss": 10.335,
"step": 1144
},
{
"epoch": 0.4849126521969296,
"grad_norm": 0.03287721052765846,
"learning_rate": 1.1145816223573259e-05,
"loss": 10.3312,
"step": 1145
},
{
"epoch": 0.4853361566966649,
"grad_norm": 0.029835056513547897,
"learning_rate": 1.1038337116718467e-05,
"loss": 10.3309,
"step": 1146
},
{
"epoch": 0.48575966119640024,
"grad_norm": 0.03465202450752258,
"learning_rate": 1.0931348454750601e-05,
"loss": 10.3336,
"step": 1147
},
{
"epoch": 0.4861831656961355,
"grad_norm": 0.03778757527470589,
"learning_rate": 1.0824850827497246e-05,
"loss": 10.3342,
"step": 1148
},
{
"epoch": 0.48660667019587084,
"grad_norm": 0.03788898512721062,
"learning_rate": 1.07188448220789e-05,
"loss": 10.3338,
"step": 1149
},
{
"epoch": 0.48703017469560617,
"grad_norm": 0.03392605856060982,
"learning_rate": 1.061333102290576e-05,
"loss": 10.3314,
"step": 1150
},
{
"epoch": 0.48745367919534144,
"grad_norm": 0.03181210905313492,
"learning_rate": 1.0508310011674516e-05,
"loss": 10.3347,
"step": 1151
},
{
"epoch": 0.48787718369507677,
"grad_norm": 0.03807486966252327,
"learning_rate": 1.0403782367365088e-05,
"loss": 10.3334,
"step": 1152
},
{
"epoch": 0.4883006881948121,
"grad_norm": 0.04221343249082565,
"learning_rate": 1.0299748666237485e-05,
"loss": 10.33,
"step": 1153
},
{
"epoch": 0.48872419269454737,
"grad_norm": 0.03662874549627304,
"learning_rate": 1.0196209481828633e-05,
"loss": 10.3337,
"step": 1154
},
{
"epoch": 0.4891476971942827,
"grad_norm": 0.03761863335967064,
"learning_rate": 1.0093165384949155e-05,
"loss": 10.3363,
"step": 1155
},
{
"epoch": 0.489571201694018,
"grad_norm": 0.03691156208515167,
"learning_rate": 9.990616943680265e-06,
"loss": 10.3355,
"step": 1156
},
{
"epoch": 0.4899947061937533,
"grad_norm": 0.03406470641493797,
"learning_rate": 9.888564723370664e-06,
"loss": 10.3348,
"step": 1157
},
{
"epoch": 0.4904182106934886,
"grad_norm": 0.03452722728252411,
"learning_rate": 9.787009286633363e-06,
"loss": 10.3332,
"step": 1158
},
{
"epoch": 0.49084171519322395,
"grad_norm": 0.03500404581427574,
"learning_rate": 9.685951193342602e-06,
"loss": 10.3328,
"step": 1159
},
{
"epoch": 0.4912652196929592,
"grad_norm": 0.034697335213422775,
"learning_rate": 9.585391000630828e-06,
"loss": 10.3292,
"step": 1160
},
{
"epoch": 0.49168872419269455,
"grad_norm": 0.028287572786211967,
"learning_rate": 9.485329262885457e-06,
"loss": 10.3337,
"step": 1161
},
{
"epoch": 0.4921122286924299,
"grad_norm": 0.0407349169254303,
"learning_rate": 9.385766531746054e-06,
"loss": 10.3314,
"step": 1162
},
{
"epoch": 0.49253573319216515,
"grad_norm": 0.03521955758333206,
"learning_rate": 9.28670335610109e-06,
"loss": 10.3313,
"step": 1163
},
{
"epoch": 0.4929592376919005,
"grad_norm": 0.038377124816179276,
"learning_rate": 9.188140282084967e-06,
"loss": 10.3295,
"step": 1164
},
{
"epoch": 0.4933827421916358,
"grad_norm": 0.037929970771074295,
"learning_rate": 9.090077853075118e-06,
"loss": 10.331,
"step": 1165
},
{
"epoch": 0.4938062466913711,
"grad_norm": 0.03767012432217598,
"learning_rate": 8.992516609688862e-06,
"loss": 10.3305,
"step": 1166
},
{
"epoch": 0.4942297511911064,
"grad_norm": 0.04114054888486862,
"learning_rate": 8.89545708978049e-06,
"loss": 10.3327,
"step": 1167
},
{
"epoch": 0.49465325569084173,
"grad_norm": 0.03139737620949745,
"learning_rate": 8.798899828438333e-06,
"loss": 10.3342,
"step": 1168
},
{
"epoch": 0.495076760190577,
"grad_norm": 0.0350373312830925,
"learning_rate": 8.70284535798168e-06,
"loss": 10.3335,
"step": 1169
},
{
"epoch": 0.49550026469031233,
"grad_norm": 0.03645787015557289,
"learning_rate": 8.607294207958073e-06,
"loss": 10.3285,
"step": 1170
},
{
"epoch": 0.49592376919004766,
"grad_norm": 0.04092005640268326,
"learning_rate": 8.512246905140165e-06,
"loss": 10.332,
"step": 1171
},
{
"epoch": 0.49634727368978293,
"grad_norm": 0.03972132131457329,
"learning_rate": 8.417703973522917e-06,
"loss": 10.3336,
"step": 1172
},
{
"epoch": 0.49677077818951826,
"grad_norm": 0.02949652262032032,
"learning_rate": 8.323665934320713e-06,
"loss": 10.3329,
"step": 1173
},
{
"epoch": 0.4971942826892536,
"grad_norm": 0.04814364016056061,
"learning_rate": 8.23013330596445e-06,
"loss": 10.3317,
"step": 1174
},
{
"epoch": 0.49761778718898886,
"grad_norm": 0.0334940031170845,
"learning_rate": 8.13710660409871e-06,
"loss": 10.3367,
"step": 1175
},
{
"epoch": 0.4980412916887242,
"grad_norm": 0.03809863701462746,
"learning_rate": 8.044586341578886e-06,
"loss": 10.3347,
"step": 1176
},
{
"epoch": 0.4984647961884595,
"grad_norm": 0.03746895492076874,
"learning_rate": 7.952573028468457e-06,
"loss": 10.3362,
"step": 1177
},
{
"epoch": 0.4988883006881948,
"grad_norm": 0.024187074974179268,
"learning_rate": 7.861067172035962e-06,
"loss": 10.3327,
"step": 1178
},
{
"epoch": 0.4993118051879301,
"grad_norm": 0.03394331783056259,
"learning_rate": 7.770069276752422e-06,
"loss": 10.3268,
"step": 1179
},
{
"epoch": 0.49973530968766544,
"grad_norm": 0.0327443964779377,
"learning_rate": 7.679579844288509e-06,
"loss": 10.332,
"step": 1180
},
{
"epoch": 0.5001588141874007,
"grad_norm": 0.027774417772889137,
"learning_rate": 7.589599373511602e-06,
"loss": 10.329,
"step": 1181
},
{
"epoch": 0.5005823186871361,
"grad_norm": 0.03464759886264801,
"learning_rate": 7.500128360483338e-06,
"loss": 10.3334,
"step": 1182
},
{
"epoch": 0.5010058231868714,
"grad_norm": 0.03733719512820244,
"learning_rate": 7.411167298456634e-06,
"loss": 10.3307,
"step": 1183
},
{
"epoch": 0.5014293276866066,
"grad_norm": 0.033785175532102585,
"learning_rate": 7.32271667787302e-06,
"loss": 10.3362,
"step": 1184
},
{
"epoch": 0.501852832186342,
"grad_norm": 0.038209252059459686,
"learning_rate": 7.234776986360059e-06,
"loss": 10.3309,
"step": 1185
},
{
"epoch": 0.5022763366860773,
"grad_norm": 0.03651139885187149,
"learning_rate": 7.147348708728507e-06,
"loss": 10.335,
"step": 1186
},
{
"epoch": 0.5026998411858126,
"grad_norm": 0.03249209746718407,
"learning_rate": 7.060432326969713e-06,
"loss": 10.3326,
"step": 1187
},
{
"epoch": 0.503123345685548,
"grad_norm": 0.049712520092725754,
"learning_rate": 6.974028320252934e-06,
"loss": 10.3269,
"step": 1188
},
{
"epoch": 0.5035468501852832,
"grad_norm": 0.03345096856355667,
"learning_rate": 6.888137164922725e-06,
"loss": 10.3273,
"step": 1189
},
{
"epoch": 0.5039703546850185,
"grad_norm": 0.028842521831393242,
"learning_rate": 6.802759334496289e-06,
"loss": 10.3299,
"step": 1190
},
{
"epoch": 0.5043938591847539,
"grad_norm": 0.02980581857264042,
"learning_rate": 6.717895299660892e-06,
"loss": 10.3337,
"step": 1191
},
{
"epoch": 0.5048173636844892,
"grad_norm": 0.032008688896894455,
"learning_rate": 6.633545528271212e-06,
"loss": 10.3275,
"step": 1192
},
{
"epoch": 0.5052408681842244,
"grad_norm": 0.03007701225578785,
"learning_rate": 6.549710485346827e-06,
"loss": 10.3319,
"step": 1193
},
{
"epoch": 0.5056643726839598,
"grad_norm": 0.03393697366118431,
"learning_rate": 6.466390633069608e-06,
"loss": 10.3292,
"step": 1194
},
{
"epoch": 0.5060878771836951,
"grad_norm": 0.04486103355884552,
"learning_rate": 6.383586430781197e-06,
"loss": 10.3289,
"step": 1195
},
{
"epoch": 0.5065113816834304,
"grad_norm": 0.03052888996899128,
"learning_rate": 6.301298334980421e-06,
"loss": 10.3374,
"step": 1196
},
{
"epoch": 0.5069348861831657,
"grad_norm": 0.030694812536239624,
"learning_rate": 6.219526799320919e-06,
"loss": 10.3308,
"step": 1197
},
{
"epoch": 0.507358390682901,
"grad_norm": 0.03446760028600693,
"learning_rate": 6.138272274608403e-06,
"loss": 10.3346,
"step": 1198
},
{
"epoch": 0.5077818951826363,
"grad_norm": 0.033587660640478134,
"learning_rate": 6.057535208798371e-06,
"loss": 10.3337,
"step": 1199
},
{
"epoch": 0.5082053996823717,
"grad_norm": 0.03484556823968887,
"learning_rate": 5.977316046993642e-06,
"loss": 10.3311,
"step": 1200
},
{
"epoch": 0.5086289041821069,
"grad_norm": 0.03142661601305008,
"learning_rate": 5.897615231441689e-06,
"loss": 10.3335,
"step": 1201
},
{
"epoch": 0.5090524086818422,
"grad_norm": 0.03492956608533859,
"learning_rate": 5.81843320153248e-06,
"loss": 10.3298,
"step": 1202
},
{
"epoch": 0.5094759131815776,
"grad_norm": 0.035875819623470306,
"learning_rate": 5.739770393795851e-06,
"loss": 10.3339,
"step": 1203
},
{
"epoch": 0.5098994176813129,
"grad_norm": 0.028575167059898376,
"learning_rate": 5.6616272418991926e-06,
"loss": 10.3306,
"step": 1204
},
{
"epoch": 0.5103229221810481,
"grad_norm": 0.034280769526958466,
"learning_rate": 5.584004176645052e-06,
"loss": 10.3339,
"step": 1205
},
{
"epoch": 0.5107464266807835,
"grad_norm": 0.03369034081697464,
"learning_rate": 5.5069016259686635e-06,
"loss": 10.3293,
"step": 1206
},
{
"epoch": 0.5111699311805188,
"grad_norm": 0.03932506591081619,
"learning_rate": 5.430320014935797e-06,
"loss": 10.3339,
"step": 1207
},
{
"epoch": 0.5115934356802541,
"grad_norm": 0.04464678466320038,
"learning_rate": 5.354259765740177e-06,
"loss": 10.3316,
"step": 1208
},
{
"epoch": 0.5120169401799894,
"grad_norm": 0.033909354358911514,
"learning_rate": 5.278721297701339e-06,
"loss": 10.3317,
"step": 1209
},
{
"epoch": 0.5124404446797247,
"grad_norm": 0.02771197073161602,
"learning_rate": 5.203705027262184e-06,
"loss": 10.3337,
"step": 1210
},
{
"epoch": 0.51286394917946,
"grad_norm": 0.03711957111954689,
"learning_rate": 5.129211367986786e-06,
"loss": 10.3374,
"step": 1211
},
{
"epoch": 0.5132874536791954,
"grad_norm": 0.04035378247499466,
"learning_rate": 5.055240730558042e-06,
"loss": 10.3278,
"step": 1212
},
{
"epoch": 0.5137109581789306,
"grad_norm": 0.037376079708337784,
"learning_rate": 4.981793522775457e-06,
"loss": 10.3354,
"step": 1213
},
{
"epoch": 0.5141344626786659,
"grad_norm": 0.033283621072769165,
"learning_rate": 4.908870149552835e-06,
"loss": 10.3304,
"step": 1214
},
{
"epoch": 0.5145579671784013,
"grad_norm": 0.04279647022485733,
"learning_rate": 4.836471012916144e-06,
"loss": 10.3317,
"step": 1215
},
{
"epoch": 0.5149814716781366,
"grad_norm": 0.026392200961709023,
"learning_rate": 4.764596512001162e-06,
"loss": 10.3338,
"step": 1216
},
{
"epoch": 0.5154049761778718,
"grad_norm": 0.038188233971595764,
"learning_rate": 4.693247043051441e-06,
"loss": 10.3363,
"step": 1217
},
{
"epoch": 0.5158284806776072,
"grad_norm": 0.03593307361006737,
"learning_rate": 4.622422999415965e-06,
"loss": 10.3302,
"step": 1218
},
{
"epoch": 0.5162519851773425,
"grad_norm": 0.03967192396521568,
"learning_rate": 4.5521247715470945e-06,
"loss": 10.33,
"step": 1219
},
{
"epoch": 0.5166754896770778,
"grad_norm": 0.0491623692214489,
"learning_rate": 4.482352746998364e-06,
"loss": 10.3386,
"step": 1220
},
{
"epoch": 0.5170989941768132,
"grad_norm": 0.0371236614882946,
"learning_rate": 4.413107310422326e-06,
"loss": 10.3336,
"step": 1221
},
{
"epoch": 0.5175224986765484,
"grad_norm": 0.027762679383158684,
"learning_rate": 4.344388843568503e-06,
"loss": 10.3282,
"step": 1222
},
{
"epoch": 0.5179460031762837,
"grad_norm": 0.03931552171707153,
"learning_rate": 4.2761977252811945e-06,
"loss": 10.3331,
"step": 1223
},
{
"epoch": 0.5183695076760191,
"grad_norm": 0.047121018171310425,
"learning_rate": 4.2085343314974715e-06,
"loss": 10.3297,
"step": 1224
},
{
"epoch": 0.5187930121757544,
"grad_norm": 0.042633168399333954,
"learning_rate": 4.141399035245052e-06,
"loss": 10.3337,
"step": 1225
},
{
"epoch": 0.5192165166754896,
"grad_norm": 0.03988894075155258,
"learning_rate": 4.07479220664021e-06,
"loss": 10.3262,
"step": 1226
},
{
"epoch": 0.519640021175225,
"grad_norm": 0.030842246487736702,
"learning_rate": 4.008714212885856e-06,
"loss": 10.3322,
"step": 1227
},
{
"epoch": 0.5200635256749603,
"grad_norm": 0.04261520504951477,
"learning_rate": 3.943165418269401e-06,
"loss": 10.328,
"step": 1228
},
{
"epoch": 0.5204870301746956,
"grad_norm": 0.030063187703490257,
"learning_rate": 3.87814618416078e-06,
"loss": 10.3345,
"step": 1229
},
{
"epoch": 0.5209105346744309,
"grad_norm": 0.030118783935904503,
"learning_rate": 3.8136568690104957e-06,
"loss": 10.3325,
"step": 1230
},
{
"epoch": 0.5213340391741662,
"grad_norm": 0.03795788437128067,
"learning_rate": 3.7496978283475648e-06,
"loss": 10.3327,
"step": 1231
},
{
"epoch": 0.5217575436739015,
"grad_norm": 0.036961231380701065,
"learning_rate": 3.686269414777643e-06,
"loss": 10.3344,
"step": 1232
},
{
"epoch": 0.5221810481736369,
"grad_norm": 0.0403430350124836,
"learning_rate": 3.623371977981027e-06,
"loss": 10.3324,
"step": 1233
},
{
"epoch": 0.5226045526733721,
"grad_norm": 0.03135257214307785,
"learning_rate": 3.5610058647107538e-06,
"loss": 10.3319,
"step": 1234
},
{
"epoch": 0.5230280571731075,
"grad_norm": 0.0364365391433239,
"learning_rate": 3.499171418790681e-06,
"loss": 10.3343,
"step": 1235
},
{
"epoch": 0.5234515616728428,
"grad_norm": 0.025732390582561493,
"learning_rate": 3.437868981113557e-06,
"loss": 10.3338,
"step": 1236
},
{
"epoch": 0.5238750661725781,
"grad_norm": 0.03495744988322258,
"learning_rate": 3.37709888963923e-06,
"loss": 10.3302,
"step": 1237
},
{
"epoch": 0.5242985706723134,
"grad_norm": 0.032097022980451584,
"learning_rate": 3.3168614793926524e-06,
"loss": 10.3356,
"step": 1238
},
{
"epoch": 0.5247220751720487,
"grad_norm": 0.029357150197029114,
"learning_rate": 3.2571570824621923e-06,
"loss": 10.3304,
"step": 1239
},
{
"epoch": 0.525145579671784,
"grad_norm": 0.03179454430937767,
"learning_rate": 3.197986027997657e-06,
"loss": 10.3311,
"step": 1240
},
{
"epoch": 0.5255690841715194,
"grad_norm": 0.038864728063344955,
"learning_rate": 3.1393486422085618e-06,
"loss": 10.3308,
"step": 1241
},
{
"epoch": 0.5259925886712546,
"grad_norm": 0.027193231508135796,
"learning_rate": 3.08124524836233e-06,
"loss": 10.3314,
"step": 1242
},
{
"epoch": 0.5264160931709899,
"grad_norm": 0.035837847739458084,
"learning_rate": 3.023676166782452e-06,
"loss": 10.3327,
"step": 1243
},
{
"epoch": 0.5268395976707253,
"grad_norm": 0.02682778798043728,
"learning_rate": 2.9666417148468072e-06,
"loss": 10.3325,
"step": 1244
},
{
"epoch": 0.5272631021704606,
"grad_norm": 0.04898487776517868,
"learning_rate": 2.910142206985833e-06,
"loss": 10.3317,
"step": 1245
},
{
"epoch": 0.5276866066701958,
"grad_norm": 0.030211864039301872,
"learning_rate": 2.8541779546808256e-06,
"loss": 10.3292,
"step": 1246
},
{
"epoch": 0.5281101111699312,
"grad_norm": 0.03472064808011055,
"learning_rate": 2.7987492664622307e-06,
"loss": 10.3324,
"step": 1247
},
{
"epoch": 0.5285336156696665,
"grad_norm": 0.03139955550432205,
"learning_rate": 2.743856447907944e-06,
"loss": 10.3309,
"step": 1248
},
{
"epoch": 0.5289571201694018,
"grad_norm": 0.02904195711016655,
"learning_rate": 2.689499801641593e-06,
"loss": 10.332,
"step": 1249
},
{
"epoch": 0.5293806246691372,
"grad_norm": 0.045261383056640625,
"learning_rate": 2.6356796273309116e-06,
"loss": 10.33,
"step": 1250
},
{
"epoch": 0.5298041291688724,
"grad_norm": 0.03183293342590332,
"learning_rate": 2.5823962216860562e-06,
"loss": 10.3297,
"step": 1251
},
{
"epoch": 0.5302276336686077,
"grad_norm": 0.04214952513575554,
"learning_rate": 2.5296498784579846e-06,
"loss": 10.3309,
"step": 1252
},
{
"epoch": 0.5306511381683431,
"grad_norm": 0.03488962724804878,
"learning_rate": 2.4774408884368215e-06,
"loss": 10.3333,
"step": 1253
},
{
"epoch": 0.5310746426680784,
"grad_norm": 0.03279737010598183,
"learning_rate": 2.4257695394503287e-06,
"loss": 10.3278,
"step": 1254
},
{
"epoch": 0.5314981471678136,
"grad_norm": 0.03219415992498398,
"learning_rate": 2.374636116362172e-06,
"loss": 10.3334,
"step": 1255
},
{
"epoch": 0.531921651667549,
"grad_norm": 0.05066683888435364,
"learning_rate": 2.32404090107049e-06,
"loss": 10.3306,
"step": 1256
},
{
"epoch": 0.5323451561672843,
"grad_norm": 0.028979485854506493,
"learning_rate": 2.2739841725062715e-06,
"loss": 10.3319,
"step": 1257
},
{
"epoch": 0.5327686606670196,
"grad_norm": 0.03191670775413513,
"learning_rate": 2.2244662066318146e-06,
"loss": 10.333,
"step": 1258
},
{
"epoch": 0.5331921651667549,
"grad_norm": 0.04911280795931816,
"learning_rate": 2.1754872764392698e-06,
"loss": 10.3313,
"step": 1259
},
{
"epoch": 0.5336156696664902,
"grad_norm": 0.039490871131420135,
"learning_rate": 2.1270476519490435e-06,
"loss": 10.3244,
"step": 1260
},
{
"epoch": 0.5340391741662255,
"grad_norm": 0.03646280616521835,
"learning_rate": 2.079147600208364e-06,
"loss": 10.3303,
"step": 1261
},
{
"epoch": 0.5344626786659609,
"grad_norm": 0.039123885333538055,
"learning_rate": 2.0317873852898518e-06,
"loss": 10.332,
"step": 1262
},
{
"epoch": 0.5348861831656961,
"grad_norm": 0.04183242470026016,
"learning_rate": 1.9849672682898944e-06,
"loss": 10.3297,
"step": 1263
},
{
"epoch": 0.5353096876654314,
"grad_norm": 0.03520303592085838,
"learning_rate": 1.9386875073274636e-06,
"loss": 10.3265,
"step": 1264
},
{
"epoch": 0.5357331921651668,
"grad_norm": 0.0325089730322361,
"learning_rate": 1.8929483575424455e-06,
"loss": 10.3345,
"step": 1265
},
{
"epoch": 0.5361566966649021,
"grad_norm": 0.029976682737469673,
"learning_rate": 1.8477500710944007e-06,
"loss": 10.3292,
"step": 1266
},
{
"epoch": 0.5365802011646373,
"grad_norm": 0.034131329506635666,
"learning_rate": 1.803092897161096e-06,
"loss": 10.3276,
"step": 1267
},
{
"epoch": 0.5370037056643727,
"grad_norm": 0.03793232887983322,
"learning_rate": 1.75897708193713e-06,
"loss": 10.3349,
"step": 1268
},
{
"epoch": 0.537427210164108,
"grad_norm": 0.025969160720705986,
"learning_rate": 1.715402868632643e-06,
"loss": 10.3325,
"step": 1269
},
{
"epoch": 0.5378507146638433,
"grad_norm": 0.04372668266296387,
"learning_rate": 1.6723704974718756e-06,
"loss": 10.33,
"step": 1270
},
{
"epoch": 0.5382742191635786,
"grad_norm": 0.03358982875943184,
"learning_rate": 1.629880205691936e-06,
"loss": 10.3321,
"step": 1271
},
{
"epoch": 0.5386977236633139,
"grad_norm": 0.045495398342609406,
"learning_rate": 1.5879322275414332e-06,
"loss": 10.3334,
"step": 1272
},
{
"epoch": 0.5391212281630492,
"grad_norm": 0.02813423052430153,
"learning_rate": 1.5465267942792127e-06,
"loss": 10.332,
"step": 1273
},
{
"epoch": 0.5395447326627846,
"grad_norm": 0.02770121954381466,
"learning_rate": 1.5056641341730903e-06,
"loss": 10.3296,
"step": 1274
},
{
"epoch": 0.5399682371625198,
"grad_norm": 0.04436861723661423,
"learning_rate": 1.465344472498531e-06,
"loss": 10.3286,
"step": 1275
},
{
"epoch": 0.5403917416622551,
"grad_norm": 0.043747782707214355,
"learning_rate": 1.4255680315375164e-06,
"loss": 10.3332,
"step": 1276
},
{
"epoch": 0.5408152461619905,
"grad_norm": 0.028111323714256287,
"learning_rate": 1.3863350305772017e-06,
"loss": 10.3319,
"step": 1277
},
{
"epoch": 0.5412387506617258,
"grad_norm": 0.03884616121649742,
"learning_rate": 1.3476456859087828e-06,
"loss": 10.3317,
"step": 1278
},
{
"epoch": 0.541662255161461,
"grad_norm": 0.04214450716972351,
"learning_rate": 1.3095002108263199e-06,
"loss": 10.3336,
"step": 1279
},
{
"epoch": 0.5420857596611964,
"grad_norm": 0.0312722884118557,
"learning_rate": 1.2718988156254607e-06,
"loss": 10.3357,
"step": 1280
},
{
"epoch": 0.5425092641609317,
"grad_norm": 0.09322332590818405,
"learning_rate": 1.2348417076023745e-06,
"loss": 10.3333,
"step": 1281
},
{
"epoch": 0.542932768660667,
"grad_norm": 0.04540476202964783,
"learning_rate": 1.198329091052608e-06,
"loss": 10.3309,
"step": 1282
},
{
"epoch": 0.5433562731604024,
"grad_norm": 0.029997704550623894,
"learning_rate": 1.1623611672698765e-06,
"loss": 10.3358,
"step": 1283
},
{
"epoch": 0.5437797776601376,
"grad_norm": 0.0350346714258194,
"learning_rate": 1.1269381345450526e-06,
"loss": 10.3306,
"step": 1284
},
{
"epoch": 0.5442032821598729,
"grad_norm": 0.04271746799349785,
"learning_rate": 1.0920601881650006e-06,
"loss": 10.3313,
"step": 1285
},
{
"epoch": 0.5446267866596083,
"grad_norm": 0.03767610713839531,
"learning_rate": 1.0577275204115444e-06,
"loss": 10.3275,
"step": 1286
},
{
"epoch": 0.5450502911593436,
"grad_norm": 0.02964678965508938,
"learning_rate": 1.0239403205604014e-06,
"loss": 10.3296,
"step": 1287
},
{
"epoch": 0.5454737956590788,
"grad_norm": 0.03278511017560959,
"learning_rate": 9.906987748800944e-07,
"loss": 10.3329,
"step": 1288
},
{
"epoch": 0.5458973001588142,
"grad_norm": 0.05790937691926956,
"learning_rate": 9.580030666309969e-07,
"loss": 10.3372,
"step": 1289
},
{
"epoch": 0.5463208046585495,
"grad_norm": 0.03746120631694794,
"learning_rate": 9.258533760642563e-07,
"loss": 10.3302,
"step": 1290
},
{
"epoch": 0.5467443091582848,
"grad_norm": 0.03203713148832321,
"learning_rate": 8.942498804208498e-07,
"loss": 10.3328,
"step": 1291
},
{
"epoch": 0.5471678136580201,
"grad_norm": 0.032408781349658966,
"learning_rate": 8.631927539305862e-07,
"loss": 10.3328,
"step": 1292
},
{
"epoch": 0.5475913181577554,
"grad_norm": 0.038404081016778946,
"learning_rate": 8.326821678111163e-07,
"loss": 10.3357,
"step": 1293
},
{
"epoch": 0.5480148226574907,
"grad_norm": 0.03704221174120903,
"learning_rate": 8.027182902670571e-07,
"loss": 10.3267,
"step": 1294
},
{
"epoch": 0.5484383271572261,
"grad_norm": 0.02777581661939621,
"learning_rate": 7.733012864890032e-07,
"loss": 10.3331,
"step": 1295
},
{
"epoch": 0.5488618316569613,
"grad_norm": 0.0339139886200428,
"learning_rate": 7.444313186526608e-07,
"loss": 10.3355,
"step": 1296
},
{
"epoch": 0.5492853361566966,
"grad_norm": 0.027996981516480446,
"learning_rate": 7.161085459178929e-07,
"loss": 10.3301,
"step": 1297
},
{
"epoch": 0.549708840656432,
"grad_norm": 0.04270913451910019,
"learning_rate": 6.88333124427909e-07,
"loss": 10.3269,
"step": 1298
},
{
"epoch": 0.5501323451561673,
"grad_norm": 0.0351426862180233,
"learning_rate": 6.611052073083768e-07,
"loss": 10.3306,
"step": 1299
},
{
"epoch": 0.5505558496559025,
"grad_norm": 0.0378975048661232,
"learning_rate": 6.344249446665674e-07,
"loss": 10.3283,
"step": 1300
},
{
"epoch": 0.5509793541556379,
"grad_norm": 0.028754916042089462,
"learning_rate": 6.082924835905446e-07,
"loss": 10.3287,
"step": 1301
},
{
"epoch": 0.5514028586553732,
"grad_norm": 0.0465865433216095,
"learning_rate": 5.827079681483438e-07,
"loss": 10.3325,
"step": 1302
},
{
"epoch": 0.5518263631551085,
"grad_norm": 0.037231337279081345,
"learning_rate": 5.576715393871613e-07,
"loss": 10.3278,
"step": 1303
},
{
"epoch": 0.5522498676548439,
"grad_norm": 0.03710845485329628,
"learning_rate": 5.331833353326432e-07,
"loss": 10.3344,
"step": 1304
},
{
"epoch": 0.5526733721545791,
"grad_norm": 0.02809790149331093,
"learning_rate": 5.092434909880317e-07,
"loss": 10.3321,
"step": 1305
},
{
"epoch": 0.5530968766543145,
"grad_norm": 0.045991264283657074,
"learning_rate": 4.858521383334868e-07,
"loss": 10.3345,
"step": 1306
},
{
"epoch": 0.5535203811540498,
"grad_norm": 0.03640573099255562,
"learning_rate": 4.630094063253321e-07,
"loss": 10.3294,
"step": 1307
},
{
"epoch": 0.553943885653785,
"grad_norm": 0.029001332819461823,
"learning_rate": 4.4071542089535454e-07,
"loss": 10.3318,
"step": 1308
},
{
"epoch": 0.5543673901535204,
"grad_norm": 0.02934233844280243,
"learning_rate": 4.18970304950117e-07,
"loss": 10.3299,
"step": 1309
},
{
"epoch": 0.5547908946532557,
"grad_norm": 0.03224503621459007,
"learning_rate": 3.977741783702471e-07,
"loss": 10.3285,
"step": 1310
},
{
"epoch": 0.555214399152991,
"grad_norm": 0.03147895634174347,
"learning_rate": 3.771271580098157e-07,
"loss": 10.3325,
"step": 1311
},
{
"epoch": 0.5556379036527264,
"grad_norm": 0.03843318298459053,
"learning_rate": 3.570293576956596e-07,
"loss": 10.3301,
"step": 1312
},
{
"epoch": 0.5560614081524616,
"grad_norm": 0.0349433533847332,
"learning_rate": 3.3748088822679325e-07,
"loss": 10.332,
"step": 1313
},
{
"epoch": 0.5564849126521969,
"grad_norm": 0.03259619325399399,
"learning_rate": 3.184818573737425e-07,
"loss": 10.3296,
"step": 1314
},
{
"epoch": 0.5569084171519323,
"grad_norm": 0.03497344255447388,
"learning_rate": 3.0003236987802274e-07,
"loss": 10.3314,
"step": 1315
},
{
"epoch": 0.5573319216516676,
"grad_norm": 0.03283681720495224,
"learning_rate": 2.821325274514952e-07,
"loss": 10.3307,
"step": 1316
},
{
"epoch": 0.5577554261514028,
"grad_norm": 0.03914149850606918,
"learning_rate": 2.6478242877583383e-07,
"loss": 10.3321,
"step": 1317
},
{
"epoch": 0.5581789306511382,
"grad_norm": 0.028979448601603508,
"learning_rate": 2.4798216950198127e-07,
"loss": 10.3295,
"step": 1318
},
{
"epoch": 0.5586024351508735,
"grad_norm": 0.0339006632566452,
"learning_rate": 2.317318422496273e-07,
"loss": 10.3326,
"step": 1319
},
{
"epoch": 0.5590259396506088,
"grad_norm": 0.027926115319132805,
"learning_rate": 2.1603153660668674e-07,
"loss": 10.3305,
"step": 1320
},
{
"epoch": 0.5594494441503441,
"grad_norm": 0.031478822231292725,
"learning_rate": 2.0088133912881113e-07,
"loss": 10.3288,
"step": 1321
},
{
"epoch": 0.5598729486500794,
"grad_norm": 0.03274491801857948,
"learning_rate": 1.862813333389113e-07,
"loss": 10.3361,
"step": 1322
},
{
"epoch": 0.5602964531498147,
"grad_norm": 0.0399165078997612,
"learning_rate": 1.722315997267021e-07,
"loss": 10.3344,
"step": 1323
},
{
"epoch": 0.5607199576495501,
"grad_norm": 0.030695218592882156,
"learning_rate": 1.5873221574822516e-07,
"loss": 10.3298,
"step": 1324
},
{
"epoch": 0.5611434621492853,
"grad_norm": 0.03967565670609474,
"learning_rate": 1.4578325582548237e-07,
"loss": 10.3305,
"step": 1325
},
{
"epoch": 0.5615669666490206,
"grad_norm": 0.03664049133658409,
"learning_rate": 1.3338479134596958e-07,
"loss": 10.3293,
"step": 1326
},
{
"epoch": 0.561990471148756,
"grad_norm": 0.03802071511745453,
"learning_rate": 1.2153689066233266e-07,
"loss": 10.3305,
"step": 1327
},
{
"epoch": 0.5624139756484913,
"grad_norm": 0.036713242530822754,
"learning_rate": 1.1023961909192304e-07,
"loss": 10.3287,
"step": 1328
},
{
"epoch": 0.5628374801482265,
"grad_norm": 0.04824815317988396,
"learning_rate": 9.949303891653161e-08,
"loss": 10.3353,
"step": 1329
},
{
"epoch": 0.5632609846479619,
"grad_norm": 0.03399055823683739,
"learning_rate": 8.929720938193331e-08,
"loss": 10.3302,
"step": 1330
},
{
"epoch": 0.5636844891476972,
"grad_norm": 0.030519891530275345,
"learning_rate": 7.965218669766516e-08,
"loss": 10.3277,
"step": 1331
},
{
"epoch": 0.5641079936474325,
"grad_norm": 0.03647278994321823,
"learning_rate": 7.05580240366488e-08,
"loss": 10.3276,
"step": 1332
},
{
"epoch": 0.5645314981471679,
"grad_norm": 0.0370662622153759,
"learning_rate": 6.201477153493506e-08,
"loss": 10.3344,
"step": 1333
},
{
"epoch": 0.5649550026469031,
"grad_norm": 0.038933202624320984,
"learning_rate": 5.402247629139323e-08,
"loss": 10.3313,
"step": 1334
},
{
"epoch": 0.5653785071466384,
"grad_norm": 0.030461156740784645,
"learning_rate": 4.658118236747777e-08,
"loss": 10.3292,
"step": 1335
},
{
"epoch": 0.5658020116463738,
"grad_norm": 0.030602607876062393,
"learning_rate": 3.9690930786995264e-08,
"loss": 10.3294,
"step": 1336
},
{
"epoch": 0.566225516146109,
"grad_norm": 0.03394312039017677,
"learning_rate": 3.335175953581571e-08,
"loss": 10.3342,
"step": 1337
},
{
"epoch": 0.5666490206458443,
"grad_norm": 0.051167815923690796,
"learning_rate": 2.756370356175042e-08,
"loss": 10.3349,
"step": 1338
},
{
"epoch": 0.5670725251455797,
"grad_norm": 0.03260042518377304,
"learning_rate": 2.232679477430777e-08,
"loss": 10.3333,
"step": 1339
},
{
"epoch": 0.567496029645315,
"grad_norm": 0.040678899735212326,
"learning_rate": 1.7641062044515544e-08,
"loss": 10.3287,
"step": 1340
},
{
"epoch": 0.5679195341450503,
"grad_norm": 0.04332433268427849,
"learning_rate": 1.350653120477663e-08,
"loss": 10.3336,
"step": 1341
},
{
"epoch": 0.5683430386447856,
"grad_norm": 0.03431249037384987,
"learning_rate": 9.923225048724671e-09,
"loss": 10.3331,
"step": 1342
},
{
"epoch": 0.5687665431445209,
"grad_norm": 0.0347750224173069,
"learning_rate": 6.891163331101957e-09,
"loss": 10.3338,
"step": 1343
},
{
"epoch": 0.5691900476442562,
"grad_norm": 0.031236495822668076,
"learning_rate": 4.410362767626186e-09,
"loss": 10.3311,
"step": 1344
},
{
"epoch": 0.5696135521439916,
"grad_norm": 0.036892782896757126,
"learning_rate": 2.4808370349460596e-09,
"loss": 10.332,
"step": 1345
},
{
"epoch": 0.5700370566437268,
"grad_norm": 0.02656089887022972,
"learning_rate": 1.1025967705080576e-09,
"loss": 10.3323,
"step": 1346
},
{
"epoch": 0.5704605611434621,
"grad_norm": 0.03345981240272522,
"learning_rate": 2.756495725342312e-10,
"loss": 10.3286,
"step": 1347
},
{
"epoch": 0.5708840656431975,
"grad_norm": 0.03345588967204094,
"learning_rate": 0.0,
"loss": 10.3287,
"step": 1348
},
{
"epoch": 0.5708840656431975,
"eval_loss": 10.330697059631348,
"eval_runtime": 3.473,
"eval_samples_per_second": 286.494,
"eval_steps_per_second": 143.391,
"step": 1348
}
],
"logging_steps": 1,
"max_steps": 1348,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 337,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 28945837916160.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}