WN-VN-14B-v0.2 / trainer_state.json
CjangCjengh's picture
upload files
f503d12
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9992505620784412,
"eval_steps": 500,
"global_step": 1250,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0007994004496627529,
"grad_norm": 29.08027928947176,
"learning_rate": 0.0,
"loss": 1.7209,
"step": 1
},
{
"epoch": 0.0015988008993255058,
"grad_norm": 9.836200747540412,
"learning_rate": 2.7023815442731975e-06,
"loss": 1.2157,
"step": 2
},
{
"epoch": 0.002398201348988259,
"grad_norm": 8.732062138142359,
"learning_rate": 4.2831734103139475e-06,
"loss": 1.2213,
"step": 3
},
{
"epoch": 0.0031976017986510116,
"grad_norm": 8.98196608627301,
"learning_rate": 5.404763088546395e-06,
"loss": 1.3207,
"step": 4
},
{
"epoch": 0.003997002248313765,
"grad_norm": 3.104558237084713,
"learning_rate": 6.274735630753034e-06,
"loss": 1.2009,
"step": 5
},
{
"epoch": 0.004796402697976518,
"grad_norm": 2.9678718492236587,
"learning_rate": 6.985554954587145e-06,
"loss": 1.1976,
"step": 6
},
{
"epoch": 0.0055958031476392705,
"grad_norm": 2.324032539210556,
"learning_rate": 7.586544129592991e-06,
"loss": 1.1668,
"step": 7
},
{
"epoch": 0.006395203597302023,
"grad_norm": 2.422145845478249,
"learning_rate": 8.107144632819592e-06,
"loss": 1.1056,
"step": 8
},
{
"epoch": 0.007194604046964776,
"grad_norm": 2.7795213648793236,
"learning_rate": 8.566346820627895e-06,
"loss": 1.1439,
"step": 9
},
{
"epoch": 0.00799400449662753,
"grad_norm": 2.304173813168448,
"learning_rate": 8.977117175026234e-06,
"loss": 1.0859,
"step": 10
},
{
"epoch": 0.008793404946290282,
"grad_norm": 2.531444418518243,
"learning_rate": 9.348704159880588e-06,
"loss": 1.1012,
"step": 11
},
{
"epoch": 0.009592805395953035,
"grad_norm": 2.623744403178605,
"learning_rate": 9.687936498860343e-06,
"loss": 1.1248,
"step": 12
},
{
"epoch": 0.010392205845615787,
"grad_norm": 2.174204408077499,
"learning_rate": 1e-05,
"loss": 1.0862,
"step": 13
},
{
"epoch": 0.011191606295278541,
"grad_norm": 2.1375382895043553,
"learning_rate": 1e-05,
"loss": 1.0843,
"step": 14
},
{
"epoch": 0.011991006744941295,
"grad_norm": 2.3409573740941245,
"learning_rate": 1e-05,
"loss": 1.1007,
"step": 15
},
{
"epoch": 0.012790407194604047,
"grad_norm": 2.2321265748114443,
"learning_rate": 1e-05,
"loss": 1.0199,
"step": 16
},
{
"epoch": 0.0135898076442668,
"grad_norm": 2.2607491323391997,
"learning_rate": 1e-05,
"loss": 1.1098,
"step": 17
},
{
"epoch": 0.014389208093929552,
"grad_norm": 2.1345387966971328,
"learning_rate": 1e-05,
"loss": 1.0852,
"step": 18
},
{
"epoch": 0.015188608543592306,
"grad_norm": 2.0836111411515224,
"learning_rate": 1e-05,
"loss": 1.0227,
"step": 19
},
{
"epoch": 0.01598800899325506,
"grad_norm": 2.1200221376043826,
"learning_rate": 1e-05,
"loss": 1.0764,
"step": 20
},
{
"epoch": 0.016787409442917813,
"grad_norm": 2.3277973958562947,
"learning_rate": 1e-05,
"loss": 1.0425,
"step": 21
},
{
"epoch": 0.017586809892580563,
"grad_norm": 2.4310258538885523,
"learning_rate": 1e-05,
"loss": 1.0437,
"step": 22
},
{
"epoch": 0.018386210342243317,
"grad_norm": 2.317560454038046,
"learning_rate": 1e-05,
"loss": 1.0027,
"step": 23
},
{
"epoch": 0.01918561079190607,
"grad_norm": 2.1153613214468923,
"learning_rate": 1e-05,
"loss": 1.0878,
"step": 24
},
{
"epoch": 0.019985011241568824,
"grad_norm": 2.1138684148369884,
"learning_rate": 1e-05,
"loss": 1.0797,
"step": 25
},
{
"epoch": 0.020784411691231575,
"grad_norm": 2.3869844261967765,
"learning_rate": 1e-05,
"loss": 1.1126,
"step": 26
},
{
"epoch": 0.02158381214089433,
"grad_norm": 1.9441687206265474,
"learning_rate": 1e-05,
"loss": 1.0356,
"step": 27
},
{
"epoch": 0.022383212590557082,
"grad_norm": 1.8858684427680283,
"learning_rate": 1e-05,
"loss": 1.0112,
"step": 28
},
{
"epoch": 0.023182613040219836,
"grad_norm": 2.0111908392780924,
"learning_rate": 1e-05,
"loss": 1.025,
"step": 29
},
{
"epoch": 0.02398201348988259,
"grad_norm": 2.3223850597645885,
"learning_rate": 1e-05,
"loss": 1.0608,
"step": 30
},
{
"epoch": 0.02478141393954534,
"grad_norm": 2.282704095464692,
"learning_rate": 1e-05,
"loss": 0.9884,
"step": 31
},
{
"epoch": 0.025580814389208093,
"grad_norm": 2.2485551406767392,
"learning_rate": 1e-05,
"loss": 1.1609,
"step": 32
},
{
"epoch": 0.026380214838870847,
"grad_norm": 1.9632420284716974,
"learning_rate": 1e-05,
"loss": 1.0541,
"step": 33
},
{
"epoch": 0.0271796152885336,
"grad_norm": 2.7873694225738963,
"learning_rate": 1e-05,
"loss": 0.9917,
"step": 34
},
{
"epoch": 0.027979015738196354,
"grad_norm": 2.048096411620949,
"learning_rate": 1e-05,
"loss": 1.012,
"step": 35
},
{
"epoch": 0.028778416187859104,
"grad_norm": 2.0309944076384494,
"learning_rate": 1e-05,
"loss": 1.0212,
"step": 36
},
{
"epoch": 0.029577816637521858,
"grad_norm": 2.0949849865314643,
"learning_rate": 1e-05,
"loss": 1.0659,
"step": 37
},
{
"epoch": 0.03037721708718461,
"grad_norm": 2.059202087957289,
"learning_rate": 1e-05,
"loss": 1.0168,
"step": 38
},
{
"epoch": 0.031176617536847365,
"grad_norm": 2.0975700429920923,
"learning_rate": 1e-05,
"loss": 1.0216,
"step": 39
},
{
"epoch": 0.03197601798651012,
"grad_norm": 2.0062452254349714,
"learning_rate": 1e-05,
"loss": 1.0274,
"step": 40
},
{
"epoch": 0.03277541843617287,
"grad_norm": 2.222854538118324,
"learning_rate": 1e-05,
"loss": 1.0656,
"step": 41
},
{
"epoch": 0.033574818885835626,
"grad_norm": 1.943599598185592,
"learning_rate": 1e-05,
"loss": 1.0782,
"step": 42
},
{
"epoch": 0.03437421933549838,
"grad_norm": 1.9956218218997503,
"learning_rate": 1e-05,
"loss": 1.0625,
"step": 43
},
{
"epoch": 0.03517361978516113,
"grad_norm": 2.1611198939392096,
"learning_rate": 1e-05,
"loss": 1.041,
"step": 44
},
{
"epoch": 0.035973020234823884,
"grad_norm": 1.9975085093102276,
"learning_rate": 1e-05,
"loss": 1.0046,
"step": 45
},
{
"epoch": 0.036772420684486634,
"grad_norm": 1.8691307201375191,
"learning_rate": 1e-05,
"loss": 1.0243,
"step": 46
},
{
"epoch": 0.03757182113414939,
"grad_norm": 2.1275630339366667,
"learning_rate": 1e-05,
"loss": 1.0565,
"step": 47
},
{
"epoch": 0.03837122158381214,
"grad_norm": 1.998529171481795,
"learning_rate": 1e-05,
"loss": 0.972,
"step": 48
},
{
"epoch": 0.03917062203347489,
"grad_norm": 2.039027660741352,
"learning_rate": 1e-05,
"loss": 0.9604,
"step": 49
},
{
"epoch": 0.03997002248313765,
"grad_norm": 1.8761207165317535,
"learning_rate": 1e-05,
"loss": 0.9985,
"step": 50
},
{
"epoch": 0.0407694229328004,
"grad_norm": 2.089454409239614,
"learning_rate": 1e-05,
"loss": 0.9963,
"step": 51
},
{
"epoch": 0.04156882338246315,
"grad_norm": 2.0445251187040134,
"learning_rate": 1e-05,
"loss": 1.0192,
"step": 52
},
{
"epoch": 0.042368223832125906,
"grad_norm": 2.205588684592072,
"learning_rate": 1e-05,
"loss": 0.9684,
"step": 53
},
{
"epoch": 0.04316762428178866,
"grad_norm": 2.0208537418585957,
"learning_rate": 1e-05,
"loss": 1.0063,
"step": 54
},
{
"epoch": 0.043967024731451414,
"grad_norm": 1.7869034029258606,
"learning_rate": 1e-05,
"loss": 1.0368,
"step": 55
},
{
"epoch": 0.044766425181114164,
"grad_norm": 1.8924926601293262,
"learning_rate": 1e-05,
"loss": 1.011,
"step": 56
},
{
"epoch": 0.045565825630776914,
"grad_norm": 2.151723728750191,
"learning_rate": 1e-05,
"loss": 1.0275,
"step": 57
},
{
"epoch": 0.04636522608043967,
"grad_norm": 2.388300807396013,
"learning_rate": 1e-05,
"loss": 0.996,
"step": 58
},
{
"epoch": 0.04716462653010242,
"grad_norm": 1.9793946104980729,
"learning_rate": 1e-05,
"loss": 1.028,
"step": 59
},
{
"epoch": 0.04796402697976518,
"grad_norm": 2.050014939910642,
"learning_rate": 1e-05,
"loss": 1.0109,
"step": 60
},
{
"epoch": 0.04876342742942793,
"grad_norm": 1.8842986029616882,
"learning_rate": 1e-05,
"loss": 0.9752,
"step": 61
},
{
"epoch": 0.04956282787909068,
"grad_norm": 1.7444876770795246,
"learning_rate": 1e-05,
"loss": 1.0228,
"step": 62
},
{
"epoch": 0.050362228328753436,
"grad_norm": 1.8304676501403103,
"learning_rate": 1e-05,
"loss": 0.9747,
"step": 63
},
{
"epoch": 0.051161628778416186,
"grad_norm": 2.1540039062270164,
"learning_rate": 1e-05,
"loss": 0.9955,
"step": 64
},
{
"epoch": 0.051961029228078943,
"grad_norm": 1.6953401550549316,
"learning_rate": 1e-05,
"loss": 0.9811,
"step": 65
},
{
"epoch": 0.052760429677741694,
"grad_norm": 2.1460856566454987,
"learning_rate": 1e-05,
"loss": 1.0365,
"step": 66
},
{
"epoch": 0.053559830127404444,
"grad_norm": 1.7390283863943892,
"learning_rate": 1e-05,
"loss": 1.0277,
"step": 67
},
{
"epoch": 0.0543592305770672,
"grad_norm": 2.0836221978397442,
"learning_rate": 1e-05,
"loss": 0.9953,
"step": 68
},
{
"epoch": 0.05515863102672995,
"grad_norm": 1.7905448109320714,
"learning_rate": 1e-05,
"loss": 0.9944,
"step": 69
},
{
"epoch": 0.05595803147639271,
"grad_norm": 1.9504348528444273,
"learning_rate": 1e-05,
"loss": 0.9808,
"step": 70
},
{
"epoch": 0.05675743192605546,
"grad_norm": 1.834972840275589,
"learning_rate": 1e-05,
"loss": 0.9992,
"step": 71
},
{
"epoch": 0.05755683237571821,
"grad_norm": 1.845072042104488,
"learning_rate": 1e-05,
"loss": 0.9811,
"step": 72
},
{
"epoch": 0.058356232825380966,
"grad_norm": 1.85534014854077,
"learning_rate": 1e-05,
"loss": 0.9864,
"step": 73
},
{
"epoch": 0.059155633275043716,
"grad_norm": 1.8650405189842276,
"learning_rate": 1e-05,
"loss": 0.9925,
"step": 74
},
{
"epoch": 0.05995503372470647,
"grad_norm": 1.862410414010068,
"learning_rate": 1e-05,
"loss": 1.0991,
"step": 75
},
{
"epoch": 0.06075443417436922,
"grad_norm": 2.1389193269284625,
"learning_rate": 1e-05,
"loss": 1.0228,
"step": 76
},
{
"epoch": 0.061553834624031974,
"grad_norm": 1.7408061970131428,
"learning_rate": 1e-05,
"loss": 1.0034,
"step": 77
},
{
"epoch": 0.06235323507369473,
"grad_norm": 2.0783333855212653,
"learning_rate": 1e-05,
"loss": 1.0015,
"step": 78
},
{
"epoch": 0.06315263552335748,
"grad_norm": 2.1794919181439507,
"learning_rate": 1e-05,
"loss": 1.0184,
"step": 79
},
{
"epoch": 0.06395203597302024,
"grad_norm": 1.8799556566280435,
"learning_rate": 1e-05,
"loss": 0.9807,
"step": 80
},
{
"epoch": 0.06475143642268298,
"grad_norm": 1.6068132265611528,
"learning_rate": 1e-05,
"loss": 1.0318,
"step": 81
},
{
"epoch": 0.06555083687234574,
"grad_norm": 1.8404529509039422,
"learning_rate": 1e-05,
"loss": 0.9634,
"step": 82
},
{
"epoch": 0.0663502373220085,
"grad_norm": 1.8490571137069702,
"learning_rate": 1e-05,
"loss": 0.9362,
"step": 83
},
{
"epoch": 0.06714963777167125,
"grad_norm": 2.1048586741337485,
"learning_rate": 1e-05,
"loss": 1.051,
"step": 84
},
{
"epoch": 0.067949038221334,
"grad_norm": 1.9361395487099815,
"learning_rate": 1e-05,
"loss": 0.9884,
"step": 85
},
{
"epoch": 0.06874843867099675,
"grad_norm": 1.882438664110377,
"learning_rate": 1e-05,
"loss": 1.0338,
"step": 86
},
{
"epoch": 0.06954783912065951,
"grad_norm": 1.9328301399003285,
"learning_rate": 1e-05,
"loss": 1.0123,
"step": 87
},
{
"epoch": 0.07034723957032225,
"grad_norm": 1.9592492051372121,
"learning_rate": 1e-05,
"loss": 1.015,
"step": 88
},
{
"epoch": 0.07114664001998501,
"grad_norm": 2.0637394818205035,
"learning_rate": 1e-05,
"loss": 1.0074,
"step": 89
},
{
"epoch": 0.07194604046964777,
"grad_norm": 1.875788422779308,
"learning_rate": 1e-05,
"loss": 0.966,
"step": 90
},
{
"epoch": 0.07274544091931051,
"grad_norm": 1.8409070357840667,
"learning_rate": 1e-05,
"loss": 1.0463,
"step": 91
},
{
"epoch": 0.07354484136897327,
"grad_norm": 1.9103779504623786,
"learning_rate": 1e-05,
"loss": 0.9633,
"step": 92
},
{
"epoch": 0.07434424181863603,
"grad_norm": 2.0590523934839307,
"learning_rate": 1e-05,
"loss": 1.0215,
"step": 93
},
{
"epoch": 0.07514364226829878,
"grad_norm": 2.104785750263468,
"learning_rate": 1e-05,
"loss": 1.0025,
"step": 94
},
{
"epoch": 0.07594304271796153,
"grad_norm": 2.1695447340449663,
"learning_rate": 1e-05,
"loss": 0.941,
"step": 95
},
{
"epoch": 0.07674244316762428,
"grad_norm": 2.0465650220094203,
"learning_rate": 1e-05,
"loss": 1.0093,
"step": 96
},
{
"epoch": 0.07754184361728704,
"grad_norm": 1.8941011997406154,
"learning_rate": 1e-05,
"loss": 1.064,
"step": 97
},
{
"epoch": 0.07834124406694978,
"grad_norm": 1.9987845140787637,
"learning_rate": 1e-05,
"loss": 0.9793,
"step": 98
},
{
"epoch": 0.07914064451661254,
"grad_norm": 1.8233385113626337,
"learning_rate": 1e-05,
"loss": 1.0176,
"step": 99
},
{
"epoch": 0.0799400449662753,
"grad_norm": 1.8162210777833079,
"learning_rate": 1e-05,
"loss": 0.9699,
"step": 100
},
{
"epoch": 0.08073944541593804,
"grad_norm": 1.8711808189743682,
"learning_rate": 1e-05,
"loss": 0.9865,
"step": 101
},
{
"epoch": 0.0815388458656008,
"grad_norm": 1.974561488916405,
"learning_rate": 1e-05,
"loss": 0.9806,
"step": 102
},
{
"epoch": 0.08233824631526356,
"grad_norm": 1.7095584582820083,
"learning_rate": 1e-05,
"loss": 0.9955,
"step": 103
},
{
"epoch": 0.0831376467649263,
"grad_norm": 1.8952139824297942,
"learning_rate": 1e-05,
"loss": 0.9338,
"step": 104
},
{
"epoch": 0.08393704721458906,
"grad_norm": 1.8058804845050307,
"learning_rate": 1e-05,
"loss": 1.0062,
"step": 105
},
{
"epoch": 0.08473644766425181,
"grad_norm": 1.8103680215448428,
"learning_rate": 1e-05,
"loss": 0.9872,
"step": 106
},
{
"epoch": 0.08553584811391457,
"grad_norm": 1.694736368233996,
"learning_rate": 1e-05,
"loss": 0.9359,
"step": 107
},
{
"epoch": 0.08633524856357731,
"grad_norm": 1.9235533583641018,
"learning_rate": 1e-05,
"loss": 1.0611,
"step": 108
},
{
"epoch": 0.08713464901324007,
"grad_norm": 1.619066977691127,
"learning_rate": 1e-05,
"loss": 0.9654,
"step": 109
},
{
"epoch": 0.08793404946290283,
"grad_norm": 1.8050888311534128,
"learning_rate": 1e-05,
"loss": 1.004,
"step": 110
},
{
"epoch": 0.08873344991256557,
"grad_norm": 1.9960924269335547,
"learning_rate": 1e-05,
"loss": 1.0118,
"step": 111
},
{
"epoch": 0.08953285036222833,
"grad_norm": 1.9286201089638149,
"learning_rate": 1e-05,
"loss": 1.0025,
"step": 112
},
{
"epoch": 0.09033225081189108,
"grad_norm": 2.1725480586787396,
"learning_rate": 1e-05,
"loss": 0.9558,
"step": 113
},
{
"epoch": 0.09113165126155383,
"grad_norm": 1.857962422635593,
"learning_rate": 1e-05,
"loss": 0.9772,
"step": 114
},
{
"epoch": 0.09193105171121659,
"grad_norm": 1.9166723424153935,
"learning_rate": 1e-05,
"loss": 0.9749,
"step": 115
},
{
"epoch": 0.09273045216087934,
"grad_norm": 2.0124769392114854,
"learning_rate": 1e-05,
"loss": 0.9548,
"step": 116
},
{
"epoch": 0.0935298526105421,
"grad_norm": 1.847426445728428,
"learning_rate": 1e-05,
"loss": 0.941,
"step": 117
},
{
"epoch": 0.09432925306020484,
"grad_norm": 2.163992947673654,
"learning_rate": 1e-05,
"loss": 0.9617,
"step": 118
},
{
"epoch": 0.0951286535098676,
"grad_norm": 1.8889979598709639,
"learning_rate": 1e-05,
"loss": 1.0272,
"step": 119
},
{
"epoch": 0.09592805395953036,
"grad_norm": 1.844634955046446,
"learning_rate": 1e-05,
"loss": 0.9669,
"step": 120
},
{
"epoch": 0.0967274544091931,
"grad_norm": 1.9301903181704618,
"learning_rate": 1e-05,
"loss": 0.9717,
"step": 121
},
{
"epoch": 0.09752685485885586,
"grad_norm": 1.9564195723979845,
"learning_rate": 1e-05,
"loss": 0.9527,
"step": 122
},
{
"epoch": 0.09832625530851861,
"grad_norm": 1.834090339470851,
"learning_rate": 1e-05,
"loss": 0.9794,
"step": 123
},
{
"epoch": 0.09912565575818136,
"grad_norm": 1.7936104151665677,
"learning_rate": 1e-05,
"loss": 0.9042,
"step": 124
},
{
"epoch": 0.09992505620784412,
"grad_norm": 1.7969263674080669,
"learning_rate": 1e-05,
"loss": 1.0397,
"step": 125
},
{
"epoch": 0.10072445665750687,
"grad_norm": 1.7901986458192694,
"learning_rate": 1e-05,
"loss": 1.0043,
"step": 126
},
{
"epoch": 0.10152385710716963,
"grad_norm": 1.8947234640723079,
"learning_rate": 1e-05,
"loss": 0.9761,
"step": 127
},
{
"epoch": 0.10232325755683237,
"grad_norm": 1.8487696622255145,
"learning_rate": 1e-05,
"loss": 0.9899,
"step": 128
},
{
"epoch": 0.10312265800649513,
"grad_norm": 1.8207862729527453,
"learning_rate": 1e-05,
"loss": 1.0272,
"step": 129
},
{
"epoch": 0.10392205845615789,
"grad_norm": 1.9816716753688939,
"learning_rate": 1e-05,
"loss": 0.9202,
"step": 130
},
{
"epoch": 0.10472145890582063,
"grad_norm": 1.8916365109275264,
"learning_rate": 1e-05,
"loss": 0.9629,
"step": 131
},
{
"epoch": 0.10552085935548339,
"grad_norm": 1.9863329832931071,
"learning_rate": 1e-05,
"loss": 0.9976,
"step": 132
},
{
"epoch": 0.10632025980514614,
"grad_norm": 1.9194816317308832,
"learning_rate": 1e-05,
"loss": 1.0043,
"step": 133
},
{
"epoch": 0.10711966025480889,
"grad_norm": 1.9537595846189237,
"learning_rate": 1e-05,
"loss": 0.9453,
"step": 134
},
{
"epoch": 0.10791906070447164,
"grad_norm": 2.0669579990783253,
"learning_rate": 1e-05,
"loss": 0.9865,
"step": 135
},
{
"epoch": 0.1087184611541344,
"grad_norm": 1.9760934706997628,
"learning_rate": 1e-05,
"loss": 1.017,
"step": 136
},
{
"epoch": 0.10951786160379715,
"grad_norm": 1.7260389446366302,
"learning_rate": 1e-05,
"loss": 0.963,
"step": 137
},
{
"epoch": 0.1103172620534599,
"grad_norm": 1.9203242105800193,
"learning_rate": 1e-05,
"loss": 1.0157,
"step": 138
},
{
"epoch": 0.11111666250312266,
"grad_norm": 1.9850822013474325,
"learning_rate": 1e-05,
"loss": 0.9438,
"step": 139
},
{
"epoch": 0.11191606295278542,
"grad_norm": 1.9572946605976695,
"learning_rate": 1e-05,
"loss": 1.0029,
"step": 140
},
{
"epoch": 0.11271546340244816,
"grad_norm": 1.5451741731912971,
"learning_rate": 1e-05,
"loss": 0.9225,
"step": 141
},
{
"epoch": 0.11351486385211092,
"grad_norm": 2.0070450938810707,
"learning_rate": 1e-05,
"loss": 0.922,
"step": 142
},
{
"epoch": 0.11431426430177367,
"grad_norm": 1.89832125508894,
"learning_rate": 1e-05,
"loss": 1.0401,
"step": 143
},
{
"epoch": 0.11511366475143642,
"grad_norm": 1.950327724703524,
"learning_rate": 1e-05,
"loss": 0.9279,
"step": 144
},
{
"epoch": 0.11591306520109917,
"grad_norm": 1.9700609199158468,
"learning_rate": 1e-05,
"loss": 0.9864,
"step": 145
},
{
"epoch": 0.11671246565076193,
"grad_norm": 1.6727783834574599,
"learning_rate": 1e-05,
"loss": 0.9659,
"step": 146
},
{
"epoch": 0.11751186610042468,
"grad_norm": 1.8484918243414765,
"learning_rate": 1e-05,
"loss": 0.9761,
"step": 147
},
{
"epoch": 0.11831126655008743,
"grad_norm": 2.045306713844051,
"learning_rate": 1e-05,
"loss": 0.9788,
"step": 148
},
{
"epoch": 0.11911066699975019,
"grad_norm": 1.8558407244018518,
"learning_rate": 1e-05,
"loss": 0.963,
"step": 149
},
{
"epoch": 0.11991006744941295,
"grad_norm": 1.777504348074839,
"learning_rate": 1e-05,
"loss": 0.9898,
"step": 150
},
{
"epoch": 0.12070946789907569,
"grad_norm": 1.7945306209083864,
"learning_rate": 1e-05,
"loss": 0.9475,
"step": 151
},
{
"epoch": 0.12150886834873845,
"grad_norm": 1.612635014991482,
"learning_rate": 1e-05,
"loss": 0.981,
"step": 152
},
{
"epoch": 0.1223082687984012,
"grad_norm": 1.5365653630331435,
"learning_rate": 1e-05,
"loss": 0.9336,
"step": 153
},
{
"epoch": 0.12310766924806395,
"grad_norm": 1.7728163669560009,
"learning_rate": 1e-05,
"loss": 0.9786,
"step": 154
},
{
"epoch": 0.1239070696977267,
"grad_norm": 1.6363907272750682,
"learning_rate": 1e-05,
"loss": 0.9499,
"step": 155
},
{
"epoch": 0.12470647014738946,
"grad_norm": 1.8927548789352038,
"learning_rate": 1e-05,
"loss": 0.9537,
"step": 156
},
{
"epoch": 0.1255058705970522,
"grad_norm": 1.576438438411652,
"learning_rate": 1e-05,
"loss": 0.9273,
"step": 157
},
{
"epoch": 0.12630527104671496,
"grad_norm": 1.8750460465870347,
"learning_rate": 1e-05,
"loss": 0.9687,
"step": 158
},
{
"epoch": 0.12710467149637772,
"grad_norm": 1.712737472716492,
"learning_rate": 1e-05,
"loss": 0.9981,
"step": 159
},
{
"epoch": 0.12790407194604048,
"grad_norm": 1.8944147808763965,
"learning_rate": 1e-05,
"loss": 1.0316,
"step": 160
},
{
"epoch": 0.12870347239570323,
"grad_norm": 1.6975154876149214,
"learning_rate": 1e-05,
"loss": 0.9921,
"step": 161
},
{
"epoch": 0.12950287284536596,
"grad_norm": 1.7330196261933866,
"learning_rate": 1e-05,
"loss": 0.9567,
"step": 162
},
{
"epoch": 0.13030227329502872,
"grad_norm": 2.004904627709956,
"learning_rate": 1e-05,
"loss": 0.9788,
"step": 163
},
{
"epoch": 0.13110167374469148,
"grad_norm": 1.7565329263507932,
"learning_rate": 1e-05,
"loss": 0.9461,
"step": 164
},
{
"epoch": 0.13190107419435423,
"grad_norm": 1.6976314021380359,
"learning_rate": 1e-05,
"loss": 0.9926,
"step": 165
},
{
"epoch": 0.132700474644017,
"grad_norm": 1.573182719519626,
"learning_rate": 1e-05,
"loss": 0.982,
"step": 166
},
{
"epoch": 0.13349987509367975,
"grad_norm": 1.5753994405016738,
"learning_rate": 1e-05,
"loss": 0.9745,
"step": 167
},
{
"epoch": 0.1342992755433425,
"grad_norm": 1.9199549441489088,
"learning_rate": 1e-05,
"loss": 0.9916,
"step": 168
},
{
"epoch": 0.13509867599300523,
"grad_norm": 1.7662832212098252,
"learning_rate": 1e-05,
"loss": 0.9717,
"step": 169
},
{
"epoch": 0.135898076442668,
"grad_norm": 2.1972236756007506,
"learning_rate": 1e-05,
"loss": 0.9923,
"step": 170
},
{
"epoch": 0.13669747689233075,
"grad_norm": 1.5845907178152914,
"learning_rate": 1e-05,
"loss": 1.041,
"step": 171
},
{
"epoch": 0.1374968773419935,
"grad_norm": 1.9027156433363486,
"learning_rate": 1e-05,
"loss": 0.9986,
"step": 172
},
{
"epoch": 0.13829627779165626,
"grad_norm": 1.938028025396952,
"learning_rate": 1e-05,
"loss": 0.9856,
"step": 173
},
{
"epoch": 0.13909567824131902,
"grad_norm": 1.7615271251517497,
"learning_rate": 1e-05,
"loss": 0.9879,
"step": 174
},
{
"epoch": 0.13989507869098175,
"grad_norm": 1.5753792433296703,
"learning_rate": 1e-05,
"loss": 0.9952,
"step": 175
},
{
"epoch": 0.1406944791406445,
"grad_norm": 1.8071610796834736,
"learning_rate": 1e-05,
"loss": 0.9403,
"step": 176
},
{
"epoch": 0.14149387959030726,
"grad_norm": 1.8188146399425127,
"learning_rate": 1e-05,
"loss": 0.9166,
"step": 177
},
{
"epoch": 0.14229328003997002,
"grad_norm": 1.8998134327288991,
"learning_rate": 1e-05,
"loss": 0.9307,
"step": 178
},
{
"epoch": 0.14309268048963278,
"grad_norm": 1.8148916923977343,
"learning_rate": 1e-05,
"loss": 0.964,
"step": 179
},
{
"epoch": 0.14389208093929554,
"grad_norm": 1.8025702262604992,
"learning_rate": 1e-05,
"loss": 0.9636,
"step": 180
},
{
"epoch": 0.1446914813889583,
"grad_norm": 1.8970561152549208,
"learning_rate": 1e-05,
"loss": 0.9446,
"step": 181
},
{
"epoch": 0.14549088183862102,
"grad_norm": 1.774281514717804,
"learning_rate": 1e-05,
"loss": 0.9011,
"step": 182
},
{
"epoch": 0.14629028228828378,
"grad_norm": 1.6697484592667877,
"learning_rate": 1e-05,
"loss": 0.9732,
"step": 183
},
{
"epoch": 0.14708968273794654,
"grad_norm": 1.748314198924899,
"learning_rate": 1e-05,
"loss": 0.9294,
"step": 184
},
{
"epoch": 0.1478890831876093,
"grad_norm": 1.5552333328333348,
"learning_rate": 1e-05,
"loss": 0.9207,
"step": 185
},
{
"epoch": 0.14868848363727205,
"grad_norm": 1.819375156478493,
"learning_rate": 1e-05,
"loss": 0.9667,
"step": 186
},
{
"epoch": 0.1494878840869348,
"grad_norm": 1.5853289567427034,
"learning_rate": 1e-05,
"loss": 0.9863,
"step": 187
},
{
"epoch": 0.15028728453659757,
"grad_norm": 1.7338233390104778,
"learning_rate": 1e-05,
"loss": 0.9088,
"step": 188
},
{
"epoch": 0.1510866849862603,
"grad_norm": 1.8735214816693204,
"learning_rate": 1e-05,
"loss": 0.9931,
"step": 189
},
{
"epoch": 0.15188608543592305,
"grad_norm": 1.70836070926444,
"learning_rate": 1e-05,
"loss": 0.9774,
"step": 190
},
{
"epoch": 0.1526854858855858,
"grad_norm": 1.68457840558557,
"learning_rate": 1e-05,
"loss": 0.9971,
"step": 191
},
{
"epoch": 0.15348488633524857,
"grad_norm": 1.9974046657795066,
"learning_rate": 1e-05,
"loss": 1.0525,
"step": 192
},
{
"epoch": 0.15428428678491132,
"grad_norm": 1.8637088407144724,
"learning_rate": 1e-05,
"loss": 0.9458,
"step": 193
},
{
"epoch": 0.15508368723457408,
"grad_norm": 1.5472617342282928,
"learning_rate": 1e-05,
"loss": 0.9321,
"step": 194
},
{
"epoch": 0.1558830876842368,
"grad_norm": 2.0278392859284224,
"learning_rate": 1e-05,
"loss": 0.9376,
"step": 195
},
{
"epoch": 0.15668248813389957,
"grad_norm": 1.8610095483452973,
"learning_rate": 1e-05,
"loss": 0.9921,
"step": 196
},
{
"epoch": 0.15748188858356232,
"grad_norm": 2.0375178580916016,
"learning_rate": 1e-05,
"loss": 0.9985,
"step": 197
},
{
"epoch": 0.15828128903322508,
"grad_norm": 1.8219362402276909,
"learning_rate": 1e-05,
"loss": 0.924,
"step": 198
},
{
"epoch": 0.15908068948288784,
"grad_norm": 1.4629250708658383,
"learning_rate": 1e-05,
"loss": 1.0201,
"step": 199
},
{
"epoch": 0.1598800899325506,
"grad_norm": 1.5628287370754461,
"learning_rate": 1e-05,
"loss": 1.0002,
"step": 200
},
{
"epoch": 0.16067949038221335,
"grad_norm": 1.8442311252983388,
"learning_rate": 1e-05,
"loss": 0.937,
"step": 201
},
{
"epoch": 0.16147889083187608,
"grad_norm": 7.441197607810174,
"learning_rate": 1e-05,
"loss": 0.8768,
"step": 202
},
{
"epoch": 0.16227829128153884,
"grad_norm": 1.7947899683379576,
"learning_rate": 1e-05,
"loss": 0.9524,
"step": 203
},
{
"epoch": 0.1630776917312016,
"grad_norm": 1.656507654529954,
"learning_rate": 1e-05,
"loss": 0.8953,
"step": 204
},
{
"epoch": 0.16387709218086435,
"grad_norm": 1.7462816982128921,
"learning_rate": 1e-05,
"loss": 0.9435,
"step": 205
},
{
"epoch": 0.1646764926305271,
"grad_norm": 1.7013940298273953,
"learning_rate": 1e-05,
"loss": 0.9124,
"step": 206
},
{
"epoch": 0.16547589308018987,
"grad_norm": 1.6379746843984113,
"learning_rate": 1e-05,
"loss": 0.9508,
"step": 207
},
{
"epoch": 0.1662752935298526,
"grad_norm": 1.9314822402660798,
"learning_rate": 1e-05,
"loss": 1.0272,
"step": 208
},
{
"epoch": 0.16707469397951535,
"grad_norm": 1.9961308842740637,
"learning_rate": 1e-05,
"loss": 0.9841,
"step": 209
},
{
"epoch": 0.1678740944291781,
"grad_norm": 2.0382234178726537,
"learning_rate": 1e-05,
"loss": 0.9785,
"step": 210
},
{
"epoch": 0.16867349487884087,
"grad_norm": 1.6901064034464468,
"learning_rate": 1e-05,
"loss": 0.9127,
"step": 211
},
{
"epoch": 0.16947289532850363,
"grad_norm": 1.7273747898471865,
"learning_rate": 1e-05,
"loss": 0.9583,
"step": 212
},
{
"epoch": 0.17027229577816638,
"grad_norm": 1.7457470216603739,
"learning_rate": 1e-05,
"loss": 0.9799,
"step": 213
},
{
"epoch": 0.17107169622782914,
"grad_norm": 1.7313522722535573,
"learning_rate": 1e-05,
"loss": 0.9489,
"step": 214
},
{
"epoch": 0.17187109667749187,
"grad_norm": 1.7762615948567715,
"learning_rate": 1e-05,
"loss": 0.9328,
"step": 215
},
{
"epoch": 0.17267049712715463,
"grad_norm": 1.6331422537410691,
"learning_rate": 1e-05,
"loss": 0.9446,
"step": 216
},
{
"epoch": 0.17346989757681738,
"grad_norm": 1.6778510604121997,
"learning_rate": 1e-05,
"loss": 0.9547,
"step": 217
},
{
"epoch": 0.17426929802648014,
"grad_norm": 1.9041470899144908,
"learning_rate": 1e-05,
"loss": 0.9014,
"step": 218
},
{
"epoch": 0.1750686984761429,
"grad_norm": 1.8662662755793453,
"learning_rate": 1e-05,
"loss": 0.9709,
"step": 219
},
{
"epoch": 0.17586809892580565,
"grad_norm": 1.7045357754568997,
"learning_rate": 1e-05,
"loss": 0.9433,
"step": 220
},
{
"epoch": 0.1766674993754684,
"grad_norm": 1.74409106945116,
"learning_rate": 1e-05,
"loss": 0.9153,
"step": 221
},
{
"epoch": 0.17746689982513114,
"grad_norm": 1.8132234884702887,
"learning_rate": 1e-05,
"loss": 0.8909,
"step": 222
},
{
"epoch": 0.1782663002747939,
"grad_norm": 1.6971296927642,
"learning_rate": 1e-05,
"loss": 0.9622,
"step": 223
},
{
"epoch": 0.17906570072445666,
"grad_norm": 1.781912471031092,
"learning_rate": 1e-05,
"loss": 0.954,
"step": 224
},
{
"epoch": 0.1798651011741194,
"grad_norm": 1.6629867774088771,
"learning_rate": 1e-05,
"loss": 0.96,
"step": 225
},
{
"epoch": 0.18066450162378217,
"grad_norm": 2.0699033115205614,
"learning_rate": 1e-05,
"loss": 0.9284,
"step": 226
},
{
"epoch": 0.18146390207344493,
"grad_norm": 1.7235146329911442,
"learning_rate": 1e-05,
"loss": 0.9456,
"step": 227
},
{
"epoch": 0.18226330252310766,
"grad_norm": 1.7961113577108625,
"learning_rate": 1e-05,
"loss": 0.9454,
"step": 228
},
{
"epoch": 0.1830627029727704,
"grad_norm": 1.6808904917909453,
"learning_rate": 1e-05,
"loss": 0.9524,
"step": 229
},
{
"epoch": 0.18386210342243317,
"grad_norm": 1.5865303307652885,
"learning_rate": 1e-05,
"loss": 0.9863,
"step": 230
},
{
"epoch": 0.18466150387209593,
"grad_norm": 1.6521878212504149,
"learning_rate": 1e-05,
"loss": 0.946,
"step": 231
},
{
"epoch": 0.18546090432175869,
"grad_norm": 1.5619375597824243,
"learning_rate": 1e-05,
"loss": 1.0141,
"step": 232
},
{
"epoch": 0.18626030477142144,
"grad_norm": 1.9668596679027701,
"learning_rate": 1e-05,
"loss": 0.9783,
"step": 233
},
{
"epoch": 0.1870597052210842,
"grad_norm": 1.7004515677555856,
"learning_rate": 1e-05,
"loss": 0.939,
"step": 234
},
{
"epoch": 0.18785910567074693,
"grad_norm": 1.8505586367786393,
"learning_rate": 1e-05,
"loss": 1.0186,
"step": 235
},
{
"epoch": 0.18865850612040969,
"grad_norm": 1.8794093279833084,
"learning_rate": 1e-05,
"loss": 0.9748,
"step": 236
},
{
"epoch": 0.18945790657007244,
"grad_norm": 1.970577363084186,
"learning_rate": 1e-05,
"loss": 0.9734,
"step": 237
},
{
"epoch": 0.1902573070197352,
"grad_norm": 1.9827162568725265,
"learning_rate": 1e-05,
"loss": 0.9526,
"step": 238
},
{
"epoch": 0.19105670746939796,
"grad_norm": 1.6777105787009272,
"learning_rate": 1e-05,
"loss": 1.0038,
"step": 239
},
{
"epoch": 0.19185610791906071,
"grad_norm": 1.8547665670552458,
"learning_rate": 1e-05,
"loss": 0.9425,
"step": 240
},
{
"epoch": 0.19265550836872344,
"grad_norm": 1.5739853104069792,
"learning_rate": 1e-05,
"loss": 0.9898,
"step": 241
},
{
"epoch": 0.1934549088183862,
"grad_norm": 1.7991544252885405,
"learning_rate": 1e-05,
"loss": 0.9068,
"step": 242
},
{
"epoch": 0.19425430926804896,
"grad_norm": 1.7278046505750493,
"learning_rate": 1e-05,
"loss": 0.9961,
"step": 243
},
{
"epoch": 0.19505370971771172,
"grad_norm": 1.6738018924260079,
"learning_rate": 1e-05,
"loss": 0.9269,
"step": 244
},
{
"epoch": 0.19585311016737447,
"grad_norm": 1.704113739011135,
"learning_rate": 1e-05,
"loss": 0.9384,
"step": 245
},
{
"epoch": 0.19665251061703723,
"grad_norm": 1.953642878567139,
"learning_rate": 1e-05,
"loss": 0.9003,
"step": 246
},
{
"epoch": 0.1974519110667,
"grad_norm": 1.8994714525376621,
"learning_rate": 1e-05,
"loss": 0.9384,
"step": 247
},
{
"epoch": 0.19825131151636272,
"grad_norm": 1.7335277476681896,
"learning_rate": 1e-05,
"loss": 0.9164,
"step": 248
},
{
"epoch": 0.19905071196602547,
"grad_norm": 1.8114996960442162,
"learning_rate": 1e-05,
"loss": 0.909,
"step": 249
},
{
"epoch": 0.19985011241568823,
"grad_norm": 1.8399064962789757,
"learning_rate": 1e-05,
"loss": 0.9672,
"step": 250
},
{
"epoch": 0.200649512865351,
"grad_norm": 1.8027482426913095,
"learning_rate": 1e-05,
"loss": 0.9294,
"step": 251
},
{
"epoch": 0.20144891331501374,
"grad_norm": 1.7914653808525045,
"learning_rate": 1e-05,
"loss": 0.9709,
"step": 252
},
{
"epoch": 0.2022483137646765,
"grad_norm": 1.8562700822437381,
"learning_rate": 1e-05,
"loss": 0.918,
"step": 253
},
{
"epoch": 0.20304771421433926,
"grad_norm": 1.592298158180451,
"learning_rate": 1e-05,
"loss": 0.9874,
"step": 254
},
{
"epoch": 0.203847114664002,
"grad_norm": 1.7885472103550304,
"learning_rate": 1e-05,
"loss": 0.9579,
"step": 255
},
{
"epoch": 0.20464651511366475,
"grad_norm": 1.8835318053165766,
"learning_rate": 1e-05,
"loss": 0.97,
"step": 256
},
{
"epoch": 0.2054459155633275,
"grad_norm": 2.2973670794805865,
"learning_rate": 1e-05,
"loss": 1.0196,
"step": 257
},
{
"epoch": 0.20624531601299026,
"grad_norm": 2.059759101560068,
"learning_rate": 1e-05,
"loss": 0.9051,
"step": 258
},
{
"epoch": 0.20704471646265302,
"grad_norm": 1.6379487643230517,
"learning_rate": 1e-05,
"loss": 0.9853,
"step": 259
},
{
"epoch": 0.20784411691231577,
"grad_norm": 1.7739932086505867,
"learning_rate": 1e-05,
"loss": 0.9365,
"step": 260
},
{
"epoch": 0.2086435173619785,
"grad_norm": 1.9378628413327441,
"learning_rate": 1e-05,
"loss": 0.9248,
"step": 261
},
{
"epoch": 0.20944291781164126,
"grad_norm": 1.8631208677480777,
"learning_rate": 1e-05,
"loss": 0.9417,
"step": 262
},
{
"epoch": 0.21024231826130402,
"grad_norm": 1.73049947808822,
"learning_rate": 1e-05,
"loss": 0.9039,
"step": 263
},
{
"epoch": 0.21104171871096677,
"grad_norm": 1.6873959381280914,
"learning_rate": 1e-05,
"loss": 0.945,
"step": 264
},
{
"epoch": 0.21184111916062953,
"grad_norm": 1.5105067176725349,
"learning_rate": 1e-05,
"loss": 0.9446,
"step": 265
},
{
"epoch": 0.2126405196102923,
"grad_norm": 1.8337058320691813,
"learning_rate": 1e-05,
"loss": 0.9582,
"step": 266
},
{
"epoch": 0.21343992005995505,
"grad_norm": 1.644955596385126,
"learning_rate": 1e-05,
"loss": 0.9055,
"step": 267
},
{
"epoch": 0.21423932050961778,
"grad_norm": 2.0248942495461435,
"learning_rate": 1e-05,
"loss": 1.0207,
"step": 268
},
{
"epoch": 0.21503872095928053,
"grad_norm": 1.746437687084402,
"learning_rate": 1e-05,
"loss": 1.0093,
"step": 269
},
{
"epoch": 0.2158381214089433,
"grad_norm": 1.719648906171914,
"learning_rate": 1e-05,
"loss": 0.9533,
"step": 270
},
{
"epoch": 0.21663752185860605,
"grad_norm": 1.8380592688711606,
"learning_rate": 1e-05,
"loss": 0.9275,
"step": 271
},
{
"epoch": 0.2174369223082688,
"grad_norm": 1.8205169561312367,
"learning_rate": 1e-05,
"loss": 0.9745,
"step": 272
},
{
"epoch": 0.21823632275793156,
"grad_norm": 2.326139141853857,
"learning_rate": 1e-05,
"loss": 0.9953,
"step": 273
},
{
"epoch": 0.2190357232075943,
"grad_norm": 1.6381092977636662,
"learning_rate": 1e-05,
"loss": 0.9203,
"step": 274
},
{
"epoch": 0.21983512365725705,
"grad_norm": 1.606867524589781,
"learning_rate": 1e-05,
"loss": 0.9007,
"step": 275
},
{
"epoch": 0.2206345241069198,
"grad_norm": 1.7195338383934604,
"learning_rate": 1e-05,
"loss": 0.9611,
"step": 276
},
{
"epoch": 0.22143392455658256,
"grad_norm": 1.3840546682546424,
"learning_rate": 1e-05,
"loss": 0.9614,
"step": 277
},
{
"epoch": 0.22223332500624532,
"grad_norm": 1.6306949714534276,
"learning_rate": 1e-05,
"loss": 0.9271,
"step": 278
},
{
"epoch": 0.22303272545590808,
"grad_norm": 1.5110189180438256,
"learning_rate": 1e-05,
"loss": 0.9528,
"step": 279
},
{
"epoch": 0.22383212590557083,
"grad_norm": 1.8612974867734187,
"learning_rate": 1e-05,
"loss": 0.9587,
"step": 280
},
{
"epoch": 0.22463152635523356,
"grad_norm": 1.664680974165204,
"learning_rate": 1e-05,
"loss": 0.9129,
"step": 281
},
{
"epoch": 0.22543092680489632,
"grad_norm": 1.7746255109018692,
"learning_rate": 1e-05,
"loss": 0.939,
"step": 282
},
{
"epoch": 0.22623032725455908,
"grad_norm": 1.575200440251585,
"learning_rate": 1e-05,
"loss": 0.9204,
"step": 283
},
{
"epoch": 0.22702972770422183,
"grad_norm": 1.7516406660858301,
"learning_rate": 1e-05,
"loss": 0.9537,
"step": 284
},
{
"epoch": 0.2278291281538846,
"grad_norm": 1.91803098110819,
"learning_rate": 1e-05,
"loss": 0.9363,
"step": 285
},
{
"epoch": 0.22862852860354735,
"grad_norm": 1.6613035583173086,
"learning_rate": 1e-05,
"loss": 0.9634,
"step": 286
},
{
"epoch": 0.2294279290532101,
"grad_norm": 1.5842290188976889,
"learning_rate": 1e-05,
"loss": 0.9551,
"step": 287
},
{
"epoch": 0.23022732950287284,
"grad_norm": 1.9140569815192874,
"learning_rate": 1e-05,
"loss": 0.9512,
"step": 288
},
{
"epoch": 0.2310267299525356,
"grad_norm": 1.5261307902201178,
"learning_rate": 1e-05,
"loss": 0.96,
"step": 289
},
{
"epoch": 0.23182613040219835,
"grad_norm": 1.682573363812062,
"learning_rate": 1e-05,
"loss": 0.8925,
"step": 290
},
{
"epoch": 0.2326255308518611,
"grad_norm": 1.6358092225364382,
"learning_rate": 1e-05,
"loss": 0.8815,
"step": 291
},
{
"epoch": 0.23342493130152386,
"grad_norm": 1.5670506043722536,
"learning_rate": 1e-05,
"loss": 0.9876,
"step": 292
},
{
"epoch": 0.23422433175118662,
"grad_norm": 1.6299839564753011,
"learning_rate": 1e-05,
"loss": 0.8892,
"step": 293
},
{
"epoch": 0.23502373220084935,
"grad_norm": 1.6554910310702649,
"learning_rate": 1e-05,
"loss": 0.9216,
"step": 294
},
{
"epoch": 0.2358231326505121,
"grad_norm": 1.8037159660461701,
"learning_rate": 1e-05,
"loss": 0.9575,
"step": 295
},
{
"epoch": 0.23662253310017486,
"grad_norm": 1.629165333497563,
"learning_rate": 1e-05,
"loss": 0.947,
"step": 296
},
{
"epoch": 0.23742193354983762,
"grad_norm": 1.8459614666127684,
"learning_rate": 1e-05,
"loss": 0.9263,
"step": 297
},
{
"epoch": 0.23822133399950038,
"grad_norm": 1.5508274722576894,
"learning_rate": 1e-05,
"loss": 0.9002,
"step": 298
},
{
"epoch": 0.23902073444916314,
"grad_norm": 1.6777079971899138,
"learning_rate": 1e-05,
"loss": 0.9508,
"step": 299
},
{
"epoch": 0.2398201348988259,
"grad_norm": 1.7100079727592197,
"learning_rate": 1e-05,
"loss": 0.935,
"step": 300
},
{
"epoch": 0.24061953534848862,
"grad_norm": 2.1307932039198425,
"learning_rate": 1e-05,
"loss": 0.9233,
"step": 301
},
{
"epoch": 0.24141893579815138,
"grad_norm": 1.883290916019245,
"learning_rate": 1e-05,
"loss": 0.943,
"step": 302
},
{
"epoch": 0.24221833624781414,
"grad_norm": 1.5909650854809918,
"learning_rate": 1e-05,
"loss": 0.9467,
"step": 303
},
{
"epoch": 0.2430177366974769,
"grad_norm": 1.7792900727864842,
"learning_rate": 1e-05,
"loss": 0.9342,
"step": 304
},
{
"epoch": 0.24381713714713965,
"grad_norm": 1.7111474699259361,
"learning_rate": 1e-05,
"loss": 0.9345,
"step": 305
},
{
"epoch": 0.2446165375968024,
"grad_norm": 1.7771845797925385,
"learning_rate": 1e-05,
"loss": 0.9341,
"step": 306
},
{
"epoch": 0.24541593804646514,
"grad_norm": 1.6148130323193988,
"learning_rate": 1e-05,
"loss": 0.8944,
"step": 307
},
{
"epoch": 0.2462153384961279,
"grad_norm": 1.9162065213210437,
"learning_rate": 1e-05,
"loss": 0.9519,
"step": 308
},
{
"epoch": 0.24701473894579065,
"grad_norm": 1.6110529009706316,
"learning_rate": 1e-05,
"loss": 0.8987,
"step": 309
},
{
"epoch": 0.2478141393954534,
"grad_norm": 1.7475182646170053,
"learning_rate": 1e-05,
"loss": 0.885,
"step": 310
},
{
"epoch": 0.24861353984511617,
"grad_norm": 1.8647125722982512,
"learning_rate": 1e-05,
"loss": 0.9214,
"step": 311
},
{
"epoch": 0.24941294029477892,
"grad_norm": 1.6670715424606828,
"learning_rate": 1e-05,
"loss": 0.9462,
"step": 312
},
{
"epoch": 0.25021234074444165,
"grad_norm": 1.5198974766775857,
"learning_rate": 1e-05,
"loss": 0.9632,
"step": 313
},
{
"epoch": 0.2510117411941044,
"grad_norm": 1.5581495649662924,
"learning_rate": 1e-05,
"loss": 0.9602,
"step": 314
},
{
"epoch": 0.25181114164376717,
"grad_norm": 1.5776975494668843,
"learning_rate": 1e-05,
"loss": 0.9794,
"step": 315
},
{
"epoch": 0.2526105420934299,
"grad_norm": 1.6005787401081062,
"learning_rate": 1e-05,
"loss": 0.8655,
"step": 316
},
{
"epoch": 0.2534099425430927,
"grad_norm": 1.7530297645251576,
"learning_rate": 1e-05,
"loss": 0.915,
"step": 317
},
{
"epoch": 0.25420934299275544,
"grad_norm": 1.8516146569735892,
"learning_rate": 1e-05,
"loss": 0.8734,
"step": 318
},
{
"epoch": 0.2550087434424182,
"grad_norm": 1.5925556861862051,
"learning_rate": 1e-05,
"loss": 0.9356,
"step": 319
},
{
"epoch": 0.25580814389208095,
"grad_norm": 1.7942857409055468,
"learning_rate": 1e-05,
"loss": 0.925,
"step": 320
},
{
"epoch": 0.2566075443417437,
"grad_norm": 1.7301914879145586,
"learning_rate": 1e-05,
"loss": 0.896,
"step": 321
},
{
"epoch": 0.25740694479140647,
"grad_norm": 1.5868880054016326,
"learning_rate": 1e-05,
"loss": 0.9021,
"step": 322
},
{
"epoch": 0.2582063452410692,
"grad_norm": 1.7680256022363232,
"learning_rate": 1e-05,
"loss": 0.9309,
"step": 323
},
{
"epoch": 0.2590057456907319,
"grad_norm": 1.586312615898128,
"learning_rate": 1e-05,
"loss": 1.0129,
"step": 324
},
{
"epoch": 0.2598051461403947,
"grad_norm": 1.8702172203637788,
"learning_rate": 1e-05,
"loss": 0.9423,
"step": 325
},
{
"epoch": 0.26060454659005744,
"grad_norm": 1.6231753647103917,
"learning_rate": 1e-05,
"loss": 0.9192,
"step": 326
},
{
"epoch": 0.2614039470397202,
"grad_norm": 1.6717011992423259,
"learning_rate": 1e-05,
"loss": 0.9214,
"step": 327
},
{
"epoch": 0.26220334748938295,
"grad_norm": 1.6440233759725276,
"learning_rate": 1e-05,
"loss": 0.9525,
"step": 328
},
{
"epoch": 0.2630027479390457,
"grad_norm": 1.6336229619568068,
"learning_rate": 1e-05,
"loss": 0.9072,
"step": 329
},
{
"epoch": 0.26380214838870847,
"grad_norm": 1.794138937818925,
"learning_rate": 1e-05,
"loss": 0.9081,
"step": 330
},
{
"epoch": 0.2646015488383712,
"grad_norm": 1.7000293714077805,
"learning_rate": 1e-05,
"loss": 0.9311,
"step": 331
},
{
"epoch": 0.265400949288034,
"grad_norm": 1.7629207816569556,
"learning_rate": 1e-05,
"loss": 0.8942,
"step": 332
},
{
"epoch": 0.26620034973769674,
"grad_norm": 1.7243708406916276,
"learning_rate": 1e-05,
"loss": 0.9009,
"step": 333
},
{
"epoch": 0.2669997501873595,
"grad_norm": 1.5153725886830214,
"learning_rate": 1e-05,
"loss": 0.946,
"step": 334
},
{
"epoch": 0.26779915063702225,
"grad_norm": 1.5897189873039888,
"learning_rate": 1e-05,
"loss": 0.8988,
"step": 335
},
{
"epoch": 0.268598551086685,
"grad_norm": 1.7792011474569303,
"learning_rate": 1e-05,
"loss": 0.9075,
"step": 336
},
{
"epoch": 0.2693979515363477,
"grad_norm": 1.715871716234354,
"learning_rate": 1e-05,
"loss": 0.9488,
"step": 337
},
{
"epoch": 0.27019735198601047,
"grad_norm": 1.7421673985618036,
"learning_rate": 1e-05,
"loss": 0.9265,
"step": 338
},
{
"epoch": 0.2709967524356732,
"grad_norm": 1.701591645181251,
"learning_rate": 1e-05,
"loss": 0.9134,
"step": 339
},
{
"epoch": 0.271796152885336,
"grad_norm": 1.5763851776425317,
"learning_rate": 1e-05,
"loss": 0.9059,
"step": 340
},
{
"epoch": 0.27259555333499874,
"grad_norm": 1.8860488547053122,
"learning_rate": 1e-05,
"loss": 0.9379,
"step": 341
},
{
"epoch": 0.2733949537846615,
"grad_norm": 1.6278214908005035,
"learning_rate": 1e-05,
"loss": 0.9041,
"step": 342
},
{
"epoch": 0.27419435423432426,
"grad_norm": 1.8591339922582193,
"learning_rate": 1e-05,
"loss": 0.9159,
"step": 343
},
{
"epoch": 0.274993754683987,
"grad_norm": 1.6416932855404107,
"learning_rate": 1e-05,
"loss": 0.9334,
"step": 344
},
{
"epoch": 0.27579315513364977,
"grad_norm": 1.5841499089670428,
"learning_rate": 1e-05,
"loss": 0.8758,
"step": 345
},
{
"epoch": 0.2765925555833125,
"grad_norm": 1.4885385714768005,
"learning_rate": 1e-05,
"loss": 0.9482,
"step": 346
},
{
"epoch": 0.2773919560329753,
"grad_norm": 1.652595269550327,
"learning_rate": 1e-05,
"loss": 0.9341,
"step": 347
},
{
"epoch": 0.27819135648263804,
"grad_norm": 1.569292511449757,
"learning_rate": 1e-05,
"loss": 0.9395,
"step": 348
},
{
"epoch": 0.2789907569323008,
"grad_norm": 1.8816669651120839,
"learning_rate": 1e-05,
"loss": 0.879,
"step": 349
},
{
"epoch": 0.2797901573819635,
"grad_norm": 1.8044366358437511,
"learning_rate": 1e-05,
"loss": 0.9476,
"step": 350
},
{
"epoch": 0.28058955783162626,
"grad_norm": 1.581864578938978,
"learning_rate": 1e-05,
"loss": 0.9443,
"step": 351
},
{
"epoch": 0.281388958281289,
"grad_norm": 1.719778574188113,
"learning_rate": 1e-05,
"loss": 0.9682,
"step": 352
},
{
"epoch": 0.28218835873095177,
"grad_norm": 1.7544745777196906,
"learning_rate": 1e-05,
"loss": 0.935,
"step": 353
},
{
"epoch": 0.28298775918061453,
"grad_norm": 1.529692690903228,
"learning_rate": 1e-05,
"loss": 0.899,
"step": 354
},
{
"epoch": 0.2837871596302773,
"grad_norm": 1.7002824332518707,
"learning_rate": 1e-05,
"loss": 0.9089,
"step": 355
},
{
"epoch": 0.28458656007994004,
"grad_norm": 1.6960676218935922,
"learning_rate": 1e-05,
"loss": 0.9131,
"step": 356
},
{
"epoch": 0.2853859605296028,
"grad_norm": 1.5467919520374653,
"learning_rate": 1e-05,
"loss": 0.9234,
"step": 357
},
{
"epoch": 0.28618536097926556,
"grad_norm": 1.5401712398267708,
"learning_rate": 1e-05,
"loss": 0.8821,
"step": 358
},
{
"epoch": 0.2869847614289283,
"grad_norm": 1.713197431966504,
"learning_rate": 1e-05,
"loss": 0.9755,
"step": 359
},
{
"epoch": 0.28778416187859107,
"grad_norm": 1.5846038726149987,
"learning_rate": 1e-05,
"loss": 0.9637,
"step": 360
},
{
"epoch": 0.28858356232825383,
"grad_norm": 1.9337936027301381,
"learning_rate": 1e-05,
"loss": 0.9208,
"step": 361
},
{
"epoch": 0.2893829627779166,
"grad_norm": 1.6240977396645668,
"learning_rate": 1e-05,
"loss": 0.9235,
"step": 362
},
{
"epoch": 0.29018236322757934,
"grad_norm": 1.7452206300395003,
"learning_rate": 1e-05,
"loss": 0.9553,
"step": 363
},
{
"epoch": 0.29098176367724204,
"grad_norm": 1.7207282088148232,
"learning_rate": 1e-05,
"loss": 0.9996,
"step": 364
},
{
"epoch": 0.2917811641269048,
"grad_norm": 1.6238156006165856,
"learning_rate": 1e-05,
"loss": 0.9492,
"step": 365
},
{
"epoch": 0.29258056457656756,
"grad_norm": 1.7217996073600954,
"learning_rate": 1e-05,
"loss": 0.9659,
"step": 366
},
{
"epoch": 0.2933799650262303,
"grad_norm": 1.7599545299893906,
"learning_rate": 1e-05,
"loss": 0.8954,
"step": 367
},
{
"epoch": 0.2941793654758931,
"grad_norm": 1.8392526222961474,
"learning_rate": 1e-05,
"loss": 0.9028,
"step": 368
},
{
"epoch": 0.29497876592555583,
"grad_norm": 1.4791987859922466,
"learning_rate": 1e-05,
"loss": 0.9207,
"step": 369
},
{
"epoch": 0.2957781663752186,
"grad_norm": 1.4806074723615978,
"learning_rate": 1e-05,
"loss": 0.9419,
"step": 370
},
{
"epoch": 0.29657756682488134,
"grad_norm": 1.7004917267851303,
"learning_rate": 1e-05,
"loss": 0.9354,
"step": 371
},
{
"epoch": 0.2973769672745441,
"grad_norm": 1.6234361909723023,
"learning_rate": 1e-05,
"loss": 0.8969,
"step": 372
},
{
"epoch": 0.29817636772420686,
"grad_norm": 1.5271331279708817,
"learning_rate": 1e-05,
"loss": 0.9455,
"step": 373
},
{
"epoch": 0.2989757681738696,
"grad_norm": 1.622230251696962,
"learning_rate": 1e-05,
"loss": 0.9504,
"step": 374
},
{
"epoch": 0.2997751686235324,
"grad_norm": 1.807073970989606,
"learning_rate": 1e-05,
"loss": 0.9,
"step": 375
},
{
"epoch": 0.30057456907319513,
"grad_norm": 1.4951410146162138,
"learning_rate": 1e-05,
"loss": 0.9664,
"step": 376
},
{
"epoch": 0.30137396952285783,
"grad_norm": 1.813020482613949,
"learning_rate": 1e-05,
"loss": 0.9441,
"step": 377
},
{
"epoch": 0.3021733699725206,
"grad_norm": 2.0285660578298046,
"learning_rate": 1e-05,
"loss": 0.8861,
"step": 378
},
{
"epoch": 0.30297277042218335,
"grad_norm": 1.6967916115297645,
"learning_rate": 1e-05,
"loss": 0.9321,
"step": 379
},
{
"epoch": 0.3037721708718461,
"grad_norm": 1.7022095887528572,
"learning_rate": 1e-05,
"loss": 0.9613,
"step": 380
},
{
"epoch": 0.30457157132150886,
"grad_norm": 1.719645739549248,
"learning_rate": 1e-05,
"loss": 0.9219,
"step": 381
},
{
"epoch": 0.3053709717711716,
"grad_norm": 1.6526243786903378,
"learning_rate": 1e-05,
"loss": 0.9069,
"step": 382
},
{
"epoch": 0.3061703722208344,
"grad_norm": 1.554593608182918,
"learning_rate": 1e-05,
"loss": 0.8863,
"step": 383
},
{
"epoch": 0.30696977267049713,
"grad_norm": 1.7296741561953324,
"learning_rate": 1e-05,
"loss": 0.9965,
"step": 384
},
{
"epoch": 0.3077691731201599,
"grad_norm": 1.7765959484743603,
"learning_rate": 1e-05,
"loss": 0.9024,
"step": 385
},
{
"epoch": 0.30856857356982265,
"grad_norm": 1.7444591927862072,
"learning_rate": 1e-05,
"loss": 0.9491,
"step": 386
},
{
"epoch": 0.3093679740194854,
"grad_norm": 1.6979650733135505,
"learning_rate": 1e-05,
"loss": 0.9633,
"step": 387
},
{
"epoch": 0.31016737446914816,
"grad_norm": 1.6518215838203623,
"learning_rate": 1e-05,
"loss": 0.8594,
"step": 388
},
{
"epoch": 0.3109667749188109,
"grad_norm": 1.597669753265097,
"learning_rate": 1e-05,
"loss": 0.8807,
"step": 389
},
{
"epoch": 0.3117661753684736,
"grad_norm": 1.7005856529533696,
"learning_rate": 1e-05,
"loss": 0.9136,
"step": 390
},
{
"epoch": 0.3125655758181364,
"grad_norm": 1.714793495031338,
"learning_rate": 1e-05,
"loss": 0.8969,
"step": 391
},
{
"epoch": 0.31336497626779913,
"grad_norm": 1.5558141368768388,
"learning_rate": 1e-05,
"loss": 0.9257,
"step": 392
},
{
"epoch": 0.3141643767174619,
"grad_norm": 1.5404155153049455,
"learning_rate": 1e-05,
"loss": 0.8779,
"step": 393
},
{
"epoch": 0.31496377716712465,
"grad_norm": 1.5383972642859716,
"learning_rate": 1e-05,
"loss": 0.9707,
"step": 394
},
{
"epoch": 0.3157631776167874,
"grad_norm": 1.7191998432330473,
"learning_rate": 1e-05,
"loss": 0.9126,
"step": 395
},
{
"epoch": 0.31656257806645016,
"grad_norm": 1.6051194326495044,
"learning_rate": 1e-05,
"loss": 0.8822,
"step": 396
},
{
"epoch": 0.3173619785161129,
"grad_norm": 1.6869656351879205,
"learning_rate": 1e-05,
"loss": 0.9343,
"step": 397
},
{
"epoch": 0.3181613789657757,
"grad_norm": 1.6256734963382786,
"learning_rate": 1e-05,
"loss": 0.9156,
"step": 398
},
{
"epoch": 0.31896077941543843,
"grad_norm": 1.5756449476038674,
"learning_rate": 1e-05,
"loss": 0.9807,
"step": 399
},
{
"epoch": 0.3197601798651012,
"grad_norm": 1.6188490159724278,
"learning_rate": 1e-05,
"loss": 0.9644,
"step": 400
},
{
"epoch": 0.32055958031476395,
"grad_norm": 1.94007311994945,
"learning_rate": 1e-05,
"loss": 0.9614,
"step": 401
},
{
"epoch": 0.3213589807644267,
"grad_norm": 1.659086295612128,
"learning_rate": 1e-05,
"loss": 0.944,
"step": 402
},
{
"epoch": 0.3221583812140894,
"grad_norm": 1.9235409755089947,
"learning_rate": 1e-05,
"loss": 0.9259,
"step": 403
},
{
"epoch": 0.32295778166375216,
"grad_norm": 1.5880918105995026,
"learning_rate": 1e-05,
"loss": 0.9255,
"step": 404
},
{
"epoch": 0.3237571821134149,
"grad_norm": 1.4948152435643522,
"learning_rate": 1e-05,
"loss": 0.926,
"step": 405
},
{
"epoch": 0.3245565825630777,
"grad_norm": 1.5350941186461544,
"learning_rate": 1e-05,
"loss": 0.898,
"step": 406
},
{
"epoch": 0.32535598301274044,
"grad_norm": 1.3466986686471294,
"learning_rate": 1e-05,
"loss": 0.8904,
"step": 407
},
{
"epoch": 0.3261553834624032,
"grad_norm": 1.459891336046445,
"learning_rate": 1e-05,
"loss": 0.8817,
"step": 408
},
{
"epoch": 0.32695478391206595,
"grad_norm": 1.4836727854431802,
"learning_rate": 1e-05,
"loss": 0.9216,
"step": 409
},
{
"epoch": 0.3277541843617287,
"grad_norm": 1.6446226209440065,
"learning_rate": 1e-05,
"loss": 0.9249,
"step": 410
},
{
"epoch": 0.32855358481139146,
"grad_norm": 1.5635297277867413,
"learning_rate": 1e-05,
"loss": 0.9122,
"step": 411
},
{
"epoch": 0.3293529852610542,
"grad_norm": 1.6358281167528332,
"learning_rate": 1e-05,
"loss": 0.8843,
"step": 412
},
{
"epoch": 0.330152385710717,
"grad_norm": 1.5404191221381782,
"learning_rate": 1e-05,
"loss": 0.9106,
"step": 413
},
{
"epoch": 0.33095178616037974,
"grad_norm": 1.5879004668639547,
"learning_rate": 1e-05,
"loss": 0.9211,
"step": 414
},
{
"epoch": 0.3317511866100425,
"grad_norm": 1.790797443056402,
"learning_rate": 1e-05,
"loss": 0.9211,
"step": 415
},
{
"epoch": 0.3325505870597052,
"grad_norm": 1.8179861905661685,
"learning_rate": 1e-05,
"loss": 0.9391,
"step": 416
},
{
"epoch": 0.33334998750936795,
"grad_norm": 1.4379165089707215,
"learning_rate": 1e-05,
"loss": 0.9021,
"step": 417
},
{
"epoch": 0.3341493879590307,
"grad_norm": 1.7134688617321956,
"learning_rate": 1e-05,
"loss": 0.9833,
"step": 418
},
{
"epoch": 0.33494878840869347,
"grad_norm": 1.4039431214440103,
"learning_rate": 1e-05,
"loss": 0.888,
"step": 419
},
{
"epoch": 0.3357481888583562,
"grad_norm": 1.6586329038004721,
"learning_rate": 1e-05,
"loss": 0.9088,
"step": 420
},
{
"epoch": 0.336547589308019,
"grad_norm": 1.900128933012227,
"learning_rate": 1e-05,
"loss": 0.9,
"step": 421
},
{
"epoch": 0.33734698975768174,
"grad_norm": 1.5890662573554606,
"learning_rate": 1e-05,
"loss": 0.9029,
"step": 422
},
{
"epoch": 0.3381463902073445,
"grad_norm": 1.495628306935103,
"learning_rate": 1e-05,
"loss": 0.919,
"step": 423
},
{
"epoch": 0.33894579065700725,
"grad_norm": 1.4495521814015604,
"learning_rate": 1e-05,
"loss": 0.9967,
"step": 424
},
{
"epoch": 0.33974519110667,
"grad_norm": 1.7055256640065686,
"learning_rate": 1e-05,
"loss": 0.9769,
"step": 425
},
{
"epoch": 0.34054459155633277,
"grad_norm": 1.4909741619159311,
"learning_rate": 1e-05,
"loss": 0.9152,
"step": 426
},
{
"epoch": 0.3413439920059955,
"grad_norm": 1.628227110908977,
"learning_rate": 1e-05,
"loss": 0.955,
"step": 427
},
{
"epoch": 0.3421433924556583,
"grad_norm": 1.8220036868892047,
"learning_rate": 1e-05,
"loss": 0.8972,
"step": 428
},
{
"epoch": 0.34294279290532104,
"grad_norm": 1.693415237669836,
"learning_rate": 1e-05,
"loss": 0.9063,
"step": 429
},
{
"epoch": 0.34374219335498374,
"grad_norm": 1.5346322329118909,
"learning_rate": 1e-05,
"loss": 0.8737,
"step": 430
},
{
"epoch": 0.3445415938046465,
"grad_norm": 1.523134303904886,
"learning_rate": 1e-05,
"loss": 0.9522,
"step": 431
},
{
"epoch": 0.34534099425430925,
"grad_norm": 1.8163891768400675,
"learning_rate": 1e-05,
"loss": 0.9254,
"step": 432
},
{
"epoch": 0.346140394703972,
"grad_norm": 1.6001042968512986,
"learning_rate": 1e-05,
"loss": 0.9393,
"step": 433
},
{
"epoch": 0.34693979515363477,
"grad_norm": 1.4962110538157338,
"learning_rate": 1e-05,
"loss": 0.9016,
"step": 434
},
{
"epoch": 0.3477391956032975,
"grad_norm": 1.7041821659704226,
"learning_rate": 1e-05,
"loss": 0.8586,
"step": 435
},
{
"epoch": 0.3485385960529603,
"grad_norm": 1.6883017856053422,
"learning_rate": 1e-05,
"loss": 0.9729,
"step": 436
},
{
"epoch": 0.34933799650262304,
"grad_norm": 1.6846925338485461,
"learning_rate": 1e-05,
"loss": 0.9379,
"step": 437
},
{
"epoch": 0.3501373969522858,
"grad_norm": 1.8235246867955863,
"learning_rate": 1e-05,
"loss": 0.9248,
"step": 438
},
{
"epoch": 0.35093679740194855,
"grad_norm": 1.935505500625835,
"learning_rate": 1e-05,
"loss": 0.9371,
"step": 439
},
{
"epoch": 0.3517361978516113,
"grad_norm": 1.67613124761384,
"learning_rate": 1e-05,
"loss": 0.979,
"step": 440
},
{
"epoch": 0.35253559830127407,
"grad_norm": 1.4449954490901646,
"learning_rate": 1e-05,
"loss": 0.968,
"step": 441
},
{
"epoch": 0.3533349987509368,
"grad_norm": 1.5913830352404914,
"learning_rate": 1e-05,
"loss": 0.964,
"step": 442
},
{
"epoch": 0.3541343992005995,
"grad_norm": 1.7168730495466147,
"learning_rate": 1e-05,
"loss": 0.9138,
"step": 443
},
{
"epoch": 0.3549337996502623,
"grad_norm": 1.6307072180820321,
"learning_rate": 1e-05,
"loss": 0.9472,
"step": 444
},
{
"epoch": 0.35573320009992504,
"grad_norm": 1.6118353409303823,
"learning_rate": 1e-05,
"loss": 0.9645,
"step": 445
},
{
"epoch": 0.3565326005495878,
"grad_norm": 1.6940859087140694,
"learning_rate": 1e-05,
"loss": 0.9815,
"step": 446
},
{
"epoch": 0.35733200099925055,
"grad_norm": 1.8606216696352482,
"learning_rate": 1e-05,
"loss": 0.9805,
"step": 447
},
{
"epoch": 0.3581314014489133,
"grad_norm": 1.532089096889218,
"learning_rate": 1e-05,
"loss": 0.9393,
"step": 448
},
{
"epoch": 0.35893080189857607,
"grad_norm": 1.6384290071957173,
"learning_rate": 1e-05,
"loss": 0.9459,
"step": 449
},
{
"epoch": 0.3597302023482388,
"grad_norm": 1.5244481340256106,
"learning_rate": 1e-05,
"loss": 0.977,
"step": 450
},
{
"epoch": 0.3605296027979016,
"grad_norm": 1.6337567843902518,
"learning_rate": 1e-05,
"loss": 0.8991,
"step": 451
},
{
"epoch": 0.36132900324756434,
"grad_norm": 1.7963360988533934,
"learning_rate": 1e-05,
"loss": 0.9087,
"step": 452
},
{
"epoch": 0.3621284036972271,
"grad_norm": 1.696315268595366,
"learning_rate": 1e-05,
"loss": 0.9242,
"step": 453
},
{
"epoch": 0.36292780414688985,
"grad_norm": 1.710036005807286,
"learning_rate": 1e-05,
"loss": 0.8429,
"step": 454
},
{
"epoch": 0.3637272045965526,
"grad_norm": 1.749452843653296,
"learning_rate": 1e-05,
"loss": 0.9133,
"step": 455
},
{
"epoch": 0.3645266050462153,
"grad_norm": 1.4397928987828232,
"learning_rate": 1e-05,
"loss": 0.8427,
"step": 456
},
{
"epoch": 0.36532600549587807,
"grad_norm": 1.6825466790780408,
"learning_rate": 1e-05,
"loss": 0.89,
"step": 457
},
{
"epoch": 0.3661254059455408,
"grad_norm": 1.6056927709310882,
"learning_rate": 1e-05,
"loss": 0.9421,
"step": 458
},
{
"epoch": 0.3669248063952036,
"grad_norm": 1.5861038676425987,
"learning_rate": 1e-05,
"loss": 0.9496,
"step": 459
},
{
"epoch": 0.36772420684486634,
"grad_norm": 1.6684621776248278,
"learning_rate": 1e-05,
"loss": 0.884,
"step": 460
},
{
"epoch": 0.3685236072945291,
"grad_norm": 1.758026110496432,
"learning_rate": 1e-05,
"loss": 0.9441,
"step": 461
},
{
"epoch": 0.36932300774419186,
"grad_norm": 1.4763506224586516,
"learning_rate": 1e-05,
"loss": 0.9393,
"step": 462
},
{
"epoch": 0.3701224081938546,
"grad_norm": 1.5509318071640712,
"learning_rate": 1e-05,
"loss": 0.8643,
"step": 463
},
{
"epoch": 0.37092180864351737,
"grad_norm": 1.5607192206519345,
"learning_rate": 1e-05,
"loss": 0.9165,
"step": 464
},
{
"epoch": 0.3717212090931801,
"grad_norm": 1.6511236719507991,
"learning_rate": 1e-05,
"loss": 0.9421,
"step": 465
},
{
"epoch": 0.3725206095428429,
"grad_norm": 1.6501362966399429,
"learning_rate": 1e-05,
"loss": 0.915,
"step": 466
},
{
"epoch": 0.37332000999250564,
"grad_norm": 1.5207720771291409,
"learning_rate": 1e-05,
"loss": 0.9454,
"step": 467
},
{
"epoch": 0.3741194104421684,
"grad_norm": 1.5392735956515966,
"learning_rate": 1e-05,
"loss": 0.956,
"step": 468
},
{
"epoch": 0.3749188108918311,
"grad_norm": 1.5940306759004237,
"learning_rate": 1e-05,
"loss": 0.8719,
"step": 469
},
{
"epoch": 0.37571821134149386,
"grad_norm": 1.6908424326030602,
"learning_rate": 1e-05,
"loss": 0.9255,
"step": 470
},
{
"epoch": 0.3765176117911566,
"grad_norm": 1.4928846149782238,
"learning_rate": 1e-05,
"loss": 0.9378,
"step": 471
},
{
"epoch": 0.37731701224081937,
"grad_norm": 1.7041500499453686,
"learning_rate": 1e-05,
"loss": 0.8734,
"step": 472
},
{
"epoch": 0.37811641269048213,
"grad_norm": 1.619189516937598,
"learning_rate": 1e-05,
"loss": 0.9063,
"step": 473
},
{
"epoch": 0.3789158131401449,
"grad_norm": 1.6588364324248581,
"learning_rate": 1e-05,
"loss": 0.8701,
"step": 474
},
{
"epoch": 0.37971521358980764,
"grad_norm": 1.5762727848791807,
"learning_rate": 1e-05,
"loss": 0.9497,
"step": 475
},
{
"epoch": 0.3805146140394704,
"grad_norm": 1.5363970090025982,
"learning_rate": 1e-05,
"loss": 0.9918,
"step": 476
},
{
"epoch": 0.38131401448913316,
"grad_norm": 1.6404231232106667,
"learning_rate": 1e-05,
"loss": 0.9056,
"step": 477
},
{
"epoch": 0.3821134149387959,
"grad_norm": 1.6314596845516385,
"learning_rate": 1e-05,
"loss": 0.928,
"step": 478
},
{
"epoch": 0.38291281538845867,
"grad_norm": 1.6126677835331522,
"learning_rate": 1e-05,
"loss": 0.9978,
"step": 479
},
{
"epoch": 0.38371221583812143,
"grad_norm": 1.3173664389567725,
"learning_rate": 1e-05,
"loss": 0.9158,
"step": 480
},
{
"epoch": 0.3845116162877842,
"grad_norm": 1.496540187325337,
"learning_rate": 1e-05,
"loss": 0.9378,
"step": 481
},
{
"epoch": 0.3853110167374469,
"grad_norm": 1.5062068173629883,
"learning_rate": 1e-05,
"loss": 0.9159,
"step": 482
},
{
"epoch": 0.38611041718710964,
"grad_norm": 1.529187603034289,
"learning_rate": 1e-05,
"loss": 0.951,
"step": 483
},
{
"epoch": 0.3869098176367724,
"grad_norm": 1.5635118437005366,
"learning_rate": 1e-05,
"loss": 0.9291,
"step": 484
},
{
"epoch": 0.38770921808643516,
"grad_norm": 1.6646247338291131,
"learning_rate": 1e-05,
"loss": 0.874,
"step": 485
},
{
"epoch": 0.3885086185360979,
"grad_norm": 1.6470189371191908,
"learning_rate": 1e-05,
"loss": 0.9118,
"step": 486
},
{
"epoch": 0.3893080189857607,
"grad_norm": 1.4041767343860398,
"learning_rate": 1e-05,
"loss": 0.9193,
"step": 487
},
{
"epoch": 0.39010741943542343,
"grad_norm": 1.637354519439742,
"learning_rate": 1e-05,
"loss": 0.9622,
"step": 488
},
{
"epoch": 0.3909068198850862,
"grad_norm": 1.6793914337693705,
"learning_rate": 1e-05,
"loss": 0.8591,
"step": 489
},
{
"epoch": 0.39170622033474894,
"grad_norm": 1.631823843080509,
"learning_rate": 1e-05,
"loss": 0.9061,
"step": 490
},
{
"epoch": 0.3925056207844117,
"grad_norm": 1.4551068376984746,
"learning_rate": 1e-05,
"loss": 0.886,
"step": 491
},
{
"epoch": 0.39330502123407446,
"grad_norm": 1.843148583217912,
"learning_rate": 1e-05,
"loss": 0.8748,
"step": 492
},
{
"epoch": 0.3941044216837372,
"grad_norm": 1.503885142875128,
"learning_rate": 1e-05,
"loss": 0.9208,
"step": 493
},
{
"epoch": 0.3949038221334,
"grad_norm": 1.7406094685573732,
"learning_rate": 1e-05,
"loss": 0.8786,
"step": 494
},
{
"epoch": 0.39570322258306273,
"grad_norm": 1.730586930891903,
"learning_rate": 1e-05,
"loss": 0.9157,
"step": 495
},
{
"epoch": 0.39650262303272543,
"grad_norm": 1.5528810488930866,
"learning_rate": 1e-05,
"loss": 0.9142,
"step": 496
},
{
"epoch": 0.3973020234823882,
"grad_norm": 1.5307301129466364,
"learning_rate": 1e-05,
"loss": 0.9415,
"step": 497
},
{
"epoch": 0.39810142393205095,
"grad_norm": 1.5706393811203467,
"learning_rate": 1e-05,
"loss": 0.8912,
"step": 498
},
{
"epoch": 0.3989008243817137,
"grad_norm": 1.6199448054984131,
"learning_rate": 1e-05,
"loss": 0.9175,
"step": 499
},
{
"epoch": 0.39970022483137646,
"grad_norm": 1.4945708663613873,
"learning_rate": 1e-05,
"loss": 0.8961,
"step": 500
},
{
"epoch": 0.4004996252810392,
"grad_norm": 1.5533154327294227,
"learning_rate": 1e-05,
"loss": 0.9356,
"step": 501
},
{
"epoch": 0.401299025730702,
"grad_norm": 1.5325963522620767,
"learning_rate": 1e-05,
"loss": 0.965,
"step": 502
},
{
"epoch": 0.40209842618036473,
"grad_norm": 1.671999510186726,
"learning_rate": 1e-05,
"loss": 0.8271,
"step": 503
},
{
"epoch": 0.4028978266300275,
"grad_norm": 1.5355767548245969,
"learning_rate": 1e-05,
"loss": 0.9497,
"step": 504
},
{
"epoch": 0.40369722707969025,
"grad_norm": 1.6030539969868434,
"learning_rate": 1e-05,
"loss": 0.9291,
"step": 505
},
{
"epoch": 0.404496627529353,
"grad_norm": 1.6407538986876247,
"learning_rate": 1e-05,
"loss": 0.9878,
"step": 506
},
{
"epoch": 0.40529602797901576,
"grad_norm": 1.7688356573735502,
"learning_rate": 1e-05,
"loss": 0.9665,
"step": 507
},
{
"epoch": 0.4060954284286785,
"grad_norm": 1.5607970481443443,
"learning_rate": 1e-05,
"loss": 0.9488,
"step": 508
},
{
"epoch": 0.4068948288783412,
"grad_norm": 1.6161754040719796,
"learning_rate": 1e-05,
"loss": 0.858,
"step": 509
},
{
"epoch": 0.407694229328004,
"grad_norm": 1.5793085315204543,
"learning_rate": 1e-05,
"loss": 0.8956,
"step": 510
},
{
"epoch": 0.40849362977766673,
"grad_norm": 1.5936599885814402,
"learning_rate": 1e-05,
"loss": 0.9348,
"step": 511
},
{
"epoch": 0.4092930302273295,
"grad_norm": 1.5658605524297327,
"learning_rate": 1e-05,
"loss": 0.9389,
"step": 512
},
{
"epoch": 0.41009243067699225,
"grad_norm": 1.5921115812648192,
"learning_rate": 1e-05,
"loss": 0.8438,
"step": 513
},
{
"epoch": 0.410891831126655,
"grad_norm": 1.8163231036582868,
"learning_rate": 1e-05,
"loss": 0.9064,
"step": 514
},
{
"epoch": 0.41169123157631776,
"grad_norm": 1.5950813731389535,
"learning_rate": 1e-05,
"loss": 0.9172,
"step": 515
},
{
"epoch": 0.4124906320259805,
"grad_norm": 1.689588168520015,
"learning_rate": 1e-05,
"loss": 0.9265,
"step": 516
},
{
"epoch": 0.4132900324756433,
"grad_norm": 1.540041600561803,
"learning_rate": 1e-05,
"loss": 0.9189,
"step": 517
},
{
"epoch": 0.41408943292530603,
"grad_norm": 1.6662920193878612,
"learning_rate": 1e-05,
"loss": 0.9155,
"step": 518
},
{
"epoch": 0.4148888333749688,
"grad_norm": 1.6860065883672692,
"learning_rate": 1e-05,
"loss": 0.9608,
"step": 519
},
{
"epoch": 0.41568823382463155,
"grad_norm": 1.7503429857603447,
"learning_rate": 1e-05,
"loss": 0.8936,
"step": 520
},
{
"epoch": 0.4164876342742943,
"grad_norm": 1.4349809774745903,
"learning_rate": 1e-05,
"loss": 0.9076,
"step": 521
},
{
"epoch": 0.417287034723957,
"grad_norm": 1.6525870899508948,
"learning_rate": 1e-05,
"loss": 0.951,
"step": 522
},
{
"epoch": 0.41808643517361976,
"grad_norm": 1.305941403451334,
"learning_rate": 1e-05,
"loss": 0.9039,
"step": 523
},
{
"epoch": 0.4188858356232825,
"grad_norm": 1.5535189677415364,
"learning_rate": 1e-05,
"loss": 0.9101,
"step": 524
},
{
"epoch": 0.4196852360729453,
"grad_norm": 1.390869042188358,
"learning_rate": 1e-05,
"loss": 0.8821,
"step": 525
},
{
"epoch": 0.42048463652260804,
"grad_norm": 1.6086764868308612,
"learning_rate": 1e-05,
"loss": 0.9494,
"step": 526
},
{
"epoch": 0.4212840369722708,
"grad_norm": 1.5277453444137763,
"learning_rate": 1e-05,
"loss": 0.9042,
"step": 527
},
{
"epoch": 0.42208343742193355,
"grad_norm": 1.5037652064794895,
"learning_rate": 1e-05,
"loss": 0.8973,
"step": 528
},
{
"epoch": 0.4228828378715963,
"grad_norm": 1.630788946234423,
"learning_rate": 1e-05,
"loss": 0.8868,
"step": 529
},
{
"epoch": 0.42368223832125906,
"grad_norm": 1.476128500837339,
"learning_rate": 1e-05,
"loss": 0.9264,
"step": 530
},
{
"epoch": 0.4244816387709218,
"grad_norm": 1.4082525457129158,
"learning_rate": 1e-05,
"loss": 0.9194,
"step": 531
},
{
"epoch": 0.4252810392205846,
"grad_norm": 1.60560804137754,
"learning_rate": 1e-05,
"loss": 0.8596,
"step": 532
},
{
"epoch": 0.42608043967024734,
"grad_norm": 1.5292853895222724,
"learning_rate": 1e-05,
"loss": 0.8933,
"step": 533
},
{
"epoch": 0.4268798401199101,
"grad_norm": 1.6276199503503024,
"learning_rate": 1e-05,
"loss": 0.8905,
"step": 534
},
{
"epoch": 0.4276792405695728,
"grad_norm": 1.6143026040200776,
"learning_rate": 1e-05,
"loss": 0.9017,
"step": 535
},
{
"epoch": 0.42847864101923555,
"grad_norm": 1.492638575870208,
"learning_rate": 1e-05,
"loss": 0.8445,
"step": 536
},
{
"epoch": 0.4292780414688983,
"grad_norm": 1.5992856689061312,
"learning_rate": 1e-05,
"loss": 0.8747,
"step": 537
},
{
"epoch": 0.43007744191856107,
"grad_norm": 1.8376302395541704,
"learning_rate": 1e-05,
"loss": 0.8744,
"step": 538
},
{
"epoch": 0.4308768423682238,
"grad_norm": 1.5083175238496622,
"learning_rate": 1e-05,
"loss": 0.8831,
"step": 539
},
{
"epoch": 0.4316762428178866,
"grad_norm": 1.6391308804501599,
"learning_rate": 1e-05,
"loss": 0.9093,
"step": 540
},
{
"epoch": 0.43247564326754934,
"grad_norm": 1.587896265231209,
"learning_rate": 1e-05,
"loss": 0.931,
"step": 541
},
{
"epoch": 0.4332750437172121,
"grad_norm": 1.5174662595552115,
"learning_rate": 1e-05,
"loss": 0.9176,
"step": 542
},
{
"epoch": 0.43407444416687485,
"grad_norm": 1.6000443436491891,
"learning_rate": 1e-05,
"loss": 0.8983,
"step": 543
},
{
"epoch": 0.4348738446165376,
"grad_norm": 1.6311375389076388,
"learning_rate": 1e-05,
"loss": 0.9358,
"step": 544
},
{
"epoch": 0.43567324506620037,
"grad_norm": 1.5311673613481407,
"learning_rate": 1e-05,
"loss": 0.9248,
"step": 545
},
{
"epoch": 0.4364726455158631,
"grad_norm": 1.527296520797819,
"learning_rate": 1e-05,
"loss": 0.8941,
"step": 546
},
{
"epoch": 0.4372720459655259,
"grad_norm": 1.3849530231908453,
"learning_rate": 1e-05,
"loss": 0.9206,
"step": 547
},
{
"epoch": 0.4380714464151886,
"grad_norm": 1.6041978636707703,
"learning_rate": 1e-05,
"loss": 0.8933,
"step": 548
},
{
"epoch": 0.43887084686485134,
"grad_norm": 1.5449273405092985,
"learning_rate": 1e-05,
"loss": 0.9157,
"step": 549
},
{
"epoch": 0.4396702473145141,
"grad_norm": 1.5864452967308555,
"learning_rate": 1e-05,
"loss": 0.8443,
"step": 550
},
{
"epoch": 0.44046964776417685,
"grad_norm": 1.4728488192211566,
"learning_rate": 1e-05,
"loss": 0.9108,
"step": 551
},
{
"epoch": 0.4412690482138396,
"grad_norm": 1.4823924024202317,
"learning_rate": 1e-05,
"loss": 0.9336,
"step": 552
},
{
"epoch": 0.44206844866350237,
"grad_norm": 1.4382359303688308,
"learning_rate": 1e-05,
"loss": 0.9271,
"step": 553
},
{
"epoch": 0.4428678491131651,
"grad_norm": 1.5676768234957863,
"learning_rate": 1e-05,
"loss": 0.9705,
"step": 554
},
{
"epoch": 0.4436672495628279,
"grad_norm": 1.5423184321680976,
"learning_rate": 1e-05,
"loss": 0.8464,
"step": 555
},
{
"epoch": 0.44446665001249064,
"grad_norm": 1.6045659880625645,
"learning_rate": 1e-05,
"loss": 0.9303,
"step": 556
},
{
"epoch": 0.4452660504621534,
"grad_norm": 1.9872755202696784,
"learning_rate": 1e-05,
"loss": 0.8687,
"step": 557
},
{
"epoch": 0.44606545091181615,
"grad_norm": 1.4834070914943105,
"learning_rate": 1e-05,
"loss": 0.951,
"step": 558
},
{
"epoch": 0.4468648513614789,
"grad_norm": 1.5310211273825027,
"learning_rate": 1e-05,
"loss": 0.9233,
"step": 559
},
{
"epoch": 0.44766425181114167,
"grad_norm": 1.5815996536549406,
"learning_rate": 1e-05,
"loss": 0.9767,
"step": 560
},
{
"epoch": 0.4484636522608044,
"grad_norm": 1.7688239075887118,
"learning_rate": 1e-05,
"loss": 0.8879,
"step": 561
},
{
"epoch": 0.4492630527104671,
"grad_norm": 1.6482560554808632,
"learning_rate": 1e-05,
"loss": 0.9124,
"step": 562
},
{
"epoch": 0.4500624531601299,
"grad_norm": 1.5404021166963555,
"learning_rate": 1e-05,
"loss": 0.9027,
"step": 563
},
{
"epoch": 0.45086185360979264,
"grad_norm": 1.5195520813189534,
"learning_rate": 1e-05,
"loss": 0.9112,
"step": 564
},
{
"epoch": 0.4516612540594554,
"grad_norm": 1.5192783031055126,
"learning_rate": 1e-05,
"loss": 0.8971,
"step": 565
},
{
"epoch": 0.45246065450911815,
"grad_norm": 1.5618653033074856,
"learning_rate": 1e-05,
"loss": 0.9054,
"step": 566
},
{
"epoch": 0.4532600549587809,
"grad_norm": 1.6064016663059253,
"learning_rate": 1e-05,
"loss": 0.9391,
"step": 567
},
{
"epoch": 0.45405945540844367,
"grad_norm": 1.7240615287273162,
"learning_rate": 1e-05,
"loss": 0.8906,
"step": 568
},
{
"epoch": 0.4548588558581064,
"grad_norm": 1.7149945179624295,
"learning_rate": 1e-05,
"loss": 0.8621,
"step": 569
},
{
"epoch": 0.4556582563077692,
"grad_norm": 1.4856328376378898,
"learning_rate": 1e-05,
"loss": 0.8694,
"step": 570
},
{
"epoch": 0.45645765675743194,
"grad_norm": 1.4702642174922036,
"learning_rate": 1e-05,
"loss": 0.9025,
"step": 571
},
{
"epoch": 0.4572570572070947,
"grad_norm": 1.6088556169851551,
"learning_rate": 1e-05,
"loss": 0.868,
"step": 572
},
{
"epoch": 0.45805645765675745,
"grad_norm": 1.5509844332733922,
"learning_rate": 1e-05,
"loss": 0.9513,
"step": 573
},
{
"epoch": 0.4588558581064202,
"grad_norm": 1.5292949122902217,
"learning_rate": 1e-05,
"loss": 0.8845,
"step": 574
},
{
"epoch": 0.4596552585560829,
"grad_norm": 1.6381076297979584,
"learning_rate": 1e-05,
"loss": 0.9386,
"step": 575
},
{
"epoch": 0.46045465900574567,
"grad_norm": 1.6267004497668505,
"learning_rate": 1e-05,
"loss": 0.8987,
"step": 576
},
{
"epoch": 0.4612540594554084,
"grad_norm": 1.5456142322307922,
"learning_rate": 1e-05,
"loss": 0.9121,
"step": 577
},
{
"epoch": 0.4620534599050712,
"grad_norm": 1.5522043742149023,
"learning_rate": 1e-05,
"loss": 0.8914,
"step": 578
},
{
"epoch": 0.46285286035473394,
"grad_norm": 1.633867715589152,
"learning_rate": 1e-05,
"loss": 0.8741,
"step": 579
},
{
"epoch": 0.4636522608043967,
"grad_norm": 1.614894631262607,
"learning_rate": 1e-05,
"loss": 0.9171,
"step": 580
},
{
"epoch": 0.46445166125405946,
"grad_norm": 1.386145144430922,
"learning_rate": 1e-05,
"loss": 0.8693,
"step": 581
},
{
"epoch": 0.4652510617037222,
"grad_norm": 1.484841140261494,
"learning_rate": 1e-05,
"loss": 0.8966,
"step": 582
},
{
"epoch": 0.46605046215338497,
"grad_norm": 1.6068617064880517,
"learning_rate": 1e-05,
"loss": 0.8818,
"step": 583
},
{
"epoch": 0.4668498626030477,
"grad_norm": 1.6096786496184112,
"learning_rate": 1e-05,
"loss": 0.9123,
"step": 584
},
{
"epoch": 0.4676492630527105,
"grad_norm": 1.4602535645871833,
"learning_rate": 1e-05,
"loss": 0.9143,
"step": 585
},
{
"epoch": 0.46844866350237324,
"grad_norm": 1.7447912274361523,
"learning_rate": 1e-05,
"loss": 0.8966,
"step": 586
},
{
"epoch": 0.469248063952036,
"grad_norm": 1.5775439912332734,
"learning_rate": 1e-05,
"loss": 0.8994,
"step": 587
},
{
"epoch": 0.4700474644016987,
"grad_norm": 1.4031832068470533,
"learning_rate": 1e-05,
"loss": 0.9055,
"step": 588
},
{
"epoch": 0.47084686485136146,
"grad_norm": 1.5789430313417314,
"learning_rate": 1e-05,
"loss": 0.9393,
"step": 589
},
{
"epoch": 0.4716462653010242,
"grad_norm": 1.4655734741114497,
"learning_rate": 1e-05,
"loss": 0.8889,
"step": 590
},
{
"epoch": 0.47244566575068697,
"grad_norm": 1.752804541715281,
"learning_rate": 1e-05,
"loss": 0.9166,
"step": 591
},
{
"epoch": 0.47324506620034973,
"grad_norm": 1.6906678527664594,
"learning_rate": 1e-05,
"loss": 0.8673,
"step": 592
},
{
"epoch": 0.4740444666500125,
"grad_norm": 1.5985802845452706,
"learning_rate": 1e-05,
"loss": 0.9435,
"step": 593
},
{
"epoch": 0.47484386709967524,
"grad_norm": 1.6997316043068198,
"learning_rate": 1e-05,
"loss": 0.9112,
"step": 594
},
{
"epoch": 0.475643267549338,
"grad_norm": 1.3896008701013607,
"learning_rate": 1e-05,
"loss": 0.8884,
"step": 595
},
{
"epoch": 0.47644266799900076,
"grad_norm": 1.4232134469996818,
"learning_rate": 1e-05,
"loss": 0.8537,
"step": 596
},
{
"epoch": 0.4772420684486635,
"grad_norm": 1.4962294604199373,
"learning_rate": 1e-05,
"loss": 0.8599,
"step": 597
},
{
"epoch": 0.47804146889832627,
"grad_norm": 1.3445821960864492,
"learning_rate": 1e-05,
"loss": 0.8719,
"step": 598
},
{
"epoch": 0.47884086934798903,
"grad_norm": 1.5426225615913305,
"learning_rate": 1e-05,
"loss": 0.9097,
"step": 599
},
{
"epoch": 0.4796402697976518,
"grad_norm": 1.4650349809263883,
"learning_rate": 1e-05,
"loss": 0.8933,
"step": 600
},
{
"epoch": 0.4804396702473145,
"grad_norm": 1.5753170073693514,
"learning_rate": 1e-05,
"loss": 0.9461,
"step": 601
},
{
"epoch": 0.48123907069697724,
"grad_norm": 1.6207854665284498,
"learning_rate": 1e-05,
"loss": 0.8332,
"step": 602
},
{
"epoch": 0.48203847114664,
"grad_norm": 1.6847020603077485,
"learning_rate": 1e-05,
"loss": 0.8902,
"step": 603
},
{
"epoch": 0.48283787159630276,
"grad_norm": 1.746631687170473,
"learning_rate": 1e-05,
"loss": 0.852,
"step": 604
},
{
"epoch": 0.4836372720459655,
"grad_norm": 1.5812097478750036,
"learning_rate": 1e-05,
"loss": 0.8875,
"step": 605
},
{
"epoch": 0.4844366724956283,
"grad_norm": 1.664501332749721,
"learning_rate": 1e-05,
"loss": 0.9623,
"step": 606
},
{
"epoch": 0.48523607294529103,
"grad_norm": 1.3887624769518734,
"learning_rate": 1e-05,
"loss": 0.8815,
"step": 607
},
{
"epoch": 0.4860354733949538,
"grad_norm": 1.4487321005360188,
"learning_rate": 1e-05,
"loss": 0.8985,
"step": 608
},
{
"epoch": 0.48683487384461654,
"grad_norm": 1.4789561071530237,
"learning_rate": 1e-05,
"loss": 0.9061,
"step": 609
},
{
"epoch": 0.4876342742942793,
"grad_norm": 1.5069409156312008,
"learning_rate": 1e-05,
"loss": 0.9286,
"step": 610
},
{
"epoch": 0.48843367474394206,
"grad_norm": 1.4663884880855809,
"learning_rate": 1e-05,
"loss": 0.9382,
"step": 611
},
{
"epoch": 0.4892330751936048,
"grad_norm": 1.4795946008795262,
"learning_rate": 1e-05,
"loss": 0.9013,
"step": 612
},
{
"epoch": 0.4900324756432676,
"grad_norm": 1.6550390075160482,
"learning_rate": 1e-05,
"loss": 0.8981,
"step": 613
},
{
"epoch": 0.4908318760929303,
"grad_norm": 1.5252370570410794,
"learning_rate": 1e-05,
"loss": 0.9399,
"step": 614
},
{
"epoch": 0.49163127654259303,
"grad_norm": 1.55342646595899,
"learning_rate": 1e-05,
"loss": 0.9369,
"step": 615
},
{
"epoch": 0.4924306769922558,
"grad_norm": 1.3945867465343513,
"learning_rate": 1e-05,
"loss": 0.9739,
"step": 616
},
{
"epoch": 0.49323007744191855,
"grad_norm": 1.8084042523739312,
"learning_rate": 1e-05,
"loss": 0.8568,
"step": 617
},
{
"epoch": 0.4940294778915813,
"grad_norm": 1.3957730664102426,
"learning_rate": 1e-05,
"loss": 0.9212,
"step": 618
},
{
"epoch": 0.49482887834124406,
"grad_norm": 1.576073681260172,
"learning_rate": 1e-05,
"loss": 0.9323,
"step": 619
},
{
"epoch": 0.4956282787909068,
"grad_norm": 1.478562229589502,
"learning_rate": 1e-05,
"loss": 0.8784,
"step": 620
},
{
"epoch": 0.4964276792405696,
"grad_norm": 1.5742856570618204,
"learning_rate": 1e-05,
"loss": 0.8949,
"step": 621
},
{
"epoch": 0.49722707969023233,
"grad_norm": 1.7717496405831807,
"learning_rate": 1e-05,
"loss": 0.8882,
"step": 622
},
{
"epoch": 0.4980264801398951,
"grad_norm": 1.512802542889935,
"learning_rate": 1e-05,
"loss": 0.9555,
"step": 623
},
{
"epoch": 0.49882588058955785,
"grad_norm": 1.44305014112251,
"learning_rate": 1e-05,
"loss": 0.9058,
"step": 624
},
{
"epoch": 0.4996252810392206,
"grad_norm": 1.6065628841661808,
"learning_rate": 1e-05,
"loss": 0.8697,
"step": 625
},
{
"epoch": 0.5004246814888833,
"grad_norm": 1.5176075034291314,
"learning_rate": 1e-05,
"loss": 0.8774,
"step": 626
},
{
"epoch": 0.5012240819385461,
"grad_norm": 1.4385224047152578,
"learning_rate": 1e-05,
"loss": 0.9092,
"step": 627
},
{
"epoch": 0.5020234823882088,
"grad_norm": 1.5846911793271963,
"learning_rate": 1e-05,
"loss": 0.9333,
"step": 628
},
{
"epoch": 0.5028228828378716,
"grad_norm": 1.6455364602527989,
"learning_rate": 1e-05,
"loss": 0.9412,
"step": 629
},
{
"epoch": 0.5036222832875343,
"grad_norm": 1.6062811152199334,
"learning_rate": 1e-05,
"loss": 0.9091,
"step": 630
},
{
"epoch": 0.5044216837371971,
"grad_norm": 1.4354611082735989,
"learning_rate": 1e-05,
"loss": 0.907,
"step": 631
},
{
"epoch": 0.5052210841868598,
"grad_norm": 1.52829754540632,
"learning_rate": 1e-05,
"loss": 0.9195,
"step": 632
},
{
"epoch": 0.5060204846365226,
"grad_norm": 1.6184765917993094,
"learning_rate": 1e-05,
"loss": 0.9452,
"step": 633
},
{
"epoch": 0.5068198850861854,
"grad_norm": 1.5257888577090237,
"learning_rate": 1e-05,
"loss": 0.8772,
"step": 634
},
{
"epoch": 0.5076192855358481,
"grad_norm": 1.4539514346389641,
"learning_rate": 1e-05,
"loss": 0.8898,
"step": 635
},
{
"epoch": 0.5084186859855109,
"grad_norm": 1.6554813398137607,
"learning_rate": 1e-05,
"loss": 0.8987,
"step": 636
},
{
"epoch": 0.5092180864351736,
"grad_norm": 1.4575833122082418,
"learning_rate": 1e-05,
"loss": 0.9455,
"step": 637
},
{
"epoch": 0.5100174868848364,
"grad_norm": 1.651253682354515,
"learning_rate": 1e-05,
"loss": 0.8731,
"step": 638
},
{
"epoch": 0.5108168873344991,
"grad_norm": 1.608007000762813,
"learning_rate": 1e-05,
"loss": 0.9068,
"step": 639
},
{
"epoch": 0.5116162877841619,
"grad_norm": 1.4451823786722864,
"learning_rate": 1e-05,
"loss": 0.9363,
"step": 640
},
{
"epoch": 0.5124156882338247,
"grad_norm": 1.585132398185237,
"learning_rate": 1e-05,
"loss": 0.8394,
"step": 641
},
{
"epoch": 0.5132150886834874,
"grad_norm": 1.5460763577114784,
"learning_rate": 1e-05,
"loss": 0.8782,
"step": 642
},
{
"epoch": 0.5140144891331502,
"grad_norm": 1.5336894539869739,
"learning_rate": 1e-05,
"loss": 0.8913,
"step": 643
},
{
"epoch": 0.5148138895828129,
"grad_norm": 1.5563638706418883,
"learning_rate": 1e-05,
"loss": 0.891,
"step": 644
},
{
"epoch": 0.5156132900324757,
"grad_norm": 1.6781793712825763,
"learning_rate": 1e-05,
"loss": 0.9243,
"step": 645
},
{
"epoch": 0.5164126904821384,
"grad_norm": 1.555015563156278,
"learning_rate": 1e-05,
"loss": 0.8733,
"step": 646
},
{
"epoch": 0.5172120909318011,
"grad_norm": 1.4725706386221917,
"learning_rate": 1e-05,
"loss": 0.8681,
"step": 647
},
{
"epoch": 0.5180114913814639,
"grad_norm": 1.3268747875477092,
"learning_rate": 1e-05,
"loss": 0.8807,
"step": 648
},
{
"epoch": 0.5188108918311266,
"grad_norm": 1.5451710380595707,
"learning_rate": 1e-05,
"loss": 0.9226,
"step": 649
},
{
"epoch": 0.5196102922807894,
"grad_norm": 1.5573419054386046,
"learning_rate": 1e-05,
"loss": 0.9044,
"step": 650
},
{
"epoch": 0.5204096927304521,
"grad_norm": 1.2989474126701601,
"learning_rate": 1e-05,
"loss": 0.8532,
"step": 651
},
{
"epoch": 0.5212090931801149,
"grad_norm": 1.696305481260023,
"learning_rate": 1e-05,
"loss": 0.8595,
"step": 652
},
{
"epoch": 0.5220084936297776,
"grad_norm": 1.4451028681658686,
"learning_rate": 1e-05,
"loss": 0.933,
"step": 653
},
{
"epoch": 0.5228078940794404,
"grad_norm": 1.5925002414772222,
"learning_rate": 1e-05,
"loss": 0.9286,
"step": 654
},
{
"epoch": 0.5236072945291032,
"grad_norm": 1.4716559853454252,
"learning_rate": 1e-05,
"loss": 0.8943,
"step": 655
},
{
"epoch": 0.5244066949787659,
"grad_norm": 1.7475122640309384,
"learning_rate": 1e-05,
"loss": 0.9523,
"step": 656
},
{
"epoch": 0.5252060954284287,
"grad_norm": 1.645603067269987,
"learning_rate": 1e-05,
"loss": 0.9075,
"step": 657
},
{
"epoch": 0.5260054958780914,
"grad_norm": 1.6726736025945501,
"learning_rate": 1e-05,
"loss": 0.9039,
"step": 658
},
{
"epoch": 0.5268048963277542,
"grad_norm": 1.518938137250405,
"learning_rate": 1e-05,
"loss": 0.8761,
"step": 659
},
{
"epoch": 0.5276042967774169,
"grad_norm": 1.7083121838298914,
"learning_rate": 1e-05,
"loss": 0.8697,
"step": 660
},
{
"epoch": 0.5284036972270797,
"grad_norm": 1.5499463775414077,
"learning_rate": 1e-05,
"loss": 0.8976,
"step": 661
},
{
"epoch": 0.5292030976767425,
"grad_norm": 1.4098782407183605,
"learning_rate": 1e-05,
"loss": 0.8753,
"step": 662
},
{
"epoch": 0.5300024981264052,
"grad_norm": 1.7641918962063994,
"learning_rate": 1e-05,
"loss": 0.9194,
"step": 663
},
{
"epoch": 0.530801898576068,
"grad_norm": 1.5469637110527181,
"learning_rate": 1e-05,
"loss": 0.9059,
"step": 664
},
{
"epoch": 0.5316012990257307,
"grad_norm": 1.6487062365426841,
"learning_rate": 1e-05,
"loss": 0.8897,
"step": 665
},
{
"epoch": 0.5324006994753935,
"grad_norm": 1.5205006643304535,
"learning_rate": 1e-05,
"loss": 0.9216,
"step": 666
},
{
"epoch": 0.5332000999250562,
"grad_norm": 1.5325976583230465,
"learning_rate": 1e-05,
"loss": 0.8957,
"step": 667
},
{
"epoch": 0.533999500374719,
"grad_norm": 1.720042040656152,
"learning_rate": 1e-05,
"loss": 0.8832,
"step": 668
},
{
"epoch": 0.5347989008243818,
"grad_norm": 1.4435271985771057,
"learning_rate": 1e-05,
"loss": 0.8587,
"step": 669
},
{
"epoch": 0.5355983012740445,
"grad_norm": 1.7309862738667545,
"learning_rate": 1e-05,
"loss": 0.8801,
"step": 670
},
{
"epoch": 0.5363977017237073,
"grad_norm": 1.51553026472629,
"learning_rate": 1e-05,
"loss": 0.8948,
"step": 671
},
{
"epoch": 0.53719710217337,
"grad_norm": 1.5034966185821361,
"learning_rate": 1e-05,
"loss": 0.8825,
"step": 672
},
{
"epoch": 0.5379965026230328,
"grad_norm": 1.5085135625486585,
"learning_rate": 1e-05,
"loss": 0.8758,
"step": 673
},
{
"epoch": 0.5387959030726954,
"grad_norm": 1.5419185376449267,
"learning_rate": 1e-05,
"loss": 0.9952,
"step": 674
},
{
"epoch": 0.5395953035223582,
"grad_norm": 1.516360272741118,
"learning_rate": 1e-05,
"loss": 0.8744,
"step": 675
},
{
"epoch": 0.5403947039720209,
"grad_norm": 1.6057277324687185,
"learning_rate": 1e-05,
"loss": 0.9379,
"step": 676
},
{
"epoch": 0.5411941044216837,
"grad_norm": 1.5074825505125475,
"learning_rate": 1e-05,
"loss": 0.8687,
"step": 677
},
{
"epoch": 0.5419935048713465,
"grad_norm": 1.564461484690962,
"learning_rate": 1e-05,
"loss": 0.928,
"step": 678
},
{
"epoch": 0.5427929053210092,
"grad_norm": 1.5052766213063988,
"learning_rate": 1e-05,
"loss": 0.909,
"step": 679
},
{
"epoch": 0.543592305770672,
"grad_norm": 1.3946507047858405,
"learning_rate": 1e-05,
"loss": 0.8984,
"step": 680
},
{
"epoch": 0.5443917062203347,
"grad_norm": 1.524550146914044,
"learning_rate": 1e-05,
"loss": 0.9103,
"step": 681
},
{
"epoch": 0.5451911066699975,
"grad_norm": 1.743015450167898,
"learning_rate": 1e-05,
"loss": 0.8817,
"step": 682
},
{
"epoch": 0.5459905071196602,
"grad_norm": 1.2727179347293005,
"learning_rate": 1e-05,
"loss": 0.9565,
"step": 683
},
{
"epoch": 0.546789907569323,
"grad_norm": 1.4218645212985512,
"learning_rate": 1e-05,
"loss": 0.9361,
"step": 684
},
{
"epoch": 0.5475893080189858,
"grad_norm": 1.5827671331667068,
"learning_rate": 1e-05,
"loss": 0.9195,
"step": 685
},
{
"epoch": 0.5483887084686485,
"grad_norm": 1.5111024964279403,
"learning_rate": 1e-05,
"loss": 0.8975,
"step": 686
},
{
"epoch": 0.5491881089183113,
"grad_norm": 1.698526384803921,
"learning_rate": 1e-05,
"loss": 0.8693,
"step": 687
},
{
"epoch": 0.549987509367974,
"grad_norm": 1.4499111433077698,
"learning_rate": 1e-05,
"loss": 0.9304,
"step": 688
},
{
"epoch": 0.5507869098176368,
"grad_norm": 1.6198855701994876,
"learning_rate": 1e-05,
"loss": 0.9071,
"step": 689
},
{
"epoch": 0.5515863102672995,
"grad_norm": 1.447799249815993,
"learning_rate": 1e-05,
"loss": 0.8577,
"step": 690
},
{
"epoch": 0.5523857107169623,
"grad_norm": 1.4643912062350883,
"learning_rate": 1e-05,
"loss": 0.8841,
"step": 691
},
{
"epoch": 0.553185111166625,
"grad_norm": 1.2539155232355081,
"learning_rate": 1e-05,
"loss": 0.902,
"step": 692
},
{
"epoch": 0.5539845116162878,
"grad_norm": 1.547551258731981,
"learning_rate": 1e-05,
"loss": 0.9678,
"step": 693
},
{
"epoch": 0.5547839120659506,
"grad_norm": 1.4727625062306167,
"learning_rate": 1e-05,
"loss": 0.8882,
"step": 694
},
{
"epoch": 0.5555833125156133,
"grad_norm": 1.4776645587359942,
"learning_rate": 1e-05,
"loss": 0.8921,
"step": 695
},
{
"epoch": 0.5563827129652761,
"grad_norm": 1.4858467571616956,
"learning_rate": 1e-05,
"loss": 0.9032,
"step": 696
},
{
"epoch": 0.5571821134149388,
"grad_norm": 1.6272094570109954,
"learning_rate": 1e-05,
"loss": 0.8754,
"step": 697
},
{
"epoch": 0.5579815138646016,
"grad_norm": 1.5209165879169078,
"learning_rate": 1e-05,
"loss": 0.8892,
"step": 698
},
{
"epoch": 0.5587809143142644,
"grad_norm": 1.5534555948764655,
"learning_rate": 1e-05,
"loss": 0.8518,
"step": 699
},
{
"epoch": 0.559580314763927,
"grad_norm": 1.7199439771989053,
"learning_rate": 1e-05,
"loss": 0.9232,
"step": 700
},
{
"epoch": 0.5603797152135898,
"grad_norm": 1.3598124896967667,
"learning_rate": 1e-05,
"loss": 0.9486,
"step": 701
},
{
"epoch": 0.5611791156632525,
"grad_norm": 1.612574738886904,
"learning_rate": 1e-05,
"loss": 0.8794,
"step": 702
},
{
"epoch": 0.5619785161129153,
"grad_norm": 1.3832262396852995,
"learning_rate": 1e-05,
"loss": 0.9321,
"step": 703
},
{
"epoch": 0.562777916562578,
"grad_norm": 1.5923071651772416,
"learning_rate": 1e-05,
"loss": 0.8793,
"step": 704
},
{
"epoch": 0.5635773170122408,
"grad_norm": 1.5489614624229269,
"learning_rate": 1e-05,
"loss": 0.8701,
"step": 705
},
{
"epoch": 0.5643767174619035,
"grad_norm": 1.4485859965980266,
"learning_rate": 1e-05,
"loss": 0.8911,
"step": 706
},
{
"epoch": 0.5651761179115663,
"grad_norm": 1.3786799015631879,
"learning_rate": 1e-05,
"loss": 0.9122,
"step": 707
},
{
"epoch": 0.5659755183612291,
"grad_norm": 1.5342699233246582,
"learning_rate": 1e-05,
"loss": 0.8847,
"step": 708
},
{
"epoch": 0.5667749188108918,
"grad_norm": 1.5871051701796994,
"learning_rate": 1e-05,
"loss": 0.884,
"step": 709
},
{
"epoch": 0.5675743192605546,
"grad_norm": 1.5660069308536273,
"learning_rate": 1e-05,
"loss": 0.8551,
"step": 710
},
{
"epoch": 0.5683737197102173,
"grad_norm": 1.429596069400543,
"learning_rate": 1e-05,
"loss": 0.8957,
"step": 711
},
{
"epoch": 0.5691731201598801,
"grad_norm": 1.4491964477267238,
"learning_rate": 1e-05,
"loss": 0.8531,
"step": 712
},
{
"epoch": 0.5699725206095428,
"grad_norm": 1.4365928694753973,
"learning_rate": 1e-05,
"loss": 0.8761,
"step": 713
},
{
"epoch": 0.5707719210592056,
"grad_norm": 1.5578122539795014,
"learning_rate": 1e-05,
"loss": 0.8804,
"step": 714
},
{
"epoch": 0.5715713215088684,
"grad_norm": 1.5327160301768794,
"learning_rate": 1e-05,
"loss": 0.8559,
"step": 715
},
{
"epoch": 0.5723707219585311,
"grad_norm": 1.530442187113109,
"learning_rate": 1e-05,
"loss": 0.8689,
"step": 716
},
{
"epoch": 0.5731701224081939,
"grad_norm": 1.5680317221543405,
"learning_rate": 1e-05,
"loss": 0.8969,
"step": 717
},
{
"epoch": 0.5739695228578566,
"grad_norm": 1.5241288570093494,
"learning_rate": 1e-05,
"loss": 0.9161,
"step": 718
},
{
"epoch": 0.5747689233075194,
"grad_norm": 1.3731985273369733,
"learning_rate": 1e-05,
"loss": 0.8568,
"step": 719
},
{
"epoch": 0.5755683237571821,
"grad_norm": 1.4645302822523454,
"learning_rate": 1e-05,
"loss": 0.899,
"step": 720
},
{
"epoch": 0.5763677242068449,
"grad_norm": 1.429554718936312,
"learning_rate": 1e-05,
"loss": 0.9161,
"step": 721
},
{
"epoch": 0.5771671246565077,
"grad_norm": 1.3621850244930958,
"learning_rate": 1e-05,
"loss": 0.9169,
"step": 722
},
{
"epoch": 0.5779665251061704,
"grad_norm": 1.485846183303666,
"learning_rate": 1e-05,
"loss": 0.9811,
"step": 723
},
{
"epoch": 0.5787659255558332,
"grad_norm": 1.4036480667947844,
"learning_rate": 1e-05,
"loss": 0.8841,
"step": 724
},
{
"epoch": 0.5795653260054959,
"grad_norm": 1.3680437907081195,
"learning_rate": 1e-05,
"loss": 0.861,
"step": 725
},
{
"epoch": 0.5803647264551587,
"grad_norm": 1.4902900528640177,
"learning_rate": 1e-05,
"loss": 0.9022,
"step": 726
},
{
"epoch": 0.5811641269048213,
"grad_norm": 1.367169701352056,
"learning_rate": 1e-05,
"loss": 0.9091,
"step": 727
},
{
"epoch": 0.5819635273544841,
"grad_norm": 1.6487586565871948,
"learning_rate": 1e-05,
"loss": 0.9328,
"step": 728
},
{
"epoch": 0.5827629278041468,
"grad_norm": 1.6567920316755664,
"learning_rate": 1e-05,
"loss": 0.8662,
"step": 729
},
{
"epoch": 0.5835623282538096,
"grad_norm": 1.3391698664356693,
"learning_rate": 1e-05,
"loss": 0.8993,
"step": 730
},
{
"epoch": 0.5843617287034724,
"grad_norm": 1.3695456445124472,
"learning_rate": 1e-05,
"loss": 0.8371,
"step": 731
},
{
"epoch": 0.5851611291531351,
"grad_norm": 1.418306336363921,
"learning_rate": 1e-05,
"loss": 0.9092,
"step": 732
},
{
"epoch": 0.5859605296027979,
"grad_norm": 1.7580509988769806,
"learning_rate": 1e-05,
"loss": 0.9117,
"step": 733
},
{
"epoch": 0.5867599300524606,
"grad_norm": 1.4969730064494027,
"learning_rate": 1e-05,
"loss": 0.8777,
"step": 734
},
{
"epoch": 0.5875593305021234,
"grad_norm": 1.5311661672699555,
"learning_rate": 1e-05,
"loss": 0.8633,
"step": 735
},
{
"epoch": 0.5883587309517861,
"grad_norm": 1.4485040495772017,
"learning_rate": 1e-05,
"loss": 0.9358,
"step": 736
},
{
"epoch": 0.5891581314014489,
"grad_norm": 1.4826514641684152,
"learning_rate": 1e-05,
"loss": 0.852,
"step": 737
},
{
"epoch": 0.5899575318511117,
"grad_norm": 1.48791832285035,
"learning_rate": 1e-05,
"loss": 0.8782,
"step": 738
},
{
"epoch": 0.5907569323007744,
"grad_norm": 1.6057041771896603,
"learning_rate": 1e-05,
"loss": 0.8316,
"step": 739
},
{
"epoch": 0.5915563327504372,
"grad_norm": 2.1038688962784593,
"learning_rate": 1e-05,
"loss": 0.8521,
"step": 740
},
{
"epoch": 0.5923557332000999,
"grad_norm": 1.621521996919619,
"learning_rate": 1e-05,
"loss": 0.8843,
"step": 741
},
{
"epoch": 0.5931551336497627,
"grad_norm": 1.471365198038119,
"learning_rate": 1e-05,
"loss": 0.8784,
"step": 742
},
{
"epoch": 0.5939545340994254,
"grad_norm": 1.4957177407162774,
"learning_rate": 1e-05,
"loss": 0.9049,
"step": 743
},
{
"epoch": 0.5947539345490882,
"grad_norm": 1.4767129920631528,
"learning_rate": 1e-05,
"loss": 0.8556,
"step": 744
},
{
"epoch": 0.595553334998751,
"grad_norm": 1.4751091109435195,
"learning_rate": 1e-05,
"loss": 0.8525,
"step": 745
},
{
"epoch": 0.5963527354484137,
"grad_norm": 1.5180861867428592,
"learning_rate": 1e-05,
"loss": 0.8986,
"step": 746
},
{
"epoch": 0.5971521358980765,
"grad_norm": 1.716833225193397,
"learning_rate": 1e-05,
"loss": 0.9396,
"step": 747
},
{
"epoch": 0.5979515363477392,
"grad_norm": 1.662846684061582,
"learning_rate": 1e-05,
"loss": 0.8806,
"step": 748
},
{
"epoch": 0.598750936797402,
"grad_norm": 1.5397292686479351,
"learning_rate": 1e-05,
"loss": 0.9085,
"step": 749
},
{
"epoch": 0.5995503372470647,
"grad_norm": 1.3571032049534457,
"learning_rate": 1e-05,
"loss": 0.9406,
"step": 750
},
{
"epoch": 0.6003497376967275,
"grad_norm": 1.384922018598161,
"learning_rate": 1e-05,
"loss": 0.8956,
"step": 751
},
{
"epoch": 0.6011491381463903,
"grad_norm": 1.496498809863047,
"learning_rate": 1e-05,
"loss": 0.8918,
"step": 752
},
{
"epoch": 0.6019485385960529,
"grad_norm": 1.4830953787172334,
"learning_rate": 1e-05,
"loss": 0.8431,
"step": 753
},
{
"epoch": 0.6027479390457157,
"grad_norm": 1.6829833333195696,
"learning_rate": 1e-05,
"loss": 0.8561,
"step": 754
},
{
"epoch": 0.6035473394953784,
"grad_norm": 1.3654050365320536,
"learning_rate": 1e-05,
"loss": 0.9101,
"step": 755
},
{
"epoch": 0.6043467399450412,
"grad_norm": 1.5240419337473992,
"learning_rate": 1e-05,
"loss": 0.8338,
"step": 756
},
{
"epoch": 0.6051461403947039,
"grad_norm": 1.5491861960420192,
"learning_rate": 1e-05,
"loss": 0.8921,
"step": 757
},
{
"epoch": 0.6059455408443667,
"grad_norm": 1.4148317529647148,
"learning_rate": 1e-05,
"loss": 0.8677,
"step": 758
},
{
"epoch": 0.6067449412940294,
"grad_norm": 1.469343003903587,
"learning_rate": 1e-05,
"loss": 0.8734,
"step": 759
},
{
"epoch": 0.6075443417436922,
"grad_norm": 1.319737814833517,
"learning_rate": 1e-05,
"loss": 0.8688,
"step": 760
},
{
"epoch": 0.608343742193355,
"grad_norm": 1.4581636035714403,
"learning_rate": 1e-05,
"loss": 0.8753,
"step": 761
},
{
"epoch": 0.6091431426430177,
"grad_norm": 1.8427778000120836,
"learning_rate": 1e-05,
"loss": 0.9185,
"step": 762
},
{
"epoch": 0.6099425430926805,
"grad_norm": 1.4013027241862714,
"learning_rate": 1e-05,
"loss": 0.9376,
"step": 763
},
{
"epoch": 0.6107419435423432,
"grad_norm": 1.5267045554235308,
"learning_rate": 1e-05,
"loss": 0.8835,
"step": 764
},
{
"epoch": 0.611541343992006,
"grad_norm": 1.4715893506156257,
"learning_rate": 1e-05,
"loss": 0.8676,
"step": 765
},
{
"epoch": 0.6123407444416687,
"grad_norm": 1.4577005776877618,
"learning_rate": 1e-05,
"loss": 0.8796,
"step": 766
},
{
"epoch": 0.6131401448913315,
"grad_norm": 1.4934814897272444,
"learning_rate": 1e-05,
"loss": 0.8458,
"step": 767
},
{
"epoch": 0.6139395453409943,
"grad_norm": 1.5364809951583207,
"learning_rate": 1e-05,
"loss": 0.8316,
"step": 768
},
{
"epoch": 0.614738945790657,
"grad_norm": 1.4992439555873935,
"learning_rate": 1e-05,
"loss": 0.9177,
"step": 769
},
{
"epoch": 0.6155383462403198,
"grad_norm": 1.4324130065382474,
"learning_rate": 1e-05,
"loss": 0.9105,
"step": 770
},
{
"epoch": 0.6163377466899825,
"grad_norm": 1.372488633970353,
"learning_rate": 1e-05,
"loss": 0.9365,
"step": 771
},
{
"epoch": 0.6171371471396453,
"grad_norm": 1.3430055625087858,
"learning_rate": 1e-05,
"loss": 0.887,
"step": 772
},
{
"epoch": 0.617936547589308,
"grad_norm": 1.4070687341497352,
"learning_rate": 1e-05,
"loss": 0.918,
"step": 773
},
{
"epoch": 0.6187359480389708,
"grad_norm": 1.4126858378429896,
"learning_rate": 1e-05,
"loss": 0.8249,
"step": 774
},
{
"epoch": 0.6195353484886336,
"grad_norm": 1.5659156867498283,
"learning_rate": 1e-05,
"loss": 0.8313,
"step": 775
},
{
"epoch": 0.6203347489382963,
"grad_norm": 1.4546097055174756,
"learning_rate": 1e-05,
"loss": 0.8701,
"step": 776
},
{
"epoch": 0.6211341493879591,
"grad_norm": 1.4487557061202467,
"learning_rate": 1e-05,
"loss": 0.9272,
"step": 777
},
{
"epoch": 0.6219335498376218,
"grad_norm": 1.6276489271011279,
"learning_rate": 1e-05,
"loss": 0.9506,
"step": 778
},
{
"epoch": 0.6227329502872845,
"grad_norm": 1.6078488944139557,
"learning_rate": 1e-05,
"loss": 0.9327,
"step": 779
},
{
"epoch": 0.6235323507369472,
"grad_norm": 1.4508829251993478,
"learning_rate": 1e-05,
"loss": 0.9369,
"step": 780
},
{
"epoch": 0.62433175118661,
"grad_norm": 1.612776765629144,
"learning_rate": 1e-05,
"loss": 0.9124,
"step": 781
},
{
"epoch": 0.6251311516362728,
"grad_norm": 1.5608370989668476,
"learning_rate": 1e-05,
"loss": 0.8758,
"step": 782
},
{
"epoch": 0.6259305520859355,
"grad_norm": 1.5129857913859477,
"learning_rate": 1e-05,
"loss": 0.891,
"step": 783
},
{
"epoch": 0.6267299525355983,
"grad_norm": 1.4321443280452155,
"learning_rate": 1e-05,
"loss": 0.865,
"step": 784
},
{
"epoch": 0.627529352985261,
"grad_norm": 1.5058564295604038,
"learning_rate": 1e-05,
"loss": 0.8721,
"step": 785
},
{
"epoch": 0.6283287534349238,
"grad_norm": 1.3807849349968864,
"learning_rate": 1e-05,
"loss": 0.886,
"step": 786
},
{
"epoch": 0.6291281538845865,
"grad_norm": 1.544883025432354,
"learning_rate": 1e-05,
"loss": 0.9102,
"step": 787
},
{
"epoch": 0.6299275543342493,
"grad_norm": 1.4150356335689325,
"learning_rate": 1e-05,
"loss": 0.9361,
"step": 788
},
{
"epoch": 0.630726954783912,
"grad_norm": 1.5188112447723208,
"learning_rate": 1e-05,
"loss": 0.8874,
"step": 789
},
{
"epoch": 0.6315263552335748,
"grad_norm": 1.5441059644669919,
"learning_rate": 1e-05,
"loss": 0.9105,
"step": 790
},
{
"epoch": 0.6323257556832376,
"grad_norm": 1.7469333936594207,
"learning_rate": 1e-05,
"loss": 0.8572,
"step": 791
},
{
"epoch": 0.6331251561329003,
"grad_norm": 1.6602103078622925,
"learning_rate": 1e-05,
"loss": 0.9294,
"step": 792
},
{
"epoch": 0.6339245565825631,
"grad_norm": 1.5925807734316682,
"learning_rate": 1e-05,
"loss": 0.9744,
"step": 793
},
{
"epoch": 0.6347239570322258,
"grad_norm": 1.5394065631369533,
"learning_rate": 1e-05,
"loss": 0.9164,
"step": 794
},
{
"epoch": 0.6355233574818886,
"grad_norm": 1.5935047510060332,
"learning_rate": 1e-05,
"loss": 0.8769,
"step": 795
},
{
"epoch": 0.6363227579315514,
"grad_norm": 1.344142047079821,
"learning_rate": 1e-05,
"loss": 0.9317,
"step": 796
},
{
"epoch": 0.6371221583812141,
"grad_norm": 1.6200454224138392,
"learning_rate": 1e-05,
"loss": 0.8334,
"step": 797
},
{
"epoch": 0.6379215588308769,
"grad_norm": 1.5204016202631034,
"learning_rate": 1e-05,
"loss": 0.9006,
"step": 798
},
{
"epoch": 0.6387209592805396,
"grad_norm": 1.4920314496701772,
"learning_rate": 1e-05,
"loss": 0.8501,
"step": 799
},
{
"epoch": 0.6395203597302024,
"grad_norm": 1.3209265560951622,
"learning_rate": 1e-05,
"loss": 0.9025,
"step": 800
},
{
"epoch": 0.6403197601798651,
"grad_norm": 1.5701927388007535,
"learning_rate": 1e-05,
"loss": 0.8747,
"step": 801
},
{
"epoch": 0.6411191606295279,
"grad_norm": 1.3344795038412969,
"learning_rate": 1e-05,
"loss": 0.9104,
"step": 802
},
{
"epoch": 0.6419185610791907,
"grad_norm": 1.3938320762656133,
"learning_rate": 1e-05,
"loss": 0.8409,
"step": 803
},
{
"epoch": 0.6427179615288534,
"grad_norm": 1.4249626741383923,
"learning_rate": 1e-05,
"loss": 0.8727,
"step": 804
},
{
"epoch": 0.6435173619785162,
"grad_norm": 1.6691646244578324,
"learning_rate": 1e-05,
"loss": 0.8903,
"step": 805
},
{
"epoch": 0.6443167624281788,
"grad_norm": 1.665931296408499,
"learning_rate": 1e-05,
"loss": 0.8787,
"step": 806
},
{
"epoch": 0.6451161628778416,
"grad_norm": 1.693200235102736,
"learning_rate": 1e-05,
"loss": 0.8462,
"step": 807
},
{
"epoch": 0.6459155633275043,
"grad_norm": 1.4005335152598601,
"learning_rate": 1e-05,
"loss": 0.8637,
"step": 808
},
{
"epoch": 0.6467149637771671,
"grad_norm": 1.5270196926285917,
"learning_rate": 1e-05,
"loss": 0.86,
"step": 809
},
{
"epoch": 0.6475143642268298,
"grad_norm": 1.4150346179433293,
"learning_rate": 1e-05,
"loss": 0.8734,
"step": 810
},
{
"epoch": 0.6483137646764926,
"grad_norm": 1.53091696763508,
"learning_rate": 1e-05,
"loss": 0.8754,
"step": 811
},
{
"epoch": 0.6491131651261554,
"grad_norm": 1.474027558315905,
"learning_rate": 1e-05,
"loss": 0.9586,
"step": 812
},
{
"epoch": 0.6499125655758181,
"grad_norm": 1.485859581480546,
"learning_rate": 1e-05,
"loss": 0.9106,
"step": 813
},
{
"epoch": 0.6507119660254809,
"grad_norm": 1.568460720361032,
"learning_rate": 1e-05,
"loss": 0.8803,
"step": 814
},
{
"epoch": 0.6515113664751436,
"grad_norm": 1.5563031313131295,
"learning_rate": 1e-05,
"loss": 0.9097,
"step": 815
},
{
"epoch": 0.6523107669248064,
"grad_norm": 1.5440917854626373,
"learning_rate": 1e-05,
"loss": 0.9062,
"step": 816
},
{
"epoch": 0.6531101673744691,
"grad_norm": 1.5083755089979098,
"learning_rate": 1e-05,
"loss": 0.8674,
"step": 817
},
{
"epoch": 0.6539095678241319,
"grad_norm": 1.508645000565019,
"learning_rate": 1e-05,
"loss": 0.8815,
"step": 818
},
{
"epoch": 0.6547089682737947,
"grad_norm": 1.6098529049906811,
"learning_rate": 1e-05,
"loss": 0.8344,
"step": 819
},
{
"epoch": 0.6555083687234574,
"grad_norm": 1.711843405154856,
"learning_rate": 1e-05,
"loss": 0.9035,
"step": 820
},
{
"epoch": 0.6563077691731202,
"grad_norm": 1.4578793644862615,
"learning_rate": 1e-05,
"loss": 0.8953,
"step": 821
},
{
"epoch": 0.6571071696227829,
"grad_norm": 1.5916969602134543,
"learning_rate": 1e-05,
"loss": 0.8868,
"step": 822
},
{
"epoch": 0.6579065700724457,
"grad_norm": 1.7747741238079355,
"learning_rate": 1e-05,
"loss": 0.8762,
"step": 823
},
{
"epoch": 0.6587059705221084,
"grad_norm": 1.610938375922778,
"learning_rate": 1e-05,
"loss": 0.9062,
"step": 824
},
{
"epoch": 0.6595053709717712,
"grad_norm": 1.6873519485834756,
"learning_rate": 1e-05,
"loss": 0.8631,
"step": 825
},
{
"epoch": 0.660304771421434,
"grad_norm": 1.430821156429654,
"learning_rate": 1e-05,
"loss": 0.9604,
"step": 826
},
{
"epoch": 0.6611041718710967,
"grad_norm": 1.457720171628577,
"learning_rate": 1e-05,
"loss": 0.8823,
"step": 827
},
{
"epoch": 0.6619035723207595,
"grad_norm": 1.3817461766649617,
"learning_rate": 1e-05,
"loss": 0.9294,
"step": 828
},
{
"epoch": 0.6627029727704222,
"grad_norm": 1.4095998527286095,
"learning_rate": 1e-05,
"loss": 0.8562,
"step": 829
},
{
"epoch": 0.663502373220085,
"grad_norm": 1.4396424977428872,
"learning_rate": 1e-05,
"loss": 0.8256,
"step": 830
},
{
"epoch": 0.6643017736697477,
"grad_norm": 1.38822130860778,
"learning_rate": 1e-05,
"loss": 0.8717,
"step": 831
},
{
"epoch": 0.6651011741194104,
"grad_norm": 1.4057148558281964,
"learning_rate": 1e-05,
"loss": 0.9041,
"step": 832
},
{
"epoch": 0.6659005745690731,
"grad_norm": 1.4772530181187606,
"learning_rate": 1e-05,
"loss": 0.9316,
"step": 833
},
{
"epoch": 0.6666999750187359,
"grad_norm": 1.5248374759511425,
"learning_rate": 1e-05,
"loss": 0.8771,
"step": 834
},
{
"epoch": 0.6674993754683987,
"grad_norm": 1.5352948925732954,
"learning_rate": 1e-05,
"loss": 0.9223,
"step": 835
},
{
"epoch": 0.6682987759180614,
"grad_norm": 1.7695375410960146,
"learning_rate": 1e-05,
"loss": 0.8801,
"step": 836
},
{
"epoch": 0.6690981763677242,
"grad_norm": 1.3579372966834742,
"learning_rate": 1e-05,
"loss": 0.8714,
"step": 837
},
{
"epoch": 0.6698975768173869,
"grad_norm": 1.5174930728786662,
"learning_rate": 1e-05,
"loss": 0.8513,
"step": 838
},
{
"epoch": 0.6706969772670497,
"grad_norm": 1.5225177134174273,
"learning_rate": 1e-05,
"loss": 0.8947,
"step": 839
},
{
"epoch": 0.6714963777167124,
"grad_norm": 1.773009293174373,
"learning_rate": 1e-05,
"loss": 0.8279,
"step": 840
},
{
"epoch": 0.6722957781663752,
"grad_norm": 1.5784630095216696,
"learning_rate": 1e-05,
"loss": 0.8764,
"step": 841
},
{
"epoch": 0.673095178616038,
"grad_norm": 1.4708285523723468,
"learning_rate": 1e-05,
"loss": 0.8816,
"step": 842
},
{
"epoch": 0.6738945790657007,
"grad_norm": 1.43983591742943,
"learning_rate": 1e-05,
"loss": 0.9482,
"step": 843
},
{
"epoch": 0.6746939795153635,
"grad_norm": 1.4485915743374498,
"learning_rate": 1e-05,
"loss": 0.9028,
"step": 844
},
{
"epoch": 0.6754933799650262,
"grad_norm": 1.5016530521995441,
"learning_rate": 1e-05,
"loss": 0.8731,
"step": 845
},
{
"epoch": 0.676292780414689,
"grad_norm": 1.3809441111375442,
"learning_rate": 1e-05,
"loss": 0.9214,
"step": 846
},
{
"epoch": 0.6770921808643517,
"grad_norm": 1.7240170055604878,
"learning_rate": 1e-05,
"loss": 0.8947,
"step": 847
},
{
"epoch": 0.6778915813140145,
"grad_norm": 1.3301304611766438,
"learning_rate": 1e-05,
"loss": 0.9231,
"step": 848
},
{
"epoch": 0.6786909817636773,
"grad_norm": 1.4218727212100182,
"learning_rate": 1e-05,
"loss": 0.8962,
"step": 849
},
{
"epoch": 0.67949038221334,
"grad_norm": 1.63010423786957,
"learning_rate": 1e-05,
"loss": 0.8939,
"step": 850
},
{
"epoch": 0.6802897826630028,
"grad_norm": 1.4495140324549352,
"learning_rate": 1e-05,
"loss": 0.8875,
"step": 851
},
{
"epoch": 0.6810891831126655,
"grad_norm": 1.5626000543974294,
"learning_rate": 1e-05,
"loss": 0.8814,
"step": 852
},
{
"epoch": 0.6818885835623283,
"grad_norm": 1.5909709047210767,
"learning_rate": 1e-05,
"loss": 0.883,
"step": 853
},
{
"epoch": 0.682687984011991,
"grad_norm": 1.388722303171786,
"learning_rate": 1e-05,
"loss": 0.8748,
"step": 854
},
{
"epoch": 0.6834873844616538,
"grad_norm": 1.385369830792288,
"learning_rate": 1e-05,
"loss": 0.8989,
"step": 855
},
{
"epoch": 0.6842867849113166,
"grad_norm": 1.4882389241813443,
"learning_rate": 1e-05,
"loss": 0.8844,
"step": 856
},
{
"epoch": 0.6850861853609793,
"grad_norm": 1.5186240399620652,
"learning_rate": 1e-05,
"loss": 0.8171,
"step": 857
},
{
"epoch": 0.6858855858106421,
"grad_norm": 1.6078033804533332,
"learning_rate": 1e-05,
"loss": 0.8521,
"step": 858
},
{
"epoch": 0.6866849862603047,
"grad_norm": 1.5272879309131646,
"learning_rate": 1e-05,
"loss": 0.8721,
"step": 859
},
{
"epoch": 0.6874843867099675,
"grad_norm": 1.3931816328350173,
"learning_rate": 1e-05,
"loss": 0.8932,
"step": 860
},
{
"epoch": 0.6882837871596302,
"grad_norm": 1.907029791689304,
"learning_rate": 1e-05,
"loss": 0.8905,
"step": 861
},
{
"epoch": 0.689083187609293,
"grad_norm": 1.556416461497499,
"learning_rate": 1e-05,
"loss": 0.8986,
"step": 862
},
{
"epoch": 0.6898825880589557,
"grad_norm": 1.4869019645563188,
"learning_rate": 1e-05,
"loss": 0.877,
"step": 863
},
{
"epoch": 0.6906819885086185,
"grad_norm": 1.3740940835208075,
"learning_rate": 1e-05,
"loss": 0.9277,
"step": 864
},
{
"epoch": 0.6914813889582813,
"grad_norm": 1.4834340760108946,
"learning_rate": 1e-05,
"loss": 0.9176,
"step": 865
},
{
"epoch": 0.692280789407944,
"grad_norm": 1.4499951936894326,
"learning_rate": 1e-05,
"loss": 0.8522,
"step": 866
},
{
"epoch": 0.6930801898576068,
"grad_norm": 1.3889909352429337,
"learning_rate": 1e-05,
"loss": 0.8675,
"step": 867
},
{
"epoch": 0.6938795903072695,
"grad_norm": 1.3995933987812776,
"learning_rate": 1e-05,
"loss": 0.8729,
"step": 868
},
{
"epoch": 0.6946789907569323,
"grad_norm": 1.4764512256041193,
"learning_rate": 1e-05,
"loss": 0.8209,
"step": 869
},
{
"epoch": 0.695478391206595,
"grad_norm": 1.465237648051072,
"learning_rate": 1e-05,
"loss": 0.8008,
"step": 870
},
{
"epoch": 0.6962777916562578,
"grad_norm": 1.3466097106594175,
"learning_rate": 1e-05,
"loss": 0.8931,
"step": 871
},
{
"epoch": 0.6970771921059206,
"grad_norm": 1.5104958792040775,
"learning_rate": 1e-05,
"loss": 0.8828,
"step": 872
},
{
"epoch": 0.6978765925555833,
"grad_norm": 1.5720653267427949,
"learning_rate": 1e-05,
"loss": 0.9531,
"step": 873
},
{
"epoch": 0.6986759930052461,
"grad_norm": 1.5492614550562422,
"learning_rate": 1e-05,
"loss": 0.9313,
"step": 874
},
{
"epoch": 0.6994753934549088,
"grad_norm": 1.303038024217404,
"learning_rate": 1e-05,
"loss": 0.9034,
"step": 875
},
{
"epoch": 0.7002747939045716,
"grad_norm": 1.4497112842693025,
"learning_rate": 1e-05,
"loss": 0.8917,
"step": 876
},
{
"epoch": 0.7010741943542343,
"grad_norm": 1.530596911055762,
"learning_rate": 1e-05,
"loss": 0.8814,
"step": 877
},
{
"epoch": 0.7018735948038971,
"grad_norm": 1.5261791959543383,
"learning_rate": 1e-05,
"loss": 0.8853,
"step": 878
},
{
"epoch": 0.7026729952535599,
"grad_norm": 1.527060521262994,
"learning_rate": 1e-05,
"loss": 0.8882,
"step": 879
},
{
"epoch": 0.7034723957032226,
"grad_norm": 1.4906207672568565,
"learning_rate": 1e-05,
"loss": 0.8723,
"step": 880
},
{
"epoch": 0.7042717961528854,
"grad_norm": 1.480851718176504,
"learning_rate": 1e-05,
"loss": 0.8692,
"step": 881
},
{
"epoch": 0.7050711966025481,
"grad_norm": 1.47139179353177,
"learning_rate": 1e-05,
"loss": 0.888,
"step": 882
},
{
"epoch": 0.7058705970522109,
"grad_norm": 1.4278237515234393,
"learning_rate": 1e-05,
"loss": 0.9221,
"step": 883
},
{
"epoch": 0.7066699975018736,
"grad_norm": 1.573532967010904,
"learning_rate": 1e-05,
"loss": 0.9087,
"step": 884
},
{
"epoch": 0.7074693979515363,
"grad_norm": 1.590669913446065,
"learning_rate": 1e-05,
"loss": 0.8771,
"step": 885
},
{
"epoch": 0.708268798401199,
"grad_norm": 1.5285176052901992,
"learning_rate": 1e-05,
"loss": 0.8884,
"step": 886
},
{
"epoch": 0.7090681988508618,
"grad_norm": 1.5768609209939375,
"learning_rate": 1e-05,
"loss": 0.8715,
"step": 887
},
{
"epoch": 0.7098675993005246,
"grad_norm": 1.7625584577995699,
"learning_rate": 1e-05,
"loss": 0.936,
"step": 888
},
{
"epoch": 0.7106669997501873,
"grad_norm": 1.6615792785808772,
"learning_rate": 1e-05,
"loss": 0.8865,
"step": 889
},
{
"epoch": 0.7114664001998501,
"grad_norm": 1.3836071347408263,
"learning_rate": 1e-05,
"loss": 0.861,
"step": 890
},
{
"epoch": 0.7122658006495128,
"grad_norm": 1.5374171878390779,
"learning_rate": 1e-05,
"loss": 0.8533,
"step": 891
},
{
"epoch": 0.7130652010991756,
"grad_norm": 1.4960191138124015,
"learning_rate": 1e-05,
"loss": 0.8971,
"step": 892
},
{
"epoch": 0.7138646015488384,
"grad_norm": 1.3462286304870854,
"learning_rate": 1e-05,
"loss": 0.9002,
"step": 893
},
{
"epoch": 0.7146640019985011,
"grad_norm": 1.516533149153394,
"learning_rate": 1e-05,
"loss": 0.8495,
"step": 894
},
{
"epoch": 0.7154634024481639,
"grad_norm": 1.4741671333939332,
"learning_rate": 1e-05,
"loss": 0.8702,
"step": 895
},
{
"epoch": 0.7162628028978266,
"grad_norm": 1.412230967356979,
"learning_rate": 1e-05,
"loss": 0.8839,
"step": 896
},
{
"epoch": 0.7170622033474894,
"grad_norm": 1.508657424433702,
"learning_rate": 1e-05,
"loss": 0.9207,
"step": 897
},
{
"epoch": 0.7178616037971521,
"grad_norm": 1.5335780024625871,
"learning_rate": 1e-05,
"loss": 0.9414,
"step": 898
},
{
"epoch": 0.7186610042468149,
"grad_norm": 1.522192545285303,
"learning_rate": 1e-05,
"loss": 0.85,
"step": 899
},
{
"epoch": 0.7194604046964777,
"grad_norm": 1.433190511112366,
"learning_rate": 1e-05,
"loss": 0.8603,
"step": 900
},
{
"epoch": 0.7202598051461404,
"grad_norm": 1.505735858560805,
"learning_rate": 1e-05,
"loss": 0.9305,
"step": 901
},
{
"epoch": 0.7210592055958032,
"grad_norm": 1.3709122596783658,
"learning_rate": 1e-05,
"loss": 0.9035,
"step": 902
},
{
"epoch": 0.7218586060454659,
"grad_norm": 1.4784407355636868,
"learning_rate": 1e-05,
"loss": 0.8133,
"step": 903
},
{
"epoch": 0.7226580064951287,
"grad_norm": 1.4139431509162406,
"learning_rate": 1e-05,
"loss": 0.8757,
"step": 904
},
{
"epoch": 0.7234574069447914,
"grad_norm": 1.483920166289949,
"learning_rate": 1e-05,
"loss": 0.8908,
"step": 905
},
{
"epoch": 0.7242568073944542,
"grad_norm": 1.355169839026166,
"learning_rate": 1e-05,
"loss": 0.8778,
"step": 906
},
{
"epoch": 0.725056207844117,
"grad_norm": 1.5849754730542471,
"learning_rate": 1e-05,
"loss": 0.8126,
"step": 907
},
{
"epoch": 0.7258556082937797,
"grad_norm": 1.4415392226295947,
"learning_rate": 1e-05,
"loss": 0.9533,
"step": 908
},
{
"epoch": 0.7266550087434425,
"grad_norm": 1.423271400925077,
"learning_rate": 1e-05,
"loss": 0.8991,
"step": 909
},
{
"epoch": 0.7274544091931052,
"grad_norm": 1.2581118411370464,
"learning_rate": 1e-05,
"loss": 0.8691,
"step": 910
},
{
"epoch": 0.7282538096427679,
"grad_norm": 1.6042455117982117,
"learning_rate": 1e-05,
"loss": 0.9323,
"step": 911
},
{
"epoch": 0.7290532100924306,
"grad_norm": 1.7219536250131735,
"learning_rate": 1e-05,
"loss": 0.9108,
"step": 912
},
{
"epoch": 0.7298526105420934,
"grad_norm": 1.39448532764431,
"learning_rate": 1e-05,
"loss": 0.8465,
"step": 913
},
{
"epoch": 0.7306520109917561,
"grad_norm": 1.3967526960492356,
"learning_rate": 1e-05,
"loss": 0.8673,
"step": 914
},
{
"epoch": 0.7314514114414189,
"grad_norm": 1.6077994734490668,
"learning_rate": 1e-05,
"loss": 0.8955,
"step": 915
},
{
"epoch": 0.7322508118910817,
"grad_norm": 1.3203640300504973,
"learning_rate": 1e-05,
"loss": 0.7997,
"step": 916
},
{
"epoch": 0.7330502123407444,
"grad_norm": 1.4566518226470033,
"learning_rate": 1e-05,
"loss": 0.8296,
"step": 917
},
{
"epoch": 0.7338496127904072,
"grad_norm": 1.7293187013351636,
"learning_rate": 1e-05,
"loss": 0.9021,
"step": 918
},
{
"epoch": 0.7346490132400699,
"grad_norm": 1.5383747305896551,
"learning_rate": 1e-05,
"loss": 0.8973,
"step": 919
},
{
"epoch": 0.7354484136897327,
"grad_norm": 1.4275975245981607,
"learning_rate": 1e-05,
"loss": 0.8612,
"step": 920
},
{
"epoch": 0.7362478141393954,
"grad_norm": 1.472214485322947,
"learning_rate": 1e-05,
"loss": 0.9005,
"step": 921
},
{
"epoch": 0.7370472145890582,
"grad_norm": 1.4170406969180516,
"learning_rate": 1e-05,
"loss": 0.8952,
"step": 922
},
{
"epoch": 0.737846615038721,
"grad_norm": 1.4134994732170305,
"learning_rate": 1e-05,
"loss": 0.8427,
"step": 923
},
{
"epoch": 0.7386460154883837,
"grad_norm": 1.5810245176397593,
"learning_rate": 1e-05,
"loss": 0.8873,
"step": 924
},
{
"epoch": 0.7394454159380465,
"grad_norm": 1.572493026866151,
"learning_rate": 1e-05,
"loss": 0.8999,
"step": 925
},
{
"epoch": 0.7402448163877092,
"grad_norm": 1.4558846312035074,
"learning_rate": 1e-05,
"loss": 0.9221,
"step": 926
},
{
"epoch": 0.741044216837372,
"grad_norm": 1.41669477168302,
"learning_rate": 1e-05,
"loss": 0.8994,
"step": 927
},
{
"epoch": 0.7418436172870347,
"grad_norm": 1.433461160216514,
"learning_rate": 1e-05,
"loss": 0.8749,
"step": 928
},
{
"epoch": 0.7426430177366975,
"grad_norm": 1.5673163590141157,
"learning_rate": 1e-05,
"loss": 0.8586,
"step": 929
},
{
"epoch": 0.7434424181863603,
"grad_norm": 1.4736635147050137,
"learning_rate": 1e-05,
"loss": 0.9211,
"step": 930
},
{
"epoch": 0.744241818636023,
"grad_norm": 1.4647228645746486,
"learning_rate": 1e-05,
"loss": 0.8332,
"step": 931
},
{
"epoch": 0.7450412190856858,
"grad_norm": 1.3876657153509906,
"learning_rate": 1e-05,
"loss": 0.8481,
"step": 932
},
{
"epoch": 0.7458406195353485,
"grad_norm": 1.469313389155329,
"learning_rate": 1e-05,
"loss": 0.9234,
"step": 933
},
{
"epoch": 0.7466400199850113,
"grad_norm": 1.415959193503077,
"learning_rate": 1e-05,
"loss": 0.8794,
"step": 934
},
{
"epoch": 0.747439420434674,
"grad_norm": 1.4597571617980725,
"learning_rate": 1e-05,
"loss": 0.8565,
"step": 935
},
{
"epoch": 0.7482388208843368,
"grad_norm": 1.5271437643331571,
"learning_rate": 1e-05,
"loss": 0.8826,
"step": 936
},
{
"epoch": 0.7490382213339996,
"grad_norm": 1.4956114964893394,
"learning_rate": 1e-05,
"loss": 0.9085,
"step": 937
},
{
"epoch": 0.7498376217836622,
"grad_norm": 1.4732612528806723,
"learning_rate": 1e-05,
"loss": 0.8247,
"step": 938
},
{
"epoch": 0.750637022233325,
"grad_norm": 1.4787986640658028,
"learning_rate": 1e-05,
"loss": 0.9317,
"step": 939
},
{
"epoch": 0.7514364226829877,
"grad_norm": 1.7252017457319206,
"learning_rate": 1e-05,
"loss": 0.8741,
"step": 940
},
{
"epoch": 0.7522358231326505,
"grad_norm": 1.4487217007150137,
"learning_rate": 1e-05,
"loss": 0.8629,
"step": 941
},
{
"epoch": 0.7530352235823132,
"grad_norm": 1.5157039585564798,
"learning_rate": 1e-05,
"loss": 0.897,
"step": 942
},
{
"epoch": 0.753834624031976,
"grad_norm": 1.611412160953887,
"learning_rate": 1e-05,
"loss": 0.9021,
"step": 943
},
{
"epoch": 0.7546340244816387,
"grad_norm": 1.4394146060850934,
"learning_rate": 1e-05,
"loss": 0.8281,
"step": 944
},
{
"epoch": 0.7554334249313015,
"grad_norm": 1.453348907195491,
"learning_rate": 1e-05,
"loss": 0.8928,
"step": 945
},
{
"epoch": 0.7562328253809643,
"grad_norm": 1.4907250315835585,
"learning_rate": 1e-05,
"loss": 0.7856,
"step": 946
},
{
"epoch": 0.757032225830627,
"grad_norm": 1.49481328462233,
"learning_rate": 1e-05,
"loss": 0.9155,
"step": 947
},
{
"epoch": 0.7578316262802898,
"grad_norm": 1.3751698030196142,
"learning_rate": 1e-05,
"loss": 0.9301,
"step": 948
},
{
"epoch": 0.7586310267299525,
"grad_norm": 1.4444725328440537,
"learning_rate": 1e-05,
"loss": 0.8655,
"step": 949
},
{
"epoch": 0.7594304271796153,
"grad_norm": 1.5456042887758088,
"learning_rate": 1e-05,
"loss": 0.8901,
"step": 950
},
{
"epoch": 0.760229827629278,
"grad_norm": 1.4949432017846453,
"learning_rate": 1e-05,
"loss": 0.916,
"step": 951
},
{
"epoch": 0.7610292280789408,
"grad_norm": 1.6011034169877894,
"learning_rate": 1e-05,
"loss": 0.891,
"step": 952
},
{
"epoch": 0.7618286285286036,
"grad_norm": 1.3042950526088992,
"learning_rate": 1e-05,
"loss": 0.9446,
"step": 953
},
{
"epoch": 0.7626280289782663,
"grad_norm": 1.5837586059992244,
"learning_rate": 1e-05,
"loss": 0.91,
"step": 954
},
{
"epoch": 0.7634274294279291,
"grad_norm": 1.4339719484551816,
"learning_rate": 1e-05,
"loss": 0.9016,
"step": 955
},
{
"epoch": 0.7642268298775918,
"grad_norm": 1.5025550156701537,
"learning_rate": 1e-05,
"loss": 0.879,
"step": 956
},
{
"epoch": 0.7650262303272546,
"grad_norm": 1.5378372958159126,
"learning_rate": 1e-05,
"loss": 0.9063,
"step": 957
},
{
"epoch": 0.7658256307769173,
"grad_norm": 1.5230827569900542,
"learning_rate": 1e-05,
"loss": 0.8989,
"step": 958
},
{
"epoch": 0.7666250312265801,
"grad_norm": 1.5291223084053325,
"learning_rate": 1e-05,
"loss": 0.8616,
"step": 959
},
{
"epoch": 0.7674244316762429,
"grad_norm": 1.4773136700451888,
"learning_rate": 1e-05,
"loss": 0.8424,
"step": 960
},
{
"epoch": 0.7682238321259056,
"grad_norm": 1.2093245102672463,
"learning_rate": 1e-05,
"loss": 0.8848,
"step": 961
},
{
"epoch": 0.7690232325755684,
"grad_norm": 1.8571716921307402,
"learning_rate": 1e-05,
"loss": 0.8495,
"step": 962
},
{
"epoch": 0.7698226330252311,
"grad_norm": 1.4472646694433717,
"learning_rate": 1e-05,
"loss": 0.867,
"step": 963
},
{
"epoch": 0.7706220334748938,
"grad_norm": 1.4580828263402077,
"learning_rate": 1e-05,
"loss": 0.9002,
"step": 964
},
{
"epoch": 0.7714214339245565,
"grad_norm": 1.499450946544706,
"learning_rate": 1e-05,
"loss": 0.9073,
"step": 965
},
{
"epoch": 0.7722208343742193,
"grad_norm": 1.4461364940439836,
"learning_rate": 1e-05,
"loss": 0.8778,
"step": 966
},
{
"epoch": 0.773020234823882,
"grad_norm": 1.506316728494387,
"learning_rate": 1e-05,
"loss": 0.845,
"step": 967
},
{
"epoch": 0.7738196352735448,
"grad_norm": 1.3561052135711964,
"learning_rate": 1e-05,
"loss": 0.8722,
"step": 968
},
{
"epoch": 0.7746190357232076,
"grad_norm": 1.4017997594585556,
"learning_rate": 1e-05,
"loss": 0.8602,
"step": 969
},
{
"epoch": 0.7754184361728703,
"grad_norm": 1.4673374430145514,
"learning_rate": 1e-05,
"loss": 0.8503,
"step": 970
},
{
"epoch": 0.7762178366225331,
"grad_norm": 1.6150949805416606,
"learning_rate": 1e-05,
"loss": 0.8194,
"step": 971
},
{
"epoch": 0.7770172370721958,
"grad_norm": 1.4293495610183653,
"learning_rate": 1e-05,
"loss": 0.8719,
"step": 972
},
{
"epoch": 0.7778166375218586,
"grad_norm": 1.34711255646197,
"learning_rate": 1e-05,
"loss": 0.9134,
"step": 973
},
{
"epoch": 0.7786160379715213,
"grad_norm": 1.486875510521667,
"learning_rate": 1e-05,
"loss": 0.8282,
"step": 974
},
{
"epoch": 0.7794154384211841,
"grad_norm": 1.3975406817023381,
"learning_rate": 1e-05,
"loss": 0.8696,
"step": 975
},
{
"epoch": 0.7802148388708469,
"grad_norm": 1.590401419774706,
"learning_rate": 1e-05,
"loss": 0.8639,
"step": 976
},
{
"epoch": 0.7810142393205096,
"grad_norm": 1.6348383990486186,
"learning_rate": 1e-05,
"loss": 0.8747,
"step": 977
},
{
"epoch": 0.7818136397701724,
"grad_norm": 1.316266061656018,
"learning_rate": 1e-05,
"loss": 0.8912,
"step": 978
},
{
"epoch": 0.7826130402198351,
"grad_norm": 1.4071917948886756,
"learning_rate": 1e-05,
"loss": 0.8258,
"step": 979
},
{
"epoch": 0.7834124406694979,
"grad_norm": 1.4469880221919649,
"learning_rate": 1e-05,
"loss": 0.8354,
"step": 980
},
{
"epoch": 0.7842118411191606,
"grad_norm": 1.2870843290387057,
"learning_rate": 1e-05,
"loss": 0.895,
"step": 981
},
{
"epoch": 0.7850112415688234,
"grad_norm": 1.3149196281524491,
"learning_rate": 1e-05,
"loss": 0.8512,
"step": 982
},
{
"epoch": 0.7858106420184862,
"grad_norm": 1.4704086234102491,
"learning_rate": 1e-05,
"loss": 0.9012,
"step": 983
},
{
"epoch": 0.7866100424681489,
"grad_norm": 1.3828361638550721,
"learning_rate": 1e-05,
"loss": 0.8971,
"step": 984
},
{
"epoch": 0.7874094429178117,
"grad_norm": 1.6052079287723495,
"learning_rate": 1e-05,
"loss": 0.8577,
"step": 985
},
{
"epoch": 0.7882088433674744,
"grad_norm": 1.6793204061607632,
"learning_rate": 1e-05,
"loss": 0.876,
"step": 986
},
{
"epoch": 0.7890082438171372,
"grad_norm": 1.4036184553448683,
"learning_rate": 1e-05,
"loss": 0.8983,
"step": 987
},
{
"epoch": 0.7898076442668,
"grad_norm": 1.3858819411819097,
"learning_rate": 1e-05,
"loss": 0.8535,
"step": 988
},
{
"epoch": 0.7906070447164627,
"grad_norm": 1.549784899074943,
"learning_rate": 1e-05,
"loss": 0.8849,
"step": 989
},
{
"epoch": 0.7914064451661255,
"grad_norm": 1.4777960666446712,
"learning_rate": 1e-05,
"loss": 0.8666,
"step": 990
},
{
"epoch": 0.7922058456157881,
"grad_norm": 1.9201257825258455,
"learning_rate": 1e-05,
"loss": 0.8409,
"step": 991
},
{
"epoch": 0.7930052460654509,
"grad_norm": 1.5498417440527896,
"learning_rate": 1e-05,
"loss": 0.8805,
"step": 992
},
{
"epoch": 0.7938046465151136,
"grad_norm": 1.4317539743714072,
"learning_rate": 1e-05,
"loss": 0.8635,
"step": 993
},
{
"epoch": 0.7946040469647764,
"grad_norm": 1.5358225135776136,
"learning_rate": 1e-05,
"loss": 0.8708,
"step": 994
},
{
"epoch": 0.7954034474144391,
"grad_norm": 1.4138952398073754,
"learning_rate": 1e-05,
"loss": 0.9169,
"step": 995
},
{
"epoch": 0.7962028478641019,
"grad_norm": 1.306882525453356,
"learning_rate": 1e-05,
"loss": 0.8839,
"step": 996
},
{
"epoch": 0.7970022483137646,
"grad_norm": 1.4151969180638062,
"learning_rate": 1e-05,
"loss": 0.9003,
"step": 997
},
{
"epoch": 0.7978016487634274,
"grad_norm": 1.275880598076204,
"learning_rate": 1e-05,
"loss": 0.8549,
"step": 998
},
{
"epoch": 0.7986010492130902,
"grad_norm": 1.4001477420212065,
"learning_rate": 1e-05,
"loss": 0.8505,
"step": 999
},
{
"epoch": 0.7994004496627529,
"grad_norm": 1.4186294121350504,
"learning_rate": 1e-05,
"loss": 0.9215,
"step": 1000
},
{
"epoch": 0.8001998501124157,
"grad_norm": 1.2933825079861516,
"learning_rate": 1e-05,
"loss": 0.851,
"step": 1001
},
{
"epoch": 0.8009992505620784,
"grad_norm": 1.3844838762102727,
"learning_rate": 1e-05,
"loss": 0.8491,
"step": 1002
},
{
"epoch": 0.8017986510117412,
"grad_norm": 1.424315745643642,
"learning_rate": 1e-05,
"loss": 0.8568,
"step": 1003
},
{
"epoch": 0.802598051461404,
"grad_norm": 1.536779666402137,
"learning_rate": 1e-05,
"loss": 0.8616,
"step": 1004
},
{
"epoch": 0.8033974519110667,
"grad_norm": 1.5821296355398455,
"learning_rate": 1e-05,
"loss": 0.8527,
"step": 1005
},
{
"epoch": 0.8041968523607295,
"grad_norm": 1.4455856617071001,
"learning_rate": 1e-05,
"loss": 0.8713,
"step": 1006
},
{
"epoch": 0.8049962528103922,
"grad_norm": 1.2354619055674243,
"learning_rate": 1e-05,
"loss": 0.8993,
"step": 1007
},
{
"epoch": 0.805795653260055,
"grad_norm": 1.6070375123923897,
"learning_rate": 1e-05,
"loss": 0.8743,
"step": 1008
},
{
"epoch": 0.8065950537097177,
"grad_norm": 1.4541848794736738,
"learning_rate": 1e-05,
"loss": 0.8581,
"step": 1009
},
{
"epoch": 0.8073944541593805,
"grad_norm": 1.4226597509638712,
"learning_rate": 1e-05,
"loss": 0.8627,
"step": 1010
},
{
"epoch": 0.8081938546090432,
"grad_norm": 1.4541863541400335,
"learning_rate": 1e-05,
"loss": 0.895,
"step": 1011
},
{
"epoch": 0.808993255058706,
"grad_norm": 1.5062531375485146,
"learning_rate": 1e-05,
"loss": 0.8703,
"step": 1012
},
{
"epoch": 0.8097926555083688,
"grad_norm": 1.4276851237794737,
"learning_rate": 1e-05,
"loss": 0.9105,
"step": 1013
},
{
"epoch": 0.8105920559580315,
"grad_norm": 1.4788273876522071,
"learning_rate": 1e-05,
"loss": 0.8367,
"step": 1014
},
{
"epoch": 0.8113914564076943,
"grad_norm": 1.407145240499365,
"learning_rate": 1e-05,
"loss": 0.8145,
"step": 1015
},
{
"epoch": 0.812190856857357,
"grad_norm": 1.8014438432061057,
"learning_rate": 1e-05,
"loss": 0.8819,
"step": 1016
},
{
"epoch": 0.8129902573070197,
"grad_norm": 1.372209038359735,
"learning_rate": 1e-05,
"loss": 0.868,
"step": 1017
},
{
"epoch": 0.8137896577566824,
"grad_norm": 1.5495856867435909,
"learning_rate": 1e-05,
"loss": 0.8626,
"step": 1018
},
{
"epoch": 0.8145890582063452,
"grad_norm": 1.617836176498916,
"learning_rate": 1e-05,
"loss": 0.802,
"step": 1019
},
{
"epoch": 0.815388458656008,
"grad_norm": 1.4721686410259016,
"learning_rate": 1e-05,
"loss": 0.8835,
"step": 1020
},
{
"epoch": 0.8161878591056707,
"grad_norm": 1.5756457858641464,
"learning_rate": 1e-05,
"loss": 0.903,
"step": 1021
},
{
"epoch": 0.8169872595553335,
"grad_norm": 1.5229101192600658,
"learning_rate": 1e-05,
"loss": 0.9111,
"step": 1022
},
{
"epoch": 0.8177866600049962,
"grad_norm": 1.3993305196243857,
"learning_rate": 1e-05,
"loss": 0.8218,
"step": 1023
},
{
"epoch": 0.818586060454659,
"grad_norm": 1.760678360532871,
"learning_rate": 1e-05,
"loss": 0.8942,
"step": 1024
},
{
"epoch": 0.8193854609043217,
"grad_norm": 1.5000785334067135,
"learning_rate": 1e-05,
"loss": 0.8791,
"step": 1025
},
{
"epoch": 0.8201848613539845,
"grad_norm": 1.6674955223352013,
"learning_rate": 1e-05,
"loss": 0.8626,
"step": 1026
},
{
"epoch": 0.8209842618036473,
"grad_norm": 1.6318940473847319,
"learning_rate": 1e-05,
"loss": 0.9127,
"step": 1027
},
{
"epoch": 0.82178366225331,
"grad_norm": 1.3538540553221186,
"learning_rate": 1e-05,
"loss": 0.8845,
"step": 1028
},
{
"epoch": 0.8225830627029728,
"grad_norm": 1.6241141376741397,
"learning_rate": 1e-05,
"loss": 0.9048,
"step": 1029
},
{
"epoch": 0.8233824631526355,
"grad_norm": 1.6076201928103848,
"learning_rate": 1e-05,
"loss": 0.911,
"step": 1030
},
{
"epoch": 0.8241818636022983,
"grad_norm": 1.6514943246242055,
"learning_rate": 1e-05,
"loss": 0.9161,
"step": 1031
},
{
"epoch": 0.824981264051961,
"grad_norm": 1.4757470594102153,
"learning_rate": 1e-05,
"loss": 0.8522,
"step": 1032
},
{
"epoch": 0.8257806645016238,
"grad_norm": 1.6550497620549556,
"learning_rate": 1e-05,
"loss": 0.8821,
"step": 1033
},
{
"epoch": 0.8265800649512866,
"grad_norm": 1.416433208923937,
"learning_rate": 1e-05,
"loss": 0.8757,
"step": 1034
},
{
"epoch": 0.8273794654009493,
"grad_norm": 1.2778570567318692,
"learning_rate": 1e-05,
"loss": 0.9259,
"step": 1035
},
{
"epoch": 0.8281788658506121,
"grad_norm": 1.5852330201994453,
"learning_rate": 1e-05,
"loss": 0.833,
"step": 1036
},
{
"epoch": 0.8289782663002748,
"grad_norm": 1.5240272977432132,
"learning_rate": 1e-05,
"loss": 0.8842,
"step": 1037
},
{
"epoch": 0.8297776667499376,
"grad_norm": 1.2798050681953308,
"learning_rate": 1e-05,
"loss": 0.8338,
"step": 1038
},
{
"epoch": 0.8305770671996003,
"grad_norm": 1.5489648653630288,
"learning_rate": 1e-05,
"loss": 0.8501,
"step": 1039
},
{
"epoch": 0.8313764676492631,
"grad_norm": 1.435906811134758,
"learning_rate": 1e-05,
"loss": 0.867,
"step": 1040
},
{
"epoch": 0.8321758680989259,
"grad_norm": 1.4702097962874583,
"learning_rate": 1e-05,
"loss": 0.8378,
"step": 1041
},
{
"epoch": 0.8329752685485886,
"grad_norm": 1.594108179583473,
"learning_rate": 1e-05,
"loss": 0.8765,
"step": 1042
},
{
"epoch": 0.8337746689982513,
"grad_norm": 1.5200902751808993,
"learning_rate": 1e-05,
"loss": 0.877,
"step": 1043
},
{
"epoch": 0.834574069447914,
"grad_norm": 1.438298229451603,
"learning_rate": 1e-05,
"loss": 0.8663,
"step": 1044
},
{
"epoch": 0.8353734698975768,
"grad_norm": 1.454018494694832,
"learning_rate": 1e-05,
"loss": 0.8918,
"step": 1045
},
{
"epoch": 0.8361728703472395,
"grad_norm": 1.5005502708015002,
"learning_rate": 1e-05,
"loss": 0.8373,
"step": 1046
},
{
"epoch": 0.8369722707969023,
"grad_norm": 1.291500552799284,
"learning_rate": 1e-05,
"loss": 0.8418,
"step": 1047
},
{
"epoch": 0.837771671246565,
"grad_norm": 1.464971545003121,
"learning_rate": 1e-05,
"loss": 0.8986,
"step": 1048
},
{
"epoch": 0.8385710716962278,
"grad_norm": 1.3970450556504503,
"learning_rate": 1e-05,
"loss": 0.8829,
"step": 1049
},
{
"epoch": 0.8393704721458906,
"grad_norm": 1.4873155145975965,
"learning_rate": 1e-05,
"loss": 0.9039,
"step": 1050
},
{
"epoch": 0.8401698725955533,
"grad_norm": 1.4116749220099283,
"learning_rate": 1e-05,
"loss": 0.8392,
"step": 1051
},
{
"epoch": 0.8409692730452161,
"grad_norm": 1.375469878009426,
"learning_rate": 1e-05,
"loss": 0.8463,
"step": 1052
},
{
"epoch": 0.8417686734948788,
"grad_norm": 1.2984722488631455,
"learning_rate": 1e-05,
"loss": 0.8578,
"step": 1053
},
{
"epoch": 0.8425680739445416,
"grad_norm": 1.377137242087716,
"learning_rate": 1e-05,
"loss": 0.8753,
"step": 1054
},
{
"epoch": 0.8433674743942043,
"grad_norm": 1.513617649555769,
"learning_rate": 1e-05,
"loss": 0.9161,
"step": 1055
},
{
"epoch": 0.8441668748438671,
"grad_norm": 1.3943343336302483,
"learning_rate": 1e-05,
"loss": 0.8984,
"step": 1056
},
{
"epoch": 0.8449662752935299,
"grad_norm": 1.4829273034743662,
"learning_rate": 1e-05,
"loss": 0.9456,
"step": 1057
},
{
"epoch": 0.8457656757431926,
"grad_norm": 1.300813156773205,
"learning_rate": 1e-05,
"loss": 0.8848,
"step": 1058
},
{
"epoch": 0.8465650761928554,
"grad_norm": 1.6171681830366456,
"learning_rate": 1e-05,
"loss": 0.8825,
"step": 1059
},
{
"epoch": 0.8473644766425181,
"grad_norm": 1.4746785824924948,
"learning_rate": 1e-05,
"loss": 0.8782,
"step": 1060
},
{
"epoch": 0.8481638770921809,
"grad_norm": 1.4773421322909972,
"learning_rate": 1e-05,
"loss": 0.8511,
"step": 1061
},
{
"epoch": 0.8489632775418436,
"grad_norm": 1.2712734121324603,
"learning_rate": 1e-05,
"loss": 0.8512,
"step": 1062
},
{
"epoch": 0.8497626779915064,
"grad_norm": 1.4673347511942942,
"learning_rate": 1e-05,
"loss": 0.9082,
"step": 1063
},
{
"epoch": 0.8505620784411692,
"grad_norm": 1.3648492502704213,
"learning_rate": 1e-05,
"loss": 0.8857,
"step": 1064
},
{
"epoch": 0.8513614788908319,
"grad_norm": 1.4139141501508512,
"learning_rate": 1e-05,
"loss": 0.862,
"step": 1065
},
{
"epoch": 0.8521608793404947,
"grad_norm": 1.477130924813934,
"learning_rate": 1e-05,
"loss": 0.9706,
"step": 1066
},
{
"epoch": 0.8529602797901574,
"grad_norm": 1.4482250453004122,
"learning_rate": 1e-05,
"loss": 0.906,
"step": 1067
},
{
"epoch": 0.8537596802398202,
"grad_norm": 1.522992168116749,
"learning_rate": 1e-05,
"loss": 0.9545,
"step": 1068
},
{
"epoch": 0.8545590806894829,
"grad_norm": 1.5261726249592624,
"learning_rate": 1e-05,
"loss": 0.9238,
"step": 1069
},
{
"epoch": 0.8553584811391456,
"grad_norm": 1.4267309300725217,
"learning_rate": 1e-05,
"loss": 0.8925,
"step": 1070
},
{
"epoch": 0.8561578815888083,
"grad_norm": 1.4527959506992734,
"learning_rate": 1e-05,
"loss": 0.8994,
"step": 1071
},
{
"epoch": 0.8569572820384711,
"grad_norm": 1.5030190804190187,
"learning_rate": 1e-05,
"loss": 0.9301,
"step": 1072
},
{
"epoch": 0.8577566824881339,
"grad_norm": 1.2235265071589685,
"learning_rate": 1e-05,
"loss": 0.8509,
"step": 1073
},
{
"epoch": 0.8585560829377966,
"grad_norm": 1.4074654499219896,
"learning_rate": 1e-05,
"loss": 0.889,
"step": 1074
},
{
"epoch": 0.8593554833874594,
"grad_norm": 1.2944008535061877,
"learning_rate": 1e-05,
"loss": 0.8966,
"step": 1075
},
{
"epoch": 0.8601548838371221,
"grad_norm": 1.3310895995556136,
"learning_rate": 1e-05,
"loss": 0.8138,
"step": 1076
},
{
"epoch": 0.8609542842867849,
"grad_norm": 1.427408631037752,
"learning_rate": 1e-05,
"loss": 0.8852,
"step": 1077
},
{
"epoch": 0.8617536847364476,
"grad_norm": 1.4692371711172514,
"learning_rate": 1e-05,
"loss": 0.8797,
"step": 1078
},
{
"epoch": 0.8625530851861104,
"grad_norm": 1.541692282374257,
"learning_rate": 1e-05,
"loss": 0.8589,
"step": 1079
},
{
"epoch": 0.8633524856357732,
"grad_norm": 1.5815910403079887,
"learning_rate": 1e-05,
"loss": 0.8625,
"step": 1080
},
{
"epoch": 0.8641518860854359,
"grad_norm": 1.4959179824929254,
"learning_rate": 1e-05,
"loss": 0.9189,
"step": 1081
},
{
"epoch": 0.8649512865350987,
"grad_norm": 1.5319580336293697,
"learning_rate": 1e-05,
"loss": 0.8267,
"step": 1082
},
{
"epoch": 0.8657506869847614,
"grad_norm": 1.486040602833083,
"learning_rate": 1e-05,
"loss": 0.8625,
"step": 1083
},
{
"epoch": 0.8665500874344242,
"grad_norm": 1.424254094608181,
"learning_rate": 1e-05,
"loss": 0.8943,
"step": 1084
},
{
"epoch": 0.867349487884087,
"grad_norm": 1.643630623556634,
"learning_rate": 1e-05,
"loss": 0.9188,
"step": 1085
},
{
"epoch": 0.8681488883337497,
"grad_norm": 1.452534027382345,
"learning_rate": 1e-05,
"loss": 0.88,
"step": 1086
},
{
"epoch": 0.8689482887834125,
"grad_norm": 1.5479604306192913,
"learning_rate": 1e-05,
"loss": 0.7943,
"step": 1087
},
{
"epoch": 0.8697476892330752,
"grad_norm": 1.363070121645927,
"learning_rate": 1e-05,
"loss": 0.8416,
"step": 1088
},
{
"epoch": 0.870547089682738,
"grad_norm": 1.4551924760921788,
"learning_rate": 1e-05,
"loss": 0.8748,
"step": 1089
},
{
"epoch": 0.8713464901324007,
"grad_norm": 1.8072081219985316,
"learning_rate": 1e-05,
"loss": 0.8955,
"step": 1090
},
{
"epoch": 0.8721458905820635,
"grad_norm": 1.480345916799882,
"learning_rate": 1e-05,
"loss": 0.8984,
"step": 1091
},
{
"epoch": 0.8729452910317262,
"grad_norm": 1.513951237735827,
"learning_rate": 1e-05,
"loss": 0.8573,
"step": 1092
},
{
"epoch": 0.873744691481389,
"grad_norm": 1.4012284095364107,
"learning_rate": 1e-05,
"loss": 0.8448,
"step": 1093
},
{
"epoch": 0.8745440919310518,
"grad_norm": 1.4316182193855909,
"learning_rate": 1e-05,
"loss": 0.8778,
"step": 1094
},
{
"epoch": 0.8753434923807145,
"grad_norm": 1.4767564148326937,
"learning_rate": 1e-05,
"loss": 0.8769,
"step": 1095
},
{
"epoch": 0.8761428928303772,
"grad_norm": 1.4996522495810245,
"learning_rate": 1e-05,
"loss": 0.8928,
"step": 1096
},
{
"epoch": 0.8769422932800399,
"grad_norm": 1.4836247078704627,
"learning_rate": 1e-05,
"loss": 0.8657,
"step": 1097
},
{
"epoch": 0.8777416937297027,
"grad_norm": 1.4148846725052078,
"learning_rate": 1e-05,
"loss": 0.8763,
"step": 1098
},
{
"epoch": 0.8785410941793654,
"grad_norm": 1.8564930284795111,
"learning_rate": 1e-05,
"loss": 0.8362,
"step": 1099
},
{
"epoch": 0.8793404946290282,
"grad_norm": 1.3887316999375894,
"learning_rate": 1e-05,
"loss": 0.8692,
"step": 1100
},
{
"epoch": 0.880139895078691,
"grad_norm": 4.721320996449426,
"learning_rate": 1e-05,
"loss": 0.8951,
"step": 1101
},
{
"epoch": 0.8809392955283537,
"grad_norm": 1.801959602001512,
"learning_rate": 1e-05,
"loss": 0.9127,
"step": 1102
},
{
"epoch": 0.8817386959780165,
"grad_norm": 1.4999059990761596,
"learning_rate": 1e-05,
"loss": 0.8277,
"step": 1103
},
{
"epoch": 0.8825380964276792,
"grad_norm": 1.580749043430391,
"learning_rate": 1e-05,
"loss": 0.8532,
"step": 1104
},
{
"epoch": 0.883337496877342,
"grad_norm": 4.723585804015321,
"learning_rate": 1e-05,
"loss": 0.8846,
"step": 1105
},
{
"epoch": 0.8841368973270047,
"grad_norm": 10.533804682370834,
"learning_rate": 1e-05,
"loss": 0.915,
"step": 1106
},
{
"epoch": 0.8849362977766675,
"grad_norm": 2.7074849652786948,
"learning_rate": 1e-05,
"loss": 0.8588,
"step": 1107
},
{
"epoch": 0.8857356982263302,
"grad_norm": 1.8291875456761892,
"learning_rate": 1e-05,
"loss": 0.8274,
"step": 1108
},
{
"epoch": 0.886535098675993,
"grad_norm": 4.390203546717027,
"learning_rate": 1e-05,
"loss": 0.908,
"step": 1109
},
{
"epoch": 0.8873344991256558,
"grad_norm": 3.8103014314112156,
"learning_rate": 1e-05,
"loss": 0.9094,
"step": 1110
},
{
"epoch": 0.8881338995753185,
"grad_norm": 455.28682152314866,
"learning_rate": 1e-05,
"loss": 0.8881,
"step": 1111
},
{
"epoch": 0.8889333000249813,
"grad_norm": 4.4477324217626295,
"learning_rate": 1e-05,
"loss": 0.891,
"step": 1112
},
{
"epoch": 0.889732700474644,
"grad_norm": 1.3884130302591122,
"learning_rate": 1e-05,
"loss": 0.8485,
"step": 1113
},
{
"epoch": 0.8905321009243068,
"grad_norm": 1.4938176798235159,
"learning_rate": 1e-05,
"loss": 0.8438,
"step": 1114
},
{
"epoch": 0.8913315013739695,
"grad_norm": 1.5434085929606869,
"learning_rate": 1e-05,
"loss": 0.8977,
"step": 1115
},
{
"epoch": 0.8921309018236323,
"grad_norm": 1.3286197641197046,
"learning_rate": 1e-05,
"loss": 0.8355,
"step": 1116
},
{
"epoch": 0.8929303022732951,
"grad_norm": 1.4646146883912168,
"learning_rate": 1e-05,
"loss": 0.8735,
"step": 1117
},
{
"epoch": 0.8937297027229578,
"grad_norm": 8.122892577298567,
"learning_rate": 1e-05,
"loss": 0.9016,
"step": 1118
},
{
"epoch": 0.8945291031726206,
"grad_norm": 1.4376773752975496,
"learning_rate": 1e-05,
"loss": 0.913,
"step": 1119
},
{
"epoch": 0.8953285036222833,
"grad_norm": 2.146749128485352,
"learning_rate": 1e-05,
"loss": 0.8643,
"step": 1120
},
{
"epoch": 0.8961279040719461,
"grad_norm": 1.7549423766927372,
"learning_rate": 1e-05,
"loss": 0.8559,
"step": 1121
},
{
"epoch": 0.8969273045216088,
"grad_norm": 1.6784215753386844,
"learning_rate": 1e-05,
"loss": 0.8467,
"step": 1122
},
{
"epoch": 0.8977267049712715,
"grad_norm": 1.5205155953208587,
"learning_rate": 1e-05,
"loss": 0.8918,
"step": 1123
},
{
"epoch": 0.8985261054209343,
"grad_norm": 1.439844948580554,
"learning_rate": 1e-05,
"loss": 0.846,
"step": 1124
},
{
"epoch": 0.899325505870597,
"grad_norm": 1.3494413427515104,
"learning_rate": 1e-05,
"loss": 0.8701,
"step": 1125
},
{
"epoch": 0.9001249063202598,
"grad_norm": 1.4785482334232822,
"learning_rate": 1e-05,
"loss": 0.8762,
"step": 1126
},
{
"epoch": 0.9009243067699225,
"grad_norm": 1.6204723133056338,
"learning_rate": 1e-05,
"loss": 0.8618,
"step": 1127
},
{
"epoch": 0.9017237072195853,
"grad_norm": 1.5410838002577578,
"learning_rate": 1e-05,
"loss": 0.8352,
"step": 1128
},
{
"epoch": 0.902523107669248,
"grad_norm": 1.408368948793772,
"learning_rate": 1e-05,
"loss": 0.8143,
"step": 1129
},
{
"epoch": 0.9033225081189108,
"grad_norm": 1.3840905876298821,
"learning_rate": 1e-05,
"loss": 0.8497,
"step": 1130
},
{
"epoch": 0.9041219085685736,
"grad_norm": 1.7756397607717793,
"learning_rate": 1e-05,
"loss": 0.9065,
"step": 1131
},
{
"epoch": 0.9049213090182363,
"grad_norm": 1.3699433150113711,
"learning_rate": 1e-05,
"loss": 0.8455,
"step": 1132
},
{
"epoch": 0.9057207094678991,
"grad_norm": 1.4303100795006611,
"learning_rate": 1e-05,
"loss": 0.8574,
"step": 1133
},
{
"epoch": 0.9065201099175618,
"grad_norm": 1.3913259705586178,
"learning_rate": 1e-05,
"loss": 0.8615,
"step": 1134
},
{
"epoch": 0.9073195103672246,
"grad_norm": 1.4143231716945688,
"learning_rate": 1e-05,
"loss": 0.9084,
"step": 1135
},
{
"epoch": 0.9081189108168873,
"grad_norm": 1.3947073651825206,
"learning_rate": 1e-05,
"loss": 0.8926,
"step": 1136
},
{
"epoch": 0.9089183112665501,
"grad_norm": 1.415175153929991,
"learning_rate": 1e-05,
"loss": 0.8153,
"step": 1137
},
{
"epoch": 0.9097177117162129,
"grad_norm": 1.3554176947555092,
"learning_rate": 1e-05,
"loss": 0.9018,
"step": 1138
},
{
"epoch": 0.9105171121658756,
"grad_norm": 1.3676437829569341,
"learning_rate": 1e-05,
"loss": 0.8339,
"step": 1139
},
{
"epoch": 0.9113165126155384,
"grad_norm": 1.2248182189476722,
"learning_rate": 1e-05,
"loss": 0.8865,
"step": 1140
},
{
"epoch": 0.9121159130652011,
"grad_norm": 1.5759658066895652,
"learning_rate": 1e-05,
"loss": 0.8736,
"step": 1141
},
{
"epoch": 0.9129153135148639,
"grad_norm": 1.2978962092251058,
"learning_rate": 1e-05,
"loss": 0.9114,
"step": 1142
},
{
"epoch": 0.9137147139645266,
"grad_norm": 1.3829867839638308,
"learning_rate": 1e-05,
"loss": 0.8636,
"step": 1143
},
{
"epoch": 0.9145141144141894,
"grad_norm": 1.4786554185436886,
"learning_rate": 1e-05,
"loss": 0.9103,
"step": 1144
},
{
"epoch": 0.9153135148638522,
"grad_norm": 1.5247139561552725,
"learning_rate": 1e-05,
"loss": 0.8403,
"step": 1145
},
{
"epoch": 0.9161129153135149,
"grad_norm": 1.3888872413761024,
"learning_rate": 1e-05,
"loss": 0.8457,
"step": 1146
},
{
"epoch": 0.9169123157631777,
"grad_norm": 1.426097741347822,
"learning_rate": 1e-05,
"loss": 0.8769,
"step": 1147
},
{
"epoch": 0.9177117162128404,
"grad_norm": 1.5560059286195493,
"learning_rate": 1e-05,
"loss": 0.864,
"step": 1148
},
{
"epoch": 0.9185111166625031,
"grad_norm": 1.4406955993681905,
"learning_rate": 1e-05,
"loss": 0.8668,
"step": 1149
},
{
"epoch": 0.9193105171121658,
"grad_norm": 1.489597707567999,
"learning_rate": 1e-05,
"loss": 0.855,
"step": 1150
},
{
"epoch": 0.9201099175618286,
"grad_norm": 1.379157014673917,
"learning_rate": 1e-05,
"loss": 0.935,
"step": 1151
},
{
"epoch": 0.9209093180114913,
"grad_norm": 1.4949181541382415,
"learning_rate": 1e-05,
"loss": 0.8664,
"step": 1152
},
{
"epoch": 0.9217087184611541,
"grad_norm": 1.4980233869730157,
"learning_rate": 1e-05,
"loss": 0.8224,
"step": 1153
},
{
"epoch": 0.9225081189108169,
"grad_norm": 1.4050924624234455,
"learning_rate": 1e-05,
"loss": 0.891,
"step": 1154
},
{
"epoch": 0.9233075193604796,
"grad_norm": 1.6467240441672264,
"learning_rate": 1e-05,
"loss": 0.834,
"step": 1155
},
{
"epoch": 0.9241069198101424,
"grad_norm": 1.3421364569781595,
"learning_rate": 1e-05,
"loss": 0.8436,
"step": 1156
},
{
"epoch": 0.9249063202598051,
"grad_norm": 1.246062501997166,
"learning_rate": 1e-05,
"loss": 0.869,
"step": 1157
},
{
"epoch": 0.9257057207094679,
"grad_norm": 1.4858806518325938,
"learning_rate": 1e-05,
"loss": 0.9168,
"step": 1158
},
{
"epoch": 0.9265051211591306,
"grad_norm": 1.4777896246461322,
"learning_rate": 1e-05,
"loss": 0.8593,
"step": 1159
},
{
"epoch": 0.9273045216087934,
"grad_norm": 1.4266934813336434,
"learning_rate": 1e-05,
"loss": 0.8943,
"step": 1160
},
{
"epoch": 0.9281039220584562,
"grad_norm": 1.4729608886697982,
"learning_rate": 1e-05,
"loss": 0.8981,
"step": 1161
},
{
"epoch": 0.9289033225081189,
"grad_norm": 1.4856931952636183,
"learning_rate": 1e-05,
"loss": 0.8623,
"step": 1162
},
{
"epoch": 0.9297027229577817,
"grad_norm": 1.4313714774475765,
"learning_rate": 1e-05,
"loss": 0.8441,
"step": 1163
},
{
"epoch": 0.9305021234074444,
"grad_norm": 1.3975576369260547,
"learning_rate": 1e-05,
"loss": 0.8337,
"step": 1164
},
{
"epoch": 0.9313015238571072,
"grad_norm": 1.4706185165998424,
"learning_rate": 1e-05,
"loss": 0.8336,
"step": 1165
},
{
"epoch": 0.9321009243067699,
"grad_norm": 1.3837948320627937,
"learning_rate": 1e-05,
"loss": 0.8741,
"step": 1166
},
{
"epoch": 0.9329003247564327,
"grad_norm": 1.3855675072168605,
"learning_rate": 1e-05,
"loss": 0.9235,
"step": 1167
},
{
"epoch": 0.9336997252060955,
"grad_norm": 1.5034589343394933,
"learning_rate": 1e-05,
"loss": 0.8267,
"step": 1168
},
{
"epoch": 0.9344991256557582,
"grad_norm": 1.5081619715031618,
"learning_rate": 1e-05,
"loss": 0.8912,
"step": 1169
},
{
"epoch": 0.935298526105421,
"grad_norm": 1.570365541340616,
"learning_rate": 1e-05,
"loss": 0.8589,
"step": 1170
},
{
"epoch": 0.9360979265550837,
"grad_norm": 1.368058151600139,
"learning_rate": 1e-05,
"loss": 0.8669,
"step": 1171
},
{
"epoch": 0.9368973270047465,
"grad_norm": 1.4254090126900538,
"learning_rate": 1e-05,
"loss": 0.8997,
"step": 1172
},
{
"epoch": 0.9376967274544092,
"grad_norm": 1.4563379308659208,
"learning_rate": 1e-05,
"loss": 0.8378,
"step": 1173
},
{
"epoch": 0.938496127904072,
"grad_norm": 1.480841767300247,
"learning_rate": 1e-05,
"loss": 0.8428,
"step": 1174
},
{
"epoch": 0.9392955283537348,
"grad_norm": 1.4441769797776909,
"learning_rate": 1e-05,
"loss": 0.8308,
"step": 1175
},
{
"epoch": 0.9400949288033974,
"grad_norm": 1.5331892724720704,
"learning_rate": 1e-05,
"loss": 0.8733,
"step": 1176
},
{
"epoch": 0.9408943292530602,
"grad_norm": 1.3897737412131999,
"learning_rate": 1e-05,
"loss": 0.8255,
"step": 1177
},
{
"epoch": 0.9416937297027229,
"grad_norm": 1.4328437965242162,
"learning_rate": 1e-05,
"loss": 0.842,
"step": 1178
},
{
"epoch": 0.9424931301523857,
"grad_norm": 1.1601633219334695,
"learning_rate": 1e-05,
"loss": 0.8528,
"step": 1179
},
{
"epoch": 0.9432925306020484,
"grad_norm": 1.3663250801686486,
"learning_rate": 1e-05,
"loss": 0.8325,
"step": 1180
},
{
"epoch": 0.9440919310517112,
"grad_norm": 1.4998097326159285,
"learning_rate": 1e-05,
"loss": 0.9421,
"step": 1181
},
{
"epoch": 0.9448913315013739,
"grad_norm": 1.4608832729340682,
"learning_rate": 1e-05,
"loss": 0.8508,
"step": 1182
},
{
"epoch": 0.9456907319510367,
"grad_norm": 1.562661791032361,
"learning_rate": 1e-05,
"loss": 0.9003,
"step": 1183
},
{
"epoch": 0.9464901324006995,
"grad_norm": 1.4455704359698196,
"learning_rate": 1e-05,
"loss": 0.964,
"step": 1184
},
{
"epoch": 0.9472895328503622,
"grad_norm": 1.5776281729460202,
"learning_rate": 1e-05,
"loss": 0.8575,
"step": 1185
},
{
"epoch": 0.948088933300025,
"grad_norm": 1.549921877625713,
"learning_rate": 1e-05,
"loss": 0.8504,
"step": 1186
},
{
"epoch": 0.9488883337496877,
"grad_norm": 1.3389592770549843,
"learning_rate": 1e-05,
"loss": 0.8488,
"step": 1187
},
{
"epoch": 0.9496877341993505,
"grad_norm": 1.4429492254870946,
"learning_rate": 1e-05,
"loss": 0.8246,
"step": 1188
},
{
"epoch": 0.9504871346490132,
"grad_norm": 1.4238306426926814,
"learning_rate": 1e-05,
"loss": 0.8696,
"step": 1189
},
{
"epoch": 0.951286535098676,
"grad_norm": 1.4646305779731619,
"learning_rate": 1e-05,
"loss": 0.8502,
"step": 1190
},
{
"epoch": 0.9520859355483388,
"grad_norm": 1.494347031973423,
"learning_rate": 1e-05,
"loss": 0.8642,
"step": 1191
},
{
"epoch": 0.9528853359980015,
"grad_norm": 1.963685019515452,
"learning_rate": 1e-05,
"loss": 0.8507,
"step": 1192
},
{
"epoch": 0.9536847364476643,
"grad_norm": 1.3925026944755527,
"learning_rate": 1e-05,
"loss": 0.833,
"step": 1193
},
{
"epoch": 0.954484136897327,
"grad_norm": 1.4062902940189372,
"learning_rate": 1e-05,
"loss": 0.9028,
"step": 1194
},
{
"epoch": 0.9552835373469898,
"grad_norm": 1.2343971080574194,
"learning_rate": 1e-05,
"loss": 0.8522,
"step": 1195
},
{
"epoch": 0.9560829377966525,
"grad_norm": 1.4221098313944995,
"learning_rate": 1e-05,
"loss": 0.8577,
"step": 1196
},
{
"epoch": 0.9568823382463153,
"grad_norm": 1.5290533732550755,
"learning_rate": 1e-05,
"loss": 0.8093,
"step": 1197
},
{
"epoch": 0.9576817386959781,
"grad_norm": 1.3961174339920084,
"learning_rate": 1e-05,
"loss": 0.8647,
"step": 1198
},
{
"epoch": 0.9584811391456408,
"grad_norm": 1.4151475464959773,
"learning_rate": 1e-05,
"loss": 0.8868,
"step": 1199
},
{
"epoch": 0.9592805395953036,
"grad_norm": 1.513441275615894,
"learning_rate": 1e-05,
"loss": 0.8647,
"step": 1200
},
{
"epoch": 0.9600799400449663,
"grad_norm": 1.3820417006090109,
"learning_rate": 1e-05,
"loss": 0.8477,
"step": 1201
},
{
"epoch": 0.960879340494629,
"grad_norm": 1.4387974434664792,
"learning_rate": 1e-05,
"loss": 0.8536,
"step": 1202
},
{
"epoch": 0.9616787409442917,
"grad_norm": 1.5784176967006853,
"learning_rate": 1e-05,
"loss": 0.8778,
"step": 1203
},
{
"epoch": 0.9624781413939545,
"grad_norm": 1.4269915386314171,
"learning_rate": 1e-05,
"loss": 0.8572,
"step": 1204
},
{
"epoch": 0.9632775418436172,
"grad_norm": 1.3866388696845584,
"learning_rate": 1e-05,
"loss": 0.8086,
"step": 1205
},
{
"epoch": 0.96407694229328,
"grad_norm": 1.432076302146608,
"learning_rate": 1e-05,
"loss": 0.8454,
"step": 1206
},
{
"epoch": 0.9648763427429428,
"grad_norm": 1.4992577974774581,
"learning_rate": 1e-05,
"loss": 0.7908,
"step": 1207
},
{
"epoch": 0.9656757431926055,
"grad_norm": 1.497039314194387,
"learning_rate": 1e-05,
"loss": 0.8544,
"step": 1208
},
{
"epoch": 0.9664751436422683,
"grad_norm": 1.3007974080201803,
"learning_rate": 1e-05,
"loss": 0.8477,
"step": 1209
},
{
"epoch": 0.967274544091931,
"grad_norm": 1.5618516258742383,
"learning_rate": 1e-05,
"loss": 0.835,
"step": 1210
},
{
"epoch": 0.9680739445415938,
"grad_norm": 1.4210670398569833,
"learning_rate": 1e-05,
"loss": 0.832,
"step": 1211
},
{
"epoch": 0.9688733449912565,
"grad_norm": 1.5510313623384935,
"learning_rate": 1e-05,
"loss": 0.8602,
"step": 1212
},
{
"epoch": 0.9696727454409193,
"grad_norm": 1.521288522133268,
"learning_rate": 1e-05,
"loss": 0.8861,
"step": 1213
},
{
"epoch": 0.9704721458905821,
"grad_norm": 1.5884079297863427,
"learning_rate": 1e-05,
"loss": 0.8258,
"step": 1214
},
{
"epoch": 0.9712715463402448,
"grad_norm": 1.3385008591661527,
"learning_rate": 1e-05,
"loss": 0.8272,
"step": 1215
},
{
"epoch": 0.9720709467899076,
"grad_norm": 1.3382297608246647,
"learning_rate": 1e-05,
"loss": 0.8984,
"step": 1216
},
{
"epoch": 0.9728703472395703,
"grad_norm": 1.548407496139496,
"learning_rate": 1e-05,
"loss": 0.8649,
"step": 1217
},
{
"epoch": 0.9736697476892331,
"grad_norm": 1.336053175129197,
"learning_rate": 1e-05,
"loss": 0.8958,
"step": 1218
},
{
"epoch": 0.9744691481388958,
"grad_norm": 1.3748017255834115,
"learning_rate": 1e-05,
"loss": 0.8486,
"step": 1219
},
{
"epoch": 0.9752685485885586,
"grad_norm": 1.5234383744628233,
"learning_rate": 1e-05,
"loss": 0.8617,
"step": 1220
},
{
"epoch": 0.9760679490382214,
"grad_norm": 1.4764432977833921,
"learning_rate": 1e-05,
"loss": 0.9367,
"step": 1221
},
{
"epoch": 0.9768673494878841,
"grad_norm": 1.3631292544649363,
"learning_rate": 1e-05,
"loss": 0.8714,
"step": 1222
},
{
"epoch": 0.9776667499375469,
"grad_norm": 1.3171008529103865,
"learning_rate": 1e-05,
"loss": 0.8285,
"step": 1223
},
{
"epoch": 0.9784661503872096,
"grad_norm": 1.4354745441705121,
"learning_rate": 1e-05,
"loss": 0.9037,
"step": 1224
},
{
"epoch": 0.9792655508368724,
"grad_norm": 1.3919378193960412,
"learning_rate": 1e-05,
"loss": 0.9309,
"step": 1225
},
{
"epoch": 0.9800649512865351,
"grad_norm": 1.4461454394492737,
"learning_rate": 1e-05,
"loss": 0.8928,
"step": 1226
},
{
"epoch": 0.9808643517361979,
"grad_norm": 1.3724038374747247,
"learning_rate": 1e-05,
"loss": 0.9014,
"step": 1227
},
{
"epoch": 0.9816637521858605,
"grad_norm": 1.351928124821094,
"learning_rate": 1e-05,
"loss": 0.8343,
"step": 1228
},
{
"epoch": 0.9824631526355233,
"grad_norm": 1.3143104444611924,
"learning_rate": 1e-05,
"loss": 0.8804,
"step": 1229
},
{
"epoch": 0.9832625530851861,
"grad_norm": 1.5074208283788533,
"learning_rate": 1e-05,
"loss": 0.8708,
"step": 1230
},
{
"epoch": 0.9840619535348488,
"grad_norm": 1.4675362219576862,
"learning_rate": 1e-05,
"loss": 0.8545,
"step": 1231
},
{
"epoch": 0.9848613539845116,
"grad_norm": 1.4044134991072301,
"learning_rate": 1e-05,
"loss": 0.852,
"step": 1232
},
{
"epoch": 0.9856607544341743,
"grad_norm": 1.4731748400546958,
"learning_rate": 1e-05,
"loss": 0.9222,
"step": 1233
},
{
"epoch": 0.9864601548838371,
"grad_norm": 1.4128661942086913,
"learning_rate": 1e-05,
"loss": 0.8997,
"step": 1234
},
{
"epoch": 0.9872595553334998,
"grad_norm": 1.4368853581391632,
"learning_rate": 1e-05,
"loss": 0.8672,
"step": 1235
},
{
"epoch": 0.9880589557831626,
"grad_norm": 1.453673257213547,
"learning_rate": 1e-05,
"loss": 0.8779,
"step": 1236
},
{
"epoch": 0.9888583562328254,
"grad_norm": 1.7470099861196207,
"learning_rate": 1e-05,
"loss": 0.9028,
"step": 1237
},
{
"epoch": 0.9896577566824881,
"grad_norm": 1.2697243063535835,
"learning_rate": 1e-05,
"loss": 0.8677,
"step": 1238
},
{
"epoch": 0.9904571571321509,
"grad_norm": 1.5282634647109214,
"learning_rate": 1e-05,
"loss": 0.8824,
"step": 1239
},
{
"epoch": 0.9912565575818136,
"grad_norm": 1.5236456464951182,
"learning_rate": 1e-05,
"loss": 0.875,
"step": 1240
},
{
"epoch": 0.9920559580314764,
"grad_norm": 1.2831857679108445,
"learning_rate": 1e-05,
"loss": 0.9118,
"step": 1241
},
{
"epoch": 0.9928553584811391,
"grad_norm": 1.4427270743757334,
"learning_rate": 1e-05,
"loss": 0.8488,
"step": 1242
},
{
"epoch": 0.9936547589308019,
"grad_norm": 1.6145144060086711,
"learning_rate": 1e-05,
"loss": 0.8594,
"step": 1243
},
{
"epoch": 0.9944541593804647,
"grad_norm": 1.5536788191330388,
"learning_rate": 1e-05,
"loss": 0.8736,
"step": 1244
},
{
"epoch": 0.9952535598301274,
"grad_norm": 1.488891430752203,
"learning_rate": 1e-05,
"loss": 0.8824,
"step": 1245
},
{
"epoch": 0.9960529602797902,
"grad_norm": 1.7670913427025423,
"learning_rate": 1e-05,
"loss": 0.8481,
"step": 1246
},
{
"epoch": 0.9968523607294529,
"grad_norm": 1.4017507511502658,
"learning_rate": 1e-05,
"loss": 0.8422,
"step": 1247
},
{
"epoch": 0.9976517611791157,
"grad_norm": 1.3372936110607956,
"learning_rate": 1e-05,
"loss": 0.842,
"step": 1248
},
{
"epoch": 0.9984511616287784,
"grad_norm": 1.3328353321262152,
"learning_rate": 1e-05,
"loss": 0.8982,
"step": 1249
},
{
"epoch": 0.9992505620784412,
"grad_norm": 1.4055115515472896,
"learning_rate": 1e-05,
"loss": 0.8433,
"step": 1250
},
{
"epoch": 0.9992505620784412,
"step": 1250,
"total_flos": 826404337876992.0,
"train_loss": 0.9163284823417663,
"train_runtime": 166824.9366,
"train_samples_per_second": 0.48,
"train_steps_per_second": 0.007
}
],
"logging_steps": 1.0,
"max_steps": 1250,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 10,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 826404337876992.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}