phi-4-4bit-Q-lora-kura-16-comm / trainer_state.json
AkiK's picture
Upload folder using huggingface_hub
53e3d59 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.9987217724755006,
"eval_steps": 500,
"global_step": 2346,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.008521516829995739,
"grad_norm": 0.6813573837280273,
"learning_rate": 5e-05,
"loss": 1.7024,
"step": 10
},
{
"epoch": 0.017043033659991477,
"grad_norm": 0.6651060581207275,
"learning_rate": 4.978595890410959e-05,
"loss": 1.4561,
"step": 20
},
{
"epoch": 0.02556455048998722,
"grad_norm": 1.1946938037872314,
"learning_rate": 4.957191780821918e-05,
"loss": 1.4456,
"step": 30
},
{
"epoch": 0.034086067319982954,
"grad_norm": 1.7951576709747314,
"learning_rate": 4.9357876712328774e-05,
"loss": 1.7177,
"step": 40
},
{
"epoch": 0.04260758414997869,
"grad_norm": 2.754934549331665,
"learning_rate": 4.914383561643836e-05,
"loss": 1.9612,
"step": 50
},
{
"epoch": 0.05112910097997444,
"grad_norm": 0.9254423975944519,
"learning_rate": 4.892979452054795e-05,
"loss": 1.3975,
"step": 60
},
{
"epoch": 0.05965061780997018,
"grad_norm": 0.9828574657440186,
"learning_rate": 4.871575342465753e-05,
"loss": 1.2984,
"step": 70
},
{
"epoch": 0.06817213463996591,
"grad_norm": 1.291460394859314,
"learning_rate": 4.850171232876712e-05,
"loss": 1.5636,
"step": 80
},
{
"epoch": 0.07669365146996165,
"grad_norm": 2.218918800354004,
"learning_rate": 4.8287671232876716e-05,
"loss": 1.765,
"step": 90
},
{
"epoch": 0.08521516829995739,
"grad_norm": 2.730905771255493,
"learning_rate": 4.8073630136986304e-05,
"loss": 1.3425,
"step": 100
},
{
"epoch": 0.09373668512995313,
"grad_norm": 0.9995436668395996,
"learning_rate": 4.785958904109589e-05,
"loss": 1.2667,
"step": 110
},
{
"epoch": 0.10225820195994888,
"grad_norm": 0.877986490726471,
"learning_rate": 4.764554794520548e-05,
"loss": 1.2679,
"step": 120
},
{
"epoch": 0.11077971878994461,
"grad_norm": 1.7854726314544678,
"learning_rate": 4.743150684931507e-05,
"loss": 1.6378,
"step": 130
},
{
"epoch": 0.11930123561994035,
"grad_norm": 2.3612940311431885,
"learning_rate": 4.7217465753424664e-05,
"loss": 1.7922,
"step": 140
},
{
"epoch": 0.1278227524499361,
"grad_norm": 2.323775053024292,
"learning_rate": 4.700342465753425e-05,
"loss": 1.2947,
"step": 150
},
{
"epoch": 0.13634426927993182,
"grad_norm": 1.2070249319076538,
"learning_rate": 4.678938356164384e-05,
"loss": 1.269,
"step": 160
},
{
"epoch": 0.14486578610992756,
"grad_norm": 1.1411925554275513,
"learning_rate": 4.657534246575342e-05,
"loss": 1.3021,
"step": 170
},
{
"epoch": 0.1533873029399233,
"grad_norm": 1.3789585828781128,
"learning_rate": 4.636130136986302e-05,
"loss": 1.3572,
"step": 180
},
{
"epoch": 0.16190881976991905,
"grad_norm": 2.145219326019287,
"learning_rate": 4.6147260273972605e-05,
"loss": 1.6292,
"step": 190
},
{
"epoch": 0.17043033659991477,
"grad_norm": 3.3627982139587402,
"learning_rate": 4.5933219178082194e-05,
"loss": 1.4358,
"step": 200
},
{
"epoch": 0.17895185342991052,
"grad_norm": 1.116105318069458,
"learning_rate": 4.571917808219178e-05,
"loss": 1.2568,
"step": 210
},
{
"epoch": 0.18747337025990626,
"grad_norm": 1.2009385824203491,
"learning_rate": 4.550513698630137e-05,
"loss": 1.1746,
"step": 220
},
{
"epoch": 0.195994887089902,
"grad_norm": 2.0525238513946533,
"learning_rate": 4.529109589041096e-05,
"loss": 1.5463,
"step": 230
},
{
"epoch": 0.20451640391989775,
"grad_norm": 2.649242639541626,
"learning_rate": 4.5077054794520553e-05,
"loss": 1.7533,
"step": 240
},
{
"epoch": 0.21303792074989347,
"grad_norm": 3.1068289279937744,
"learning_rate": 4.486301369863014e-05,
"loss": 1.3994,
"step": 250
},
{
"epoch": 0.22155943757988922,
"grad_norm": 1.2716392278671265,
"learning_rate": 4.464897260273973e-05,
"loss": 1.2382,
"step": 260
},
{
"epoch": 0.23008095440988496,
"grad_norm": 1.20390784740448,
"learning_rate": 4.443493150684932e-05,
"loss": 1.1434,
"step": 270
},
{
"epoch": 0.2386024712398807,
"grad_norm": 1.7929986715316772,
"learning_rate": 4.422089041095891e-05,
"loss": 1.5121,
"step": 280
},
{
"epoch": 0.24712398806987643,
"grad_norm": 2.645399570465088,
"learning_rate": 4.4006849315068495e-05,
"loss": 1.4523,
"step": 290
},
{
"epoch": 0.2556455048998722,
"grad_norm": 3.460209846496582,
"learning_rate": 4.379280821917808e-05,
"loss": 1.3166,
"step": 300
},
{
"epoch": 0.2641670217298679,
"grad_norm": 1.2120425701141357,
"learning_rate": 4.357876712328767e-05,
"loss": 1.2265,
"step": 310
},
{
"epoch": 0.27268853855986364,
"grad_norm": 1.2804960012435913,
"learning_rate": 4.336472602739726e-05,
"loss": 1.15,
"step": 320
},
{
"epoch": 0.2812100553898594,
"grad_norm": 2.300981044769287,
"learning_rate": 4.3150684931506855e-05,
"loss": 1.5719,
"step": 330
},
{
"epoch": 0.2897315722198551,
"grad_norm": 2.3201756477355957,
"learning_rate": 4.293664383561644e-05,
"loss": 1.5963,
"step": 340
},
{
"epoch": 0.2982530890498509,
"grad_norm": 3.4875683784484863,
"learning_rate": 4.272260273972603e-05,
"loss": 1.1126,
"step": 350
},
{
"epoch": 0.3067746058798466,
"grad_norm": 1.2687772512435913,
"learning_rate": 4.250856164383562e-05,
"loss": 1.1099,
"step": 360
},
{
"epoch": 0.31529612270984236,
"grad_norm": 1.367256760597229,
"learning_rate": 4.229452054794521e-05,
"loss": 1.1648,
"step": 370
},
{
"epoch": 0.3238176395398381,
"grad_norm": 1.5560370683670044,
"learning_rate": 4.2080479452054796e-05,
"loss": 1.3107,
"step": 380
},
{
"epoch": 0.33233915636983385,
"grad_norm": 2.6812076568603516,
"learning_rate": 4.1866438356164385e-05,
"loss": 1.4966,
"step": 390
},
{
"epoch": 0.34086067319982954,
"grad_norm": 4.098489284515381,
"learning_rate": 4.165239726027397e-05,
"loss": 1.2831,
"step": 400
},
{
"epoch": 0.3493821900298253,
"grad_norm": 1.2586500644683838,
"learning_rate": 4.143835616438356e-05,
"loss": 1.152,
"step": 410
},
{
"epoch": 0.35790370685982104,
"grad_norm": 1.3851690292358398,
"learning_rate": 4.122431506849315e-05,
"loss": 1.2035,
"step": 420
},
{
"epoch": 0.3664252236898168,
"grad_norm": 1.9357365369796753,
"learning_rate": 4.1010273972602745e-05,
"loss": 1.4145,
"step": 430
},
{
"epoch": 0.3749467405198125,
"grad_norm": 2.84523344039917,
"learning_rate": 4.079623287671233e-05,
"loss": 1.5173,
"step": 440
},
{
"epoch": 0.3834682573498083,
"grad_norm": 2.4506237506866455,
"learning_rate": 4.058219178082192e-05,
"loss": 1.1763,
"step": 450
},
{
"epoch": 0.391989774179804,
"grad_norm": 1.1524243354797363,
"learning_rate": 4.036815068493151e-05,
"loss": 1.316,
"step": 460
},
{
"epoch": 0.40051129100979976,
"grad_norm": 1.289616346359253,
"learning_rate": 4.01541095890411e-05,
"loss": 1.102,
"step": 470
},
{
"epoch": 0.4090328078397955,
"grad_norm": 1.482630968093872,
"learning_rate": 3.9940068493150686e-05,
"loss": 1.3212,
"step": 480
},
{
"epoch": 0.4175543246697912,
"grad_norm": 2.3927276134490967,
"learning_rate": 3.9726027397260274e-05,
"loss": 1.6163,
"step": 490
},
{
"epoch": 0.42607584149978694,
"grad_norm": 3.26775860786438,
"learning_rate": 3.951198630136986e-05,
"loss": 1.1924,
"step": 500
},
{
"epoch": 0.4345973583297827,
"grad_norm": 1.2278869152069092,
"learning_rate": 3.929794520547945e-05,
"loss": 1.1865,
"step": 510
},
{
"epoch": 0.44311887515977844,
"grad_norm": 1.3176724910736084,
"learning_rate": 3.908390410958904e-05,
"loss": 1.2366,
"step": 520
},
{
"epoch": 0.4516403919897742,
"grad_norm": 4.769649505615234,
"learning_rate": 3.8869863013698634e-05,
"loss": 1.4883,
"step": 530
},
{
"epoch": 0.4601619088197699,
"grad_norm": 3.0388917922973633,
"learning_rate": 3.865582191780822e-05,
"loss": 1.4851,
"step": 540
},
{
"epoch": 0.4686834256497657,
"grad_norm": 3.100656270980835,
"learning_rate": 3.844178082191781e-05,
"loss": 1.0437,
"step": 550
},
{
"epoch": 0.4772049424797614,
"grad_norm": 3.1867423057556152,
"learning_rate": 3.82277397260274e-05,
"loss": 1.1181,
"step": 560
},
{
"epoch": 0.48572645930975716,
"grad_norm": 1.2426332235336304,
"learning_rate": 3.801369863013699e-05,
"loss": 1.1024,
"step": 570
},
{
"epoch": 0.49424797613975285,
"grad_norm": 1.2760354280471802,
"learning_rate": 3.779965753424658e-05,
"loss": 1.1686,
"step": 580
},
{
"epoch": 0.5027694929697486,
"grad_norm": 2.6802961826324463,
"learning_rate": 3.7585616438356164e-05,
"loss": 1.4582,
"step": 590
},
{
"epoch": 0.5112910097997444,
"grad_norm": 3.5295393466949463,
"learning_rate": 3.737157534246575e-05,
"loss": 1.1565,
"step": 600
},
{
"epoch": 0.5198125266297401,
"grad_norm": 1.3644486665725708,
"learning_rate": 3.715753424657534e-05,
"loss": 1.0841,
"step": 610
},
{
"epoch": 0.5283340434597358,
"grad_norm": 1.268506407737732,
"learning_rate": 3.6943493150684936e-05,
"loss": 1.0859,
"step": 620
},
{
"epoch": 0.5368555602897316,
"grad_norm": 2.1879653930664062,
"learning_rate": 3.6729452054794524e-05,
"loss": 1.415,
"step": 630
},
{
"epoch": 0.5453770771197273,
"grad_norm": 2.482619047164917,
"learning_rate": 3.651541095890411e-05,
"loss": 1.5743,
"step": 640
},
{
"epoch": 0.5538985939497231,
"grad_norm": 3.3282759189605713,
"learning_rate": 3.63013698630137e-05,
"loss": 1.203,
"step": 650
},
{
"epoch": 0.5624201107797188,
"grad_norm": 1.288682460784912,
"learning_rate": 3.608732876712329e-05,
"loss": 1.1415,
"step": 660
},
{
"epoch": 0.5709416276097146,
"grad_norm": 1.232367753982544,
"learning_rate": 3.587328767123288e-05,
"loss": 1.0992,
"step": 670
},
{
"epoch": 0.5794631444397103,
"grad_norm": 2.4475719928741455,
"learning_rate": 3.565924657534247e-05,
"loss": 1.6888,
"step": 680
},
{
"epoch": 0.587984661269706,
"grad_norm": 2.75734543800354,
"learning_rate": 3.5445205479452054e-05,
"loss": 1.479,
"step": 690
},
{
"epoch": 0.5965061780997017,
"grad_norm": 3.1947202682495117,
"learning_rate": 3.523116438356164e-05,
"loss": 1.0327,
"step": 700
},
{
"epoch": 0.6050276949296974,
"grad_norm": 1.2636553049087524,
"learning_rate": 3.501712328767123e-05,
"loss": 1.0756,
"step": 710
},
{
"epoch": 0.6135492117596932,
"grad_norm": 1.2693008184432983,
"learning_rate": 3.4803082191780825e-05,
"loss": 1.2125,
"step": 720
},
{
"epoch": 0.6220707285896889,
"grad_norm": 2.964484691619873,
"learning_rate": 3.4589041095890414e-05,
"loss": 1.3954,
"step": 730
},
{
"epoch": 0.6305922454196847,
"grad_norm": 2.6809933185577393,
"learning_rate": 3.4375e-05,
"loss": 1.3599,
"step": 740
},
{
"epoch": 0.6391137622496804,
"grad_norm": 3.059370994567871,
"learning_rate": 3.416095890410959e-05,
"loss": 1.2158,
"step": 750
},
{
"epoch": 0.6476352790796762,
"grad_norm": 1.3807661533355713,
"learning_rate": 3.394691780821918e-05,
"loss": 1.0973,
"step": 760
},
{
"epoch": 0.6561567959096719,
"grad_norm": 1.525608777999878,
"learning_rate": 3.373287671232877e-05,
"loss": 1.1659,
"step": 770
},
{
"epoch": 0.6646783127396677,
"grad_norm": 2.267289161682129,
"learning_rate": 3.351883561643836e-05,
"loss": 1.6013,
"step": 780
},
{
"epoch": 0.6731998295696634,
"grad_norm": 2.4192111492156982,
"learning_rate": 3.330479452054795e-05,
"loss": 1.5129,
"step": 790
},
{
"epoch": 0.6817213463996591,
"grad_norm": 4.045669078826904,
"learning_rate": 3.309075342465753e-05,
"loss": 1.1527,
"step": 800
},
{
"epoch": 0.6902428632296549,
"grad_norm": 1.4586423635482788,
"learning_rate": 3.287671232876712e-05,
"loss": 1.0746,
"step": 810
},
{
"epoch": 0.6987643800596506,
"grad_norm": 1.3621727228164673,
"learning_rate": 3.2662671232876715e-05,
"loss": 1.0083,
"step": 820
},
{
"epoch": 0.7072858968896464,
"grad_norm": 2.002037763595581,
"learning_rate": 3.2448630136986303e-05,
"loss": 1.4296,
"step": 830
},
{
"epoch": 0.7158074137196421,
"grad_norm": 24.786273956298828,
"learning_rate": 3.223458904109589e-05,
"loss": 1.5759,
"step": 840
},
{
"epoch": 0.7243289305496379,
"grad_norm": 3.176041841506958,
"learning_rate": 3.202054794520548e-05,
"loss": 1.34,
"step": 850
},
{
"epoch": 0.7328504473796336,
"grad_norm": 1.4174227714538574,
"learning_rate": 3.180650684931507e-05,
"loss": 1.0625,
"step": 860
},
{
"epoch": 0.7413719642096294,
"grad_norm": 1.2621195316314697,
"learning_rate": 3.1592465753424663e-05,
"loss": 1.0369,
"step": 870
},
{
"epoch": 0.749893481039625,
"grad_norm": 2.297351360321045,
"learning_rate": 3.137842465753425e-05,
"loss": 1.4817,
"step": 880
},
{
"epoch": 0.7584149978696207,
"grad_norm": 2.9399614334106445,
"learning_rate": 3.116438356164384e-05,
"loss": 1.4127,
"step": 890
},
{
"epoch": 0.7669365146996165,
"grad_norm": 3.5914995670318604,
"learning_rate": 3.095034246575342e-05,
"loss": 1.0546,
"step": 900
},
{
"epoch": 0.7754580315296122,
"grad_norm": 1.2652535438537598,
"learning_rate": 3.073630136986301e-05,
"loss": 1.1101,
"step": 910
},
{
"epoch": 0.783979548359608,
"grad_norm": 1.644640564918518,
"learning_rate": 3.0522260273972605e-05,
"loss": 1.1791,
"step": 920
},
{
"epoch": 0.7925010651896037,
"grad_norm": 2.482304096221924,
"learning_rate": 3.0308219178082193e-05,
"loss": 1.512,
"step": 930
},
{
"epoch": 0.8010225820195995,
"grad_norm": 2.763471841812134,
"learning_rate": 3.009417808219178e-05,
"loss": 1.477,
"step": 940
},
{
"epoch": 0.8095440988495952,
"grad_norm": 3.675570011138916,
"learning_rate": 2.988013698630137e-05,
"loss": 1.1064,
"step": 950
},
{
"epoch": 0.818065615679591,
"grad_norm": 1.302946925163269,
"learning_rate": 2.966609589041096e-05,
"loss": 1.0189,
"step": 960
},
{
"epoch": 0.8265871325095867,
"grad_norm": 1.492253303527832,
"learning_rate": 2.945205479452055e-05,
"loss": 1.0817,
"step": 970
},
{
"epoch": 0.8351086493395824,
"grad_norm": 3.072866439819336,
"learning_rate": 2.923801369863014e-05,
"loss": 1.4999,
"step": 980
},
{
"epoch": 0.8436301661695782,
"grad_norm": 2.7219836711883545,
"learning_rate": 2.902397260273973e-05,
"loss": 1.4183,
"step": 990
},
{
"epoch": 0.8521516829995739,
"grad_norm": 3.425384044647217,
"learning_rate": 2.8809931506849318e-05,
"loss": 1.0067,
"step": 1000
},
{
"epoch": 0.8606731998295697,
"grad_norm": 1.6666113138198853,
"learning_rate": 2.8595890410958903e-05,
"loss": 1.1709,
"step": 1010
},
{
"epoch": 0.8691947166595654,
"grad_norm": 1.3914527893066406,
"learning_rate": 2.838184931506849e-05,
"loss": 1.0248,
"step": 1020
},
{
"epoch": 0.8777162334895612,
"grad_norm": 2.082874298095703,
"learning_rate": 2.8167808219178083e-05,
"loss": 1.2266,
"step": 1030
},
{
"epoch": 0.8862377503195569,
"grad_norm": 2.5057151317596436,
"learning_rate": 2.795376712328767e-05,
"loss": 1.4517,
"step": 1040
},
{
"epoch": 0.8947592671495527,
"grad_norm": 3.3986401557922363,
"learning_rate": 2.7739726027397263e-05,
"loss": 1.1483,
"step": 1050
},
{
"epoch": 0.9032807839795484,
"grad_norm": 1.3847638368606567,
"learning_rate": 2.752568493150685e-05,
"loss": 1.0509,
"step": 1060
},
{
"epoch": 0.911802300809544,
"grad_norm": 1.515759825706482,
"learning_rate": 2.731164383561644e-05,
"loss": 1.1062,
"step": 1070
},
{
"epoch": 0.9203238176395399,
"grad_norm": 1.9089744091033936,
"learning_rate": 2.709760273972603e-05,
"loss": 1.2536,
"step": 1080
},
{
"epoch": 0.9288453344695355,
"grad_norm": 3.3122973442077637,
"learning_rate": 2.688356164383562e-05,
"loss": 1.6613,
"step": 1090
},
{
"epoch": 0.9373668512995313,
"grad_norm": 4.078606128692627,
"learning_rate": 2.6669520547945208e-05,
"loss": 1.1644,
"step": 1100
},
{
"epoch": 0.945888368129527,
"grad_norm": 1.3247941732406616,
"learning_rate": 2.6455479452054793e-05,
"loss": 0.999,
"step": 1110
},
{
"epoch": 0.9544098849595228,
"grad_norm": 1.617491364479065,
"learning_rate": 2.6241438356164384e-05,
"loss": 1.0226,
"step": 1120
},
{
"epoch": 0.9629314017895185,
"grad_norm": 2.2223851680755615,
"learning_rate": 2.6027397260273973e-05,
"loss": 1.5095,
"step": 1130
},
{
"epoch": 0.9714529186195143,
"grad_norm": 2.7325618267059326,
"learning_rate": 2.581335616438356e-05,
"loss": 1.5794,
"step": 1140
},
{
"epoch": 0.97997443544951,
"grad_norm": 3.270749568939209,
"learning_rate": 2.5599315068493153e-05,
"loss": 1.2044,
"step": 1150
},
{
"epoch": 0.9884959522795057,
"grad_norm": 1.6444586515426636,
"learning_rate": 2.538527397260274e-05,
"loss": 1.136,
"step": 1160
},
{
"epoch": 0.9970174691095015,
"grad_norm": 3.6478381156921387,
"learning_rate": 2.517123287671233e-05,
"loss": 1.3373,
"step": 1170
},
{
"epoch": 1.0051129100979974,
"grad_norm": 1.398633599281311,
"learning_rate": 2.495719178082192e-05,
"loss": 0.9056,
"step": 1180
},
{
"epoch": 1.013634426927993,
"grad_norm": 1.3859210014343262,
"learning_rate": 2.4743150684931506e-05,
"loss": 0.9212,
"step": 1190
},
{
"epoch": 1.022155943757989,
"grad_norm": 2.0104875564575195,
"learning_rate": 2.4529109589041097e-05,
"loss": 1.0748,
"step": 1200
},
{
"epoch": 1.0306774605879847,
"grad_norm": 3.378269672393799,
"learning_rate": 2.4315068493150686e-05,
"loss": 1.1882,
"step": 1210
},
{
"epoch": 1.0391989774179804,
"grad_norm": 3.0901708602905273,
"learning_rate": 2.4101027397260274e-05,
"loss": 0.8874,
"step": 1220
},
{
"epoch": 1.047720494247976,
"grad_norm": 1.6910735368728638,
"learning_rate": 2.3886986301369866e-05,
"loss": 0.888,
"step": 1230
},
{
"epoch": 1.0562420110779718,
"grad_norm": 1.5407036542892456,
"learning_rate": 2.367294520547945e-05,
"loss": 0.9438,
"step": 1240
},
{
"epoch": 1.0647635279079677,
"grad_norm": 1.9005461931228638,
"learning_rate": 2.3458904109589042e-05,
"loss": 0.9209,
"step": 1250
},
{
"epoch": 1.0732850447379634,
"grad_norm": 2.394400119781494,
"learning_rate": 2.324486301369863e-05,
"loss": 1.2207,
"step": 1260
},
{
"epoch": 1.081806561567959,
"grad_norm": 2.9217231273651123,
"learning_rate": 2.3030821917808222e-05,
"loss": 1.0554,
"step": 1270
},
{
"epoch": 1.0903280783979548,
"grad_norm": 1.6549851894378662,
"learning_rate": 2.281678082191781e-05,
"loss": 0.9341,
"step": 1280
},
{
"epoch": 1.0988495952279507,
"grad_norm": 1.7708053588867188,
"learning_rate": 2.2602739726027396e-05,
"loss": 0.9768,
"step": 1290
},
{
"epoch": 1.1073711120579464,
"grad_norm": 1.953326940536499,
"learning_rate": 2.2388698630136987e-05,
"loss": 0.9628,
"step": 1300
},
{
"epoch": 1.115892628887942,
"grad_norm": 3.4446678161621094,
"learning_rate": 2.2174657534246575e-05,
"loss": 1.2085,
"step": 1310
},
{
"epoch": 1.1244141457179377,
"grad_norm": 3.5323126316070557,
"learning_rate": 2.1960616438356167e-05,
"loss": 1.124,
"step": 1320
},
{
"epoch": 1.1329356625479337,
"grad_norm": 1.861324429512024,
"learning_rate": 2.1746575342465755e-05,
"loss": 0.8991,
"step": 1330
},
{
"epoch": 1.1414571793779293,
"grad_norm": 1.6703088283538818,
"learning_rate": 2.1532534246575344e-05,
"loss": 0.9776,
"step": 1340
},
{
"epoch": 1.149978696207925,
"grad_norm": 1.834876537322998,
"learning_rate": 2.1318493150684932e-05,
"loss": 0.9773,
"step": 1350
},
{
"epoch": 1.1585002130379207,
"grad_norm": 3.600705862045288,
"learning_rate": 2.110445205479452e-05,
"loss": 1.2392,
"step": 1360
},
{
"epoch": 1.1670217298679164,
"grad_norm": 3.9731180667877197,
"learning_rate": 2.0890410958904112e-05,
"loss": 0.9947,
"step": 1370
},
{
"epoch": 1.1755432466979123,
"grad_norm": 2.0116519927978516,
"learning_rate": 2.06763698630137e-05,
"loss": 0.9025,
"step": 1380
},
{
"epoch": 1.184064763527908,
"grad_norm": 1.8098217248916626,
"learning_rate": 2.046232876712329e-05,
"loss": 0.8956,
"step": 1390
},
{
"epoch": 1.1925862803579037,
"grad_norm": 2.4669318199157715,
"learning_rate": 2.0248287671232877e-05,
"loss": 1.0979,
"step": 1400
},
{
"epoch": 1.2011077971878994,
"grad_norm": 3.1824028491973877,
"learning_rate": 2.0034246575342465e-05,
"loss": 1.2918,
"step": 1410
},
{
"epoch": 1.2096293140178953,
"grad_norm": 3.60017991065979,
"learning_rate": 1.9820205479452057e-05,
"loss": 0.9279,
"step": 1420
},
{
"epoch": 1.218150830847891,
"grad_norm": 6.929809093475342,
"learning_rate": 1.9606164383561645e-05,
"loss": 0.8606,
"step": 1430
},
{
"epoch": 1.2266723476778867,
"grad_norm": 2.342393636703491,
"learning_rate": 1.9392123287671233e-05,
"loss": 0.963,
"step": 1440
},
{
"epoch": 1.2351938645078824,
"grad_norm": 2.6814820766448975,
"learning_rate": 1.9178082191780822e-05,
"loss": 1.0806,
"step": 1450
},
{
"epoch": 1.243715381337878,
"grad_norm": 4.10792350769043,
"learning_rate": 1.896404109589041e-05,
"loss": 1.1623,
"step": 1460
},
{
"epoch": 1.2522368981678738,
"grad_norm": 3.0713534355163574,
"learning_rate": 1.8750000000000002e-05,
"loss": 0.8909,
"step": 1470
},
{
"epoch": 1.2607584149978697,
"grad_norm": 1.9831465482711792,
"learning_rate": 1.853595890410959e-05,
"loss": 0.8631,
"step": 1480
},
{
"epoch": 1.2692799318278654,
"grad_norm": 1.8906506299972534,
"learning_rate": 1.832191780821918e-05,
"loss": 0.9484,
"step": 1490
},
{
"epoch": 1.277801448657861,
"grad_norm": 2.1991984844207764,
"learning_rate": 1.8107876712328767e-05,
"loss": 1.0584,
"step": 1500
},
{
"epoch": 1.286322965487857,
"grad_norm": 3.851630449295044,
"learning_rate": 1.7893835616438355e-05,
"loss": 1.3179,
"step": 1510
},
{
"epoch": 1.2948444823178527,
"grad_norm": 4.507850646972656,
"learning_rate": 1.7679794520547947e-05,
"loss": 1.0782,
"step": 1520
},
{
"epoch": 1.3033659991478483,
"grad_norm": 1.7349963188171387,
"learning_rate": 1.7465753424657535e-05,
"loss": 0.8217,
"step": 1530
},
{
"epoch": 1.311887515977844,
"grad_norm": 1.8865768909454346,
"learning_rate": 1.7251712328767127e-05,
"loss": 0.9012,
"step": 1540
},
{
"epoch": 1.3204090328078397,
"grad_norm": 1.9257206916809082,
"learning_rate": 1.703767123287671e-05,
"loss": 0.8876,
"step": 1550
},
{
"epoch": 1.3289305496378354,
"grad_norm": 3.2287306785583496,
"learning_rate": 1.68236301369863e-05,
"loss": 1.2605,
"step": 1560
},
{
"epoch": 1.3374520664678313,
"grad_norm": 3.4038004875183105,
"learning_rate": 1.660958904109589e-05,
"loss": 0.9881,
"step": 1570
},
{
"epoch": 1.345973583297827,
"grad_norm": 1.9077774286270142,
"learning_rate": 1.639554794520548e-05,
"loss": 0.913,
"step": 1580
},
{
"epoch": 1.3544951001278227,
"grad_norm": 2.548600196838379,
"learning_rate": 1.618150684931507e-05,
"loss": 0.9404,
"step": 1590
},
{
"epoch": 1.3630166169578186,
"grad_norm": 2.9931490421295166,
"learning_rate": 1.596746575342466e-05,
"loss": 1.0795,
"step": 1600
},
{
"epoch": 1.3715381337878143,
"grad_norm": 4.5465216636657715,
"learning_rate": 1.5753424657534248e-05,
"loss": 1.2289,
"step": 1610
},
{
"epoch": 1.38005965061781,
"grad_norm": 4.3074631690979,
"learning_rate": 1.5539383561643836e-05,
"loss": 0.8165,
"step": 1620
},
{
"epoch": 1.3885811674478057,
"grad_norm": 1.9459553956985474,
"learning_rate": 1.5325342465753425e-05,
"loss": 0.8087,
"step": 1630
},
{
"epoch": 1.3971026842778014,
"grad_norm": 1.8323142528533936,
"learning_rate": 1.5111301369863015e-05,
"loss": 0.8715,
"step": 1640
},
{
"epoch": 1.405624201107797,
"grad_norm": 2.3462753295898438,
"learning_rate": 1.4897260273972605e-05,
"loss": 0.8655,
"step": 1650
},
{
"epoch": 1.414145717937793,
"grad_norm": 3.0974302291870117,
"learning_rate": 1.4683219178082191e-05,
"loss": 1.1704,
"step": 1660
},
{
"epoch": 1.4226672347677887,
"grad_norm": 3.1823673248291016,
"learning_rate": 1.4469178082191781e-05,
"loss": 0.9444,
"step": 1670
},
{
"epoch": 1.4311887515977844,
"grad_norm": 2.3283894062042236,
"learning_rate": 1.4255136986301371e-05,
"loss": 0.8847,
"step": 1680
},
{
"epoch": 1.4397102684277803,
"grad_norm": 1.8633939027786255,
"learning_rate": 1.404109589041096e-05,
"loss": 0.8843,
"step": 1690
},
{
"epoch": 1.448231785257776,
"grad_norm": 2.870725393295288,
"learning_rate": 1.382705479452055e-05,
"loss": 0.9102,
"step": 1700
},
{
"epoch": 1.4567533020877717,
"grad_norm": 3.6491854190826416,
"learning_rate": 1.3613013698630136e-05,
"loss": 1.1743,
"step": 1710
},
{
"epoch": 1.4652748189177673,
"grad_norm": 4.350872039794922,
"learning_rate": 1.3398972602739726e-05,
"loss": 0.9763,
"step": 1720
},
{
"epoch": 1.473796335747763,
"grad_norm": 2.080191135406494,
"learning_rate": 1.3184931506849316e-05,
"loss": 0.8528,
"step": 1730
},
{
"epoch": 1.4823178525777587,
"grad_norm": 2.2544877529144287,
"learning_rate": 1.2970890410958906e-05,
"loss": 0.8987,
"step": 1740
},
{
"epoch": 1.4908393694077546,
"grad_norm": 2.3497424125671387,
"learning_rate": 1.2756849315068494e-05,
"loss": 1.0171,
"step": 1750
},
{
"epoch": 1.4993608862377503,
"grad_norm": 3.6866824626922607,
"learning_rate": 1.2542808219178081e-05,
"loss": 1.1774,
"step": 1760
},
{
"epoch": 1.507882403067746,
"grad_norm": 4.5758891105651855,
"learning_rate": 1.2328767123287671e-05,
"loss": 0.9464,
"step": 1770
},
{
"epoch": 1.516403919897742,
"grad_norm": 2.159677743911743,
"learning_rate": 1.2114726027397261e-05,
"loss": 0.8089,
"step": 1780
},
{
"epoch": 1.5249254367277376,
"grad_norm": 2.345613479614258,
"learning_rate": 1.1900684931506851e-05,
"loss": 0.9699,
"step": 1790
},
{
"epoch": 1.5334469535577333,
"grad_norm": 2.187382936477661,
"learning_rate": 1.168664383561644e-05,
"loss": 1.0533,
"step": 1800
},
{
"epoch": 1.541968470387729,
"grad_norm": 6.11014986038208,
"learning_rate": 1.1472602739726027e-05,
"loss": 1.3963,
"step": 1810
},
{
"epoch": 1.5504899872177247,
"grad_norm": 4.699113368988037,
"learning_rate": 1.1258561643835617e-05,
"loss": 0.8968,
"step": 1820
},
{
"epoch": 1.5590115040477204,
"grad_norm": 1.8584600687026978,
"learning_rate": 1.1044520547945206e-05,
"loss": 0.8661,
"step": 1830
},
{
"epoch": 1.5675330208777163,
"grad_norm": 2.1716010570526123,
"learning_rate": 1.0830479452054796e-05,
"loss": 0.9446,
"step": 1840
},
{
"epoch": 1.576054537707712,
"grad_norm": 2.649498701095581,
"learning_rate": 1.0616438356164384e-05,
"loss": 0.9392,
"step": 1850
},
{
"epoch": 1.5845760545377077,
"grad_norm": 5.051261901855469,
"learning_rate": 1.0402397260273972e-05,
"loss": 1.4026,
"step": 1860
},
{
"epoch": 1.5930975713677036,
"grad_norm": 4.0868425369262695,
"learning_rate": 1.0188356164383562e-05,
"loss": 1.0371,
"step": 1870
},
{
"epoch": 1.6016190881976993,
"grad_norm": 2.242595672607422,
"learning_rate": 9.97431506849315e-06,
"loss": 0.7793,
"step": 1880
},
{
"epoch": 1.610140605027695,
"grad_norm": 2.0395429134368896,
"learning_rate": 9.76027397260274e-06,
"loss": 0.9659,
"step": 1890
},
{
"epoch": 1.6186621218576907,
"grad_norm": 3.6967060565948486,
"learning_rate": 9.54623287671233e-06,
"loss": 1.3609,
"step": 1900
},
{
"epoch": 1.6271836386876863,
"grad_norm": 3.8495938777923584,
"learning_rate": 9.332191780821919e-06,
"loss": 0.9838,
"step": 1910
},
{
"epoch": 1.635705155517682,
"grad_norm": 4.630374431610107,
"learning_rate": 9.118150684931507e-06,
"loss": 0.9471,
"step": 1920
},
{
"epoch": 1.6442266723476777,
"grad_norm": 2.2313318252563477,
"learning_rate": 8.904109589041095e-06,
"loss": 0.8377,
"step": 1930
},
{
"epoch": 1.6527481891776736,
"grad_norm": 2.623538017272949,
"learning_rate": 8.690068493150685e-06,
"loss": 0.9168,
"step": 1940
},
{
"epoch": 1.6612697060076693,
"grad_norm": 2.4369919300079346,
"learning_rate": 8.476027397260275e-06,
"loss": 1.1145,
"step": 1950
},
{
"epoch": 1.6697912228376652,
"grad_norm": 3.5387771129608154,
"learning_rate": 8.261986301369864e-06,
"loss": 1.2709,
"step": 1960
},
{
"epoch": 1.678312739667661,
"grad_norm": 3.519103527069092,
"learning_rate": 8.047945205479452e-06,
"loss": 0.8001,
"step": 1970
},
{
"epoch": 1.6868342564976566,
"grad_norm": 2.0191633701324463,
"learning_rate": 7.83390410958904e-06,
"loss": 0.9001,
"step": 1980
},
{
"epoch": 1.6953557733276523,
"grad_norm": 2.1604294776916504,
"learning_rate": 7.61986301369863e-06,
"loss": 0.8404,
"step": 1990
},
{
"epoch": 1.703877290157648,
"grad_norm": 2.4887239933013916,
"learning_rate": 7.40582191780822e-06,
"loss": 1.0109,
"step": 2000
},
{
"epoch": 1.7123988069876437,
"grad_norm": 3.2208356857299805,
"learning_rate": 7.191780821917809e-06,
"loss": 1.135,
"step": 2010
},
{
"epoch": 1.7209203238176394,
"grad_norm": 3.4077229499816895,
"learning_rate": 6.977739726027398e-06,
"loss": 1.0031,
"step": 2020
},
{
"epoch": 1.7294418406476353,
"grad_norm": 1.9562416076660156,
"learning_rate": 6.763698630136987e-06,
"loss": 0.8598,
"step": 2030
},
{
"epoch": 1.737963357477631,
"grad_norm": 2.2411820888519287,
"learning_rate": 6.549657534246575e-06,
"loss": 0.8489,
"step": 2040
},
{
"epoch": 1.7464848743076269,
"grad_norm": 3.026580333709717,
"learning_rate": 6.335616438356165e-06,
"loss": 1.0326,
"step": 2050
},
{
"epoch": 1.7550063911376226,
"grad_norm": 4.257705211639404,
"learning_rate": 6.121575342465754e-06,
"loss": 1.2792,
"step": 2060
},
{
"epoch": 1.7635279079676183,
"grad_norm": 2.8991031646728516,
"learning_rate": 5.907534246575343e-06,
"loss": 0.9655,
"step": 2070
},
{
"epoch": 1.772049424797614,
"grad_norm": 2.1200027465820312,
"learning_rate": 5.693493150684932e-06,
"loss": 0.9003,
"step": 2080
},
{
"epoch": 1.7805709416276096,
"grad_norm": 2.8092291355133057,
"learning_rate": 5.479452054794521e-06,
"loss": 0.9486,
"step": 2090
},
{
"epoch": 1.7890924584576053,
"grad_norm": 2.481590747833252,
"learning_rate": 5.26541095890411e-06,
"loss": 0.8927,
"step": 2100
},
{
"epoch": 1.797613975287601,
"grad_norm": 4.080635070800781,
"learning_rate": 5.051369863013699e-06,
"loss": 1.0988,
"step": 2110
},
{
"epoch": 1.806135492117597,
"grad_norm": 5.39976692199707,
"learning_rate": 4.8373287671232874e-06,
"loss": 0.923,
"step": 2120
},
{
"epoch": 1.8146570089475926,
"grad_norm": 2.044147253036499,
"learning_rate": 4.623287671232877e-06,
"loss": 0.914,
"step": 2130
},
{
"epoch": 1.8231785257775885,
"grad_norm": 2.0422821044921875,
"learning_rate": 4.4092465753424666e-06,
"loss": 0.8518,
"step": 2140
},
{
"epoch": 1.8317000426075842,
"grad_norm": 2.9914565086364746,
"learning_rate": 4.195205479452055e-06,
"loss": 1.1411,
"step": 2150
},
{
"epoch": 1.84022155943758,
"grad_norm": 3.7009713649749756,
"learning_rate": 3.981164383561644e-06,
"loss": 1.1454,
"step": 2160
},
{
"epoch": 1.8487430762675756,
"grad_norm": 5.263967514038086,
"learning_rate": 3.7671232876712327e-06,
"loss": 0.8432,
"step": 2170
},
{
"epoch": 1.8572645930975713,
"grad_norm": 2.337414026260376,
"learning_rate": 3.5530821917808223e-06,
"loss": 0.8604,
"step": 2180
},
{
"epoch": 1.865786109927567,
"grad_norm": 2.2294089794158936,
"learning_rate": 3.3390410958904114e-06,
"loss": 0.933,
"step": 2190
},
{
"epoch": 1.8743076267575627,
"grad_norm": 2.332831621170044,
"learning_rate": 3.125e-06,
"loss": 0.9133,
"step": 2200
},
{
"epoch": 1.8828291435875586,
"grad_norm": 3.756347179412842,
"learning_rate": 2.910958904109589e-06,
"loss": 1.3043,
"step": 2210
},
{
"epoch": 1.8913506604175543,
"grad_norm": 4.380275249481201,
"learning_rate": 2.6969178082191784e-06,
"loss": 0.9552,
"step": 2220
},
{
"epoch": 1.8998721772475502,
"grad_norm": 2.2383973598480225,
"learning_rate": 2.482876712328767e-06,
"loss": 0.8783,
"step": 2230
},
{
"epoch": 1.9083936940775459,
"grad_norm": 2.0851542949676514,
"learning_rate": 2.2688356164383563e-06,
"loss": 0.8612,
"step": 2240
},
{
"epoch": 1.9169152109075416,
"grad_norm": 2.3459975719451904,
"learning_rate": 2.054794520547945e-06,
"loss": 0.9008,
"step": 2250
},
{
"epoch": 1.9254367277375373,
"grad_norm": 4.169739246368408,
"learning_rate": 1.8407534246575344e-06,
"loss": 1.2114,
"step": 2260
},
{
"epoch": 1.933958244567533,
"grad_norm": 4.250594615936279,
"learning_rate": 1.6267123287671233e-06,
"loss": 1.0262,
"step": 2270
},
{
"epoch": 1.9424797613975286,
"grad_norm": 2.187901020050049,
"learning_rate": 1.4126712328767122e-06,
"loss": 0.8497,
"step": 2280
},
{
"epoch": 1.9510012782275243,
"grad_norm": 2.139758348464966,
"learning_rate": 1.1986301369863014e-06,
"loss": 0.8855,
"step": 2290
},
{
"epoch": 1.9595227950575203,
"grad_norm": 3.2545394897460938,
"learning_rate": 9.845890410958905e-07,
"loss": 1.0541,
"step": 2300
},
{
"epoch": 1.968044311887516,
"grad_norm": 4.898044586181641,
"learning_rate": 7.705479452054794e-07,
"loss": 1.1848,
"step": 2310
},
{
"epoch": 1.9765658287175119,
"grad_norm": 5.535640716552734,
"learning_rate": 5.565068493150685e-07,
"loss": 0.8679,
"step": 2320
},
{
"epoch": 1.9850873455475075,
"grad_norm": 1.8863285779953003,
"learning_rate": 3.4246575342465755e-07,
"loss": 0.7762,
"step": 2330
},
{
"epoch": 1.9936088623775032,
"grad_norm": 4.670785427093506,
"learning_rate": 1.2842465753424656e-07,
"loss": 0.9951,
"step": 2340
}
],
"logging_steps": 10,
"max_steps": 2346,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.4355734448273408e+17,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}