Qwen-7B-v2-16384r / trainer_state.json
PeterV09's picture
Initial commit
5d8413a verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 8.366013071895425,
"eval_steps": 10.0,
"global_step": 80,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.10457516339869281,
"grad_norm": 6.079821586608887,
"learning_rate": 5.000000000000001e-07,
"loss": 0.5536,
"step": 1
},
{
"epoch": 0.20915032679738563,
"grad_norm": 6.397408485412598,
"learning_rate": 1.0000000000000002e-06,
"loss": 0.5528,
"step": 2
},
{
"epoch": 0.3137254901960784,
"grad_norm": 5.950789451599121,
"learning_rate": 1.5e-06,
"loss": 0.5438,
"step": 3
},
{
"epoch": 0.41830065359477125,
"grad_norm": 5.743692874908447,
"learning_rate": 2.0000000000000003e-06,
"loss": 0.5351,
"step": 4
},
{
"epoch": 0.5228758169934641,
"grad_norm": 3.9552488327026367,
"learning_rate": 2.5e-06,
"loss": 0.5425,
"step": 5
},
{
"epoch": 0.6274509803921569,
"grad_norm": 5.907255172729492,
"learning_rate": 3e-06,
"loss": 0.5432,
"step": 6
},
{
"epoch": 0.7320261437908496,
"grad_norm": 6.004484176635742,
"learning_rate": 3.5e-06,
"loss": 0.544,
"step": 7
},
{
"epoch": 0.8366013071895425,
"grad_norm": 5.635789394378662,
"learning_rate": 4.000000000000001e-06,
"loss": 0.5056,
"step": 8
},
{
"epoch": 0.9411764705882353,
"grad_norm": 2.8101253509521484,
"learning_rate": 4.5e-06,
"loss": 0.4968,
"step": 9
},
{
"epoch": 1.0457516339869282,
"grad_norm": 2.6935434341430664,
"learning_rate": 5e-06,
"loss": 0.4755,
"step": 10
},
{
"epoch": 1.1503267973856208,
"grad_norm": 3.1984055042266846,
"learning_rate": 4.99847706754774e-06,
"loss": 0.4559,
"step": 11
},
{
"epoch": 1.2549019607843137,
"grad_norm": 3.488842248916626,
"learning_rate": 4.993910125649561e-06,
"loss": 0.4511,
"step": 12
},
{
"epoch": 1.3594771241830066,
"grad_norm": 2.507657051086426,
"learning_rate": 4.986304738420684e-06,
"loss": 0.4032,
"step": 13
},
{
"epoch": 1.4640522875816995,
"grad_norm": 2.3815805912017822,
"learning_rate": 4.975670171853926e-06,
"loss": 0.3984,
"step": 14
},
{
"epoch": 1.5686274509803921,
"grad_norm": 3.52946138381958,
"learning_rate": 4.962019382530521e-06,
"loss": 0.394,
"step": 15
},
{
"epoch": 1.673202614379085,
"grad_norm": 2.6642520427703857,
"learning_rate": 4.9453690018345144e-06,
"loss": 0.3778,
"step": 16
},
{
"epoch": 1.7777777777777777,
"grad_norm": 0.9417625665664673,
"learning_rate": 4.925739315689991e-06,
"loss": 0.3839,
"step": 17
},
{
"epoch": 1.8823529411764706,
"grad_norm": 0.8939223289489746,
"learning_rate": 4.903154239845798e-06,
"loss": 0.3599,
"step": 18
},
{
"epoch": 1.9869281045751634,
"grad_norm": 0.9599626064300537,
"learning_rate": 4.8776412907378845e-06,
"loss": 0.3494,
"step": 19
},
{
"epoch": 2.0915032679738563,
"grad_norm": 1.0300042629241943,
"learning_rate": 4.849231551964771e-06,
"loss": 0.3329,
"step": 20
},
{
"epoch": 2.196078431372549,
"grad_norm": 1.173368215560913,
"learning_rate": 4.817959636416969e-06,
"loss": 0.324,
"step": 21
},
{
"epoch": 2.3006535947712417,
"grad_norm": 0.5053654313087463,
"learning_rate": 4.783863644106502e-06,
"loss": 0.3192,
"step": 22
},
{
"epoch": 2.4052287581699345,
"grad_norm": 0.7185565233230591,
"learning_rate": 4.746985115747918e-06,
"loss": 0.3214,
"step": 23
},
{
"epoch": 2.5098039215686274,
"grad_norm": 0.5996802449226379,
"learning_rate": 4.707368982147318e-06,
"loss": 0.3074,
"step": 24
},
{
"epoch": 2.6143790849673203,
"grad_norm": 0.5362175703048706,
"learning_rate": 4.665063509461098e-06,
"loss": 0.3108,
"step": 25
},
{
"epoch": 2.718954248366013,
"grad_norm": 1.5828744173049927,
"learning_rate": 4.620120240391065e-06,
"loss": 0.303,
"step": 26
},
{
"epoch": 2.8235294117647056,
"grad_norm": 1.048822045326233,
"learning_rate": 4.572593931387604e-06,
"loss": 0.2971,
"step": 27
},
{
"epoch": 2.928104575163399,
"grad_norm": 0.37474843859672546,
"learning_rate": 4.522542485937369e-06,
"loss": 0.3041,
"step": 28
},
{
"epoch": 3.0326797385620914,
"grad_norm": 0.37631165981292725,
"learning_rate": 4.470026884016805e-06,
"loss": 0.2925,
"step": 29
},
{
"epoch": 3.1372549019607843,
"grad_norm": 0.5196499824523926,
"learning_rate": 4.415111107797445e-06,
"loss": 0.2967,
"step": 30
},
{
"epoch": 3.241830065359477,
"grad_norm": 2.151287794113159,
"learning_rate": 4.357862063693486e-06,
"loss": 0.2867,
"step": 31
},
{
"epoch": 3.34640522875817,
"grad_norm": 0.7420333027839661,
"learning_rate": 4.2983495008466285e-06,
"loss": 0.2867,
"step": 32
},
{
"epoch": 3.450980392156863,
"grad_norm": 0.9488831162452698,
"learning_rate": 4.236645926147493e-06,
"loss": 0.2812,
"step": 33
},
{
"epoch": 3.5555555555555554,
"grad_norm": 0.4171179533004761,
"learning_rate": 4.172826515897146e-06,
"loss": 0.2813,
"step": 34
},
{
"epoch": 3.6601307189542482,
"grad_norm": 0.3109152317047119,
"learning_rate": 4.106969024216348e-06,
"loss": 0.2856,
"step": 35
},
{
"epoch": 3.764705882352941,
"grad_norm": 1.511412501335144,
"learning_rate": 4.039153688314146e-06,
"loss": 0.2762,
"step": 36
},
{
"epoch": 3.869281045751634,
"grad_norm": 0.2636289894580841,
"learning_rate": 3.969463130731183e-06,
"loss": 0.2661,
"step": 37
},
{
"epoch": 3.973856209150327,
"grad_norm": 0.36717239022254944,
"learning_rate": 3.897982258676867e-06,
"loss": 0.2697,
"step": 38
},
{
"epoch": 4.078431372549019,
"grad_norm": 0.31323152780532837,
"learning_rate": 3.824798160583012e-06,
"loss": 0.2681,
"step": 39
},
{
"epoch": 4.183006535947713,
"grad_norm": 0.28556039929389954,
"learning_rate": 3.7500000000000005e-06,
"loss": 0.2688,
"step": 40
},
{
"epoch": 4.287581699346405,
"grad_norm": 0.25580766797065735,
"learning_rate": 3.6736789069647273e-06,
"loss": 0.2587,
"step": 41
},
{
"epoch": 4.392156862745098,
"grad_norm": 0.28098028898239136,
"learning_rate": 3.595927866972694e-06,
"loss": 0.2619,
"step": 42
},
{
"epoch": 4.496732026143791,
"grad_norm": 0.3076843023300171,
"learning_rate": 3.516841607689501e-06,
"loss": 0.2632,
"step": 43
},
{
"epoch": 4.601307189542483,
"grad_norm": 0.37901487946510315,
"learning_rate": 3.436516483539781e-06,
"loss": 0.2705,
"step": 44
},
{
"epoch": 4.705882352941177,
"grad_norm": 0.3699696362018585,
"learning_rate": 3.3550503583141726e-06,
"loss": 0.2573,
"step": 45
},
{
"epoch": 4.810457516339869,
"grad_norm": 0.21686607599258423,
"learning_rate": 3.272542485937369e-06,
"loss": 0.2597,
"step": 46
},
{
"epoch": 4.915032679738562,
"grad_norm": 0.253548800945282,
"learning_rate": 3.189093389542498e-06,
"loss": 0.2579,
"step": 47
},
{
"epoch": 5.019607843137255,
"grad_norm": 0.2619287073612213,
"learning_rate": 3.1048047389991693e-06,
"loss": 0.2557,
"step": 48
},
{
"epoch": 5.124183006535947,
"grad_norm": 0.22800114750862122,
"learning_rate": 3.019779227044398e-06,
"loss": 0.2499,
"step": 49
},
{
"epoch": 5.228758169934641,
"grad_norm": 0.2334267795085907,
"learning_rate": 2.9341204441673267e-06,
"loss": 0.257,
"step": 50
},
{
"epoch": 5.333333333333333,
"grad_norm": 0.21543391048908234,
"learning_rate": 2.847932752400164e-06,
"loss": 0.2587,
"step": 51
},
{
"epoch": 5.437908496732026,
"grad_norm": 0.23680457472801208,
"learning_rate": 2.761321158169134e-06,
"loss": 0.244,
"step": 52
},
{
"epoch": 5.542483660130719,
"grad_norm": 0.23751680552959442,
"learning_rate": 2.6743911843603134e-06,
"loss": 0.2516,
"step": 53
},
{
"epoch": 5.647058823529412,
"grad_norm": 0.22354498505592346,
"learning_rate": 2.587248741756253e-06,
"loss": 0.251,
"step": 54
},
{
"epoch": 5.751633986928105,
"grad_norm": 0.2219904214143753,
"learning_rate": 2.5e-06,
"loss": 0.2513,
"step": 55
},
{
"epoch": 5.856209150326797,
"grad_norm": 0.24870158731937408,
"learning_rate": 2.4127512582437486e-06,
"loss": 0.2478,
"step": 56
},
{
"epoch": 5.96078431372549,
"grad_norm": 0.2249930202960968,
"learning_rate": 2.325608815639687e-06,
"loss": 0.2448,
"step": 57
},
{
"epoch": 6.065359477124183,
"grad_norm": 0.2350921630859375,
"learning_rate": 2.238678841830867e-06,
"loss": 0.2436,
"step": 58
},
{
"epoch": 6.169934640522876,
"grad_norm": 0.2143002599477768,
"learning_rate": 2.1520672475998374e-06,
"loss": 0.2447,
"step": 59
},
{
"epoch": 6.2745098039215685,
"grad_norm": 0.23206955194473267,
"learning_rate": 2.0658795558326745e-06,
"loss": 0.2472,
"step": 60
},
{
"epoch": 6.379084967320262,
"grad_norm": 0.2509792447090149,
"learning_rate": 1.9802207729556023e-06,
"loss": 0.2444,
"step": 61
},
{
"epoch": 6.483660130718954,
"grad_norm": 0.21777762472629547,
"learning_rate": 1.895195261000831e-06,
"loss": 0.2406,
"step": 62
},
{
"epoch": 6.588235294117647,
"grad_norm": 0.214972585439682,
"learning_rate": 1.8109066104575023e-06,
"loss": 0.2403,
"step": 63
},
{
"epoch": 6.69281045751634,
"grad_norm": 0.2152736335992813,
"learning_rate": 1.7274575140626318e-06,
"loss": 0.2407,
"step": 64
},
{
"epoch": 6.7973856209150325,
"grad_norm": 0.21374449133872986,
"learning_rate": 1.6449496416858285e-06,
"loss": 0.2354,
"step": 65
},
{
"epoch": 6.901960784313726,
"grad_norm": 0.23214764893054962,
"learning_rate": 1.56348351646022e-06,
"loss": 0.2441,
"step": 66
},
{
"epoch": 7.006535947712418,
"grad_norm": 0.21843403577804565,
"learning_rate": 1.4831583923105e-06,
"loss": 0.242,
"step": 67
},
{
"epoch": 7.111111111111111,
"grad_norm": 0.22605258226394653,
"learning_rate": 1.4040721330273063e-06,
"loss": 0.2439,
"step": 68
},
{
"epoch": 7.215686274509804,
"grad_norm": 0.21626867353916168,
"learning_rate": 1.3263210930352737e-06,
"loss": 0.2413,
"step": 69
},
{
"epoch": 7.3202614379084965,
"grad_norm": 0.21916088461875916,
"learning_rate": 1.2500000000000007e-06,
"loss": 0.2361,
"step": 70
},
{
"epoch": 7.42483660130719,
"grad_norm": 0.212180495262146,
"learning_rate": 1.1752018394169882e-06,
"loss": 0.2329,
"step": 71
},
{
"epoch": 7.529411764705882,
"grad_norm": 0.25572070479393005,
"learning_rate": 1.1020177413231334e-06,
"loss": 0.2365,
"step": 72
},
{
"epoch": 7.633986928104575,
"grad_norm": 0.24046508967876434,
"learning_rate": 1.0305368692688175e-06,
"loss": 0.2372,
"step": 73
},
{
"epoch": 7.738562091503268,
"grad_norm": 0.21979837119579315,
"learning_rate": 9.608463116858544e-07,
"loss": 0.238,
"step": 74
},
{
"epoch": 7.8431372549019605,
"grad_norm": 0.2575077712535858,
"learning_rate": 8.930309757836517e-07,
"loss": 0.2382,
"step": 75
},
{
"epoch": 7.947712418300654,
"grad_norm": 0.2223196178674698,
"learning_rate": 8.271734841028553e-07,
"loss": 0.2352,
"step": 76
},
{
"epoch": 8.052287581699346,
"grad_norm": 0.23885323107242584,
"learning_rate": 7.633540738525066e-07,
"loss": 0.2347,
"step": 77
},
{
"epoch": 8.156862745098039,
"grad_norm": 0.22449229657649994,
"learning_rate": 7.016504991533727e-07,
"loss": 0.241,
"step": 78
},
{
"epoch": 8.261437908496733,
"grad_norm": 0.21523602306842804,
"learning_rate": 6.421379363065142e-07,
"loss": 0.2344,
"step": 79
},
{
"epoch": 8.366013071895425,
"grad_norm": 0.22689399123191833,
"learning_rate": 5.848888922025553e-07,
"loss": 0.24,
"step": 80
}
],
"logging_steps": 1.0,
"max_steps": 100,
"num_input_tokens_seen": 0,
"num_train_epochs": 12,
"save_steps": 10,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": false,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 7.116079112262779e+18,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}