mathy-7b-3e / checkpoint-94 /trainer_state.json
qnguyen3's picture
Add files using upload-large-folder tool
46bfff1 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.99867197875166,
"eval_steps": 94,
"global_step": 94,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.010624169986719787,
"grad_norm": 2.473472145582703,
"learning_rate": 2.5e-06,
"loss": 0.4101,
"step": 1
},
{
"epoch": 0.010624169986719787,
"eval_loss": 1.6489976644515991,
"eval_runtime": 38.1546,
"eval_samples_per_second": 63.033,
"eval_steps_per_second": 1.992,
"step": 1
},
{
"epoch": 0.021248339973439574,
"grad_norm": 2.473600838217233,
"learning_rate": 5e-06,
"loss": 0.4273,
"step": 2
},
{
"epoch": 0.03187250996015936,
"grad_norm": 2.5043053838666367,
"learning_rate": 7.5e-06,
"loss": 0.4244,
"step": 3
},
{
"epoch": 0.04249667994687915,
"grad_norm": 2.4185276106954317,
"learning_rate": 1e-05,
"loss": 0.4166,
"step": 4
},
{
"epoch": 0.05312084993359894,
"grad_norm": 2.101313294150849,
"learning_rate": 1.25e-05,
"loss": 0.3973,
"step": 5
},
{
"epoch": 0.06374501992031872,
"grad_norm": 2.32474051391718,
"learning_rate": 1.5e-05,
"loss": 0.364,
"step": 6
},
{
"epoch": 0.07436918990703852,
"grad_norm": 2.1736886206661663,
"learning_rate": 1.75e-05,
"loss": 0.3353,
"step": 7
},
{
"epoch": 0.0849933598937583,
"grad_norm": 1.7213633107027813,
"learning_rate": 2e-05,
"loss": 0.3195,
"step": 8
},
{
"epoch": 0.09561752988047809,
"grad_norm": 3.2596713587035007,
"learning_rate": 2.25e-05,
"loss": 0.3083,
"step": 9
},
{
"epoch": 0.10624169986719788,
"grad_norm": 7.208305473873824,
"learning_rate": 2.5e-05,
"loss": 0.3097,
"step": 10
},
{
"epoch": 0.11686586985391766,
"grad_norm": 2.0792264874760193,
"learning_rate": 2.7500000000000004e-05,
"loss": 0.2968,
"step": 11
},
{
"epoch": 0.12749003984063745,
"grad_norm": 7.712252689421062,
"learning_rate": 3e-05,
"loss": 0.2859,
"step": 12
},
{
"epoch": 0.13811420982735723,
"grad_norm": 0.9497642120720764,
"learning_rate": 3.2500000000000004e-05,
"loss": 0.2974,
"step": 13
},
{
"epoch": 0.14873837981407703,
"grad_norm": 0.9147721663929693,
"learning_rate": 3.5e-05,
"loss": 0.2803,
"step": 14
},
{
"epoch": 0.1593625498007968,
"grad_norm": 0.4984711347806819,
"learning_rate": 3.7500000000000003e-05,
"loss": 0.2721,
"step": 15
},
{
"epoch": 0.1699867197875166,
"grad_norm": 0.471047244087332,
"learning_rate": 4e-05,
"loss": 0.262,
"step": 16
},
{
"epoch": 0.1806108897742364,
"grad_norm": 0.472154205638306,
"learning_rate": 4.25e-05,
"loss": 0.2638,
"step": 17
},
{
"epoch": 0.19123505976095617,
"grad_norm": 0.4625092675068189,
"learning_rate": 4.5e-05,
"loss": 0.2591,
"step": 18
},
{
"epoch": 0.20185922974767595,
"grad_norm": 0.4996392904070069,
"learning_rate": 4.75e-05,
"loss": 0.2571,
"step": 19
},
{
"epoch": 0.21248339973439576,
"grad_norm": 1.0428146077253466,
"learning_rate": 5e-05,
"loss": 0.2487,
"step": 20
},
{
"epoch": 0.22310756972111553,
"grad_norm": 0.45426198900774195,
"learning_rate": 4.9809160305343514e-05,
"loss": 0.2523,
"step": 21
},
{
"epoch": 0.2337317397078353,
"grad_norm": 0.3580171632863205,
"learning_rate": 4.9618320610687025e-05,
"loss": 0.2435,
"step": 22
},
{
"epoch": 0.24435590969455512,
"grad_norm": 0.3537567495590767,
"learning_rate": 4.9427480916030536e-05,
"loss": 0.2411,
"step": 23
},
{
"epoch": 0.2549800796812749,
"grad_norm": 0.3233498171110322,
"learning_rate": 4.923664122137405e-05,
"loss": 0.2319,
"step": 24
},
{
"epoch": 0.2656042496679947,
"grad_norm": 0.26847478370421396,
"learning_rate": 4.904580152671756e-05,
"loss": 0.2401,
"step": 25
},
{
"epoch": 0.27622841965471445,
"grad_norm": 0.22622668691998796,
"learning_rate": 4.885496183206107e-05,
"loss": 0.2339,
"step": 26
},
{
"epoch": 0.2868525896414343,
"grad_norm": 0.17447662466704217,
"learning_rate": 4.866412213740458e-05,
"loss": 0.2334,
"step": 27
},
{
"epoch": 0.29747675962815406,
"grad_norm": 0.1406444973947631,
"learning_rate": 4.847328244274809e-05,
"loss": 0.2325,
"step": 28
},
{
"epoch": 0.30810092961487384,
"grad_norm": 0.15551008923115225,
"learning_rate": 4.82824427480916e-05,
"loss": 0.2312,
"step": 29
},
{
"epoch": 0.3187250996015936,
"grad_norm": 0.1363375689910322,
"learning_rate": 4.809160305343512e-05,
"loss": 0.231,
"step": 30
},
{
"epoch": 0.3293492695883134,
"grad_norm": 0.1701602959944274,
"learning_rate": 4.7900763358778626e-05,
"loss": 0.2293,
"step": 31
},
{
"epoch": 0.3399734395750332,
"grad_norm": 0.13622900372630573,
"learning_rate": 4.7709923664122144e-05,
"loss": 0.2248,
"step": 32
},
{
"epoch": 0.350597609561753,
"grad_norm": 0.15128152543667112,
"learning_rate": 4.751908396946565e-05,
"loss": 0.2306,
"step": 33
},
{
"epoch": 0.3612217795484728,
"grad_norm": 0.15811875642520323,
"learning_rate": 4.7328244274809166e-05,
"loss": 0.2321,
"step": 34
},
{
"epoch": 0.37184594953519257,
"grad_norm": 0.12645178389747586,
"learning_rate": 4.713740458015267e-05,
"loss": 0.2208,
"step": 35
},
{
"epoch": 0.38247011952191234,
"grad_norm": 0.12862564367197982,
"learning_rate": 4.694656488549619e-05,
"loss": 0.223,
"step": 36
},
{
"epoch": 0.3930942895086321,
"grad_norm": 0.1369002410151976,
"learning_rate": 4.675572519083969e-05,
"loss": 0.2271,
"step": 37
},
{
"epoch": 0.4037184594953519,
"grad_norm": 0.14767255168536392,
"learning_rate": 4.656488549618321e-05,
"loss": 0.2252,
"step": 38
},
{
"epoch": 0.41434262948207173,
"grad_norm": 0.13483775474592635,
"learning_rate": 4.637404580152672e-05,
"loss": 0.2256,
"step": 39
},
{
"epoch": 0.4249667994687915,
"grad_norm": 0.14826458237724738,
"learning_rate": 4.618320610687023e-05,
"loss": 0.2191,
"step": 40
},
{
"epoch": 0.4355909694555113,
"grad_norm": 0.12640771132223902,
"learning_rate": 4.5992366412213745e-05,
"loss": 0.2222,
"step": 41
},
{
"epoch": 0.44621513944223107,
"grad_norm": 0.12735423736878446,
"learning_rate": 4.5801526717557256e-05,
"loss": 0.215,
"step": 42
},
{
"epoch": 0.45683930942895085,
"grad_norm": 0.12856119123995918,
"learning_rate": 4.561068702290077e-05,
"loss": 0.2229,
"step": 43
},
{
"epoch": 0.4674634794156706,
"grad_norm": 0.10294046565924951,
"learning_rate": 4.541984732824428e-05,
"loss": 0.2176,
"step": 44
},
{
"epoch": 0.47808764940239046,
"grad_norm": 0.15050659721187953,
"learning_rate": 4.522900763358779e-05,
"loss": 0.221,
"step": 45
},
{
"epoch": 0.48871181938911024,
"grad_norm": 0.11127436955612696,
"learning_rate": 4.5038167938931294e-05,
"loss": 0.2202,
"step": 46
},
{
"epoch": 0.49933598937583,
"grad_norm": 0.1369522376895646,
"learning_rate": 4.484732824427481e-05,
"loss": 0.2133,
"step": 47
},
{
"epoch": 0.5099601593625498,
"grad_norm": 0.13836598803647265,
"learning_rate": 4.465648854961832e-05,
"loss": 0.2155,
"step": 48
},
{
"epoch": 0.5205843293492696,
"grad_norm": 0.11643639323464662,
"learning_rate": 4.4465648854961834e-05,
"loss": 0.2152,
"step": 49
},
{
"epoch": 0.5312084993359893,
"grad_norm": 0.13132395588523518,
"learning_rate": 4.4274809160305345e-05,
"loss": 0.2107,
"step": 50
},
{
"epoch": 0.5418326693227091,
"grad_norm": 0.11238359379346577,
"learning_rate": 4.408396946564886e-05,
"loss": 0.2137,
"step": 51
},
{
"epoch": 0.5524568393094289,
"grad_norm": 0.17138814733946134,
"learning_rate": 4.389312977099237e-05,
"loss": 0.2154,
"step": 52
},
{
"epoch": 0.5630810092961488,
"grad_norm": 0.1092235798281431,
"learning_rate": 4.370229007633588e-05,
"loss": 0.2112,
"step": 53
},
{
"epoch": 0.5737051792828686,
"grad_norm": 0.11289137255732332,
"learning_rate": 4.351145038167939e-05,
"loss": 0.2122,
"step": 54
},
{
"epoch": 0.5843293492695883,
"grad_norm": 0.1025053581338601,
"learning_rate": 4.332061068702291e-05,
"loss": 0.2168,
"step": 55
},
{
"epoch": 0.5949535192563081,
"grad_norm": 0.11102992105384238,
"learning_rate": 4.312977099236641e-05,
"loss": 0.2135,
"step": 56
},
{
"epoch": 0.6055776892430279,
"grad_norm": 0.12567758664033069,
"learning_rate": 4.293893129770993e-05,
"loss": 0.211,
"step": 57
},
{
"epoch": 0.6162018592297477,
"grad_norm": 0.11349920644602123,
"learning_rate": 4.2748091603053435e-05,
"loss": 0.2115,
"step": 58
},
{
"epoch": 0.6268260292164675,
"grad_norm": 0.12916314836096926,
"learning_rate": 4.255725190839695e-05,
"loss": 0.2208,
"step": 59
},
{
"epoch": 0.6374501992031872,
"grad_norm": 0.09752976352934666,
"learning_rate": 4.236641221374046e-05,
"loss": 0.2082,
"step": 60
},
{
"epoch": 0.648074369189907,
"grad_norm": 0.13511089875393836,
"learning_rate": 4.2175572519083975e-05,
"loss": 0.2081,
"step": 61
},
{
"epoch": 0.6586985391766268,
"grad_norm": 0.10709804722840903,
"learning_rate": 4.198473282442748e-05,
"loss": 0.2089,
"step": 62
},
{
"epoch": 0.6693227091633466,
"grad_norm": 0.11164046629150956,
"learning_rate": 4.1793893129771e-05,
"loss": 0.212,
"step": 63
},
{
"epoch": 0.6799468791500664,
"grad_norm": 0.11504623760720377,
"learning_rate": 4.160305343511451e-05,
"loss": 0.2104,
"step": 64
},
{
"epoch": 0.6905710491367862,
"grad_norm": 0.1056951553011598,
"learning_rate": 4.1412213740458014e-05,
"loss": 0.2095,
"step": 65
},
{
"epoch": 0.701195219123506,
"grad_norm": 0.11902910166095262,
"learning_rate": 4.122137404580153e-05,
"loss": 0.21,
"step": 66
},
{
"epoch": 0.7118193891102258,
"grad_norm": 0.11888528499779406,
"learning_rate": 4.1030534351145036e-05,
"loss": 0.2074,
"step": 67
},
{
"epoch": 0.7224435590969456,
"grad_norm": 0.11792907862935477,
"learning_rate": 4.0839694656488554e-05,
"loss": 0.2074,
"step": 68
},
{
"epoch": 0.7330677290836654,
"grad_norm": 0.12123246786138824,
"learning_rate": 4.064885496183206e-05,
"loss": 0.2117,
"step": 69
},
{
"epoch": 0.7436918990703851,
"grad_norm": 0.11548221744093704,
"learning_rate": 4.0458015267175576e-05,
"loss": 0.2152,
"step": 70
},
{
"epoch": 0.7543160690571049,
"grad_norm": 0.11744546618963372,
"learning_rate": 4.026717557251908e-05,
"loss": 0.2102,
"step": 71
},
{
"epoch": 0.7649402390438247,
"grad_norm": 0.1129425013506569,
"learning_rate": 4.00763358778626e-05,
"loss": 0.2084,
"step": 72
},
{
"epoch": 0.7755644090305445,
"grad_norm": 0.11061564445479172,
"learning_rate": 3.988549618320611e-05,
"loss": 0.2077,
"step": 73
},
{
"epoch": 0.7861885790172642,
"grad_norm": 0.12747716190819824,
"learning_rate": 3.969465648854962e-05,
"loss": 0.2065,
"step": 74
},
{
"epoch": 0.796812749003984,
"grad_norm": 0.11144908828621163,
"learning_rate": 3.950381679389313e-05,
"loss": 0.2106,
"step": 75
},
{
"epoch": 0.8074369189907038,
"grad_norm": 0.10477648755637319,
"learning_rate": 3.9312977099236644e-05,
"loss": 0.2126,
"step": 76
},
{
"epoch": 0.8180610889774237,
"grad_norm": 0.12819412448665685,
"learning_rate": 3.9122137404580155e-05,
"loss": 0.2079,
"step": 77
},
{
"epoch": 0.8286852589641435,
"grad_norm": 0.11200327130647701,
"learning_rate": 3.8931297709923666e-05,
"loss": 0.2092,
"step": 78
},
{
"epoch": 0.8393094289508632,
"grad_norm": 0.10297552490298816,
"learning_rate": 3.874045801526718e-05,
"loss": 0.2024,
"step": 79
},
{
"epoch": 0.849933598937583,
"grad_norm": 0.11622087800076612,
"learning_rate": 3.854961832061069e-05,
"loss": 0.2041,
"step": 80
},
{
"epoch": 0.8605577689243028,
"grad_norm": 0.12063464037416519,
"learning_rate": 3.83587786259542e-05,
"loss": 0.2139,
"step": 81
},
{
"epoch": 0.8711819389110226,
"grad_norm": 0.10545414252113708,
"learning_rate": 3.816793893129771e-05,
"loss": 0.207,
"step": 82
},
{
"epoch": 0.8818061088977424,
"grad_norm": 0.11873399276296229,
"learning_rate": 3.797709923664122e-05,
"loss": 0.2114,
"step": 83
},
{
"epoch": 0.8924302788844621,
"grad_norm": 0.12163297981289363,
"learning_rate": 3.778625954198473e-05,
"loss": 0.2088,
"step": 84
},
{
"epoch": 0.9030544488711819,
"grad_norm": 0.11583886378919572,
"learning_rate": 3.7595419847328244e-05,
"loss": 0.2097,
"step": 85
},
{
"epoch": 0.9136786188579017,
"grad_norm": 0.11870676027662425,
"learning_rate": 3.7404580152671756e-05,
"loss": 0.1982,
"step": 86
},
{
"epoch": 0.9243027888446215,
"grad_norm": 0.1188545663258694,
"learning_rate": 3.721374045801527e-05,
"loss": 0.2006,
"step": 87
},
{
"epoch": 0.9349269588313412,
"grad_norm": 0.11078800000682179,
"learning_rate": 3.702290076335878e-05,
"loss": 0.2011,
"step": 88
},
{
"epoch": 0.9455511288180611,
"grad_norm": 0.11154286597122227,
"learning_rate": 3.683206106870229e-05,
"loss": 0.2011,
"step": 89
},
{
"epoch": 0.9561752988047809,
"grad_norm": 0.10658131888410093,
"learning_rate": 3.66412213740458e-05,
"loss": 0.2048,
"step": 90
},
{
"epoch": 0.9667994687915007,
"grad_norm": 0.09993801688648761,
"learning_rate": 3.645038167938932e-05,
"loss": 0.2049,
"step": 91
},
{
"epoch": 0.9774236387782205,
"grad_norm": 0.11005954464067705,
"learning_rate": 3.625954198473282e-05,
"loss": 0.208,
"step": 92
},
{
"epoch": 0.9880478087649402,
"grad_norm": 0.10245543326903217,
"learning_rate": 3.606870229007634e-05,
"loss": 0.204,
"step": 93
},
{
"epoch": 0.99867197875166,
"grad_norm": 0.1163495155338574,
"learning_rate": 3.5877862595419845e-05,
"loss": 0.2037,
"step": 94
},
{
"epoch": 0.99867197875166,
"eval_loss": 1.6727522611618042,
"eval_runtime": 38.3479,
"eval_samples_per_second": 62.715,
"eval_steps_per_second": 1.982,
"step": 94
}
],
"logging_steps": 1,
"max_steps": 282,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 94,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 2.8221787935132877e+17,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}