|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.0, |
|
"eval_steps": 119, |
|
"global_step": 475, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 0.434104859828949, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8629, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"eval_loss": 0.9369699358940125, |
|
"eval_runtime": 61.2717, |
|
"eval_samples_per_second": 1.632, |
|
"eval_steps_per_second": 1.632, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 0.517804741859436, |
|
"learning_rate": 4e-05, |
|
"loss": 0.9794, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 1.3174158334732056, |
|
"learning_rate": 6e-05, |
|
"loss": 1.5473, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 0.6236504912376404, |
|
"learning_rate": 8e-05, |
|
"loss": 0.6592, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 0.49080607295036316, |
|
"learning_rate": 0.0001, |
|
"loss": 1.1104, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 0.43938764929771423, |
|
"learning_rate": 0.00012, |
|
"loss": 0.9446, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 0.8905380368232727, |
|
"learning_rate": 0.00014, |
|
"loss": 0.6719, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 0.9044575095176697, |
|
"learning_rate": 0.00016, |
|
"loss": 0.6471, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 0.541214108467102, |
|
"learning_rate": 0.00018, |
|
"loss": 0.8339, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 1.075484275817871, |
|
"learning_rate": 0.0002, |
|
"loss": 0.9616, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 0.3413633406162262, |
|
"learning_rate": 0.0001999998618515421, |
|
"loss": 0.8251, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 0.7642049193382263, |
|
"learning_rate": 0.00019999944740655014, |
|
"loss": 1.3883, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 0.5759286284446716, |
|
"learning_rate": 0.00019999875666616918, |
|
"loss": 0.9162, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 1.16646409034729, |
|
"learning_rate": 0.00019999778963230775, |
|
"loss": 1.1735, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 0.5447037220001221, |
|
"learning_rate": 0.0001999965463076377, |
|
"loss": 1.2769, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 0.510498046875, |
|
"learning_rate": 0.00019999502669559432, |
|
"loss": 1.1439, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 0.514502227306366, |
|
"learning_rate": 0.00019999323080037624, |
|
"loss": 0.9929, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 0.2014903873205185, |
|
"learning_rate": 0.00019999115862694546, |
|
"loss": 0.6706, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 0.6941856741905212, |
|
"learning_rate": 0.00019998881018102737, |
|
"loss": 0.7576, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 0.29424500465393066, |
|
"learning_rate": 0.00019998618546911056, |
|
"loss": 1.5048, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 1.431168556213379, |
|
"learning_rate": 0.00019998328449844714, |
|
"loss": 0.761, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 2.672117233276367, |
|
"learning_rate": 0.00019998010727705236, |
|
"loss": 0.9434, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 0.40351876616477966, |
|
"learning_rate": 0.00019997665381370477, |
|
"loss": 1.2708, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 0.36313024163246155, |
|
"learning_rate": 0.00019997292411794618, |
|
"loss": 0.6931, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 0.7412884831428528, |
|
"learning_rate": 0.00019996891820008164, |
|
"loss": 0.827, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 0.7299063205718994, |
|
"learning_rate": 0.00019996463607117935, |
|
"loss": 0.7135, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 0.5188469290733337, |
|
"learning_rate": 0.00019996007774307075, |
|
"loss": 0.574, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 0.23084047436714172, |
|
"learning_rate": 0.00019995524322835034, |
|
"loss": 1.0736, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 0.7209138870239258, |
|
"learning_rate": 0.00019995013254037574, |
|
"loss": 0.9087, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 1.2266876697540283, |
|
"learning_rate": 0.00019994474569326757, |
|
"loss": 1.2464, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 0.6613232493400574, |
|
"learning_rate": 0.0001999390827019096, |
|
"loss": 0.9568, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 1.0048911571502686, |
|
"learning_rate": 0.00019993314358194843, |
|
"loss": 1.1935, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 0.9797032475471497, |
|
"learning_rate": 0.00019992692834979372, |
|
"loss": 1.1969, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 0.4784688353538513, |
|
"learning_rate": 0.00019992043702261793, |
|
"loss": 0.8674, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 2.863600015640259, |
|
"learning_rate": 0.00019991366961835642, |
|
"loss": 1.3276, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 0.4809297025203705, |
|
"learning_rate": 0.0001999066261557073, |
|
"loss": 1.0116, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 0.6229730844497681, |
|
"learning_rate": 0.00019989930665413147, |
|
"loss": 1.2794, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 0.5807106494903564, |
|
"learning_rate": 0.0001998917111338525, |
|
"loss": 1.0074, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 1.5811257362365723, |
|
"learning_rate": 0.00019988383961585645, |
|
"loss": 1.2491, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 0.46693727374076843, |
|
"learning_rate": 0.00019987569212189224, |
|
"loss": 0.8432, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 0.9792713522911072, |
|
"learning_rate": 0.00019986726867447107, |
|
"loss": 0.8716, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 0.3240562081336975, |
|
"learning_rate": 0.00019985856929686667, |
|
"loss": 0.9934, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 0.7358172535896301, |
|
"learning_rate": 0.0001998495940131152, |
|
"loss": 0.7247, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 0.40497535467147827, |
|
"learning_rate": 0.00019984034284801502, |
|
"loss": 0.8262, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 0.36835265159606934, |
|
"learning_rate": 0.00019983081582712685, |
|
"loss": 1.0898, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 0.7027626633644104, |
|
"learning_rate": 0.0001998210129767735, |
|
"loss": 1.2949, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 0.35624387860298157, |
|
"learning_rate": 0.00019981093432404006, |
|
"loss": 0.9734, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 0.4896808862686157, |
|
"learning_rate": 0.00019980057989677345, |
|
"loss": 1.2297, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 0.9656649827957153, |
|
"learning_rate": 0.00019978994972358265, |
|
"loss": 0.8358, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 0.644644021987915, |
|
"learning_rate": 0.0001997790438338385, |
|
"loss": 0.7373, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 1.0634011030197144, |
|
"learning_rate": 0.00019976786225767365, |
|
"loss": 1.3792, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 0.9041422605514526, |
|
"learning_rate": 0.00019975640502598244, |
|
"loss": 0.856, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 0.5682844519615173, |
|
"learning_rate": 0.00019974467217042085, |
|
"loss": 0.6686, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 0.7197520136833191, |
|
"learning_rate": 0.00019973266372340639, |
|
"loss": 0.6806, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 1.2542258501052856, |
|
"learning_rate": 0.00019972037971811802, |
|
"loss": 1.353, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 0.6802650094032288, |
|
"learning_rate": 0.0001997078201884961, |
|
"loss": 1.0014, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 0.4886693060398102, |
|
"learning_rate": 0.0001996949851692422, |
|
"loss": 0.8613, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 1.0226856470108032, |
|
"learning_rate": 0.0001996818746958191, |
|
"loss": 1.0443, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 0.5514834523200989, |
|
"learning_rate": 0.00019966848880445062, |
|
"loss": 0.8816, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 0.4890052378177643, |
|
"learning_rate": 0.00019965482753212156, |
|
"loss": 0.8541, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 0.9011398553848267, |
|
"learning_rate": 0.0001996408909165776, |
|
"loss": 1.1158, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 0.7809276580810547, |
|
"learning_rate": 0.00019962667899632518, |
|
"loss": 0.6702, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 0.604097843170166, |
|
"learning_rate": 0.00019961219181063142, |
|
"loss": 1.1875, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 0.9003333449363708, |
|
"learning_rate": 0.00019959742939952392, |
|
"loss": 1.2126, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 0.59239661693573, |
|
"learning_rate": 0.0001995823918037908, |
|
"loss": 1.1083, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 0.6981655955314636, |
|
"learning_rate": 0.00019956707906498044, |
|
"loss": 0.9634, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 0.4635857045650482, |
|
"learning_rate": 0.00019955149122540152, |
|
"loss": 1.0345, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 0.35098958015441895, |
|
"learning_rate": 0.00019953562832812272, |
|
"loss": 0.8871, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 0.4510989189147949, |
|
"learning_rate": 0.00019951949041697274, |
|
"loss": 0.6967, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 0.39661434292793274, |
|
"learning_rate": 0.00019950307753654017, |
|
"loss": 1.0786, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 0.5836047530174255, |
|
"learning_rate": 0.00019948638973217323, |
|
"loss": 0.9265, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 0.4589115381240845, |
|
"learning_rate": 0.00019946942704997982, |
|
"loss": 0.6476, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 0.4495834410190582, |
|
"learning_rate": 0.00019945218953682734, |
|
"loss": 0.8693, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.35905730724334717, |
|
"learning_rate": 0.00019943467724034252, |
|
"loss": 1.0325, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 2.235016345977783, |
|
"learning_rate": 0.0001994168902089112, |
|
"loss": 1.3103, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.36725524067878723, |
|
"learning_rate": 0.00019939882849167852, |
|
"loss": 0.908, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.66635662317276, |
|
"learning_rate": 0.0001993804921385484, |
|
"loss": 0.682, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 2.121004819869995, |
|
"learning_rate": 0.0001993618812001836, |
|
"loss": 0.8462, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 0.44895172119140625, |
|
"learning_rate": 0.00019934299572800556, |
|
"loss": 0.9625, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 0.5769445300102234, |
|
"learning_rate": 0.00019932383577419432, |
|
"loss": 0.7278, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 0.3807710111141205, |
|
"learning_rate": 0.00019930440139168817, |
|
"loss": 0.657, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 0.212838813662529, |
|
"learning_rate": 0.00019928469263418374, |
|
"loss": 0.3094, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 0.8039274215698242, |
|
"learning_rate": 0.0001992647095561357, |
|
"loss": 0.8898, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 0.7184070348739624, |
|
"learning_rate": 0.00019924445221275675, |
|
"loss": 0.8613, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 0.4697589874267578, |
|
"learning_rate": 0.00019922392066001722, |
|
"loss": 0.9533, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 0.7024903297424316, |
|
"learning_rate": 0.00019920311495464518, |
|
"loss": 0.7188, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 0.5008754730224609, |
|
"learning_rate": 0.00019918203515412617, |
|
"loss": 0.8329, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 0.6406499743461609, |
|
"learning_rate": 0.00019916068131670302, |
|
"loss": 1.4259, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 0.39489614963531494, |
|
"learning_rate": 0.00019913905350137573, |
|
"loss": 0.5831, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 0.7018424272537231, |
|
"learning_rate": 0.0001991171517679013, |
|
"loss": 0.9808, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 0.5011338591575623, |
|
"learning_rate": 0.00019909497617679348, |
|
"loss": 0.6806, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 0.2412053793668747, |
|
"learning_rate": 0.0001990725267893228, |
|
"loss": 1.0299, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 0.4145759046077728, |
|
"learning_rate": 0.00019904980366751624, |
|
"loss": 1.4344, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 0.5789082050323486, |
|
"learning_rate": 0.00019902680687415705, |
|
"loss": 0.4662, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 0.28986847400665283, |
|
"learning_rate": 0.00019900353647278466, |
|
"loss": 1.296, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 0.33722007274627686, |
|
"learning_rate": 0.00019897999252769448, |
|
"loss": 0.8011, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 0.6796722412109375, |
|
"learning_rate": 0.00019895617510393772, |
|
"loss": 0.972, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 0.5858548879623413, |
|
"learning_rate": 0.00019893208426732115, |
|
"loss": 1.0073, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 0.5766484141349792, |
|
"learning_rate": 0.00019890772008440704, |
|
"loss": 0.7884, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 0.37647876143455505, |
|
"learning_rate": 0.00019888308262251285, |
|
"loss": 0.6407, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 1.5475112199783325, |
|
"learning_rate": 0.00019885817194971117, |
|
"loss": 1.1401, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 1.1082801818847656, |
|
"learning_rate": 0.00019883298813482938, |
|
"loss": 1.392, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 0.3952051103115082, |
|
"learning_rate": 0.00019880753124744963, |
|
"loss": 1.0498, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 0.19289755821228027, |
|
"learning_rate": 0.00019878180135790845, |
|
"loss": 0.4145, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 0.5658400654792786, |
|
"learning_rate": 0.00019875579853729676, |
|
"loss": 1.0984, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 0.8976437449455261, |
|
"learning_rate": 0.00019872952285745959, |
|
"loss": 0.6919, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 0.5265024900436401, |
|
"learning_rate": 0.00019870297439099577, |
|
"loss": 1.2932, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 0.8367021083831787, |
|
"learning_rate": 0.00019867615321125795, |
|
"loss": 1.4497, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 0.5223955512046814, |
|
"learning_rate": 0.00019864905939235214, |
|
"loss": 1.0325, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 1.408779501914978, |
|
"learning_rate": 0.00019862169300913785, |
|
"loss": 0.9026, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 0.24817530810832977, |
|
"learning_rate": 0.00019859405413722746, |
|
"loss": 0.826, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 1.450430154800415, |
|
"learning_rate": 0.0001985661428529863, |
|
"loss": 0.9791, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 0.4882315993309021, |
|
"learning_rate": 0.0001985379592335325, |
|
"loss": 0.7889, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 0.42783451080322266, |
|
"learning_rate": 0.00019850950335673643, |
|
"loss": 1.1608, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 1.0337790250778198, |
|
"learning_rate": 0.00019848077530122083, |
|
"loss": 1.045, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 0.3064819872379303, |
|
"learning_rate": 0.00019845177514636042, |
|
"loss": 0.6474, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 0.29662173986434937, |
|
"learning_rate": 0.00019842250297228176, |
|
"loss": 0.9493, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 0.515562891960144, |
|
"learning_rate": 0.00019839295885986296, |
|
"loss": 1.0605, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 1.4514832496643066, |
|
"learning_rate": 0.0001983631428907335, |
|
"loss": 0.6917, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"eval_loss": 0.8804921507835388, |
|
"eval_runtime": 61.5233, |
|
"eval_samples_per_second": 1.625, |
|
"eval_steps_per_second": 1.625, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 0.30757004022598267, |
|
"learning_rate": 0.00019833305514727395, |
|
"loss": 0.9722, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 0.5162855386734009, |
|
"learning_rate": 0.00019830269571261583, |
|
"loss": 1.2197, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 0.5095639824867249, |
|
"learning_rate": 0.00019827206467064133, |
|
"loss": 0.8676, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 0.4804045557975769, |
|
"learning_rate": 0.00019824116210598306, |
|
"loss": 0.8565, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 0.28008362650871277, |
|
"learning_rate": 0.0001982099881040239, |
|
"loss": 0.9001, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 0.6209085583686829, |
|
"learning_rate": 0.0001981785427508966, |
|
"loss": 0.7188, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 0.32877278327941895, |
|
"learning_rate": 0.0001981468261334837, |
|
"loss": 0.6749, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 0.4256601631641388, |
|
"learning_rate": 0.00019811483833941728, |
|
"loss": 0.8086, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 1.1572288274765015, |
|
"learning_rate": 0.0001980825794570786, |
|
"loss": 0.8554, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 0.4987819194793701, |
|
"learning_rate": 0.00019805004957559793, |
|
"loss": 0.6999, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 0.6852537393569946, |
|
"learning_rate": 0.00019801724878485438, |
|
"loss": 0.8759, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 0.7970736622810364, |
|
"learning_rate": 0.00019798417717547552, |
|
"loss": 0.7471, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 0.5638220310211182, |
|
"learning_rate": 0.00019795083483883715, |
|
"loss": 1.0391, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 0.5482009649276733, |
|
"learning_rate": 0.00019791722186706317, |
|
"loss": 0.8363, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 0.23791633546352386, |
|
"learning_rate": 0.0001978833383530251, |
|
"loss": 0.725, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 0.5339345335960388, |
|
"learning_rate": 0.00019784918439034216, |
|
"loss": 0.9828, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 0.24769064784049988, |
|
"learning_rate": 0.00019781476007338058, |
|
"loss": 0.9496, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 0.46634215116500854, |
|
"learning_rate": 0.00019778006549725375, |
|
"loss": 1.0973, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 0.8007522821426392, |
|
"learning_rate": 0.00019774510075782172, |
|
"loss": 0.6847, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 0.5393804907798767, |
|
"learning_rate": 0.00019770986595169096, |
|
"loss": 0.6461, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 0.2891620695590973, |
|
"learning_rate": 0.00019767436117621413, |
|
"loss": 0.2937, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 0.9463545680046082, |
|
"learning_rate": 0.0001976385865294899, |
|
"loss": 0.4934, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 0.25647807121276855, |
|
"learning_rate": 0.00019760254211036244, |
|
"loss": 0.7446, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 0.49435535073280334, |
|
"learning_rate": 0.00019756622801842143, |
|
"loss": 0.3544, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 0.7826042175292969, |
|
"learning_rate": 0.00019752964435400155, |
|
"loss": 0.6972, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 0.7160178422927856, |
|
"learning_rate": 0.00019749279121818235, |
|
"loss": 0.9655, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 0.3925221264362335, |
|
"learning_rate": 0.00019745566871278794, |
|
"loss": 0.9041, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 0.5669321417808533, |
|
"learning_rate": 0.0001974182769403866, |
|
"loss": 0.9093, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 0.5025343298912048, |
|
"learning_rate": 0.00019738061600429064, |
|
"loss": 0.6226, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 1.1127972602844238, |
|
"learning_rate": 0.0001973426860085561, |
|
"loss": 0.7431, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.4064362049102783, |
|
"learning_rate": 0.00019730448705798239, |
|
"loss": 0.8444, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.9272475242614746, |
|
"learning_rate": 0.00019726601925811204, |
|
"loss": 0.836, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.6594715118408203, |
|
"learning_rate": 0.00019722728271523034, |
|
"loss": 0.9031, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.9399688839912415, |
|
"learning_rate": 0.00019718827753636522, |
|
"loss": 0.7959, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.4452652633190155, |
|
"learning_rate": 0.00019714900382928675, |
|
"loss": 0.5638, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 0.47481146454811096, |
|
"learning_rate": 0.000197109461702507, |
|
"loss": 0.8291, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 0.1962246149778366, |
|
"learning_rate": 0.00019706965126527963, |
|
"loss": 0.7894, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 1.366571307182312, |
|
"learning_rate": 0.00019702957262759965, |
|
"loss": 1.1808, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 1.3261445760726929, |
|
"learning_rate": 0.00019698922590020312, |
|
"loss": 0.8769, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 0.5637160539627075, |
|
"learning_rate": 0.00019694861119456679, |
|
"loss": 0.882, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 0.4508800208568573, |
|
"learning_rate": 0.0001969077286229078, |
|
"loss": 1.2723, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 0.41292956471443176, |
|
"learning_rate": 0.0001968665782981835, |
|
"loss": 0.7919, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 0.6105634570121765, |
|
"learning_rate": 0.00019682516033409092, |
|
"loss": 1.0901, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 0.6460319757461548, |
|
"learning_rate": 0.00019678347484506669, |
|
"loss": 1.0425, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 0.8627430200576782, |
|
"learning_rate": 0.00019674152194628638, |
|
"loss": 0.8019, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 0.3218872547149658, |
|
"learning_rate": 0.00019669930175366472, |
|
"loss": 0.8345, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 0.6773053407669067, |
|
"learning_rate": 0.00019665681438385473, |
|
"loss": 1.3567, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 0.3802971839904785, |
|
"learning_rate": 0.0001966140599542477, |
|
"loss": 0.7315, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 0.9038891196250916, |
|
"learning_rate": 0.0001965710385829728, |
|
"loss": 0.6807, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 0.7831525802612305, |
|
"learning_rate": 0.00019652775038889674, |
|
"loss": 1.2796, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 0.3705346882343292, |
|
"learning_rate": 0.00019648419549162348, |
|
"loss": 0.8275, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 0.7794845104217529, |
|
"learning_rate": 0.0001964403740114939, |
|
"loss": 0.7539, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 0.2621815800666809, |
|
"learning_rate": 0.00019639628606958533, |
|
"loss": 0.976, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 0.6929745674133301, |
|
"learning_rate": 0.00019635193178771143, |
|
"loss": 0.6198, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 0.543230414390564, |
|
"learning_rate": 0.0001963073112884217, |
|
"loss": 0.9319, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 0.6732174158096313, |
|
"learning_rate": 0.0001962624246950012, |
|
"loss": 0.804, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 0.25452062487602234, |
|
"learning_rate": 0.00019621727213147027, |
|
"loss": 0.7632, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 0.6591973304748535, |
|
"learning_rate": 0.00019617185372258392, |
|
"loss": 0.9745, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 0.6275454163551331, |
|
"learning_rate": 0.0001961261695938319, |
|
"loss": 0.3411, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 0.6691128611564636, |
|
"learning_rate": 0.00019608021987143804, |
|
"loss": 0.9564, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 0.3190310299396515, |
|
"learning_rate": 0.00019603400468235998, |
|
"loss": 1.3002, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 0.4648153781890869, |
|
"learning_rate": 0.0001959875241542889, |
|
"loss": 0.9507, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 0.5921639800071716, |
|
"learning_rate": 0.00019594077841564907, |
|
"loss": 0.9397, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 0.5769446492195129, |
|
"learning_rate": 0.00019589376759559745, |
|
"loss": 0.9958, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 0.8454503417015076, |
|
"learning_rate": 0.00019584649182402357, |
|
"loss": 1.189, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 0.2865101099014282, |
|
"learning_rate": 0.0001957989512315489, |
|
"loss": 0.6747, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 0.3642055094242096, |
|
"learning_rate": 0.0001957511459495266, |
|
"loss": 0.5196, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 0.4965610206127167, |
|
"learning_rate": 0.00019570307611004124, |
|
"loss": 0.9448, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.5694214105606079, |
|
"learning_rate": 0.00019565474184590826, |
|
"loss": 0.868, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.6402484774589539, |
|
"learning_rate": 0.00019560614329067378, |
|
"loss": 0.8872, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.37722048163414, |
|
"learning_rate": 0.0001955572805786141, |
|
"loss": 0.9253, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 1.9157966375350952, |
|
"learning_rate": 0.00019550815384473534, |
|
"loss": 1.6508, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.33376675844192505, |
|
"learning_rate": 0.0001954587632247732, |
|
"loss": 0.9109, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 0.2680880129337311, |
|
"learning_rate": 0.00019540910885519242, |
|
"loss": 1.0693, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 0.7726811766624451, |
|
"learning_rate": 0.00019535919087318652, |
|
"loss": 0.9574, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 0.8604207634925842, |
|
"learning_rate": 0.0001953090094166773, |
|
"loss": 0.9475, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 1.3954675197601318, |
|
"learning_rate": 0.0001952585646243146, |
|
"loss": 1.5094, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 0.39931145310401917, |
|
"learning_rate": 0.00019520785663547586, |
|
"loss": 0.9915, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 0.772156298160553, |
|
"learning_rate": 0.00019515688559026563, |
|
"loss": 1.4155, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 0.48633861541748047, |
|
"learning_rate": 0.00019510565162951537, |
|
"loss": 0.9607, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 0.4661516845226288, |
|
"learning_rate": 0.0001950541548947829, |
|
"loss": 1.0283, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 0.8846752047538757, |
|
"learning_rate": 0.00019500239552835215, |
|
"loss": 0.756, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 0.9870714545249939, |
|
"learning_rate": 0.00019495037367323262, |
|
"loss": 0.7688, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 0.7435501217842102, |
|
"learning_rate": 0.00019489808947315915, |
|
"loss": 0.4752, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 0.6509325504302979, |
|
"learning_rate": 0.0001948455430725913, |
|
"loss": 0.9053, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 0.30190637707710266, |
|
"learning_rate": 0.0001947927346167132, |
|
"loss": 0.9323, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 0.420055627822876, |
|
"learning_rate": 0.00019473966425143292, |
|
"loss": 0.6446, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 0.49513018131256104, |
|
"learning_rate": 0.00019468633212338233, |
|
"loss": 0.9022, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 0.4812709391117096, |
|
"learning_rate": 0.00019463273837991643, |
|
"loss": 0.6835, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 0.2101246416568756, |
|
"learning_rate": 0.00019457888316911306, |
|
"loss": 0.5991, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 0.3539298176765442, |
|
"learning_rate": 0.00019452476663977248, |
|
"loss": 0.7323, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 0.29954612255096436, |
|
"learning_rate": 0.00019447038894141705, |
|
"loss": 0.6868, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 0.4053567349910736, |
|
"learning_rate": 0.00019441575022429065, |
|
"loss": 1.0805, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 0.7733739614486694, |
|
"learning_rate": 0.00019436085063935835, |
|
"loss": 1.3524, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 0.6096423864364624, |
|
"learning_rate": 0.00019430569033830605, |
|
"loss": 1.0183, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 1.1940584182739258, |
|
"learning_rate": 0.00019425026947353992, |
|
"loss": 1.0919, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 0.4030895233154297, |
|
"learning_rate": 0.00019419458819818614, |
|
"loss": 0.7642, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 1.4116997718811035, |
|
"learning_rate": 0.00019413864666609034, |
|
"loss": 0.6112, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 0.4545953869819641, |
|
"learning_rate": 0.00019408244503181724, |
|
"loss": 0.7328, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 0.9334838390350342, |
|
"learning_rate": 0.0001940259834506502, |
|
"loss": 1.0518, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 0.2695348858833313, |
|
"learning_rate": 0.00019396926207859084, |
|
"loss": 0.987, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 1.3967281579971313, |
|
"learning_rate": 0.00019391228107235858, |
|
"loss": 1.0819, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 1.0220236778259277, |
|
"learning_rate": 0.00019385504058939024, |
|
"loss": 0.9621, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 2.5694682598114014, |
|
"learning_rate": 0.00019379754078783937, |
|
"loss": 1.0647, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 0.6181725263595581, |
|
"learning_rate": 0.00019373978182657625, |
|
"loss": 1.0991, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 0.508532702922821, |
|
"learning_rate": 0.0001936817638651871, |
|
"loss": 1.0276, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 0.3763074278831482, |
|
"learning_rate": 0.00019362348706397373, |
|
"loss": 0.7447, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 0.9533042311668396, |
|
"learning_rate": 0.00019356495158395315, |
|
"loss": 1.1979, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 0.43593689799308777, |
|
"learning_rate": 0.00019350615758685708, |
|
"loss": 1.0028, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 0.7646205425262451, |
|
"learning_rate": 0.00019344710523513156, |
|
"loss": 1.463, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 0.29402196407318115, |
|
"learning_rate": 0.00019338779469193639, |
|
"loss": 1.2726, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 0.5773300528526306, |
|
"learning_rate": 0.00019332822612114475, |
|
"loss": 0.4847, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 1.0580178499221802, |
|
"learning_rate": 0.00019326839968734279, |
|
"loss": 1.0639, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 0.6212771534919739, |
|
"learning_rate": 0.00019320831555582908, |
|
"loss": 0.7302, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 1.1953450441360474, |
|
"learning_rate": 0.00019314797389261424, |
|
"loss": 0.9873, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 1.856995940208435, |
|
"learning_rate": 0.00019308737486442045, |
|
"loss": 0.9573, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 0.36539939045906067, |
|
"learning_rate": 0.00019302651863868092, |
|
"loss": 0.6884, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 0.3269266188144684, |
|
"learning_rate": 0.0001929654053835395, |
|
"loss": 0.9445, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 0.46403074264526367, |
|
"learning_rate": 0.00019290403526785025, |
|
"loss": 0.9783, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"eval_loss": 0.8782849311828613, |
|
"eval_runtime": 61.3598, |
|
"eval_samples_per_second": 1.63, |
|
"eval_steps_per_second": 1.63, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 0.6627680659294128, |
|
"learning_rate": 0.00019284240846117697, |
|
"loss": 0.9527, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 0.866802990436554, |
|
"learning_rate": 0.00019278052513379255, |
|
"loss": 0.6096, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 0.5304962396621704, |
|
"learning_rate": 0.00019271838545667876, |
|
"loss": 0.8335, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 1.076063632965088, |
|
"learning_rate": 0.00019265598960152555, |
|
"loss": 1.3308, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 2.491516351699829, |
|
"learning_rate": 0.00019259333774073083, |
|
"loss": 1.4458, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 1.3771064281463623, |
|
"learning_rate": 0.00019253043004739968, |
|
"loss": 1.4581, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 0.24413131177425385, |
|
"learning_rate": 0.00019246726669534415, |
|
"loss": 0.7537, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 1.02517831325531, |
|
"learning_rate": 0.00019240384785908265, |
|
"loss": 1.0646, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 0.4848421514034271, |
|
"learning_rate": 0.00019234017371383945, |
|
"loss": 0.6972, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 0.8870792388916016, |
|
"learning_rate": 0.00019227624443554425, |
|
"loss": 1.2114, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 0.5171313285827637, |
|
"learning_rate": 0.00019221206020083166, |
|
"loss": 0.7243, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 0.5975112915039062, |
|
"learning_rate": 0.00019214762118704076, |
|
"loss": 0.964, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 1.0921701192855835, |
|
"learning_rate": 0.0001920829275722146, |
|
"loss": 1.1413, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 0.6540035009384155, |
|
"learning_rate": 0.00019201797953509955, |
|
"loss": 0.9732, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 1.137863278388977, |
|
"learning_rate": 0.0001919527772551451, |
|
"loss": 1.3374, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 1.4139158725738525, |
|
"learning_rate": 0.00019188732091250307, |
|
"loss": 1.1147, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 0.5039550065994263, |
|
"learning_rate": 0.00019182161068802741, |
|
"loss": 0.7832, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 1.567670464515686, |
|
"learning_rate": 0.00019175564676327339, |
|
"loss": 0.6684, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 0.4372114837169647, |
|
"learning_rate": 0.0001916894293204973, |
|
"loss": 0.7285, |
|
"step": 257 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 0.4466225206851959, |
|
"learning_rate": 0.00019162295854265594, |
|
"loss": 0.5705, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 1.7975250482559204, |
|
"learning_rate": 0.00019155623461340594, |
|
"loss": 1.4155, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 0.6310514211654663, |
|
"learning_rate": 0.00019148925771710347, |
|
"loss": 0.7388, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 0.5273220539093018, |
|
"learning_rate": 0.0001914220280388037, |
|
"loss": 0.9241, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 1.8354101181030273, |
|
"learning_rate": 0.0001913545457642601, |
|
"loss": 0.8085, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 0.7362698316574097, |
|
"learning_rate": 0.00019128681107992415, |
|
"loss": 0.953, |
|
"step": 263 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 0.5334580540657043, |
|
"learning_rate": 0.00019121882417294462, |
|
"loss": 0.4416, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 0.6351854205131531, |
|
"learning_rate": 0.00019115058523116733, |
|
"loss": 0.6414, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 0.28386977314949036, |
|
"learning_rate": 0.00019108209444313433, |
|
"loss": 1.0273, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 0.5504246354103088, |
|
"learning_rate": 0.00019101335199808354, |
|
"loss": 1.1191, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 0.7449864149093628, |
|
"learning_rate": 0.00019094435808594823, |
|
"loss": 1.1073, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 0.6302490830421448, |
|
"learning_rate": 0.00019087511289735644, |
|
"loss": 1.2092, |
|
"step": 269 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 0.5618910789489746, |
|
"learning_rate": 0.0001908056166236305, |
|
"loss": 1.1966, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 0.46393775939941406, |
|
"learning_rate": 0.0001907358694567865, |
|
"loss": 0.7148, |
|
"step": 271 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 0.34640607237815857, |
|
"learning_rate": 0.00019066587158953366, |
|
"loss": 1.1297, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 1.3277580738067627, |
|
"learning_rate": 0.00019059562321527396, |
|
"loss": 1.0978, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 0.8730579018592834, |
|
"learning_rate": 0.0001905251245281015, |
|
"loss": 0.9732, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 0.32950034737586975, |
|
"learning_rate": 0.00019045437572280194, |
|
"loss": 1.0795, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 0.48170116543769836, |
|
"learning_rate": 0.00019038337699485208, |
|
"loss": 0.8124, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 0.858323335647583, |
|
"learning_rate": 0.00019031212854041918, |
|
"loss": 0.813, |
|
"step": 277 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 0.9366027116775513, |
|
"learning_rate": 0.00019024063055636057, |
|
"loss": 1.5074, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 0.4378308653831482, |
|
"learning_rate": 0.00019016888324022296, |
|
"loss": 0.8387, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 0.5781106948852539, |
|
"learning_rate": 0.0001900968867902419, |
|
"loss": 1.0496, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 0.834186851978302, |
|
"learning_rate": 0.00019002464140534147, |
|
"loss": 1.2684, |
|
"step": 281 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 0.752008855342865, |
|
"learning_rate": 0.00018995214728513343, |
|
"loss": 1.069, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 0.3941871225833893, |
|
"learning_rate": 0.0001898794046299167, |
|
"loss": 0.942, |
|
"step": 283 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 0.4069131314754486, |
|
"learning_rate": 0.0001898064136406771, |
|
"loss": 0.7116, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 0.6478765606880188, |
|
"learning_rate": 0.00018973317451908642, |
|
"loss": 0.9494, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 1.8658535480499268, |
|
"learning_rate": 0.0001896596874675021, |
|
"loss": 0.7592, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 0.8622011542320251, |
|
"learning_rate": 0.0001895859526889666, |
|
"loss": 0.9392, |
|
"step": 287 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 0.8127020001411438, |
|
"learning_rate": 0.00018951197038720688, |
|
"loss": 1.3309, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 0.5042945146560669, |
|
"learning_rate": 0.0001894377407666337, |
|
"loss": 0.7607, |
|
"step": 289 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 0.7252426743507385, |
|
"learning_rate": 0.00018936326403234125, |
|
"loss": 1.1221, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 0.7334456443786621, |
|
"learning_rate": 0.0001892885403901064, |
|
"loss": 0.4738, |
|
"step": 291 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 2.6204662322998047, |
|
"learning_rate": 0.00018921357004638835, |
|
"loss": 1.2511, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 0.5708286762237549, |
|
"learning_rate": 0.00018913835320832778, |
|
"loss": 1.0887, |
|
"step": 293 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 1.0324314832687378, |
|
"learning_rate": 0.00018906289008374655, |
|
"loss": 1.1019, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 0.3663407862186432, |
|
"learning_rate": 0.0001889871808811469, |
|
"loss": 1.0333, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 0.7219849824905396, |
|
"learning_rate": 0.00018891122580971098, |
|
"loss": 0.858, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 0.7850363850593567, |
|
"learning_rate": 0.00018883502507930042, |
|
"loss": 0.9503, |
|
"step": 297 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 0.28012195229530334, |
|
"learning_rate": 0.00018875857890045543, |
|
"loss": 0.8068, |
|
"step": 298 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 0.7574068307876587, |
|
"learning_rate": 0.00018868188748439444, |
|
"loss": 0.7557, |
|
"step": 299 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 0.9131019711494446, |
|
"learning_rate": 0.00018860495104301345, |
|
"loss": 1.1462, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 0.24085545539855957, |
|
"learning_rate": 0.00018852776978888551, |
|
"loss": 0.8286, |
|
"step": 301 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 0.4502617418766022, |
|
"learning_rate": 0.00018845034393526005, |
|
"loss": 1.0052, |
|
"step": 302 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 0.7258254289627075, |
|
"learning_rate": 0.00018837267369606228, |
|
"loss": 0.9703, |
|
"step": 303 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 0.6078888773918152, |
|
"learning_rate": 0.00018829475928589271, |
|
"loss": 0.8479, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 0.5912296772003174, |
|
"learning_rate": 0.00018821660092002641, |
|
"loss": 1.0336, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 0.3440995216369629, |
|
"learning_rate": 0.0001881381988144126, |
|
"loss": 0.7629, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 0.5613306164741516, |
|
"learning_rate": 0.0001880595531856738, |
|
"loss": 1.0355, |
|
"step": 307 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 0.5265874862670898, |
|
"learning_rate": 0.0001879806642511055, |
|
"loss": 0.9046, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 0.37300053238868713, |
|
"learning_rate": 0.0001879015322286754, |
|
"loss": 0.578, |
|
"step": 309 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 0.7948945164680481, |
|
"learning_rate": 0.00018782215733702286, |
|
"loss": 0.5693, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 0.5222792625427246, |
|
"learning_rate": 0.0001877425397954582, |
|
"loss": 0.812, |
|
"step": 311 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 0.6407319903373718, |
|
"learning_rate": 0.00018766267982396224, |
|
"loss": 0.7317, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 0.36041396856307983, |
|
"learning_rate": 0.00018758257764318567, |
|
"loss": 0.3617, |
|
"step": 313 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 0.6465966105461121, |
|
"learning_rate": 0.00018750223347444828, |
|
"loss": 0.6037, |
|
"step": 314 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 0.4281207025051117, |
|
"learning_rate": 0.00018742164753973855, |
|
"loss": 0.5269, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 0.3671799898147583, |
|
"learning_rate": 0.00018734082006171299, |
|
"loss": 0.66, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 0.4369129240512848, |
|
"learning_rate": 0.00018725975126369535, |
|
"loss": 1.1395, |
|
"step": 317 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 0.4631548523902893, |
|
"learning_rate": 0.00018717844136967624, |
|
"loss": 0.7871, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 0.4736942946910858, |
|
"learning_rate": 0.00018709689060431242, |
|
"loss": 1.2983, |
|
"step": 319 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 0.7346480488777161, |
|
"learning_rate": 0.00018701509919292613, |
|
"loss": 0.9507, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 0.5298660397529602, |
|
"learning_rate": 0.00018693306736150444, |
|
"loss": 0.6621, |
|
"step": 321 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 0.5501769781112671, |
|
"learning_rate": 0.0001868507953366989, |
|
"loss": 0.6954, |
|
"step": 322 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 1.565510630607605, |
|
"learning_rate": 0.0001867682833458245, |
|
"loss": 1.2279, |
|
"step": 323 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 0.2679019570350647, |
|
"learning_rate": 0.00018668553161685933, |
|
"loss": 0.6207, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 1.0185893774032593, |
|
"learning_rate": 0.00018660254037844388, |
|
"loss": 1.1179, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 0.400493323802948, |
|
"learning_rate": 0.00018651930985988036, |
|
"loss": 0.5947, |
|
"step": 326 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 0.7746186256408691, |
|
"learning_rate": 0.00018643584029113215, |
|
"loss": 1.0365, |
|
"step": 327 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 0.5792235136032104, |
|
"learning_rate": 0.0001863521319028231, |
|
"loss": 0.7102, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 0.35895833373069763, |
|
"learning_rate": 0.00018626818492623688, |
|
"loss": 0.5571, |
|
"step": 329 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 0.41158926486968994, |
|
"learning_rate": 0.0001861839995933164, |
|
"loss": 0.9009, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 0.5845640301704407, |
|
"learning_rate": 0.00018609957613666315, |
|
"loss": 0.3317, |
|
"step": 331 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 0.4458400309085846, |
|
"learning_rate": 0.00018601491478953657, |
|
"loss": 1.0094, |
|
"step": 332 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 0.6415822505950928, |
|
"learning_rate": 0.00018593001578585326, |
|
"loss": 0.9772, |
|
"step": 333 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 1.616220474243164, |
|
"learning_rate": 0.00018584487936018661, |
|
"loss": 0.6879, |
|
"step": 334 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 1.4885902404785156, |
|
"learning_rate": 0.00018575950574776595, |
|
"loss": 0.9627, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 0.2818461060523987, |
|
"learning_rate": 0.0001856738951844759, |
|
"loss": 0.9156, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 1.2286068201065063, |
|
"learning_rate": 0.00018558804790685588, |
|
"loss": 2.6577, |
|
"step": 337 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 0.7086435556411743, |
|
"learning_rate": 0.00018550196415209914, |
|
"loss": 0.8172, |
|
"step": 338 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 1.0317937135696411, |
|
"learning_rate": 0.00018541564415805258, |
|
"loss": 1.3381, |
|
"step": 339 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 0.693418562412262, |
|
"learning_rate": 0.00018532908816321558, |
|
"loss": 1.1259, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 1.25714910030365, |
|
"learning_rate": 0.00018524229640673974, |
|
"loss": 0.7892, |
|
"step": 341 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 0.6042699813842773, |
|
"learning_rate": 0.00018515526912842796, |
|
"loss": 0.8982, |
|
"step": 342 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 0.3453720211982727, |
|
"learning_rate": 0.00018506800656873398, |
|
"loss": 0.9424, |
|
"step": 343 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 0.7436335682868958, |
|
"learning_rate": 0.0001849805089687615, |
|
"loss": 0.7121, |
|
"step": 344 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 0.8308970928192139, |
|
"learning_rate": 0.00018489277657026375, |
|
"loss": 1.1099, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 0.6892271637916565, |
|
"learning_rate": 0.0001848048096156426, |
|
"loss": 0.6814, |
|
"step": 346 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 0.30851200222969055, |
|
"learning_rate": 0.00018471660834794805, |
|
"loss": 0.283, |
|
"step": 347 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 0.2706887722015381, |
|
"learning_rate": 0.00018462817301087748, |
|
"loss": 0.6258, |
|
"step": 348 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 0.9876924157142639, |
|
"learning_rate": 0.00018453950384877504, |
|
"loss": 0.6983, |
|
"step": 349 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 0.3037252128124237, |
|
"learning_rate": 0.0001844506011066308, |
|
"loss": 0.9854, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 1.0091379880905151, |
|
"learning_rate": 0.00018436146503008035, |
|
"loss": 0.9871, |
|
"step": 351 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 0.5219744443893433, |
|
"learning_rate": 0.0001842720958654039, |
|
"loss": 0.3771, |
|
"step": 352 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 0.49409008026123047, |
|
"learning_rate": 0.00018418249385952575, |
|
"loss": 1.0838, |
|
"step": 353 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 0.29014095664024353, |
|
"learning_rate": 0.00018409265926001343, |
|
"loss": 0.9922, |
|
"step": 354 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 0.3307441771030426, |
|
"learning_rate": 0.00018400259231507717, |
|
"loss": 1.0458, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 0.3356322646141052, |
|
"learning_rate": 0.00018391229327356916, |
|
"loss": 0.9891, |
|
"step": 356 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 0.3707556426525116, |
|
"learning_rate": 0.00018382176238498286, |
|
"loss": 0.9578, |
|
"step": 357 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"eval_loss": 0.8826732635498047, |
|
"eval_runtime": 61.8974, |
|
"eval_samples_per_second": 1.616, |
|
"eval_steps_per_second": 1.616, |
|
"step": 357 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 0.7507327198982239, |
|
"learning_rate": 0.00018373099989945236, |
|
"loss": 0.8916, |
|
"step": 358 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 0.3686985373497009, |
|
"learning_rate": 0.00018364000606775155, |
|
"loss": 0.9855, |
|
"step": 359 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 0.34240958094596863, |
|
"learning_rate": 0.00018354878114129367, |
|
"loss": 1.0874, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 0.2911188304424286, |
|
"learning_rate": 0.00018345732537213027, |
|
"loss": 1.2217, |
|
"step": 361 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 0.5415646433830261, |
|
"learning_rate": 0.0001833656390129509, |
|
"loss": 0.6675, |
|
"step": 362 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 0.36682239174842834, |
|
"learning_rate": 0.00018327372231708212, |
|
"loss": 0.8702, |
|
"step": 363 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 0.5462591648101807, |
|
"learning_rate": 0.0001831815755384869, |
|
"loss": 0.9005, |
|
"step": 364 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 0.5059930682182312, |
|
"learning_rate": 0.00018308919893176396, |
|
"loss": 0.8994, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 0.6344266533851624, |
|
"learning_rate": 0.00018299659275214706, |
|
"loss": 1.1571, |
|
"step": 366 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 0.2552272081375122, |
|
"learning_rate": 0.00018290375725550417, |
|
"loss": 1.2492, |
|
"step": 367 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 0.5543289184570312, |
|
"learning_rate": 0.00018281069269833692, |
|
"loss": 1.0141, |
|
"step": 368 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 1.3686586618423462, |
|
"learning_rate": 0.0001827173993377798, |
|
"loss": 0.8264, |
|
"step": 369 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 0.5549390912055969, |
|
"learning_rate": 0.0001826238774315995, |
|
"loss": 1.0753, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 0.8563418388366699, |
|
"learning_rate": 0.00018253012723819416, |
|
"loss": 0.4458, |
|
"step": 371 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 0.4292491376399994, |
|
"learning_rate": 0.00018243614901659264, |
|
"loss": 1.1994, |
|
"step": 372 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 0.37186571955680847, |
|
"learning_rate": 0.00018234194302645394, |
|
"loss": 0.9811, |
|
"step": 373 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 0.6655788421630859, |
|
"learning_rate": 0.00018224750952806624, |
|
"loss": 0.5048, |
|
"step": 374 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 0.7731723785400391, |
|
"learning_rate": 0.00018215284878234642, |
|
"loss": 0.9481, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 0.36243554949760437, |
|
"learning_rate": 0.00018205796105083915, |
|
"loss": 1.0048, |
|
"step": 376 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 1.08484947681427, |
|
"learning_rate": 0.00018196284659571639, |
|
"loss": 1.0245, |
|
"step": 377 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 1.128653883934021, |
|
"learning_rate": 0.00018186750567977637, |
|
"loss": 0.9997, |
|
"step": 378 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 0.6685619950294495, |
|
"learning_rate": 0.00018177193856644316, |
|
"loss": 1.3555, |
|
"step": 379 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 0.30426543951034546, |
|
"learning_rate": 0.00018167614551976567, |
|
"loss": 1.1209, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 0.6189528107643127, |
|
"learning_rate": 0.00018158012680441723, |
|
"loss": 1.0321, |
|
"step": 381 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 0.6775807738304138, |
|
"learning_rate": 0.00018148388268569453, |
|
"loss": 0.7826, |
|
"step": 382 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 0.4594517946243286, |
|
"learning_rate": 0.00018138741342951705, |
|
"loss": 0.6422, |
|
"step": 383 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 0.537011444568634, |
|
"learning_rate": 0.00018129071930242648, |
|
"loss": 0.9219, |
|
"step": 384 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 0.43772855401039124, |
|
"learning_rate": 0.00018119380057158568, |
|
"loss": 1.1737, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 0.7221130132675171, |
|
"learning_rate": 0.00018109665750477806, |
|
"loss": 0.8636, |
|
"step": 386 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 0.3437989354133606, |
|
"learning_rate": 0.00018099929037040694, |
|
"loss": 0.9238, |
|
"step": 387 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 0.47244492173194885, |
|
"learning_rate": 0.00018090169943749476, |
|
"loss": 0.7715, |
|
"step": 388 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 0.7109631299972534, |
|
"learning_rate": 0.0001808038849756822, |
|
"loss": 0.4109, |
|
"step": 389 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 0.27005669474601746, |
|
"learning_rate": 0.00018070584725522762, |
|
"loss": 0.7158, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 0.4006590247154236, |
|
"learning_rate": 0.00018060758654700622, |
|
"loss": 1.0167, |
|
"step": 391 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 0.5627204179763794, |
|
"learning_rate": 0.00018050910312250931, |
|
"loss": 0.8679, |
|
"step": 392 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 0.5019241571426392, |
|
"learning_rate": 0.00018041039725384352, |
|
"loss": 0.9163, |
|
"step": 393 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 1.00431227684021, |
|
"learning_rate": 0.00018031146921373018, |
|
"loss": 0.676, |
|
"step": 394 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 0.7062071561813354, |
|
"learning_rate": 0.0001802123192755044, |
|
"loss": 1.2407, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 1.6554285287857056, |
|
"learning_rate": 0.00018011294771311435, |
|
"loss": 1.1187, |
|
"step": 396 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 1.08072829246521, |
|
"learning_rate": 0.00018001335480112064, |
|
"loss": 0.4878, |
|
"step": 397 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 0.3923906981945038, |
|
"learning_rate": 0.00017991354081469538, |
|
"loss": 0.7836, |
|
"step": 398 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 0.20446747541427612, |
|
"learning_rate": 0.0001798135060296216, |
|
"loss": 0.4597, |
|
"step": 399 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 0.5178759098052979, |
|
"learning_rate": 0.00017971325072229226, |
|
"loss": 1.7021, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 0.5159180164337158, |
|
"learning_rate": 0.0001796127751697097, |
|
"loss": 0.6037, |
|
"step": 401 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 0.9448319673538208, |
|
"learning_rate": 0.0001795120796494848, |
|
"loss": 0.8965, |
|
"step": 402 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 1.0035223960876465, |
|
"learning_rate": 0.00017941116443983613, |
|
"loss": 0.9786, |
|
"step": 403 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 0.26040011644363403, |
|
"learning_rate": 0.00017931002981958933, |
|
"loss": 0.8624, |
|
"step": 404 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 0.518144965171814, |
|
"learning_rate": 0.00017920867606817625, |
|
"loss": 1.0095, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 0.5256940722465515, |
|
"learning_rate": 0.00017910710346563416, |
|
"loss": 0.7392, |
|
"step": 406 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 0.8347258567810059, |
|
"learning_rate": 0.000179005312292605, |
|
"loss": 0.8081, |
|
"step": 407 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 0.8221095204353333, |
|
"learning_rate": 0.00017890330283033468, |
|
"loss": 1.1406, |
|
"step": 408 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 0.8048923015594482, |
|
"learning_rate": 0.00017880107536067218, |
|
"loss": 1.4362, |
|
"step": 409 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 1.9037342071533203, |
|
"learning_rate": 0.0001786986301660689, |
|
"loss": 1.2935, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 0.5521582961082458, |
|
"learning_rate": 0.00017859596752957768, |
|
"loss": 1.0742, |
|
"step": 411 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 1.052284598350525, |
|
"learning_rate": 0.00017849308773485226, |
|
"loss": 0.7661, |
|
"step": 412 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 0.43000859022140503, |
|
"learning_rate": 0.00017838999106614632, |
|
"loss": 0.812, |
|
"step": 413 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 0.7804751396179199, |
|
"learning_rate": 0.00017828667780831278, |
|
"loss": 0.7995, |
|
"step": 414 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 0.5827552080154419, |
|
"learning_rate": 0.000178183148246803, |
|
"loss": 0.6489, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 1.3453142642974854, |
|
"learning_rate": 0.00017807940266766593, |
|
"loss": 0.7154, |
|
"step": 416 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 0.24924832582473755, |
|
"learning_rate": 0.00017797544135754744, |
|
"loss": 0.8061, |
|
"step": 417 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 0.4459979236125946, |
|
"learning_rate": 0.0001778712646036894, |
|
"loss": 1.1167, |
|
"step": 418 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 0.6095878481864929, |
|
"learning_rate": 0.000177766872693929, |
|
"loss": 0.8344, |
|
"step": 419 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 0.43662723898887634, |
|
"learning_rate": 0.00017766226591669785, |
|
"loss": 1.0373, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 0.8759774565696716, |
|
"learning_rate": 0.00017755744456102122, |
|
"loss": 1.0988, |
|
"step": 421 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 1.1800742149353027, |
|
"learning_rate": 0.00017745240891651735, |
|
"loss": 0.7385, |
|
"step": 422 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 0.5820197463035583, |
|
"learning_rate": 0.0001773471592733964, |
|
"loss": 0.9363, |
|
"step": 423 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 0.6128491759300232, |
|
"learning_rate": 0.00017724169592245995, |
|
"loss": 0.7762, |
|
"step": 424 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 0.5693449378013611, |
|
"learning_rate": 0.0001771360191551, |
|
"loss": 0.7526, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 0.7725418210029602, |
|
"learning_rate": 0.00017703012926329815, |
|
"loss": 0.7019, |
|
"step": 426 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 0.5068923234939575, |
|
"learning_rate": 0.0001769240265396249, |
|
"loss": 0.8308, |
|
"step": 427 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 0.34859699010849, |
|
"learning_rate": 0.0001768177112772388, |
|
"loss": 0.9593, |
|
"step": 428 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 0.34673023223876953, |
|
"learning_rate": 0.00017671118376988573, |
|
"loss": 1.0334, |
|
"step": 429 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 0.5354735851287842, |
|
"learning_rate": 0.0001766044443118978, |
|
"loss": 1.2355, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 1.2567592859268188, |
|
"learning_rate": 0.0001764974931981929, |
|
"loss": 0.8935, |
|
"step": 431 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 0.4151657521724701, |
|
"learning_rate": 0.00017639033072427366, |
|
"loss": 1.1042, |
|
"step": 432 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 0.4307219386100769, |
|
"learning_rate": 0.00017628295718622665, |
|
"loss": 1.2273, |
|
"step": 433 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 0.6330164074897766, |
|
"learning_rate": 0.0001761753728807217, |
|
"loss": 1.2027, |
|
"step": 434 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 0.47434625029563904, |
|
"learning_rate": 0.00017606757810501088, |
|
"loss": 0.9242, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 1.0463887453079224, |
|
"learning_rate": 0.00017595957315692782, |
|
"loss": 1.151, |
|
"step": 436 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 0.7210713028907776, |
|
"learning_rate": 0.00017585135833488692, |
|
"loss": 0.8223, |
|
"step": 437 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 0.5121049284934998, |
|
"learning_rate": 0.00017574293393788235, |
|
"loss": 0.6994, |
|
"step": 438 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 0.8933761119842529, |
|
"learning_rate": 0.00017563430026548734, |
|
"loss": 0.846, |
|
"step": 439 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 0.5270050764083862, |
|
"learning_rate": 0.0001755254576178535, |
|
"loss": 0.7119, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 0.37028369307518005, |
|
"learning_rate": 0.0001754164062957096, |
|
"loss": 0.8623, |
|
"step": 441 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 0.6245588660240173, |
|
"learning_rate": 0.00017530714660036112, |
|
"loss": 1.0409, |
|
"step": 442 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 0.878105640411377, |
|
"learning_rate": 0.0001751976788336892, |
|
"loss": 0.6867, |
|
"step": 443 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 0.3765283226966858, |
|
"learning_rate": 0.00017508800329814995, |
|
"loss": 1.2251, |
|
"step": 444 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 0.4110933840274811, |
|
"learning_rate": 0.00017497812029677344, |
|
"loss": 0.8676, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 1.3986817598342896, |
|
"learning_rate": 0.000174868030133163, |
|
"loss": 1.6298, |
|
"step": 446 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 0.4310443103313446, |
|
"learning_rate": 0.0001747577331114945, |
|
"loss": 0.7328, |
|
"step": 447 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 1.5922423601150513, |
|
"learning_rate": 0.00017464722953651504, |
|
"loss": 0.6629, |
|
"step": 448 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 2.27004075050354, |
|
"learning_rate": 0.00017453651971354264, |
|
"loss": 1.4748, |
|
"step": 449 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 0.31181880831718445, |
|
"learning_rate": 0.00017442560394846516, |
|
"loss": 1.0477, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 1.1180263757705688, |
|
"learning_rate": 0.00017431448254773944, |
|
"loss": 0.9225, |
|
"step": 451 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 0.7490403056144714, |
|
"learning_rate": 0.00017420315581839044, |
|
"loss": 0.7847, |
|
"step": 452 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 1.138551115989685, |
|
"learning_rate": 0.0001740916240680105, |
|
"loss": 1.2782, |
|
"step": 453 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 0.9375423789024353, |
|
"learning_rate": 0.0001739798876047584, |
|
"loss": 0.8316, |
|
"step": 454 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 1.0941681861877441, |
|
"learning_rate": 0.0001738679467373586, |
|
"loss": 0.9702, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 0.2845444083213806, |
|
"learning_rate": 0.00017375580177510016, |
|
"loss": 0.8563, |
|
"step": 456 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 0.7341310381889343, |
|
"learning_rate": 0.0001736434530278362, |
|
"loss": 0.7102, |
|
"step": 457 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 0.886854350566864, |
|
"learning_rate": 0.0001735309008059829, |
|
"loss": 1.4872, |
|
"step": 458 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 0.44623202085494995, |
|
"learning_rate": 0.00017341814542051845, |
|
"loss": 0.8142, |
|
"step": 459 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 0.8813387155532837, |
|
"learning_rate": 0.00017330518718298264, |
|
"loss": 1.0869, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 0.8468006253242493, |
|
"learning_rate": 0.0001731920264054755, |
|
"loss": 1.2859, |
|
"step": 461 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 0.6797575354576111, |
|
"learning_rate": 0.00017307866340065685, |
|
"loss": 1.0288, |
|
"step": 462 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 0.25991517305374146, |
|
"learning_rate": 0.00017296509848174508, |
|
"loss": 0.7996, |
|
"step": 463 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 0.557244598865509, |
|
"learning_rate": 0.00017285133196251663, |
|
"loss": 0.6877, |
|
"step": 464 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 0.49986258149147034, |
|
"learning_rate": 0.00017273736415730488, |
|
"loss": 0.8285, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 0.5839115977287292, |
|
"learning_rate": 0.0001726231953809993, |
|
"loss": 0.8617, |
|
"step": 466 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 1.1967806816101074, |
|
"learning_rate": 0.0001725088259490448, |
|
"loss": 0.719, |
|
"step": 467 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 0.7830839157104492, |
|
"learning_rate": 0.00017239425617744048, |
|
"loss": 0.623, |
|
"step": 468 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 2.0930089950561523, |
|
"learning_rate": 0.00017227948638273916, |
|
"loss": 1.1768, |
|
"step": 469 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 0.562641441822052, |
|
"learning_rate": 0.0001721645168820462, |
|
"loss": 0.5235, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 0.5656068325042725, |
|
"learning_rate": 0.00017204934799301883, |
|
"loss": 0.9211, |
|
"step": 471 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 0.934866726398468, |
|
"learning_rate": 0.0001719339800338651, |
|
"loss": 0.7897, |
|
"step": 472 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.30100950598716736, |
|
"learning_rate": 0.00017181841332334318, |
|
"loss": 1.0436, |
|
"step": 473 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.5494408011436462, |
|
"learning_rate": 0.00017170264818076026, |
|
"loss": 0.6412, |
|
"step": 474 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.6607212424278259, |
|
"learning_rate": 0.00017158668492597186, |
|
"loss": 1.3501, |
|
"step": 475 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 1900, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 4, |
|
"save_steps": 475, |
|
"total_flos": 3.489377344094208e+16, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|