|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.04583651642475172, |
|
"eval_steps": 50, |
|
"global_step": 150, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.00030557677616501144, |
|
"grad_norm": 2.4270410537719727, |
|
"learning_rate": 2e-05, |
|
"loss": 4.3691, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.00030557677616501144, |
|
"eval_loss": 3.825514316558838, |
|
"eval_runtime": 44.62, |
|
"eval_samples_per_second": 30.883, |
|
"eval_steps_per_second": 15.442, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0006111535523300229, |
|
"grad_norm": 1.158055305480957, |
|
"learning_rate": 4e-05, |
|
"loss": 1.9265, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.0009167303284950344, |
|
"grad_norm": 1.5009372234344482, |
|
"learning_rate": 6e-05, |
|
"loss": 4.1948, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.0012223071046600458, |
|
"grad_norm": 1.4214242696762085, |
|
"learning_rate": 8e-05, |
|
"loss": 3.1941, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.0015278838808250573, |
|
"grad_norm": 1.7152305841445923, |
|
"learning_rate": 0.0001, |
|
"loss": 3.3993, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.0018334606569900688, |
|
"grad_norm": 1.6367850303649902, |
|
"learning_rate": 0.00012, |
|
"loss": 4.2454, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.0021390374331550803, |
|
"grad_norm": 1.2402284145355225, |
|
"learning_rate": 0.00014, |
|
"loss": 2.3504, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.0024446142093200915, |
|
"grad_norm": 1.5380349159240723, |
|
"learning_rate": 0.00016, |
|
"loss": 3.6695, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.0027501909854851033, |
|
"grad_norm": 2.2115941047668457, |
|
"learning_rate": 0.00018, |
|
"loss": 3.8667, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.0030557677616501145, |
|
"grad_norm": 2.173429489135742, |
|
"learning_rate": 0.0002, |
|
"loss": 3.0004, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.0033613445378151263, |
|
"grad_norm": 2.2001826763153076, |
|
"learning_rate": 0.0001999863304992469, |
|
"loss": 2.6077, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.0036669213139801375, |
|
"grad_norm": 2.234281539916992, |
|
"learning_rate": 0.00019994532573409262, |
|
"loss": 3.6697, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.003972498090145149, |
|
"grad_norm": 2.308614492416382, |
|
"learning_rate": 0.00019987699691483048, |
|
"loss": 2.7118, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.0042780748663101605, |
|
"grad_norm": 2.648648500442505, |
|
"learning_rate": 0.00019978136272187747, |
|
"loss": 3.2435, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.004583651642475172, |
|
"grad_norm": 1.7936925888061523, |
|
"learning_rate": 0.000199658449300667, |
|
"loss": 2.1392, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.004889228418640183, |
|
"grad_norm": 2.6367146968841553, |
|
"learning_rate": 0.00019950829025450114, |
|
"loss": 3.592, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.005194805194805195, |
|
"grad_norm": 2.5174176692962646, |
|
"learning_rate": 0.00019933092663536382, |
|
"loss": 2.977, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.0055003819709702065, |
|
"grad_norm": 2.296748399734497, |
|
"learning_rate": 0.00019912640693269752, |
|
"loss": 3.093, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.005805958747135217, |
|
"grad_norm": 3.448976993560791, |
|
"learning_rate": 0.00019889478706014687, |
|
"loss": 3.0724, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.006111535523300229, |
|
"grad_norm": 2.230823278427124, |
|
"learning_rate": 0.00019863613034027224, |
|
"loss": 2.7141, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.006417112299465241, |
|
"grad_norm": 2.3661162853240967, |
|
"learning_rate": 0.00019835050748723824, |
|
"loss": 3.6851, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.0067226890756302525, |
|
"grad_norm": 1.993086338043213, |
|
"learning_rate": 0.00019803799658748094, |
|
"loss": 3.8929, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.007028265851795263, |
|
"grad_norm": 1.727256417274475, |
|
"learning_rate": 0.00019769868307835994, |
|
"loss": 2.7433, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.007333842627960275, |
|
"grad_norm": 2.6137921810150146, |
|
"learning_rate": 0.0001973326597248006, |
|
"loss": 3.9081, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.007639419404125287, |
|
"grad_norm": 1.9214521646499634, |
|
"learning_rate": 0.00019694002659393305, |
|
"loss": 3.451, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.007944996180290298, |
|
"grad_norm": 2.5544991493225098, |
|
"learning_rate": 0.00019652089102773488, |
|
"loss": 2.4955, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.00825057295645531, |
|
"grad_norm": 3.0405311584472656, |
|
"learning_rate": 0.00019607536761368484, |
|
"loss": 3.6993, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.008556149732620321, |
|
"grad_norm": 1.412721037864685, |
|
"learning_rate": 0.00019560357815343577, |
|
"loss": 3.4813, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.008861726508785332, |
|
"grad_norm": 1.844821572303772, |
|
"learning_rate": 0.00019510565162951537, |
|
"loss": 2.4911, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.009167303284950344, |
|
"grad_norm": 2.185194730758667, |
|
"learning_rate": 0.00019458172417006347, |
|
"loss": 3.0409, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.009472880061115355, |
|
"grad_norm": 2.528438091278076, |
|
"learning_rate": 0.00019403193901161613, |
|
"loss": 4.4194, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.009778456837280366, |
|
"grad_norm": 2.3545053005218506, |
|
"learning_rate": 0.0001934564464599461, |
|
"loss": 3.9219, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.010084033613445379, |
|
"grad_norm": 2.4586379528045654, |
|
"learning_rate": 0.00019285540384897073, |
|
"loss": 3.8094, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.01038961038961039, |
|
"grad_norm": 2.5422775745391846, |
|
"learning_rate": 0.00019222897549773848, |
|
"loss": 1.9784, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.0106951871657754, |
|
"grad_norm": 1.6043715476989746, |
|
"learning_rate": 0.00019157733266550575, |
|
"loss": 2.5754, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.011000763941940413, |
|
"grad_norm": 2.428220510482788, |
|
"learning_rate": 0.00019090065350491626, |
|
"loss": 3.7738, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.011306340718105424, |
|
"grad_norm": 2.145481586456299, |
|
"learning_rate": 0.00019019912301329592, |
|
"loss": 3.2982, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.011611917494270435, |
|
"grad_norm": 1.5569844245910645, |
|
"learning_rate": 0.00018947293298207635, |
|
"loss": 1.7218, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.011917494270435447, |
|
"grad_norm": 2.1777217388153076, |
|
"learning_rate": 0.0001887222819443612, |
|
"loss": 2.581, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.012223071046600458, |
|
"grad_norm": 2.7405147552490234, |
|
"learning_rate": 0.0001879473751206489, |
|
"loss": 3.8181, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.012528647822765469, |
|
"grad_norm": 1.6006349325180054, |
|
"learning_rate": 0.00018714842436272773, |
|
"loss": 2.0009, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.012834224598930482, |
|
"grad_norm": 1.8577955961227417, |
|
"learning_rate": 0.00018632564809575742, |
|
"loss": 3.3277, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.013139801375095492, |
|
"grad_norm": 1.7019318342208862, |
|
"learning_rate": 0.0001854792712585539, |
|
"loss": 3.6416, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.013445378151260505, |
|
"grad_norm": 2.172180414199829, |
|
"learning_rate": 0.00018460952524209355, |
|
"loss": 3.5653, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.013750954927425516, |
|
"grad_norm": 1.173954963684082, |
|
"learning_rate": 0.00018371664782625287, |
|
"loss": 2.4792, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.014056531703590527, |
|
"grad_norm": 2.2119481563568115, |
|
"learning_rate": 0.00018280088311480201, |
|
"loss": 3.7178, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.01436210847975554, |
|
"grad_norm": 1.043997883796692, |
|
"learning_rate": 0.00018186248146866927, |
|
"loss": 2.3083, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.01466768525592055, |
|
"grad_norm": 1.6460810899734497, |
|
"learning_rate": 0.00018090169943749476, |
|
"loss": 2.4319, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.014973262032085561, |
|
"grad_norm": 1.5691938400268555, |
|
"learning_rate": 0.0001799187996894925, |
|
"loss": 2.5601, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.015278838808250574, |
|
"grad_norm": 2.165229320526123, |
|
"learning_rate": 0.00017891405093963938, |
|
"loss": 3.1488, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.015278838808250574, |
|
"eval_loss": 3.244551658630371, |
|
"eval_runtime": 43.5009, |
|
"eval_samples_per_second": 31.678, |
|
"eval_steps_per_second": 15.839, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.015584415584415584, |
|
"grad_norm": 2.4259822368621826, |
|
"learning_rate": 0.00017788772787621126, |
|
"loss": 4.0701, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.015889992360580595, |
|
"grad_norm": 1.828935146331787, |
|
"learning_rate": 0.00017684011108568592, |
|
"loss": 3.8272, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.016195569136745608, |
|
"grad_norm": 1.6015702486038208, |
|
"learning_rate": 0.0001757714869760335, |
|
"loss": 4.0941, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.01650114591291062, |
|
"grad_norm": 1.4622135162353516, |
|
"learning_rate": 0.0001746821476984154, |
|
"loss": 2.8427, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.01680672268907563, |
|
"grad_norm": 2.106966018676758, |
|
"learning_rate": 0.00017357239106731317, |
|
"loss": 2.3866, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.017112299465240642, |
|
"grad_norm": 2.9907472133636475, |
|
"learning_rate": 0.00017244252047910892, |
|
"loss": 3.2555, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.017417876241405655, |
|
"grad_norm": 1.2781822681427002, |
|
"learning_rate": 0.00017129284482913972, |
|
"loss": 2.8657, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.017723453017570664, |
|
"grad_norm": 1.5655877590179443, |
|
"learning_rate": 0.00017012367842724887, |
|
"loss": 2.6976, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.018029029793735676, |
|
"grad_norm": 1.7124484777450562, |
|
"learning_rate": 0.0001689353409118566, |
|
"loss": 2.5161, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.01833460656990069, |
|
"grad_norm": 2.1622180938720703, |
|
"learning_rate": 0.00016772815716257412, |
|
"loss": 2.9102, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.018640183346065698, |
|
"grad_norm": 1.4135297536849976, |
|
"learning_rate": 0.0001665024572113848, |
|
"loss": 1.7485, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.01894576012223071, |
|
"grad_norm": 1.8781421184539795, |
|
"learning_rate": 0.00016525857615241687, |
|
"loss": 3.8095, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.019251336898395723, |
|
"grad_norm": 1.5626355409622192, |
|
"learning_rate": 0.00016399685405033167, |
|
"loss": 1.9366, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.019556913674560732, |
|
"grad_norm": 1.412752389907837, |
|
"learning_rate": 0.0001627176358473537, |
|
"loss": 1.9336, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.019862490450725745, |
|
"grad_norm": 5.998400688171387, |
|
"learning_rate": 0.0001614212712689668, |
|
"loss": 3.1919, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.020168067226890758, |
|
"grad_norm": 1.5334243774414062, |
|
"learning_rate": 0.00016010811472830252, |
|
"loss": 2.7113, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.020473644003055767, |
|
"grad_norm": 2.1354057788848877, |
|
"learning_rate": 0.00015877852522924732, |
|
"loss": 2.988, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.02077922077922078, |
|
"grad_norm": 1.9327161312103271, |
|
"learning_rate": 0.00015743286626829437, |
|
"loss": 2.9572, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.021084797555385792, |
|
"grad_norm": 2.1718881130218506, |
|
"learning_rate": 0.0001560715057351673, |
|
"loss": 3.5648, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.0213903743315508, |
|
"grad_norm": 1.8955811262130737, |
|
"learning_rate": 0.00015469481581224272, |
|
"loss": 3.1244, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.021695951107715813, |
|
"grad_norm": 1.5830830335617065, |
|
"learning_rate": 0.0001533031728727994, |
|
"loss": 2.7921, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.022001527883880826, |
|
"grad_norm": 2.2739856243133545, |
|
"learning_rate": 0.00015189695737812152, |
|
"loss": 3.5455, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.022307104660045835, |
|
"grad_norm": 0.9828822612762451, |
|
"learning_rate": 0.0001504765537734844, |
|
"loss": 1.6882, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.022612681436210848, |
|
"grad_norm": 1.9325013160705566, |
|
"learning_rate": 0.00014904235038305083, |
|
"loss": 2.1146, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.02291825821237586, |
|
"grad_norm": 1.3537631034851074, |
|
"learning_rate": 0.00014759473930370736, |
|
"loss": 2.2816, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.02322383498854087, |
|
"grad_norm": 1.826690673828125, |
|
"learning_rate": 0.0001461341162978688, |
|
"loss": 1.5235, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.023529411764705882, |
|
"grad_norm": 1.8681014776229858, |
|
"learning_rate": 0.00014466088068528068, |
|
"loss": 4.0703, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.023834988540870895, |
|
"grad_norm": 4.881453514099121, |
|
"learning_rate": 0.00014317543523384928, |
|
"loss": 3.2547, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.024140565317035904, |
|
"grad_norm": 2.301090955734253, |
|
"learning_rate": 0.00014167818604952906, |
|
"loss": 2.3378, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.024446142093200916, |
|
"grad_norm": 1.1395305395126343, |
|
"learning_rate": 0.00014016954246529696, |
|
"loss": 1.5157, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.02475171886936593, |
|
"grad_norm": 1.7658803462982178, |
|
"learning_rate": 0.00013864991692924523, |
|
"loss": 4.2361, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.025057295645530938, |
|
"grad_norm": 1.827609896659851, |
|
"learning_rate": 0.00013711972489182208, |
|
"loss": 3.0601, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.02536287242169595, |
|
"grad_norm": 1.9651907682418823, |
|
"learning_rate": 0.00013557938469225167, |
|
"loss": 3.6306, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.025668449197860963, |
|
"grad_norm": 2.074267625808716, |
|
"learning_rate": 0.00013402931744416433, |
|
"loss": 2.9667, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.025974025974025976, |
|
"grad_norm": 1.3315553665161133, |
|
"learning_rate": 0.00013246994692046836, |
|
"loss": 1.88, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.026279602750190985, |
|
"grad_norm": 1.5968420505523682, |
|
"learning_rate": 0.00013090169943749476, |
|
"loss": 3.7276, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.026585179526355997, |
|
"grad_norm": 1.9459774494171143, |
|
"learning_rate": 0.0001293250037384465, |
|
"loss": 2.5366, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.02689075630252101, |
|
"grad_norm": 1.9473600387573242, |
|
"learning_rate": 0.00012774029087618446, |
|
"loss": 3.8513, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.02719633307868602, |
|
"grad_norm": 1.4431513547897339, |
|
"learning_rate": 0.00012614799409538198, |
|
"loss": 2.5023, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.02750190985485103, |
|
"grad_norm": 3.223552703857422, |
|
"learning_rate": 0.00012454854871407994, |
|
"loss": 2.0943, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.027807486631016044, |
|
"grad_norm": 2.679762363433838, |
|
"learning_rate": 0.00012294239200467516, |
|
"loss": 3.3053, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.028113063407181053, |
|
"grad_norm": 2.0697975158691406, |
|
"learning_rate": 0.0001213299630743747, |
|
"loss": 3.5176, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.028418640183346066, |
|
"grad_norm": 2.661999464035034, |
|
"learning_rate": 0.00011971170274514802, |
|
"loss": 3.7355, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.02872421695951108, |
|
"grad_norm": 1.8615680932998657, |
|
"learning_rate": 0.000118088053433211, |
|
"loss": 2.0932, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.029029793735676088, |
|
"grad_norm": 2.583749532699585, |
|
"learning_rate": 0.00011645945902807341, |
|
"loss": 4.0713, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.0293353705118411, |
|
"grad_norm": 1.8530974388122559, |
|
"learning_rate": 0.0001148263647711842, |
|
"loss": 1.5702, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.029640947288006113, |
|
"grad_norm": 1.7810598611831665, |
|
"learning_rate": 0.00011318921713420691, |
|
"loss": 3.71, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.029946524064171122, |
|
"grad_norm": 2.1363232135772705, |
|
"learning_rate": 0.00011154846369695863, |
|
"loss": 3.1915, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.030252100840336135, |
|
"grad_norm": 1.8341383934020996, |
|
"learning_rate": 0.0001099045530250463, |
|
"loss": 2.4543, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.030557677616501147, |
|
"grad_norm": 1.8934003114700317, |
|
"learning_rate": 0.00010825793454723325, |
|
"loss": 2.6079, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.030557677616501147, |
|
"eval_loss": 3.186122417449951, |
|
"eval_runtime": 43.6356, |
|
"eval_samples_per_second": 31.58, |
|
"eval_steps_per_second": 15.79, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.030863254392666156, |
|
"grad_norm": 1.059015154838562, |
|
"learning_rate": 0.00010660905843256994, |
|
"loss": 2.2306, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.03116883116883117, |
|
"grad_norm": 1.7741599082946777, |
|
"learning_rate": 0.00010495837546732224, |
|
"loss": 3.5765, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.03147440794499618, |
|
"grad_norm": 2.162186622619629, |
|
"learning_rate": 0.00010330633693173082, |
|
"loss": 3.4345, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.03177998472116119, |
|
"grad_norm": 2.5586822032928467, |
|
"learning_rate": 0.00010165339447663587, |
|
"loss": 2.9903, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.03208556149732621, |
|
"grad_norm": 2.0206334590911865, |
|
"learning_rate": 0.0001, |
|
"loss": 2.1015, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.032391138273491216, |
|
"grad_norm": 1.485046148300171, |
|
"learning_rate": 9.834660552336415e-05, |
|
"loss": 3.0347, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.032696715049656225, |
|
"grad_norm": 1.54109787940979, |
|
"learning_rate": 9.669366306826919e-05, |
|
"loss": 3.4548, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.03300229182582124, |
|
"grad_norm": 2.197310209274292, |
|
"learning_rate": 9.504162453267777e-05, |
|
"loss": 3.259, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.03330786860198625, |
|
"grad_norm": 1.7704206705093384, |
|
"learning_rate": 9.339094156743007e-05, |
|
"loss": 3.6315, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.03361344537815126, |
|
"grad_norm": 1.5361251831054688, |
|
"learning_rate": 9.174206545276677e-05, |
|
"loss": 3.565, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.033919022154316275, |
|
"grad_norm": 1.6352933645248413, |
|
"learning_rate": 9.009544697495374e-05, |
|
"loss": 4.2866, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.034224598930481284, |
|
"grad_norm": 1.5935004949569702, |
|
"learning_rate": 8.845153630304139e-05, |
|
"loss": 1.9851, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.03453017570664629, |
|
"grad_norm": 1.665504813194275, |
|
"learning_rate": 8.681078286579311e-05, |
|
"loss": 2.6306, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.03483575248281131, |
|
"grad_norm": 1.937706470489502, |
|
"learning_rate": 8.517363522881579e-05, |
|
"loss": 3.863, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.03514132925897632, |
|
"grad_norm": 1.2224433422088623, |
|
"learning_rate": 8.35405409719266e-05, |
|
"loss": 2.8868, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.03544690603514133, |
|
"grad_norm": 1.5066522359848022, |
|
"learning_rate": 8.191194656678904e-05, |
|
"loss": 2.3325, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.035752482811306344, |
|
"grad_norm": 1.7514053583145142, |
|
"learning_rate": 8.028829725485199e-05, |
|
"loss": 3.0919, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.03605805958747135, |
|
"grad_norm": 2.9330379962921143, |
|
"learning_rate": 7.867003692562534e-05, |
|
"loss": 3.5218, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.03636363636363636, |
|
"grad_norm": 1.7792319059371948, |
|
"learning_rate": 7.705760799532485e-05, |
|
"loss": 2.6203, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.03666921313980138, |
|
"grad_norm": 1.3247697353363037, |
|
"learning_rate": 7.54514512859201e-05, |
|
"loss": 3.0596, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.03697478991596639, |
|
"grad_norm": 1.2648736238479614, |
|
"learning_rate": 7.385200590461803e-05, |
|
"loss": 1.8237, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.037280366692131396, |
|
"grad_norm": 1.992238163948059, |
|
"learning_rate": 7.225970912381556e-05, |
|
"loss": 3.7792, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.03758594346829641, |
|
"grad_norm": 4.09771203994751, |
|
"learning_rate": 7.067499626155354e-05, |
|
"loss": 2.6822, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.03789152024446142, |
|
"grad_norm": 1.820434331893921, |
|
"learning_rate": 6.909830056250527e-05, |
|
"loss": 2.6724, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.03819709702062643, |
|
"grad_norm": 1.6424354314804077, |
|
"learning_rate": 6.753005307953167e-05, |
|
"loss": 3.4209, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.038502673796791446, |
|
"grad_norm": 2.45306134223938, |
|
"learning_rate": 6.59706825558357e-05, |
|
"loss": 3.502, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.038808250572956456, |
|
"grad_norm": 1.5133211612701416, |
|
"learning_rate": 6.442061530774834e-05, |
|
"loss": 2.9659, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.039113827349121465, |
|
"grad_norm": 2.0427510738372803, |
|
"learning_rate": 6.28802751081779e-05, |
|
"loss": 4.4254, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.03941940412528648, |
|
"grad_norm": 1.8973280191421509, |
|
"learning_rate": 6.135008307075481e-05, |
|
"loss": 2.7392, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.03972498090145149, |
|
"grad_norm": 1.3709115982055664, |
|
"learning_rate": 5.983045753470308e-05, |
|
"loss": 1.846, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.0400305576776165, |
|
"grad_norm": 0.9821792244911194, |
|
"learning_rate": 5.832181395047098e-05, |
|
"loss": 2.6255, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.040336134453781515, |
|
"grad_norm": 1.0302985906600952, |
|
"learning_rate": 5.6824564766150726e-05, |
|
"loss": 1.7235, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.040641711229946524, |
|
"grad_norm": 1.5651659965515137, |
|
"learning_rate": 5.533911931471936e-05, |
|
"loss": 3.025, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.04094728800611153, |
|
"grad_norm": 1.6474765539169312, |
|
"learning_rate": 5.386588370213124e-05, |
|
"loss": 2.6187, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.04125286478227655, |
|
"grad_norm": 1.3223847150802612, |
|
"learning_rate": 5.240526069629265e-05, |
|
"loss": 2.7047, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.04155844155844156, |
|
"grad_norm": 1.139236569404602, |
|
"learning_rate": 5.095764961694922e-05, |
|
"loss": 1.2376, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.04186401833460657, |
|
"grad_norm": 1.7920728921890259, |
|
"learning_rate": 4.952344622651566e-05, |
|
"loss": 3.7943, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.042169595110771584, |
|
"grad_norm": 1.287239909172058, |
|
"learning_rate": 4.810304262187852e-05, |
|
"loss": 3.2348, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.04247517188693659, |
|
"grad_norm": 1.7391269207000732, |
|
"learning_rate": 4.669682712720065e-05, |
|
"loss": 4.3926, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.0427807486631016, |
|
"grad_norm": 1.4766433238983154, |
|
"learning_rate": 4.530518418775733e-05, |
|
"loss": 1.1949, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.04308632543926662, |
|
"grad_norm": 2.0691680908203125, |
|
"learning_rate": 4.392849426483274e-05, |
|
"loss": 2.6009, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.04339190221543163, |
|
"grad_norm": 2.074167490005493, |
|
"learning_rate": 4.256713373170564e-05, |
|
"loss": 2.3074, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.043697478991596636, |
|
"grad_norm": 1.3897058963775635, |
|
"learning_rate": 4.12214747707527e-05, |
|
"loss": 3.5031, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.04400305576776165, |
|
"grad_norm": 1.8366340398788452, |
|
"learning_rate": 3.9891885271697496e-05, |
|
"loss": 3.5897, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.04430863254392666, |
|
"grad_norm": 1.4808062314987183, |
|
"learning_rate": 3.857872873103322e-05, |
|
"loss": 3.3973, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.04461420932009167, |
|
"grad_norm": 1.730080485343933, |
|
"learning_rate": 3.7282364152646297e-05, |
|
"loss": 3.0046, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.044919786096256686, |
|
"grad_norm": 1.9429744482040405, |
|
"learning_rate": 3.600314594966834e-05, |
|
"loss": 3.086, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.045225362872421696, |
|
"grad_norm": 1.349107265472412, |
|
"learning_rate": 3.4741423847583134e-05, |
|
"loss": 3.3598, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.045530939648586705, |
|
"grad_norm": 1.0275979042053223, |
|
"learning_rate": 3.349754278861517e-05, |
|
"loss": 1.5396, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 0.04583651642475172, |
|
"grad_norm": 1.412550449371338, |
|
"learning_rate": 3.227184283742591e-05, |
|
"loss": 1.6656, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.04583651642475172, |
|
"eval_loss": 3.142651081085205, |
|
"eval_runtime": 43.5899, |
|
"eval_samples_per_second": 31.613, |
|
"eval_steps_per_second": 15.806, |
|
"step": 150 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 200, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 50, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 2.8344572102836224e+16, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|