{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.030557677616501147, "eval_steps": 50, "global_step": 100, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00030557677616501144, "grad_norm": 2.4270410537719727, "learning_rate": 2e-05, "loss": 4.3691, "step": 1 }, { "epoch": 0.00030557677616501144, "eval_loss": 3.825514316558838, "eval_runtime": 44.62, "eval_samples_per_second": 30.883, "eval_steps_per_second": 15.442, "step": 1 }, { "epoch": 0.0006111535523300229, "grad_norm": 1.158055305480957, "learning_rate": 4e-05, "loss": 1.9265, "step": 2 }, { "epoch": 0.0009167303284950344, "grad_norm": 1.5009372234344482, "learning_rate": 6e-05, "loss": 4.1948, "step": 3 }, { "epoch": 0.0012223071046600458, "grad_norm": 1.4214242696762085, "learning_rate": 8e-05, "loss": 3.1941, "step": 4 }, { "epoch": 0.0015278838808250573, "grad_norm": 1.7152305841445923, "learning_rate": 0.0001, "loss": 3.3993, "step": 5 }, { "epoch": 0.0018334606569900688, "grad_norm": 1.6367850303649902, "learning_rate": 0.00012, "loss": 4.2454, "step": 6 }, { "epoch": 0.0021390374331550803, "grad_norm": 1.2402284145355225, "learning_rate": 0.00014, "loss": 2.3504, "step": 7 }, { "epoch": 0.0024446142093200915, "grad_norm": 1.5380349159240723, "learning_rate": 0.00016, "loss": 3.6695, "step": 8 }, { "epoch": 0.0027501909854851033, "grad_norm": 2.2115941047668457, "learning_rate": 0.00018, "loss": 3.8667, "step": 9 }, { "epoch": 0.0030557677616501145, "grad_norm": 2.173429489135742, "learning_rate": 0.0002, "loss": 3.0004, "step": 10 }, { "epoch": 0.0033613445378151263, "grad_norm": 2.2001826763153076, "learning_rate": 0.0001999863304992469, "loss": 2.6077, "step": 11 }, { "epoch": 0.0036669213139801375, "grad_norm": 2.234281539916992, "learning_rate": 0.00019994532573409262, "loss": 3.6697, "step": 12 }, { "epoch": 0.003972498090145149, "grad_norm": 2.308614492416382, "learning_rate": 0.00019987699691483048, "loss": 2.7118, "step": 13 }, { "epoch": 0.0042780748663101605, "grad_norm": 2.648648500442505, "learning_rate": 0.00019978136272187747, "loss": 3.2435, "step": 14 }, { "epoch": 0.004583651642475172, "grad_norm": 1.7936925888061523, "learning_rate": 0.000199658449300667, "loss": 2.1392, "step": 15 }, { "epoch": 0.004889228418640183, "grad_norm": 2.6367146968841553, "learning_rate": 0.00019950829025450114, "loss": 3.592, "step": 16 }, { "epoch": 0.005194805194805195, "grad_norm": 2.5174176692962646, "learning_rate": 0.00019933092663536382, "loss": 2.977, "step": 17 }, { "epoch": 0.0055003819709702065, "grad_norm": 2.296748399734497, "learning_rate": 0.00019912640693269752, "loss": 3.093, "step": 18 }, { "epoch": 0.005805958747135217, "grad_norm": 3.448976993560791, "learning_rate": 0.00019889478706014687, "loss": 3.0724, "step": 19 }, { "epoch": 0.006111535523300229, "grad_norm": 2.230823278427124, "learning_rate": 0.00019863613034027224, "loss": 2.7141, "step": 20 }, { "epoch": 0.006417112299465241, "grad_norm": 2.3661162853240967, "learning_rate": 0.00019835050748723824, "loss": 3.6851, "step": 21 }, { "epoch": 0.0067226890756302525, "grad_norm": 1.993086338043213, "learning_rate": 0.00019803799658748094, "loss": 3.8929, "step": 22 }, { "epoch": 0.007028265851795263, "grad_norm": 1.727256417274475, "learning_rate": 0.00019769868307835994, "loss": 2.7433, "step": 23 }, { "epoch": 0.007333842627960275, "grad_norm": 2.6137921810150146, "learning_rate": 0.0001973326597248006, "loss": 3.9081, "step": 24 }, { "epoch": 0.007639419404125287, "grad_norm": 1.9214521646499634, "learning_rate": 0.00019694002659393305, "loss": 3.451, "step": 25 }, { "epoch": 0.007944996180290298, "grad_norm": 2.5544991493225098, "learning_rate": 0.00019652089102773488, "loss": 2.4955, "step": 26 }, { "epoch": 0.00825057295645531, "grad_norm": 3.0405311584472656, "learning_rate": 0.00019607536761368484, "loss": 3.6993, "step": 27 }, { "epoch": 0.008556149732620321, "grad_norm": 1.412721037864685, "learning_rate": 0.00019560357815343577, "loss": 3.4813, "step": 28 }, { "epoch": 0.008861726508785332, "grad_norm": 1.844821572303772, "learning_rate": 0.00019510565162951537, "loss": 2.4911, "step": 29 }, { "epoch": 0.009167303284950344, "grad_norm": 2.185194730758667, "learning_rate": 0.00019458172417006347, "loss": 3.0409, "step": 30 }, { "epoch": 0.009472880061115355, "grad_norm": 2.528438091278076, "learning_rate": 0.00019403193901161613, "loss": 4.4194, "step": 31 }, { "epoch": 0.009778456837280366, "grad_norm": 2.3545053005218506, "learning_rate": 0.0001934564464599461, "loss": 3.9219, "step": 32 }, { "epoch": 0.010084033613445379, "grad_norm": 2.4586379528045654, "learning_rate": 0.00019285540384897073, "loss": 3.8094, "step": 33 }, { "epoch": 0.01038961038961039, "grad_norm": 2.5422775745391846, "learning_rate": 0.00019222897549773848, "loss": 1.9784, "step": 34 }, { "epoch": 0.0106951871657754, "grad_norm": 1.6043715476989746, "learning_rate": 0.00019157733266550575, "loss": 2.5754, "step": 35 }, { "epoch": 0.011000763941940413, "grad_norm": 2.428220510482788, "learning_rate": 0.00019090065350491626, "loss": 3.7738, "step": 36 }, { "epoch": 0.011306340718105424, "grad_norm": 2.145481586456299, "learning_rate": 0.00019019912301329592, "loss": 3.2982, "step": 37 }, { "epoch": 0.011611917494270435, "grad_norm": 1.5569844245910645, "learning_rate": 0.00018947293298207635, "loss": 1.7218, "step": 38 }, { "epoch": 0.011917494270435447, "grad_norm": 2.1777217388153076, "learning_rate": 0.0001887222819443612, "loss": 2.581, "step": 39 }, { "epoch": 0.012223071046600458, "grad_norm": 2.7405147552490234, "learning_rate": 0.0001879473751206489, "loss": 3.8181, "step": 40 }, { "epoch": 0.012528647822765469, "grad_norm": 1.6006349325180054, "learning_rate": 0.00018714842436272773, "loss": 2.0009, "step": 41 }, { "epoch": 0.012834224598930482, "grad_norm": 1.8577955961227417, "learning_rate": 0.00018632564809575742, "loss": 3.3277, "step": 42 }, { "epoch": 0.013139801375095492, "grad_norm": 1.7019318342208862, "learning_rate": 0.0001854792712585539, "loss": 3.6416, "step": 43 }, { "epoch": 0.013445378151260505, "grad_norm": 2.172180414199829, "learning_rate": 0.00018460952524209355, "loss": 3.5653, "step": 44 }, { "epoch": 0.013750954927425516, "grad_norm": 1.173954963684082, "learning_rate": 0.00018371664782625287, "loss": 2.4792, "step": 45 }, { "epoch": 0.014056531703590527, "grad_norm": 2.2119481563568115, "learning_rate": 0.00018280088311480201, "loss": 3.7178, "step": 46 }, { "epoch": 0.01436210847975554, "grad_norm": 1.043997883796692, "learning_rate": 0.00018186248146866927, "loss": 2.3083, "step": 47 }, { "epoch": 0.01466768525592055, "grad_norm": 1.6460810899734497, "learning_rate": 0.00018090169943749476, "loss": 2.4319, "step": 48 }, { "epoch": 0.014973262032085561, "grad_norm": 1.5691938400268555, "learning_rate": 0.0001799187996894925, "loss": 2.5601, "step": 49 }, { "epoch": 0.015278838808250574, "grad_norm": 2.165229320526123, "learning_rate": 0.00017891405093963938, "loss": 3.1488, "step": 50 }, { "epoch": 0.015278838808250574, "eval_loss": 3.244551658630371, "eval_runtime": 43.5009, "eval_samples_per_second": 31.678, "eval_steps_per_second": 15.839, "step": 50 }, { "epoch": 0.015584415584415584, "grad_norm": 2.4259822368621826, "learning_rate": 0.00017788772787621126, "loss": 4.0701, "step": 51 }, { "epoch": 0.015889992360580595, "grad_norm": 1.828935146331787, "learning_rate": 0.00017684011108568592, "loss": 3.8272, "step": 52 }, { "epoch": 0.016195569136745608, "grad_norm": 1.6015702486038208, "learning_rate": 0.0001757714869760335, "loss": 4.0941, "step": 53 }, { "epoch": 0.01650114591291062, "grad_norm": 1.4622135162353516, "learning_rate": 0.0001746821476984154, "loss": 2.8427, "step": 54 }, { "epoch": 0.01680672268907563, "grad_norm": 2.106966018676758, "learning_rate": 0.00017357239106731317, "loss": 2.3866, "step": 55 }, { "epoch": 0.017112299465240642, "grad_norm": 2.9907472133636475, "learning_rate": 0.00017244252047910892, "loss": 3.2555, "step": 56 }, { "epoch": 0.017417876241405655, "grad_norm": 1.2781822681427002, "learning_rate": 0.00017129284482913972, "loss": 2.8657, "step": 57 }, { "epoch": 0.017723453017570664, "grad_norm": 1.5655877590179443, "learning_rate": 0.00017012367842724887, "loss": 2.6976, "step": 58 }, { "epoch": 0.018029029793735676, "grad_norm": 1.7124484777450562, "learning_rate": 0.0001689353409118566, "loss": 2.5161, "step": 59 }, { "epoch": 0.01833460656990069, "grad_norm": 2.1622180938720703, "learning_rate": 0.00016772815716257412, "loss": 2.9102, "step": 60 }, { "epoch": 0.018640183346065698, "grad_norm": 1.4135297536849976, "learning_rate": 0.0001665024572113848, "loss": 1.7485, "step": 61 }, { "epoch": 0.01894576012223071, "grad_norm": 1.8781421184539795, "learning_rate": 0.00016525857615241687, "loss": 3.8095, "step": 62 }, { "epoch": 0.019251336898395723, "grad_norm": 1.5626355409622192, "learning_rate": 0.00016399685405033167, "loss": 1.9366, "step": 63 }, { "epoch": 0.019556913674560732, "grad_norm": 1.412752389907837, "learning_rate": 0.0001627176358473537, "loss": 1.9336, "step": 64 }, { "epoch": 0.019862490450725745, "grad_norm": 5.998400688171387, "learning_rate": 0.0001614212712689668, "loss": 3.1919, "step": 65 }, { "epoch": 0.020168067226890758, "grad_norm": 1.5334243774414062, "learning_rate": 0.00016010811472830252, "loss": 2.7113, "step": 66 }, { "epoch": 0.020473644003055767, "grad_norm": 2.1354057788848877, "learning_rate": 0.00015877852522924732, "loss": 2.988, "step": 67 }, { "epoch": 0.02077922077922078, "grad_norm": 1.9327161312103271, "learning_rate": 0.00015743286626829437, "loss": 2.9572, "step": 68 }, { "epoch": 0.021084797555385792, "grad_norm": 2.1718881130218506, "learning_rate": 0.0001560715057351673, "loss": 3.5648, "step": 69 }, { "epoch": 0.0213903743315508, "grad_norm": 1.8955811262130737, "learning_rate": 0.00015469481581224272, "loss": 3.1244, "step": 70 }, { "epoch": 0.021695951107715813, "grad_norm": 1.5830830335617065, "learning_rate": 0.0001533031728727994, "loss": 2.7921, "step": 71 }, { "epoch": 0.022001527883880826, "grad_norm": 2.2739856243133545, "learning_rate": 0.00015189695737812152, "loss": 3.5455, "step": 72 }, { "epoch": 0.022307104660045835, "grad_norm": 0.9828822612762451, "learning_rate": 0.0001504765537734844, "loss": 1.6882, "step": 73 }, { "epoch": 0.022612681436210848, "grad_norm": 1.9325013160705566, "learning_rate": 0.00014904235038305083, "loss": 2.1146, "step": 74 }, { "epoch": 0.02291825821237586, "grad_norm": 1.3537631034851074, "learning_rate": 0.00014759473930370736, "loss": 2.2816, "step": 75 }, { "epoch": 0.02322383498854087, "grad_norm": 1.826690673828125, "learning_rate": 0.0001461341162978688, "loss": 1.5235, "step": 76 }, { "epoch": 0.023529411764705882, "grad_norm": 1.8681014776229858, "learning_rate": 0.00014466088068528068, "loss": 4.0703, "step": 77 }, { "epoch": 0.023834988540870895, "grad_norm": 4.881453514099121, "learning_rate": 0.00014317543523384928, "loss": 3.2547, "step": 78 }, { "epoch": 0.024140565317035904, "grad_norm": 2.301090955734253, "learning_rate": 0.00014167818604952906, "loss": 2.3378, "step": 79 }, { "epoch": 0.024446142093200916, "grad_norm": 1.1395305395126343, "learning_rate": 0.00014016954246529696, "loss": 1.5157, "step": 80 }, { "epoch": 0.02475171886936593, "grad_norm": 1.7658803462982178, "learning_rate": 0.00013864991692924523, "loss": 4.2361, "step": 81 }, { "epoch": 0.025057295645530938, "grad_norm": 1.827609896659851, "learning_rate": 0.00013711972489182208, "loss": 3.0601, "step": 82 }, { "epoch": 0.02536287242169595, "grad_norm": 1.9651907682418823, "learning_rate": 0.00013557938469225167, "loss": 3.6306, "step": 83 }, { "epoch": 0.025668449197860963, "grad_norm": 2.074267625808716, "learning_rate": 0.00013402931744416433, "loss": 2.9667, "step": 84 }, { "epoch": 0.025974025974025976, "grad_norm": 1.3315553665161133, "learning_rate": 0.00013246994692046836, "loss": 1.88, "step": 85 }, { "epoch": 0.026279602750190985, "grad_norm": 1.5968420505523682, "learning_rate": 0.00013090169943749476, "loss": 3.7276, "step": 86 }, { "epoch": 0.026585179526355997, "grad_norm": 1.9459774494171143, "learning_rate": 0.0001293250037384465, "loss": 2.5366, "step": 87 }, { "epoch": 0.02689075630252101, "grad_norm": 1.9473600387573242, "learning_rate": 0.00012774029087618446, "loss": 3.8513, "step": 88 }, { "epoch": 0.02719633307868602, "grad_norm": 1.4431513547897339, "learning_rate": 0.00012614799409538198, "loss": 2.5023, "step": 89 }, { "epoch": 0.02750190985485103, "grad_norm": 3.223552703857422, "learning_rate": 0.00012454854871407994, "loss": 2.0943, "step": 90 }, { "epoch": 0.027807486631016044, "grad_norm": 2.679762363433838, "learning_rate": 0.00012294239200467516, "loss": 3.3053, "step": 91 }, { "epoch": 0.028113063407181053, "grad_norm": 2.0697975158691406, "learning_rate": 0.0001213299630743747, "loss": 3.5176, "step": 92 }, { "epoch": 0.028418640183346066, "grad_norm": 2.661999464035034, "learning_rate": 0.00011971170274514802, "loss": 3.7355, "step": 93 }, { "epoch": 0.02872421695951108, "grad_norm": 1.8615680932998657, "learning_rate": 0.000118088053433211, "loss": 2.0932, "step": 94 }, { "epoch": 0.029029793735676088, "grad_norm": 2.583749532699585, "learning_rate": 0.00011645945902807341, "loss": 4.0713, "step": 95 }, { "epoch": 0.0293353705118411, "grad_norm": 1.8530974388122559, "learning_rate": 0.0001148263647711842, "loss": 1.5702, "step": 96 }, { "epoch": 0.029640947288006113, "grad_norm": 1.7810598611831665, "learning_rate": 0.00011318921713420691, "loss": 3.71, "step": 97 }, { "epoch": 0.029946524064171122, "grad_norm": 2.1363232135772705, "learning_rate": 0.00011154846369695863, "loss": 3.1915, "step": 98 }, { "epoch": 0.030252100840336135, "grad_norm": 1.8341383934020996, "learning_rate": 0.0001099045530250463, "loss": 2.4543, "step": 99 }, { "epoch": 0.030557677616501147, "grad_norm": 1.8934003114700317, "learning_rate": 0.00010825793454723325, "loss": 2.6079, "step": 100 }, { "epoch": 0.030557677616501147, "eval_loss": 3.186122417449951, "eval_runtime": 43.6356, "eval_samples_per_second": 31.58, "eval_steps_per_second": 15.79, "step": 100 } ], "logging_steps": 1, "max_steps": 200, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.895803354349568e+16, "train_batch_size": 2, "trial_name": null, "trial_params": null }