{ "best_metric": 1.3859784603118896, "best_model_checkpoint": "miner_id_24/checkpoint-200", "epoch": 0.055348000553480006, "eval_steps": 50, "global_step": 200, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0002767400027674, "grad_norm": 0.852175772190094, "learning_rate": 1.012e-05, "loss": 2.4008, "step": 1 }, { "epoch": 0.0002767400027674, "eval_loss": 1.7593120336532593, "eval_runtime": 153.6628, "eval_samples_per_second": 9.905, "eval_steps_per_second": 2.479, "step": 1 }, { "epoch": 0.0005534800055348, "grad_norm": 0.935762882232666, "learning_rate": 2.024e-05, "loss": 2.555, "step": 2 }, { "epoch": 0.0008302200083022001, "grad_norm": 0.978963315486908, "learning_rate": 3.0359999999999997e-05, "loss": 2.6445, "step": 3 }, { "epoch": 0.0011069600110696, "grad_norm": 1.0214682817459106, "learning_rate": 4.048e-05, "loss": 2.685, "step": 4 }, { "epoch": 0.0013837000138370002, "grad_norm": 1.059704065322876, "learning_rate": 5.06e-05, "loss": 1.7658, "step": 5 }, { "epoch": 0.0016604400166044002, "grad_norm": 0.9298608899116516, "learning_rate": 6.0719999999999995e-05, "loss": 1.6336, "step": 6 }, { "epoch": 0.0019371800193718, "grad_norm": 0.8906842470169067, "learning_rate": 7.083999999999999e-05, "loss": 1.54, "step": 7 }, { "epoch": 0.0022139200221392, "grad_norm": 0.8215664029121399, "learning_rate": 8.096e-05, "loss": 1.5611, "step": 8 }, { "epoch": 0.0024906600249066002, "grad_norm": 0.8630419373512268, "learning_rate": 9.108e-05, "loss": 1.5775, "step": 9 }, { "epoch": 0.0027674000276740004, "grad_norm": 0.8396645188331604, "learning_rate": 0.0001012, "loss": 1.6161, "step": 10 }, { "epoch": 0.0030441400304414, "grad_norm": 0.810003936290741, "learning_rate": 0.00010066736842105262, "loss": 1.293, "step": 11 }, { "epoch": 0.0033208800332088003, "grad_norm": 0.7545539736747742, "learning_rate": 0.00010013473684210525, "loss": 1.3874, "step": 12 }, { "epoch": 0.0035976200359762005, "grad_norm": 0.8366276621818542, "learning_rate": 9.960210526315788e-05, "loss": 1.5619, "step": 13 }, { "epoch": 0.0038743600387436, "grad_norm": 0.8285793662071228, "learning_rate": 9.906947368421052e-05, "loss": 1.3301, "step": 14 }, { "epoch": 0.004151100041511001, "grad_norm": 0.9211393594741821, "learning_rate": 9.853684210526316e-05, "loss": 1.4689, "step": 15 }, { "epoch": 0.0044278400442784, "grad_norm": 0.788477897644043, "learning_rate": 9.800421052631579e-05, "loss": 1.2793, "step": 16 }, { "epoch": 0.0047045800470458, "grad_norm": 0.8850870728492737, "learning_rate": 9.747157894736841e-05, "loss": 1.3232, "step": 17 }, { "epoch": 0.0049813200498132005, "grad_norm": 0.7503834962844849, "learning_rate": 9.693894736842104e-05, "loss": 1.2805, "step": 18 }, { "epoch": 0.005258060052580601, "grad_norm": 0.9954808950424194, "learning_rate": 9.640631578947367e-05, "loss": 1.3117, "step": 19 }, { "epoch": 0.005534800055348001, "grad_norm": 0.9654945135116577, "learning_rate": 9.58736842105263e-05, "loss": 1.2588, "step": 20 }, { "epoch": 0.005811540058115401, "grad_norm": 1.1801338195800781, "learning_rate": 9.534105263157894e-05, "loss": 1.3144, "step": 21 }, { "epoch": 0.0060882800608828, "grad_norm": 1.1572587490081787, "learning_rate": 9.480842105263158e-05, "loss": 1.3957, "step": 22 }, { "epoch": 0.0063650200636502004, "grad_norm": 0.9956503510475159, "learning_rate": 9.427578947368421e-05, "loss": 1.1871, "step": 23 }, { "epoch": 0.006641760066417601, "grad_norm": 0.9935819506645203, "learning_rate": 9.374315789473684e-05, "loss": 1.2236, "step": 24 }, { "epoch": 0.006918500069185001, "grad_norm": 1.0050981044769287, "learning_rate": 9.321052631578946e-05, "loss": 1.584, "step": 25 }, { "epoch": 0.007195240071952401, "grad_norm": 1.1631112098693848, "learning_rate": 9.267789473684209e-05, "loss": 1.387, "step": 26 }, { "epoch": 0.007471980074719801, "grad_norm": 0.9945346713066101, "learning_rate": 9.214526315789473e-05, "loss": 1.3317, "step": 27 }, { "epoch": 0.0077487200774872, "grad_norm": 1.177724838256836, "learning_rate": 9.161263157894736e-05, "loss": 1.2419, "step": 28 }, { "epoch": 0.008025460080254601, "grad_norm": 0.9147948026657104, "learning_rate": 9.108e-05, "loss": 1.2363, "step": 29 }, { "epoch": 0.008302200083022002, "grad_norm": 1.0739562511444092, "learning_rate": 9.054736842105263e-05, "loss": 1.3926, "step": 30 }, { "epoch": 0.0085789400857894, "grad_norm": 0.9552971720695496, "learning_rate": 9.001473684210526e-05, "loss": 1.2376, "step": 31 }, { "epoch": 0.0088556800885568, "grad_norm": 1.0618211030960083, "learning_rate": 8.948210526315789e-05, "loss": 1.3261, "step": 32 }, { "epoch": 0.0091324200913242, "grad_norm": 1.0548874139785767, "learning_rate": 8.894947368421051e-05, "loss": 1.3451, "step": 33 }, { "epoch": 0.0094091600940916, "grad_norm": 1.030394434928894, "learning_rate": 8.841684210526315e-05, "loss": 1.1694, "step": 34 }, { "epoch": 0.009685900096859, "grad_norm": 1.0048632621765137, "learning_rate": 8.788421052631578e-05, "loss": 1.3008, "step": 35 }, { "epoch": 0.009962640099626401, "grad_norm": 1.1488323211669922, "learning_rate": 8.735157894736842e-05, "loss": 1.3874, "step": 36 }, { "epoch": 0.010239380102393801, "grad_norm": 1.2645336389541626, "learning_rate": 8.681894736842105e-05, "loss": 1.4368, "step": 37 }, { "epoch": 0.010516120105161201, "grad_norm": 1.1045883893966675, "learning_rate": 8.628631578947368e-05, "loss": 1.2961, "step": 38 }, { "epoch": 0.010792860107928601, "grad_norm": 1.221477746963501, "learning_rate": 8.575368421052631e-05, "loss": 1.3975, "step": 39 }, { "epoch": 0.011069600110696002, "grad_norm": 1.4169154167175293, "learning_rate": 8.522105263157893e-05, "loss": 1.3287, "step": 40 }, { "epoch": 0.011346340113463402, "grad_norm": 0.9626021981239319, "learning_rate": 8.468842105263158e-05, "loss": 1.1188, "step": 41 }, { "epoch": 0.011623080116230802, "grad_norm": 1.2021511793136597, "learning_rate": 8.41557894736842e-05, "loss": 1.2036, "step": 42 }, { "epoch": 0.0118998201189982, "grad_norm": 1.3119330406188965, "learning_rate": 8.362315789473683e-05, "loss": 1.4336, "step": 43 }, { "epoch": 0.0121765601217656, "grad_norm": 1.5164241790771484, "learning_rate": 8.309052631578947e-05, "loss": 1.5811, "step": 44 }, { "epoch": 0.012453300124533, "grad_norm": 1.4372763633728027, "learning_rate": 8.25578947368421e-05, "loss": 1.3545, "step": 45 }, { "epoch": 0.012730040127300401, "grad_norm": 1.3324321508407593, "learning_rate": 8.202526315789473e-05, "loss": 1.4718, "step": 46 }, { "epoch": 0.013006780130067801, "grad_norm": 1.5894691944122314, "learning_rate": 8.149263157894736e-05, "loss": 1.5597, "step": 47 }, { "epoch": 0.013283520132835201, "grad_norm": 1.7893458604812622, "learning_rate": 8.096e-05, "loss": 1.5236, "step": 48 }, { "epoch": 0.013560260135602601, "grad_norm": 2.089571714401245, "learning_rate": 8.042736842105263e-05, "loss": 1.5392, "step": 49 }, { "epoch": 0.013837000138370002, "grad_norm": 2.590139865875244, "learning_rate": 7.989473684210525e-05, "loss": 1.7109, "step": 50 }, { "epoch": 0.013837000138370002, "eval_loss": 1.4664398431777954, "eval_runtime": 155.4729, "eval_samples_per_second": 9.789, "eval_steps_per_second": 2.451, "step": 50 }, { "epoch": 0.014113740141137402, "grad_norm": 0.5701592564582825, "learning_rate": 7.93621052631579e-05, "loss": 2.6106, "step": 51 }, { "epoch": 0.014390480143904802, "grad_norm": 0.5594712495803833, "learning_rate": 7.882947368421052e-05, "loss": 2.5369, "step": 52 }, { "epoch": 0.014667220146672202, "grad_norm": 0.7236589789390564, "learning_rate": 7.829684210526315e-05, "loss": 2.8481, "step": 53 }, { "epoch": 0.014943960149439602, "grad_norm": 0.6818345785140991, "learning_rate": 7.776421052631578e-05, "loss": 2.5326, "step": 54 }, { "epoch": 0.015220700152207, "grad_norm": 0.6065534353256226, "learning_rate": 7.723157894736842e-05, "loss": 1.8406, "step": 55 }, { "epoch": 0.0154974401549744, "grad_norm": 0.7356017827987671, "learning_rate": 7.669894736842105e-05, "loss": 1.7761, "step": 56 }, { "epoch": 0.0157741801577418, "grad_norm": 0.7852395176887512, "learning_rate": 7.616631578947367e-05, "loss": 1.725, "step": 57 }, { "epoch": 0.016050920160509203, "grad_norm": 0.5799427032470703, "learning_rate": 7.563368421052632e-05, "loss": 1.7881, "step": 58 }, { "epoch": 0.0163276601632766, "grad_norm": 0.8190228343009949, "learning_rate": 7.510105263157894e-05, "loss": 1.588, "step": 59 }, { "epoch": 0.016604400166044003, "grad_norm": 0.6333599090576172, "learning_rate": 7.456842105263157e-05, "loss": 1.2655, "step": 60 }, { "epoch": 0.0168811401688114, "grad_norm": 1.3629286289215088, "learning_rate": 7.403578947368421e-05, "loss": 1.3786, "step": 61 }, { "epoch": 0.0171578801715788, "grad_norm": 0.6778807640075684, "learning_rate": 7.350315789473684e-05, "loss": 1.348, "step": 62 }, { "epoch": 0.017434620174346202, "grad_norm": 0.6999670267105103, "learning_rate": 7.297052631578947e-05, "loss": 1.4429, "step": 63 }, { "epoch": 0.0177113601771136, "grad_norm": 0.6831374764442444, "learning_rate": 7.24378947368421e-05, "loss": 1.194, "step": 64 }, { "epoch": 0.017988100179881002, "grad_norm": 0.652746856212616, "learning_rate": 7.190526315789474e-05, "loss": 1.267, "step": 65 }, { "epoch": 0.0182648401826484, "grad_norm": 0.7085557579994202, "learning_rate": 7.137263157894736e-05, "loss": 1.3692, "step": 66 }, { "epoch": 0.018541580185415803, "grad_norm": 0.752521812915802, "learning_rate": 7.083999999999999e-05, "loss": 1.1922, "step": 67 }, { "epoch": 0.0188183201881832, "grad_norm": 0.7442921996116638, "learning_rate": 7.030736842105263e-05, "loss": 1.2814, "step": 68 }, { "epoch": 0.019095060190950603, "grad_norm": 0.8117621541023254, "learning_rate": 6.977473684210526e-05, "loss": 1.2822, "step": 69 }, { "epoch": 0.019371800193718, "grad_norm": 0.7203373312950134, "learning_rate": 6.924210526315789e-05, "loss": 1.0517, "step": 70 }, { "epoch": 0.019648540196485403, "grad_norm": 0.7640715837478638, "learning_rate": 6.870947368421052e-05, "loss": 1.1189, "step": 71 }, { "epoch": 0.019925280199252802, "grad_norm": 0.8596171140670776, "learning_rate": 6.817684210526316e-05, "loss": 1.2053, "step": 72 }, { "epoch": 0.020202020202020204, "grad_norm": 0.9384458661079407, "learning_rate": 6.764421052631579e-05, "loss": 1.2805, "step": 73 }, { "epoch": 0.020478760204787602, "grad_norm": 0.8166313171386719, "learning_rate": 6.711157894736841e-05, "loss": 1.1936, "step": 74 }, { "epoch": 0.020755500207555, "grad_norm": 0.8672533631324768, "learning_rate": 6.657894736842106e-05, "loss": 1.1106, "step": 75 }, { "epoch": 0.021032240210322403, "grad_norm": 0.8567204475402832, "learning_rate": 6.604631578947368e-05, "loss": 1.6046, "step": 76 }, { "epoch": 0.0213089802130898, "grad_norm": 0.9276119470596313, "learning_rate": 6.551368421052631e-05, "loss": 1.3299, "step": 77 }, { "epoch": 0.021585720215857203, "grad_norm": 0.8920117616653442, "learning_rate": 6.498105263157894e-05, "loss": 1.2766, "step": 78 }, { "epoch": 0.0218624602186246, "grad_norm": 0.8522023558616638, "learning_rate": 6.444842105263157e-05, "loss": 1.3039, "step": 79 }, { "epoch": 0.022139200221392003, "grad_norm": 0.9613226056098938, "learning_rate": 6.391578947368421e-05, "loss": 1.2998, "step": 80 }, { "epoch": 0.0224159402241594, "grad_norm": 0.8439605832099915, "learning_rate": 6.338315789473684e-05, "loss": 1.239, "step": 81 }, { "epoch": 0.022692680226926804, "grad_norm": 0.8934568166732788, "learning_rate": 6.285052631578948e-05, "loss": 1.3884, "step": 82 }, { "epoch": 0.022969420229694202, "grad_norm": 0.9310352802276611, "learning_rate": 6.23178947368421e-05, "loss": 1.3431, "step": 83 }, { "epoch": 0.023246160232461604, "grad_norm": 0.8858011364936829, "learning_rate": 6.178526315789473e-05, "loss": 1.2641, "step": 84 }, { "epoch": 0.023522900235229002, "grad_norm": 0.9092288017272949, "learning_rate": 6.125263157894736e-05, "loss": 1.2965, "step": 85 }, { "epoch": 0.0237996402379964, "grad_norm": 0.9616385102272034, "learning_rate": 6.0719999999999995e-05, "loss": 1.2244, "step": 86 }, { "epoch": 0.024076380240763803, "grad_norm": 0.9041621088981628, "learning_rate": 6.018736842105262e-05, "loss": 1.3132, "step": 87 }, { "epoch": 0.0243531202435312, "grad_norm": 0.9865350127220154, "learning_rate": 5.965473684210526e-05, "loss": 1.3897, "step": 88 }, { "epoch": 0.024629860246298603, "grad_norm": 1.0935101509094238, "learning_rate": 5.912210526315789e-05, "loss": 1.2502, "step": 89 }, { "epoch": 0.024906600249066, "grad_norm": 0.8915322422981262, "learning_rate": 5.8589473684210526e-05, "loss": 1.2404, "step": 90 }, { "epoch": 0.025183340251833403, "grad_norm": 1.0322345495224, "learning_rate": 5.8056842105263154e-05, "loss": 1.4389, "step": 91 }, { "epoch": 0.025460080254600802, "grad_norm": 1.1132317781448364, "learning_rate": 5.752421052631578e-05, "loss": 1.3657, "step": 92 }, { "epoch": 0.025736820257368204, "grad_norm": 1.0486310720443726, "learning_rate": 5.6991578947368416e-05, "loss": 1.3841, "step": 93 }, { "epoch": 0.026013560260135602, "grad_norm": 1.1889928579330444, "learning_rate": 5.6458947368421044e-05, "loss": 1.4876, "step": 94 }, { "epoch": 0.026290300262903004, "grad_norm": 1.1577107906341553, "learning_rate": 5.5926315789473685e-05, "loss": 1.2857, "step": 95 }, { "epoch": 0.026567040265670402, "grad_norm": 1.280145525932312, "learning_rate": 5.539368421052631e-05, "loss": 1.29, "step": 96 }, { "epoch": 0.026843780268437804, "grad_norm": 1.3187057971954346, "learning_rate": 5.486105263157895e-05, "loss": 1.3489, "step": 97 }, { "epoch": 0.027120520271205203, "grad_norm": 1.4107880592346191, "learning_rate": 5.4328421052631575e-05, "loss": 1.4531, "step": 98 }, { "epoch": 0.0273972602739726, "grad_norm": 1.625570297241211, "learning_rate": 5.37957894736842e-05, "loss": 1.318, "step": 99 }, { "epoch": 0.027674000276740003, "grad_norm": 1.671846866607666, "learning_rate": 5.326315789473684e-05, "loss": 1.619, "step": 100 }, { "epoch": 0.027674000276740003, "eval_loss": 1.4342150688171387, "eval_runtime": 153.6483, "eval_samples_per_second": 9.906, "eval_steps_per_second": 2.48, "step": 100 }, { "epoch": 0.0279507402795074, "grad_norm": 0.6019634008407593, "learning_rate": 5.2730526315789465e-05, "loss": 2.7598, "step": 101 }, { "epoch": 0.028227480282274803, "grad_norm": 0.5791364312171936, "learning_rate": 5.2197894736842107e-05, "loss": 2.3475, "step": 102 }, { "epoch": 0.028504220285042202, "grad_norm": 0.6060399413108826, "learning_rate": 5.1665263157894734e-05, "loss": 2.2584, "step": 103 }, { "epoch": 0.028780960287809604, "grad_norm": 0.6558189392089844, "learning_rate": 5.113263157894737e-05, "loss": 2.0299, "step": 104 }, { "epoch": 0.029057700290577002, "grad_norm": 0.7048506140708923, "learning_rate": 5.06e-05, "loss": 1.7873, "step": 105 }, { "epoch": 0.029334440293344404, "grad_norm": 0.7497207522392273, "learning_rate": 5.0067368421052624e-05, "loss": 1.4258, "step": 106 }, { "epoch": 0.029611180296111803, "grad_norm": 0.6850358247756958, "learning_rate": 4.953473684210526e-05, "loss": 1.3026, "step": 107 }, { "epoch": 0.029887920298879204, "grad_norm": 0.6229945421218872, "learning_rate": 4.9002105263157893e-05, "loss": 1.3109, "step": 108 }, { "epoch": 0.030164660301646603, "grad_norm": 0.7342796921730042, "learning_rate": 4.846947368421052e-05, "loss": 1.5, "step": 109 }, { "epoch": 0.030441400304414, "grad_norm": 0.656924307346344, "learning_rate": 4.793684210526315e-05, "loss": 1.5616, "step": 110 }, { "epoch": 0.030718140307181403, "grad_norm": 0.6830202341079712, "learning_rate": 4.740421052631579e-05, "loss": 1.144, "step": 111 }, { "epoch": 0.0309948803099488, "grad_norm": 0.6222048997879028, "learning_rate": 4.687157894736842e-05, "loss": 1.0449, "step": 112 }, { "epoch": 0.0312716203127162, "grad_norm": 0.6360278129577637, "learning_rate": 4.6338947368421046e-05, "loss": 1.0367, "step": 113 }, { "epoch": 0.0315483603154836, "grad_norm": 0.7706067562103271, "learning_rate": 4.580631578947368e-05, "loss": 1.5067, "step": 114 }, { "epoch": 0.031825100318251004, "grad_norm": 0.7164391279220581, "learning_rate": 4.5273684210526315e-05, "loss": 1.0675, "step": 115 }, { "epoch": 0.032101840321018406, "grad_norm": 0.6832680702209473, "learning_rate": 4.474105263157894e-05, "loss": 1.222, "step": 116 }, { "epoch": 0.0323785803237858, "grad_norm": 0.810901403427124, "learning_rate": 4.420842105263158e-05, "loss": 1.1712, "step": 117 }, { "epoch": 0.0326553203265532, "grad_norm": 0.660420298576355, "learning_rate": 4.367578947368421e-05, "loss": 1.3118, "step": 118 }, { "epoch": 0.032932060329320605, "grad_norm": 0.7108460664749146, "learning_rate": 4.314315789473684e-05, "loss": 1.2022, "step": 119 }, { "epoch": 0.033208800332088007, "grad_norm": 0.680159330368042, "learning_rate": 4.261052631578947e-05, "loss": 1.1978, "step": 120 }, { "epoch": 0.0334855403348554, "grad_norm": 0.8145832419395447, "learning_rate": 4.20778947368421e-05, "loss": 1.4316, "step": 121 }, { "epoch": 0.0337622803376228, "grad_norm": 0.808811604976654, "learning_rate": 4.1545263157894736e-05, "loss": 1.2853, "step": 122 }, { "epoch": 0.034039020340390205, "grad_norm": 0.7780376076698303, "learning_rate": 4.1012631578947364e-05, "loss": 1.3115, "step": 123 }, { "epoch": 0.0343157603431576, "grad_norm": 0.7685719728469849, "learning_rate": 4.048e-05, "loss": 1.2184, "step": 124 }, { "epoch": 0.034592500345925, "grad_norm": 0.7712112665176392, "learning_rate": 3.9947368421052626e-05, "loss": 1.1671, "step": 125 }, { "epoch": 0.034869240348692404, "grad_norm": 0.8383251428604126, "learning_rate": 3.941473684210526e-05, "loss": 1.2699, "step": 126 }, { "epoch": 0.035145980351459806, "grad_norm": 0.8275764584541321, "learning_rate": 3.888210526315789e-05, "loss": 1.1739, "step": 127 }, { "epoch": 0.0354227203542272, "grad_norm": 0.8801849484443665, "learning_rate": 3.834947368421052e-05, "loss": 1.4452, "step": 128 }, { "epoch": 0.0356994603569946, "grad_norm": 0.7709352374076843, "learning_rate": 3.781684210526316e-05, "loss": 1.1203, "step": 129 }, { "epoch": 0.035976200359762005, "grad_norm": 0.9169853329658508, "learning_rate": 3.7284210526315786e-05, "loss": 1.1997, "step": 130 }, { "epoch": 0.03625294036252941, "grad_norm": 0.8363617062568665, "learning_rate": 3.675157894736842e-05, "loss": 1.3281, "step": 131 }, { "epoch": 0.0365296803652968, "grad_norm": 0.8911722302436829, "learning_rate": 3.621894736842105e-05, "loss": 1.2928, "step": 132 }, { "epoch": 0.036806420368064204, "grad_norm": 0.8886839747428894, "learning_rate": 3.568631578947368e-05, "loss": 1.1933, "step": 133 }, { "epoch": 0.037083160370831605, "grad_norm": 0.8812587261199951, "learning_rate": 3.515368421052632e-05, "loss": 1.1706, "step": 134 }, { "epoch": 0.037359900373599, "grad_norm": 0.9277761578559875, "learning_rate": 3.4621052631578945e-05, "loss": 1.3746, "step": 135 }, { "epoch": 0.0376366403763664, "grad_norm": 0.9608489871025085, "learning_rate": 3.408842105263158e-05, "loss": 1.3974, "step": 136 }, { "epoch": 0.037913380379133804, "grad_norm": 0.9445280432701111, "learning_rate": 3.355578947368421e-05, "loss": 1.4277, "step": 137 }, { "epoch": 0.038190120381901206, "grad_norm": 0.9991944432258606, "learning_rate": 3.302315789473684e-05, "loss": 1.3596, "step": 138 }, { "epoch": 0.0384668603846686, "grad_norm": 1.0606067180633545, "learning_rate": 3.249052631578947e-05, "loss": 1.3531, "step": 139 }, { "epoch": 0.038743600387436, "grad_norm": 0.8716339468955994, "learning_rate": 3.1957894736842104e-05, "loss": 1.2648, "step": 140 }, { "epoch": 0.039020340390203405, "grad_norm": 1.0521880388259888, "learning_rate": 3.142526315789474e-05, "loss": 1.498, "step": 141 }, { "epoch": 0.03929708039297081, "grad_norm": 1.025915265083313, "learning_rate": 3.0892631578947366e-05, "loss": 1.3495, "step": 142 }, { "epoch": 0.0395738203957382, "grad_norm": 1.0220515727996826, "learning_rate": 3.0359999999999997e-05, "loss": 1.2362, "step": 143 }, { "epoch": 0.039850560398505604, "grad_norm": 0.9568931460380554, "learning_rate": 2.982736842105263e-05, "loss": 1.1941, "step": 144 }, { "epoch": 0.040127300401273006, "grad_norm": 1.0511956214904785, "learning_rate": 2.9294736842105263e-05, "loss": 1.1299, "step": 145 }, { "epoch": 0.04040404040404041, "grad_norm": 1.0851856470108032, "learning_rate": 2.876210526315789e-05, "loss": 1.2415, "step": 146 }, { "epoch": 0.0406807804068078, "grad_norm": 1.4850367307662964, "learning_rate": 2.8229473684210522e-05, "loss": 1.5644, "step": 147 }, { "epoch": 0.040957520409575204, "grad_norm": 1.582217812538147, "learning_rate": 2.7696842105263156e-05, "loss": 1.4194, "step": 148 }, { "epoch": 0.041234260412342606, "grad_norm": 1.2814207077026367, "learning_rate": 2.7164210526315788e-05, "loss": 1.4819, "step": 149 }, { "epoch": 0.04151100041511, "grad_norm": 2.7679193019866943, "learning_rate": 2.663157894736842e-05, "loss": 1.769, "step": 150 }, { "epoch": 0.04151100041511, "eval_loss": 1.3988063335418701, "eval_runtime": 153.9303, "eval_samples_per_second": 9.888, "eval_steps_per_second": 2.475, "step": 150 }, { "epoch": 0.0417877404178774, "grad_norm": 0.6982325315475464, "learning_rate": 2.6098947368421053e-05, "loss": 2.6541, "step": 151 }, { "epoch": 0.042064480420644805, "grad_norm": 0.6466959714889526, "learning_rate": 2.5566315789473684e-05, "loss": 2.7869, "step": 152 }, { "epoch": 0.04234122042341221, "grad_norm": 0.5397589802742004, "learning_rate": 2.5033684210526312e-05, "loss": 1.9081, "step": 153 }, { "epoch": 0.0426179604261796, "grad_norm": 0.5460906624794006, "learning_rate": 2.4501052631578947e-05, "loss": 1.5494, "step": 154 }, { "epoch": 0.042894700428947004, "grad_norm": 0.5999884009361267, "learning_rate": 2.3968421052631575e-05, "loss": 1.7669, "step": 155 }, { "epoch": 0.043171440431714406, "grad_norm": 0.6195064187049866, "learning_rate": 2.343578947368421e-05, "loss": 1.4744, "step": 156 }, { "epoch": 0.04344818043448181, "grad_norm": 0.5452435612678528, "learning_rate": 2.290315789473684e-05, "loss": 1.5237, "step": 157 }, { "epoch": 0.0437249204372492, "grad_norm": 0.6407039761543274, "learning_rate": 2.237052631578947e-05, "loss": 2.0735, "step": 158 }, { "epoch": 0.044001660440016604, "grad_norm": 0.6194781064987183, "learning_rate": 2.1837894736842106e-05, "loss": 1.4127, "step": 159 }, { "epoch": 0.044278400442784006, "grad_norm": 0.6215983629226685, "learning_rate": 2.1305263157894734e-05, "loss": 1.2298, "step": 160 }, { "epoch": 0.0445551404455514, "grad_norm": 0.6356123685836792, "learning_rate": 2.0772631578947368e-05, "loss": 1.2279, "step": 161 }, { "epoch": 0.0448318804483188, "grad_norm": 0.6377651691436768, "learning_rate": 2.024e-05, "loss": 1.1827, "step": 162 }, { "epoch": 0.045108620451086205, "grad_norm": 0.6660798192024231, "learning_rate": 1.970736842105263e-05, "loss": 1.1657, "step": 163 }, { "epoch": 0.04538536045385361, "grad_norm": 0.7134098410606384, "learning_rate": 1.917473684210526e-05, "loss": 1.5894, "step": 164 }, { "epoch": 0.045662100456621, "grad_norm": 0.6191778182983398, "learning_rate": 1.8642105263157893e-05, "loss": 1.154, "step": 165 }, { "epoch": 0.045938840459388404, "grad_norm": 0.6857507228851318, "learning_rate": 1.8109473684210524e-05, "loss": 1.3369, "step": 166 }, { "epoch": 0.046215580462155806, "grad_norm": 0.6705174446105957, "learning_rate": 1.757684210526316e-05, "loss": 1.4583, "step": 167 }, { "epoch": 0.04649232046492321, "grad_norm": 0.7063128352165222, "learning_rate": 1.704421052631579e-05, "loss": 1.3876, "step": 168 }, { "epoch": 0.0467690604676906, "grad_norm": 0.7156012058258057, "learning_rate": 1.651157894736842e-05, "loss": 1.252, "step": 169 }, { "epoch": 0.047045800470458005, "grad_norm": 0.7198076248168945, "learning_rate": 1.5978947368421052e-05, "loss": 1.3088, "step": 170 }, { "epoch": 0.047322540473225407, "grad_norm": 0.7079565525054932, "learning_rate": 1.5446315789473683e-05, "loss": 1.1477, "step": 171 }, { "epoch": 0.0475992804759928, "grad_norm": 0.7721232175827026, "learning_rate": 1.4913684210526314e-05, "loss": 1.2165, "step": 172 }, { "epoch": 0.0478760204787602, "grad_norm": 0.772697389125824, "learning_rate": 1.4381052631578945e-05, "loss": 1.3874, "step": 173 }, { "epoch": 0.048152760481527605, "grad_norm": 0.7915751338005066, "learning_rate": 1.3848421052631578e-05, "loss": 1.2733, "step": 174 }, { "epoch": 0.04842950048429501, "grad_norm": 0.6995910406112671, "learning_rate": 1.331578947368421e-05, "loss": 1.1716, "step": 175 }, { "epoch": 0.0487062404870624, "grad_norm": 0.8334876298904419, "learning_rate": 1.2783157894736842e-05, "loss": 1.2003, "step": 176 }, { "epoch": 0.048982980489829804, "grad_norm": 0.7988669872283936, "learning_rate": 1.2250526315789473e-05, "loss": 1.2631, "step": 177 }, { "epoch": 0.049259720492597206, "grad_norm": 0.7876620888710022, "learning_rate": 1.1717894736842105e-05, "loss": 1.144, "step": 178 }, { "epoch": 0.04953646049536461, "grad_norm": 0.7998309135437012, "learning_rate": 1.1185263157894736e-05, "loss": 1.256, "step": 179 }, { "epoch": 0.049813200498132, "grad_norm": 0.7581685781478882, "learning_rate": 1.0652631578947367e-05, "loss": 1.1314, "step": 180 }, { "epoch": 0.050089940500899405, "grad_norm": 0.8552046418190002, "learning_rate": 1.012e-05, "loss": 1.2926, "step": 181 }, { "epoch": 0.05036668050366681, "grad_norm": 0.7959903478622437, "learning_rate": 9.58736842105263e-06, "loss": 1.1671, "step": 182 }, { "epoch": 0.0506434205064342, "grad_norm": 0.8624477982521057, "learning_rate": 9.054736842105262e-06, "loss": 1.2291, "step": 183 }, { "epoch": 0.050920160509201604, "grad_norm": 0.7806897163391113, "learning_rate": 8.522105263157895e-06, "loss": 1.1011, "step": 184 }, { "epoch": 0.051196900511969005, "grad_norm": 0.8457885980606079, "learning_rate": 7.989473684210526e-06, "loss": 1.2111, "step": 185 }, { "epoch": 0.05147364051473641, "grad_norm": 0.912453830242157, "learning_rate": 7.456842105263157e-06, "loss": 1.2933, "step": 186 }, { "epoch": 0.0517503805175038, "grad_norm": 0.8722508549690247, "learning_rate": 6.924210526315789e-06, "loss": 1.2674, "step": 187 }, { "epoch": 0.052027120520271204, "grad_norm": 0.9331529140472412, "learning_rate": 6.391578947368421e-06, "loss": 1.3586, "step": 188 }, { "epoch": 0.052303860523038606, "grad_norm": 0.9258183836936951, "learning_rate": 5.858947368421052e-06, "loss": 1.2182, "step": 189 }, { "epoch": 0.05258060052580601, "grad_norm": 0.8901333212852478, "learning_rate": 5.326315789473683e-06, "loss": 1.3312, "step": 190 }, { "epoch": 0.0528573405285734, "grad_norm": 0.9532617330551147, "learning_rate": 4.793684210526315e-06, "loss": 1.2047, "step": 191 }, { "epoch": 0.053134080531340805, "grad_norm": 1.0242388248443604, "learning_rate": 4.261052631578947e-06, "loss": 1.3325, "step": 192 }, { "epoch": 0.05341082053410821, "grad_norm": 1.0415493249893188, "learning_rate": 3.7284210526315786e-06, "loss": 1.4721, "step": 193 }, { "epoch": 0.05368756053687561, "grad_norm": 1.0794676542282104, "learning_rate": 3.1957894736842106e-06, "loss": 1.2548, "step": 194 }, { "epoch": 0.053964300539643004, "grad_norm": 0.989457368850708, "learning_rate": 2.6631578947368417e-06, "loss": 1.2458, "step": 195 }, { "epoch": 0.054241040542410406, "grad_norm": 1.0559757947921753, "learning_rate": 2.1305263157894737e-06, "loss": 1.2328, "step": 196 }, { "epoch": 0.05451778054517781, "grad_norm": 1.1546615362167358, "learning_rate": 1.5978947368421053e-06, "loss": 1.3439, "step": 197 }, { "epoch": 0.0547945205479452, "grad_norm": 1.6110146045684814, "learning_rate": 1.0652631578947369e-06, "loss": 1.3329, "step": 198 }, { "epoch": 0.055071260550712604, "grad_norm": 1.3636716604232788, "learning_rate": 5.326315789473684e-07, "loss": 1.4477, "step": 199 }, { "epoch": 0.055348000553480006, "grad_norm": 1.5938727855682373, "learning_rate": 0.0, "loss": 1.7578, "step": 200 }, { "epoch": 0.055348000553480006, "eval_loss": 1.3859784603118896, "eval_runtime": 153.7767, "eval_samples_per_second": 9.897, "eval_steps_per_second": 2.478, "step": 200 } ], "logging_steps": 1, "max_steps": 200, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 5, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 6.669110068484506e+16, "train_batch_size": 4, "trial_name": null, "trial_params": null }