{ "best_metric": 0.7234218120574951, "best_model_checkpoint": "miner_id_24/checkpoint-200", "epoch": 0.009289363678588018, "eval_steps": 50, "global_step": 200, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 4.644681839294008e-05, "grad_norm": 3.092576503753662, "learning_rate": 1.018e-05, "loss": 0.997, "step": 1 }, { "epoch": 4.644681839294008e-05, "eval_loss": 1.5409296751022339, "eval_runtime": 162.8209, "eval_samples_per_second": 55.681, "eval_steps_per_second": 13.923, "step": 1 }, { "epoch": 9.289363678588016e-05, "grad_norm": 21.47955322265625, "learning_rate": 2.036e-05, "loss": 1.9273, "step": 2 }, { "epoch": 0.00013934045517882026, "grad_norm": 9.852048873901367, "learning_rate": 3.0539999999999996e-05, "loss": 1.6379, "step": 3 }, { "epoch": 0.00018578727357176033, "grad_norm": 17.509506225585938, "learning_rate": 4.072e-05, "loss": 1.4906, "step": 4 }, { "epoch": 0.00023223409196470042, "grad_norm": 14.546015739440918, "learning_rate": 5.09e-05, "loss": 1.4437, "step": 5 }, { "epoch": 0.0002786809103576405, "grad_norm": 5.5904154777526855, "learning_rate": 6.107999999999999e-05, "loss": 1.6272, "step": 6 }, { "epoch": 0.0003251277287505806, "grad_norm": 4.112157821655273, "learning_rate": 7.125999999999999e-05, "loss": 1.4437, "step": 7 }, { "epoch": 0.00037157454714352065, "grad_norm": 4.384551048278809, "learning_rate": 8.144e-05, "loss": 1.3646, "step": 8 }, { "epoch": 0.0004180213655364608, "grad_norm": 8.227641105651855, "learning_rate": 9.162e-05, "loss": 1.1622, "step": 9 }, { "epoch": 0.00046446818392940084, "grad_norm": 5.1519856452941895, "learning_rate": 0.0001018, "loss": 1.4487, "step": 10 }, { "epoch": 0.0005109150023223409, "grad_norm": 9.391843795776367, "learning_rate": 0.00010126421052631578, "loss": 1.2691, "step": 11 }, { "epoch": 0.000557361820715281, "grad_norm": 5.215498924255371, "learning_rate": 0.00010072842105263156, "loss": 0.9428, "step": 12 }, { "epoch": 0.000603808639108221, "grad_norm": 2.952871084213257, "learning_rate": 0.00010019263157894736, "loss": 1.0689, "step": 13 }, { "epoch": 0.0006502554575011612, "grad_norm": 2.8391802310943604, "learning_rate": 9.965684210526316e-05, "loss": 0.9291, "step": 14 }, { "epoch": 0.0006967022758941013, "grad_norm": 2.461745023727417, "learning_rate": 9.912105263157895e-05, "loss": 0.8318, "step": 15 }, { "epoch": 0.0007431490942870413, "grad_norm": 2.3736178874969482, "learning_rate": 9.858526315789473e-05, "loss": 0.5747, "step": 16 }, { "epoch": 0.0007895959126799814, "grad_norm": 3.788600444793701, "learning_rate": 9.804947368421052e-05, "loss": 0.9676, "step": 17 }, { "epoch": 0.0008360427310729215, "grad_norm": 3.9328246116638184, "learning_rate": 9.75136842105263e-05, "loss": 1.0193, "step": 18 }, { "epoch": 0.0008824895494658616, "grad_norm": 2.693528652191162, "learning_rate": 9.69778947368421e-05, "loss": 0.7466, "step": 19 }, { "epoch": 0.0009289363678588017, "grad_norm": 3.4937636852264404, "learning_rate": 9.644210526315789e-05, "loss": 0.8632, "step": 20 }, { "epoch": 0.0009753831862517418, "grad_norm": 2.1122593879699707, "learning_rate": 9.590631578947369e-05, "loss": 0.7247, "step": 21 }, { "epoch": 0.0010218300046446818, "grad_norm": 3.186523675918579, "learning_rate": 9.537052631578947e-05, "loss": 0.6886, "step": 22 }, { "epoch": 0.001068276823037622, "grad_norm": 2.758934497833252, "learning_rate": 9.483473684210526e-05, "loss": 0.6994, "step": 23 }, { "epoch": 0.001114723641430562, "grad_norm": 2.430767774581909, "learning_rate": 9.429894736842104e-05, "loss": 0.6421, "step": 24 }, { "epoch": 0.0011611704598235022, "grad_norm": 3.4728782176971436, "learning_rate": 9.376315789473684e-05, "loss": 0.8879, "step": 25 }, { "epoch": 0.001207617278216442, "grad_norm": 3.511162519454956, "learning_rate": 9.322736842105262e-05, "loss": 0.9696, "step": 26 }, { "epoch": 0.0012540640966093822, "grad_norm": 3.0230774879455566, "learning_rate": 9.269157894736842e-05, "loss": 0.9589, "step": 27 }, { "epoch": 0.0013005109150023223, "grad_norm": 2.1474082469940186, "learning_rate": 9.215578947368421e-05, "loss": 0.7707, "step": 28 }, { "epoch": 0.0013469577333952625, "grad_norm": 2.3117761611938477, "learning_rate": 9.162e-05, "loss": 0.7724, "step": 29 }, { "epoch": 0.0013934045517882026, "grad_norm": 1.9512385129928589, "learning_rate": 9.108421052631578e-05, "loss": 0.603, "step": 30 }, { "epoch": 0.0014398513701811425, "grad_norm": 6.22908878326416, "learning_rate": 9.054842105263158e-05, "loss": 1.0569, "step": 31 }, { "epoch": 0.0014862981885740826, "grad_norm": 2.2944276332855225, "learning_rate": 9.001263157894736e-05, "loss": 0.6613, "step": 32 }, { "epoch": 0.0015327450069670227, "grad_norm": 2.312437057495117, "learning_rate": 8.947684210526315e-05, "loss": 0.8758, "step": 33 }, { "epoch": 0.0015791918253599629, "grad_norm": 2.5488150119781494, "learning_rate": 8.894105263157895e-05, "loss": 0.635, "step": 34 }, { "epoch": 0.001625638643752903, "grad_norm": 3.658079147338867, "learning_rate": 8.840526315789473e-05, "loss": 1.0831, "step": 35 }, { "epoch": 0.001672085462145843, "grad_norm": 2.474161386489868, "learning_rate": 8.786947368421052e-05, "loss": 0.7918, "step": 36 }, { "epoch": 0.001718532280538783, "grad_norm": 2.5074217319488525, "learning_rate": 8.733368421052632e-05, "loss": 0.8514, "step": 37 }, { "epoch": 0.0017649790989317231, "grad_norm": 2.0377755165100098, "learning_rate": 8.67978947368421e-05, "loss": 0.6768, "step": 38 }, { "epoch": 0.0018114259173246632, "grad_norm": 2.2819857597351074, "learning_rate": 8.626210526315789e-05, "loss": 0.6528, "step": 39 }, { "epoch": 0.0018578727357176034, "grad_norm": 2.3973352909088135, "learning_rate": 8.572631578947367e-05, "loss": 0.8542, "step": 40 }, { "epoch": 0.0019043195541105435, "grad_norm": 2.628427267074585, "learning_rate": 8.519052631578947e-05, "loss": 0.8507, "step": 41 }, { "epoch": 0.0019507663725034836, "grad_norm": 3.116105556488037, "learning_rate": 8.465473684210527e-05, "loss": 0.9283, "step": 42 }, { "epoch": 0.0019972131908964235, "grad_norm": 3.5683062076568604, "learning_rate": 8.411894736842105e-05, "loss": 1.1937, "step": 43 }, { "epoch": 0.0020436600092893636, "grad_norm": 2.569610118865967, "learning_rate": 8.358315789473684e-05, "loss": 0.9237, "step": 44 }, { "epoch": 0.0020901068276823038, "grad_norm": 3.7425827980041504, "learning_rate": 8.304736842105262e-05, "loss": 1.3234, "step": 45 }, { "epoch": 0.002136553646075244, "grad_norm": 3.7030258178710938, "learning_rate": 8.251157894736841e-05, "loss": 0.7639, "step": 46 }, { "epoch": 0.002183000464468184, "grad_norm": 3.188816785812378, "learning_rate": 8.197578947368421e-05, "loss": 1.1931, "step": 47 }, { "epoch": 0.002229447282861124, "grad_norm": 5.445688724517822, "learning_rate": 8.144e-05, "loss": 0.8938, "step": 48 }, { "epoch": 0.0022758941012540643, "grad_norm": 2.3576200008392334, "learning_rate": 8.090421052631579e-05, "loss": 0.836, "step": 49 }, { "epoch": 0.0023223409196470044, "grad_norm": 22.260164260864258, "learning_rate": 8.036842105263158e-05, "loss": 1.0266, "step": 50 }, { "epoch": 0.0023223409196470044, "eval_loss": 0.8269708752632141, "eval_runtime": 162.7994, "eval_samples_per_second": 55.688, "eval_steps_per_second": 13.925, "step": 50 }, { "epoch": 0.002368787738039944, "grad_norm": 2.7304725646972656, "learning_rate": 7.983263157894736e-05, "loss": 0.7953, "step": 51 }, { "epoch": 0.002415234556432884, "grad_norm": 2.282052516937256, "learning_rate": 7.929684210526315e-05, "loss": 0.8493, "step": 52 }, { "epoch": 0.0024616813748258243, "grad_norm": 2.524898052215576, "learning_rate": 7.876105263157895e-05, "loss": 1.0108, "step": 53 }, { "epoch": 0.0025081281932187644, "grad_norm": 2.6745500564575195, "learning_rate": 7.822526315789473e-05, "loss": 1.0531, "step": 54 }, { "epoch": 0.0025545750116117046, "grad_norm": 3.1213998794555664, "learning_rate": 7.768947368421053e-05, "loss": 1.1068, "step": 55 }, { "epoch": 0.0026010218300046447, "grad_norm": 3.499636173248291, "learning_rate": 7.715368421052631e-05, "loss": 1.0808, "step": 56 }, { "epoch": 0.002647468648397585, "grad_norm": 3.125180721282959, "learning_rate": 7.66178947368421e-05, "loss": 0.8485, "step": 57 }, { "epoch": 0.002693915466790525, "grad_norm": 2.547490119934082, "learning_rate": 7.608210526315788e-05, "loss": 0.7604, "step": 58 }, { "epoch": 0.002740362285183465, "grad_norm": 2.058576822280884, "learning_rate": 7.554631578947368e-05, "loss": 0.665, "step": 59 }, { "epoch": 0.002786809103576405, "grad_norm": 2.624077320098877, "learning_rate": 7.501052631578947e-05, "loss": 0.6822, "step": 60 }, { "epoch": 0.0028332559219693453, "grad_norm": 1.4456878900527954, "learning_rate": 7.447473684210527e-05, "loss": 0.554, "step": 61 }, { "epoch": 0.002879702740362285, "grad_norm": 1.9477028846740723, "learning_rate": 7.393894736842105e-05, "loss": 0.712, "step": 62 }, { "epoch": 0.002926149558755225, "grad_norm": 2.0057406425476074, "learning_rate": 7.340315789473684e-05, "loss": 0.7368, "step": 63 }, { "epoch": 0.0029725963771481652, "grad_norm": 2.041309356689453, "learning_rate": 7.286736842105262e-05, "loss": 0.9478, "step": 64 }, { "epoch": 0.0030190431955411053, "grad_norm": 1.8585742712020874, "learning_rate": 7.233157894736842e-05, "loss": 0.5883, "step": 65 }, { "epoch": 0.0030654900139340455, "grad_norm": 2.3726940155029297, "learning_rate": 7.179578947368421e-05, "loss": 0.8448, "step": 66 }, { "epoch": 0.0031119368323269856, "grad_norm": 4.274697303771973, "learning_rate": 7.125999999999999e-05, "loss": 0.693, "step": 67 }, { "epoch": 0.0031583836507199257, "grad_norm": 1.7456036806106567, "learning_rate": 7.072421052631579e-05, "loss": 0.6139, "step": 68 }, { "epoch": 0.003204830469112866, "grad_norm": 2.077462673187256, "learning_rate": 7.018842105263158e-05, "loss": 0.6687, "step": 69 }, { "epoch": 0.003251277287505806, "grad_norm": 1.6591328382492065, "learning_rate": 6.965263157894736e-05, "loss": 0.6602, "step": 70 }, { "epoch": 0.003297724105898746, "grad_norm": 2.040104866027832, "learning_rate": 6.911684210526316e-05, "loss": 0.6718, "step": 71 }, { "epoch": 0.003344170924291686, "grad_norm": 2.031517505645752, "learning_rate": 6.858105263157894e-05, "loss": 0.6965, "step": 72 }, { "epoch": 0.003390617742684626, "grad_norm": 1.677396297454834, "learning_rate": 6.804526315789473e-05, "loss": 0.6022, "step": 73 }, { "epoch": 0.003437064561077566, "grad_norm": 2.6971142292022705, "learning_rate": 6.750947368421052e-05, "loss": 0.7803, "step": 74 }, { "epoch": 0.003483511379470506, "grad_norm": 1.2442923784255981, "learning_rate": 6.697368421052631e-05, "loss": 0.4904, "step": 75 }, { "epoch": 0.0035299581978634463, "grad_norm": 1.520882487297058, "learning_rate": 6.64378947368421e-05, "loss": 0.5449, "step": 76 }, { "epoch": 0.0035764050162563864, "grad_norm": 2.945136070251465, "learning_rate": 6.59021052631579e-05, "loss": 0.5945, "step": 77 }, { "epoch": 0.0036228518346493265, "grad_norm": 2.225796937942505, "learning_rate": 6.536631578947368e-05, "loss": 0.8414, "step": 78 }, { "epoch": 0.0036692986530422666, "grad_norm": 3.5419042110443115, "learning_rate": 6.483052631578947e-05, "loss": 0.951, "step": 79 }, { "epoch": 0.0037157454714352067, "grad_norm": 2.4470789432525635, "learning_rate": 6.429473684210525e-05, "loss": 0.6766, "step": 80 }, { "epoch": 0.003762192289828147, "grad_norm": 1.8150739669799805, "learning_rate": 6.375894736842104e-05, "loss": 0.6212, "step": 81 }, { "epoch": 0.003808639108221087, "grad_norm": 2.2378828525543213, "learning_rate": 6.322315789473684e-05, "loss": 0.912, "step": 82 }, { "epoch": 0.003855085926614027, "grad_norm": 2.66448974609375, "learning_rate": 6.268736842105264e-05, "loss": 0.7284, "step": 83 }, { "epoch": 0.0039015327450069672, "grad_norm": 2.0171289443969727, "learning_rate": 6.215157894736842e-05, "loss": 0.5339, "step": 84 }, { "epoch": 0.003947979563399907, "grad_norm": 1.829827070236206, "learning_rate": 6.16157894736842e-05, "loss": 0.6982, "step": 85 }, { "epoch": 0.003994426381792847, "grad_norm": 1.3786966800689697, "learning_rate": 6.107999999999999e-05, "loss": 0.4433, "step": 86 }, { "epoch": 0.004040873200185788, "grad_norm": 2.0562403202056885, "learning_rate": 6.054421052631578e-05, "loss": 0.791, "step": 87 }, { "epoch": 0.004087320018578727, "grad_norm": 1.8710417747497559, "learning_rate": 6.000842105263157e-05, "loss": 0.5487, "step": 88 }, { "epoch": 0.004133766836971668, "grad_norm": 2.46244215965271, "learning_rate": 5.947263157894737e-05, "loss": 0.9045, "step": 89 }, { "epoch": 0.0041802136553646075, "grad_norm": 1.9283982515335083, "learning_rate": 5.893684210526316e-05, "loss": 0.6472, "step": 90 }, { "epoch": 0.004226660473757547, "grad_norm": 1.794073462486267, "learning_rate": 5.8401052631578944e-05, "loss": 0.7332, "step": 91 }, { "epoch": 0.004273107292150488, "grad_norm": 2.4211764335632324, "learning_rate": 5.7865263157894736e-05, "loss": 0.9175, "step": 92 }, { "epoch": 0.0043195541105434275, "grad_norm": 2.131087064743042, "learning_rate": 5.732947368421052e-05, "loss": 0.846, "step": 93 }, { "epoch": 0.004366000928936368, "grad_norm": 3.606595993041992, "learning_rate": 5.6793684210526306e-05, "loss": 1.023, "step": 94 }, { "epoch": 0.004412447747329308, "grad_norm": 2.0817458629608154, "learning_rate": 5.6257894736842105e-05, "loss": 0.7585, "step": 95 }, { "epoch": 0.004458894565722248, "grad_norm": 2.736661672592163, "learning_rate": 5.57221052631579e-05, "loss": 0.9443, "step": 96 }, { "epoch": 0.004505341384115188, "grad_norm": 1.7814656496047974, "learning_rate": 5.518631578947368e-05, "loss": 0.9056, "step": 97 }, { "epoch": 0.0045517882025081285, "grad_norm": 2.098845958709717, "learning_rate": 5.4650526315789474e-05, "loss": 0.7014, "step": 98 }, { "epoch": 0.004598235020901068, "grad_norm": 2.316159963607788, "learning_rate": 5.411473684210526e-05, "loss": 0.7147, "step": 99 }, { "epoch": 0.004644681839294009, "grad_norm": 2.182925224304199, "learning_rate": 5.3578947368421044e-05, "loss": 0.7637, "step": 100 }, { "epoch": 0.004644681839294009, "eval_loss": 0.7575440406799316, "eval_runtime": 162.5731, "eval_samples_per_second": 55.766, "eval_steps_per_second": 13.944, "step": 100 }, { "epoch": 0.0046911286576869484, "grad_norm": 1.547819972038269, "learning_rate": 5.3043157894736836e-05, "loss": 0.7768, "step": 101 }, { "epoch": 0.004737575476079888, "grad_norm": 2.0550365447998047, "learning_rate": 5.2507368421052635e-05, "loss": 0.8542, "step": 102 }, { "epoch": 0.004784022294472829, "grad_norm": 1.7644928693771362, "learning_rate": 5.197157894736842e-05, "loss": 0.8406, "step": 103 }, { "epoch": 0.004830469112865768, "grad_norm": 2.784821033477783, "learning_rate": 5.143578947368421e-05, "loss": 1.0809, "step": 104 }, { "epoch": 0.004876915931258709, "grad_norm": 2.643968105316162, "learning_rate": 5.09e-05, "loss": 1.1358, "step": 105 }, { "epoch": 0.004923362749651649, "grad_norm": 2.6479332447052, "learning_rate": 5.036421052631578e-05, "loss": 0.9534, "step": 106 }, { "epoch": 0.004969809568044589, "grad_norm": 1.5139284133911133, "learning_rate": 4.982842105263158e-05, "loss": 0.6037, "step": 107 }, { "epoch": 0.005016256386437529, "grad_norm": 2.2001686096191406, "learning_rate": 4.9292631578947366e-05, "loss": 1.0875, "step": 108 }, { "epoch": 0.005062703204830469, "grad_norm": 1.906663417816162, "learning_rate": 4.875684210526315e-05, "loss": 0.8251, "step": 109 }, { "epoch": 0.005109150023223409, "grad_norm": 1.6133707761764526, "learning_rate": 4.822105263157894e-05, "loss": 0.7804, "step": 110 }, { "epoch": 0.00515559684161635, "grad_norm": 1.6872289180755615, "learning_rate": 4.7685263157894735e-05, "loss": 0.5731, "step": 111 }, { "epoch": 0.005202043660009289, "grad_norm": 1.2829549312591553, "learning_rate": 4.714947368421052e-05, "loss": 0.4865, "step": 112 }, { "epoch": 0.005248490478402229, "grad_norm": 1.8299009799957275, "learning_rate": 4.661368421052631e-05, "loss": 0.7806, "step": 113 }, { "epoch": 0.00529493729679517, "grad_norm": 1.3792545795440674, "learning_rate": 4.6077894736842104e-05, "loss": 0.5824, "step": 114 }, { "epoch": 0.005341384115188109, "grad_norm": 1.554002046585083, "learning_rate": 4.554210526315789e-05, "loss": 0.7399, "step": 115 }, { "epoch": 0.00538783093358105, "grad_norm": 1.8911974430084229, "learning_rate": 4.500631578947368e-05, "loss": 0.8756, "step": 116 }, { "epoch": 0.0054342777519739895, "grad_norm": 2.071706771850586, "learning_rate": 4.447052631578947e-05, "loss": 0.8007, "step": 117 }, { "epoch": 0.00548072457036693, "grad_norm": 2.202437162399292, "learning_rate": 4.393473684210526e-05, "loss": 0.8207, "step": 118 }, { "epoch": 0.00552717138875987, "grad_norm": 1.33773672580719, "learning_rate": 4.339894736842105e-05, "loss": 0.5947, "step": 119 }, { "epoch": 0.00557361820715281, "grad_norm": 1.8306225538253784, "learning_rate": 4.2863157894736835e-05, "loss": 0.7513, "step": 120 }, { "epoch": 0.00562006502554575, "grad_norm": 1.6813061237335205, "learning_rate": 4.2327368421052634e-05, "loss": 0.6929, "step": 121 }, { "epoch": 0.005666511843938691, "grad_norm": 1.5658451318740845, "learning_rate": 4.179157894736842e-05, "loss": 0.5594, "step": 122 }, { "epoch": 0.00571295866233163, "grad_norm": 1.4536268711090088, "learning_rate": 4.1255789473684204e-05, "loss": 0.6208, "step": 123 }, { "epoch": 0.00575940548072457, "grad_norm": 1.9043149948120117, "learning_rate": 4.072e-05, "loss": 0.6332, "step": 124 }, { "epoch": 0.0058058522991175105, "grad_norm": 2.0733814239501953, "learning_rate": 4.018421052631579e-05, "loss": 0.6764, "step": 125 }, { "epoch": 0.00585229911751045, "grad_norm": 1.7627897262573242, "learning_rate": 3.9648421052631573e-05, "loss": 0.7384, "step": 126 }, { "epoch": 0.005898745935903391, "grad_norm": 1.6006054878234863, "learning_rate": 3.9112631578947365e-05, "loss": 0.6752, "step": 127 }, { "epoch": 0.0059451927542963304, "grad_norm": 1.4541168212890625, "learning_rate": 3.857684210526316e-05, "loss": 0.6692, "step": 128 }, { "epoch": 0.005991639572689271, "grad_norm": 1.3292078971862793, "learning_rate": 3.804105263157894e-05, "loss": 0.5353, "step": 129 }, { "epoch": 0.006038086391082211, "grad_norm": 1.6884562969207764, "learning_rate": 3.7505263157894734e-05, "loss": 0.7562, "step": 130 }, { "epoch": 0.006084533209475151, "grad_norm": 1.0477324724197388, "learning_rate": 3.6969473684210526e-05, "loss": 0.3243, "step": 131 }, { "epoch": 0.006130980027868091, "grad_norm": 1.4753937721252441, "learning_rate": 3.643368421052631e-05, "loss": 0.5291, "step": 132 }, { "epoch": 0.0061774268462610315, "grad_norm": 1.7509891986846924, "learning_rate": 3.5897894736842103e-05, "loss": 0.6364, "step": 133 }, { "epoch": 0.006223873664653971, "grad_norm": 2.055713653564453, "learning_rate": 3.5362105263157895e-05, "loss": 0.782, "step": 134 }, { "epoch": 0.006270320483046911, "grad_norm": 2.0711967945098877, "learning_rate": 3.482631578947368e-05, "loss": 0.7677, "step": 135 }, { "epoch": 0.006316767301439851, "grad_norm": 1.3271763324737549, "learning_rate": 3.429052631578947e-05, "loss": 0.5314, "step": 136 }, { "epoch": 0.006363214119832791, "grad_norm": 1.7668476104736328, "learning_rate": 3.375473684210526e-05, "loss": 0.8441, "step": 137 }, { "epoch": 0.006409660938225732, "grad_norm": 1.773807168006897, "learning_rate": 3.321894736842105e-05, "loss": 0.7551, "step": 138 }, { "epoch": 0.006456107756618671, "grad_norm": 1.6312812566757202, "learning_rate": 3.268315789473684e-05, "loss": 0.8111, "step": 139 }, { "epoch": 0.006502554575011612, "grad_norm": 1.6187984943389893, "learning_rate": 3.2147368421052627e-05, "loss": 0.6781, "step": 140 }, { "epoch": 0.006549001393404552, "grad_norm": 1.6448986530303955, "learning_rate": 3.161157894736842e-05, "loss": 0.5815, "step": 141 }, { "epoch": 0.006595448211797492, "grad_norm": 1.9651342630386353, "learning_rate": 3.107578947368421e-05, "loss": 0.7199, "step": 142 }, { "epoch": 0.006641895030190432, "grad_norm": 2.4397366046905518, "learning_rate": 3.0539999999999996e-05, "loss": 0.7959, "step": 143 }, { "epoch": 0.006688341848583372, "grad_norm": 1.7463246583938599, "learning_rate": 3.0004210526315784e-05, "loss": 0.7066, "step": 144 }, { "epoch": 0.006734788666976312, "grad_norm": 1.6383179426193237, "learning_rate": 2.946842105263158e-05, "loss": 0.5844, "step": 145 }, { "epoch": 0.006781235485369252, "grad_norm": 2.03802752494812, "learning_rate": 2.8932631578947368e-05, "loss": 0.7851, "step": 146 }, { "epoch": 0.006827682303762192, "grad_norm": 1.5965886116027832, "learning_rate": 2.8396842105263153e-05, "loss": 0.7421, "step": 147 }, { "epoch": 0.006874129122155132, "grad_norm": 1.6589584350585938, "learning_rate": 2.786105263157895e-05, "loss": 0.7362, "step": 148 }, { "epoch": 0.006920575940548073, "grad_norm": 1.904215693473816, "learning_rate": 2.7325263157894737e-05, "loss": 0.842, "step": 149 }, { "epoch": 0.006967022758941012, "grad_norm": 1.954518437385559, "learning_rate": 2.6789473684210522e-05, "loss": 0.8687, "step": 150 }, { "epoch": 0.006967022758941012, "eval_loss": 0.7381066679954529, "eval_runtime": 163.587, "eval_samples_per_second": 55.42, "eval_steps_per_second": 13.858, "step": 150 }, { "epoch": 0.007013469577333953, "grad_norm": 1.321113109588623, "learning_rate": 2.6253684210526317e-05, "loss": 0.6275, "step": 151 }, { "epoch": 0.0070599163957268925, "grad_norm": 1.9313557147979736, "learning_rate": 2.5717894736842106e-05, "loss": 0.8083, "step": 152 }, { "epoch": 0.007106363214119833, "grad_norm": 2.39707350730896, "learning_rate": 2.518210526315789e-05, "loss": 1.1125, "step": 153 }, { "epoch": 0.007152810032512773, "grad_norm": 2.2258388996124268, "learning_rate": 2.4646315789473683e-05, "loss": 0.9885, "step": 154 }, { "epoch": 0.007199256850905713, "grad_norm": 2.207796096801758, "learning_rate": 2.411052631578947e-05, "loss": 0.8165, "step": 155 }, { "epoch": 0.007245703669298653, "grad_norm": 2.068021774291992, "learning_rate": 2.357473684210526e-05, "loss": 0.9621, "step": 156 }, { "epoch": 0.0072921504876915936, "grad_norm": 3.123298168182373, "learning_rate": 2.3038947368421052e-05, "loss": 0.9623, "step": 157 }, { "epoch": 0.007338597306084533, "grad_norm": 1.7516857385635376, "learning_rate": 2.250315789473684e-05, "loss": 0.7126, "step": 158 }, { "epoch": 0.007385044124477473, "grad_norm": 1.756352424621582, "learning_rate": 2.196736842105263e-05, "loss": 0.6112, "step": 159 }, { "epoch": 0.0074314909428704135, "grad_norm": 1.324313998222351, "learning_rate": 2.1431578947368418e-05, "loss": 0.4837, "step": 160 }, { "epoch": 0.007477937761263353, "grad_norm": 1.6090558767318726, "learning_rate": 2.089578947368421e-05, "loss": 0.5255, "step": 161 }, { "epoch": 0.007524384579656294, "grad_norm": 1.3804148435592651, "learning_rate": 2.036e-05, "loss": 0.4674, "step": 162 }, { "epoch": 0.007570831398049233, "grad_norm": 1.3651041984558105, "learning_rate": 1.9824210526315787e-05, "loss": 0.7306, "step": 163 }, { "epoch": 0.007617278216442174, "grad_norm": 1.8530007600784302, "learning_rate": 1.928842105263158e-05, "loss": 0.7491, "step": 164 }, { "epoch": 0.007663725034835114, "grad_norm": 1.493820309638977, "learning_rate": 1.8752631578947367e-05, "loss": 0.6576, "step": 165 }, { "epoch": 0.007710171853228054, "grad_norm": 1.199458360671997, "learning_rate": 1.8216842105263156e-05, "loss": 0.3963, "step": 166 }, { "epoch": 0.007756618671620994, "grad_norm": 1.6788829565048218, "learning_rate": 1.7681052631578948e-05, "loss": 0.7574, "step": 167 }, { "epoch": 0.0078030654900139345, "grad_norm": 1.2864102125167847, "learning_rate": 1.7145263157894736e-05, "loss": 0.5522, "step": 168 }, { "epoch": 0.007849512308406874, "grad_norm": 1.8316515684127808, "learning_rate": 1.6609473684210525e-05, "loss": 0.5285, "step": 169 }, { "epoch": 0.007895959126799815, "grad_norm": 1.116195559501648, "learning_rate": 1.6073684210526313e-05, "loss": 0.5598, "step": 170 }, { "epoch": 0.007942405945192754, "grad_norm": 1.7328448295593262, "learning_rate": 1.5537894736842105e-05, "loss": 0.5905, "step": 171 }, { "epoch": 0.007988852763585694, "grad_norm": 1.5131913423538208, "learning_rate": 1.5002105263157892e-05, "loss": 0.5292, "step": 172 }, { "epoch": 0.008035299581978635, "grad_norm": 1.8316717147827148, "learning_rate": 1.4466315789473684e-05, "loss": 0.547, "step": 173 }, { "epoch": 0.008081746400371575, "grad_norm": 1.4963319301605225, "learning_rate": 1.3930526315789474e-05, "loss": 0.6936, "step": 174 }, { "epoch": 0.008128193218764514, "grad_norm": 1.6800758838653564, "learning_rate": 1.3394736842105261e-05, "loss": 0.6586, "step": 175 }, { "epoch": 0.008174640037157455, "grad_norm": 1.0843589305877686, "learning_rate": 1.2858947368421053e-05, "loss": 0.4544, "step": 176 }, { "epoch": 0.008221086855550395, "grad_norm": 1.6353403329849243, "learning_rate": 1.2323157894736842e-05, "loss": 0.6454, "step": 177 }, { "epoch": 0.008267533673943336, "grad_norm": 1.6226987838745117, "learning_rate": 1.178736842105263e-05, "loss": 0.7083, "step": 178 }, { "epoch": 0.008313980492336275, "grad_norm": 1.2014755010604858, "learning_rate": 1.125157894736842e-05, "loss": 0.518, "step": 179 }, { "epoch": 0.008360427310729215, "grad_norm": 0.9961537718772888, "learning_rate": 1.0715789473684209e-05, "loss": 0.5034, "step": 180 }, { "epoch": 0.008406874129122156, "grad_norm": 1.2505362033843994, "learning_rate": 1.018e-05, "loss": 0.3744, "step": 181 }, { "epoch": 0.008453320947515094, "grad_norm": 1.209218144416809, "learning_rate": 9.64421052631579e-06, "loss": 0.5659, "step": 182 }, { "epoch": 0.008499767765908035, "grad_norm": 1.5287011861801147, "learning_rate": 9.108421052631578e-06, "loss": 0.6864, "step": 183 }, { "epoch": 0.008546214584300976, "grad_norm": 1.5412635803222656, "learning_rate": 8.572631578947368e-06, "loss": 0.784, "step": 184 }, { "epoch": 0.008592661402693916, "grad_norm": 1.2702871561050415, "learning_rate": 8.036842105263157e-06, "loss": 0.6373, "step": 185 }, { "epoch": 0.008639108221086855, "grad_norm": 1.2671583890914917, "learning_rate": 7.501052631578946e-06, "loss": 0.4503, "step": 186 }, { "epoch": 0.008685555039479795, "grad_norm": 1.6440976858139038, "learning_rate": 6.965263157894737e-06, "loss": 0.6182, "step": 187 }, { "epoch": 0.008732001857872736, "grad_norm": 1.6860370635986328, "learning_rate": 6.4294736842105265e-06, "loss": 0.6629, "step": 188 }, { "epoch": 0.008778448676265677, "grad_norm": 1.778744101524353, "learning_rate": 5.893684210526315e-06, "loss": 0.707, "step": 189 }, { "epoch": 0.008824895494658615, "grad_norm": 2.259239673614502, "learning_rate": 5.3578947368421044e-06, "loss": 0.8088, "step": 190 }, { "epoch": 0.008871342313051556, "grad_norm": 1.5541491508483887, "learning_rate": 4.822105263157895e-06, "loss": 0.6756, "step": 191 }, { "epoch": 0.008917789131444497, "grad_norm": 1.634876012802124, "learning_rate": 4.286315789473684e-06, "loss": 0.8733, "step": 192 }, { "epoch": 0.008964235949837435, "grad_norm": 1.7315068244934082, "learning_rate": 3.750526315789473e-06, "loss": 0.6555, "step": 193 }, { "epoch": 0.009010682768230376, "grad_norm": 1.7454522848129272, "learning_rate": 3.2147368421052633e-06, "loss": 0.754, "step": 194 }, { "epoch": 0.009057129586623316, "grad_norm": 1.7474086284637451, "learning_rate": 2.6789473684210522e-06, "loss": 0.8278, "step": 195 }, { "epoch": 0.009103576405016257, "grad_norm": 1.9185843467712402, "learning_rate": 2.143157894736842e-06, "loss": 0.7852, "step": 196 }, { "epoch": 0.009150023223409196, "grad_norm": 1.923701286315918, "learning_rate": 1.6073684210526316e-06, "loss": 0.6713, "step": 197 }, { "epoch": 0.009196470041802136, "grad_norm": 1.524601697921753, "learning_rate": 1.071578947368421e-06, "loss": 0.7535, "step": 198 }, { "epoch": 0.009242916860195077, "grad_norm": 2.139697313308716, "learning_rate": 5.357894736842105e-07, "loss": 0.9147, "step": 199 }, { "epoch": 0.009289363678588018, "grad_norm": 1.9575085639953613, "learning_rate": 0.0, "loss": 0.8268, "step": 200 }, { "epoch": 0.009289363678588018, "eval_loss": 0.7234218120574951, "eval_runtime": 163.2305, "eval_samples_per_second": 55.541, "eval_steps_per_second": 13.888, "step": 200 } ], "logging_steps": 1, "max_steps": 200, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 5, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.2158942759092224e+16, "train_batch_size": 4, "trial_name": null, "trial_params": null }