{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.999341672152732, "eval_steps": 500, "global_step": 759, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.003949967083607637, "grad_norm": 6.504404997081429, "learning_rate": 1.3157894736842107e-07, "loss": 0.85, "step": 1 }, { "epoch": 0.007899934167215274, "grad_norm": 6.4285522276729195, "learning_rate": 2.6315789473684213e-07, "loss": 0.8628, "step": 2 }, { "epoch": 0.01184990125082291, "grad_norm": 6.509733011342855, "learning_rate": 3.9473684210526315e-07, "loss": 0.872, "step": 3 }, { "epoch": 0.015799868334430547, "grad_norm": 6.490006436867934, "learning_rate": 5.263157894736843e-07, "loss": 0.8617, "step": 4 }, { "epoch": 0.019749835418038184, "grad_norm": 6.403260703843927, "learning_rate": 6.578947368421053e-07, "loss": 0.853, "step": 5 }, { "epoch": 0.02369980250164582, "grad_norm": 6.311435368116544, "learning_rate": 7.894736842105263e-07, "loss": 0.8506, "step": 6 }, { "epoch": 0.027649769585253458, "grad_norm": 6.151881596200702, "learning_rate": 9.210526315789474e-07, "loss": 0.8414, "step": 7 }, { "epoch": 0.031599736668861095, "grad_norm": 6.02357759760398, "learning_rate": 1.0526315789473685e-06, "loss": 0.8159, "step": 8 }, { "epoch": 0.03554970375246873, "grad_norm": 5.71136757874485, "learning_rate": 1.1842105263157894e-06, "loss": 0.8102, "step": 9 }, { "epoch": 0.03949967083607637, "grad_norm": 5.177345192704993, "learning_rate": 1.3157894736842106e-06, "loss": 0.8192, "step": 10 }, { "epoch": 0.043449637919684, "grad_norm": 4.774028846631175, "learning_rate": 1.4473684210526317e-06, "loss": 0.7968, "step": 11 }, { "epoch": 0.04739960500329164, "grad_norm": 4.609199934531555, "learning_rate": 1.5789473684210526e-06, "loss": 0.7932, "step": 12 }, { "epoch": 0.051349572086899276, "grad_norm": 2.897172635017503, "learning_rate": 1.710526315789474e-06, "loss": 0.7464, "step": 13 }, { "epoch": 0.055299539170506916, "grad_norm": 2.8086012655784605, "learning_rate": 1.8421052631578948e-06, "loss": 0.7183, "step": 14 }, { "epoch": 0.05924950625411455, "grad_norm": 2.6085154141793274, "learning_rate": 1.973684210526316e-06, "loss": 0.762, "step": 15 }, { "epoch": 0.06319947333772219, "grad_norm": 2.4552833281104145, "learning_rate": 2.105263157894737e-06, "loss": 0.7306, "step": 16 }, { "epoch": 0.06714944042132982, "grad_norm": 2.375545866697905, "learning_rate": 2.236842105263158e-06, "loss": 0.7317, "step": 17 }, { "epoch": 0.07109940750493746, "grad_norm": 2.866954345791216, "learning_rate": 2.368421052631579e-06, "loss": 0.685, "step": 18 }, { "epoch": 0.07504937458854509, "grad_norm": 3.175071700080693, "learning_rate": 2.5e-06, "loss": 0.6969, "step": 19 }, { "epoch": 0.07899934167215274, "grad_norm": 2.939403446465285, "learning_rate": 2.631578947368421e-06, "loss": 0.6845, "step": 20 }, { "epoch": 0.08294930875576037, "grad_norm": 2.8817452325863884, "learning_rate": 2.7631578947368424e-06, "loss": 0.6679, "step": 21 }, { "epoch": 0.086899275839368, "grad_norm": 2.545083903627149, "learning_rate": 2.8947368421052634e-06, "loss": 0.6959, "step": 22 }, { "epoch": 0.09084924292297564, "grad_norm": 2.0198030689295976, "learning_rate": 3.0263157894736843e-06, "loss": 0.6729, "step": 23 }, { "epoch": 0.09479921000658328, "grad_norm": 1.5568634659879192, "learning_rate": 3.157894736842105e-06, "loss": 0.6586, "step": 24 }, { "epoch": 0.09874917709019092, "grad_norm": 1.4100915921761508, "learning_rate": 3.289473684210527e-06, "loss": 0.6434, "step": 25 }, { "epoch": 0.10269914417379855, "grad_norm": 1.2296149889545172, "learning_rate": 3.421052631578948e-06, "loss": 0.6393, "step": 26 }, { "epoch": 0.10664911125740618, "grad_norm": 1.113941306841056, "learning_rate": 3.5526315789473687e-06, "loss": 0.6222, "step": 27 }, { "epoch": 0.11059907834101383, "grad_norm": 1.143957658795969, "learning_rate": 3.6842105263157896e-06, "loss": 0.5969, "step": 28 }, { "epoch": 0.11454904542462147, "grad_norm": 1.1505693655950588, "learning_rate": 3.815789473684211e-06, "loss": 0.5826, "step": 29 }, { "epoch": 0.1184990125082291, "grad_norm": 0.933931796719158, "learning_rate": 3.947368421052632e-06, "loss": 0.5768, "step": 30 }, { "epoch": 0.12244897959183673, "grad_norm": 0.8600006106071842, "learning_rate": 4.078947368421053e-06, "loss": 0.6014, "step": 31 }, { "epoch": 0.12639894667544438, "grad_norm": 0.7826439353184591, "learning_rate": 4.210526315789474e-06, "loss": 0.5652, "step": 32 }, { "epoch": 0.130348913759052, "grad_norm": 0.8581968909931792, "learning_rate": 4.342105263157895e-06, "loss": 0.6003, "step": 33 }, { "epoch": 0.13429888084265965, "grad_norm": 0.9006376952129296, "learning_rate": 4.473684210526316e-06, "loss": 0.5722, "step": 34 }, { "epoch": 0.1382488479262673, "grad_norm": 0.8991294946337406, "learning_rate": 4.605263157894737e-06, "loss": 0.5713, "step": 35 }, { "epoch": 0.1421988150098749, "grad_norm": 0.782481973940663, "learning_rate": 4.736842105263158e-06, "loss": 0.5542, "step": 36 }, { "epoch": 0.14614878209348256, "grad_norm": 0.6895184112566858, "learning_rate": 4.8684210526315795e-06, "loss": 0.5614, "step": 37 }, { "epoch": 0.15009874917709018, "grad_norm": 0.6740638206595433, "learning_rate": 5e-06, "loss": 0.5249, "step": 38 }, { "epoch": 0.15404871626069783, "grad_norm": 0.8492118783304596, "learning_rate": 5.131578947368422e-06, "loss": 0.5786, "step": 39 }, { "epoch": 0.15799868334430547, "grad_norm": 0.732941645850187, "learning_rate": 5.263157894736842e-06, "loss": 0.567, "step": 40 }, { "epoch": 0.1619486504279131, "grad_norm": 0.744593808806397, "learning_rate": 5.394736842105264e-06, "loss": 0.5653, "step": 41 }, { "epoch": 0.16589861751152074, "grad_norm": 0.6422110673750279, "learning_rate": 5.526315789473685e-06, "loss": 0.5425, "step": 42 }, { "epoch": 0.16984858459512836, "grad_norm": 0.7405946476228535, "learning_rate": 5.657894736842106e-06, "loss": 0.5482, "step": 43 }, { "epoch": 0.173798551678736, "grad_norm": 0.7361356453970808, "learning_rate": 5.789473684210527e-06, "loss": 0.5435, "step": 44 }, { "epoch": 0.17774851876234365, "grad_norm": 0.6194209742518226, "learning_rate": 5.921052631578948e-06, "loss": 0.5745, "step": 45 }, { "epoch": 0.18169848584595127, "grad_norm": 0.5716973876414746, "learning_rate": 6.0526315789473685e-06, "loss": 0.5485, "step": 46 }, { "epoch": 0.18564845292955892, "grad_norm": 0.5981144628332149, "learning_rate": 6.18421052631579e-06, "loss": 0.5392, "step": 47 }, { "epoch": 0.18959842001316657, "grad_norm": 0.6813367973023082, "learning_rate": 6.31578947368421e-06, "loss": 0.5464, "step": 48 }, { "epoch": 0.1935483870967742, "grad_norm": 0.5771532920223513, "learning_rate": 6.447368421052632e-06, "loss": 0.5002, "step": 49 }, { "epoch": 0.19749835418038184, "grad_norm": 0.5485589353480836, "learning_rate": 6.578947368421054e-06, "loss": 0.5608, "step": 50 }, { "epoch": 0.20144832126398945, "grad_norm": 0.5978256639860449, "learning_rate": 6.710526315789474e-06, "loss": 0.5452, "step": 51 }, { "epoch": 0.2053982883475971, "grad_norm": 0.5459021681272838, "learning_rate": 6.842105263157896e-06, "loss": 0.5095, "step": 52 }, { "epoch": 0.20934825543120475, "grad_norm": 0.6360883088560538, "learning_rate": 6.973684210526316e-06, "loss": 0.5084, "step": 53 }, { "epoch": 0.21329822251481237, "grad_norm": 0.5498172201066589, "learning_rate": 7.1052631578947375e-06, "loss": 0.5131, "step": 54 }, { "epoch": 0.21724818959842002, "grad_norm": 0.5580895114375191, "learning_rate": 7.236842105263158e-06, "loss": 0.5085, "step": 55 }, { "epoch": 0.22119815668202766, "grad_norm": 0.6017850242151779, "learning_rate": 7.368421052631579e-06, "loss": 0.5286, "step": 56 }, { "epoch": 0.22514812376563528, "grad_norm": 0.5721354518970821, "learning_rate": 7.500000000000001e-06, "loss": 0.4982, "step": 57 }, { "epoch": 0.22909809084924293, "grad_norm": 0.5489159060150893, "learning_rate": 7.631578947368423e-06, "loss": 0.4964, "step": 58 }, { "epoch": 0.23304805793285055, "grad_norm": 0.5252428125465181, "learning_rate": 7.763157894736843e-06, "loss": 0.5029, "step": 59 }, { "epoch": 0.2369980250164582, "grad_norm": 0.6252972830139001, "learning_rate": 7.894736842105265e-06, "loss": 0.5358, "step": 60 }, { "epoch": 0.24094799210006584, "grad_norm": 0.5847025247731619, "learning_rate": 8.026315789473685e-06, "loss": 0.5335, "step": 61 }, { "epoch": 0.24489795918367346, "grad_norm": 0.517405600168881, "learning_rate": 8.157894736842106e-06, "loss": 0.5345, "step": 62 }, { "epoch": 0.2488479262672811, "grad_norm": 0.5759616130629226, "learning_rate": 8.289473684210526e-06, "loss": 0.5271, "step": 63 }, { "epoch": 0.25279789335088876, "grad_norm": 0.5408380279166887, "learning_rate": 8.421052631578948e-06, "loss": 0.5313, "step": 64 }, { "epoch": 0.2567478604344964, "grad_norm": 0.5599709668921061, "learning_rate": 8.552631578947368e-06, "loss": 0.5014, "step": 65 }, { "epoch": 0.260697827518104, "grad_norm": 0.49276069628291064, "learning_rate": 8.68421052631579e-06, "loss": 0.5023, "step": 66 }, { "epoch": 0.2646477946017117, "grad_norm": 0.5253626072163272, "learning_rate": 8.81578947368421e-06, "loss": 0.4912, "step": 67 }, { "epoch": 0.2685977616853193, "grad_norm": 0.6592208211843796, "learning_rate": 8.947368421052632e-06, "loss": 0.5081, "step": 68 }, { "epoch": 0.2725477287689269, "grad_norm": 0.5996182282163528, "learning_rate": 9.078947368421054e-06, "loss": 0.5198, "step": 69 }, { "epoch": 0.2764976958525346, "grad_norm": 0.5435008824717386, "learning_rate": 9.210526315789474e-06, "loss": 0.4816, "step": 70 }, { "epoch": 0.2804476629361422, "grad_norm": 0.568678975917265, "learning_rate": 9.342105263157895e-06, "loss": 0.4906, "step": 71 }, { "epoch": 0.2843976300197498, "grad_norm": 0.5861743970771205, "learning_rate": 9.473684210526315e-06, "loss": 0.5111, "step": 72 }, { "epoch": 0.28834759710335744, "grad_norm": 0.5101410545315266, "learning_rate": 9.605263157894737e-06, "loss": 0.4832, "step": 73 }, { "epoch": 0.2922975641869651, "grad_norm": 0.5497631883203306, "learning_rate": 9.736842105263159e-06, "loss": 0.4666, "step": 74 }, { "epoch": 0.29624753127057274, "grad_norm": 0.5770945737038101, "learning_rate": 9.868421052631579e-06, "loss": 0.4693, "step": 75 }, { "epoch": 0.30019749835418036, "grad_norm": 0.5897957166870517, "learning_rate": 1e-05, "loss": 0.5043, "step": 76 }, { "epoch": 0.30414746543778803, "grad_norm": 0.5201074615650162, "learning_rate": 9.99994710707541e-06, "loss": 0.5208, "step": 77 }, { "epoch": 0.30809743252139565, "grad_norm": 0.6480448160556778, "learning_rate": 9.999788429420697e-06, "loss": 0.4755, "step": 78 }, { "epoch": 0.3120473996050033, "grad_norm": 0.6169194869319706, "learning_rate": 9.999523970393038e-06, "loss": 0.5107, "step": 79 }, { "epoch": 0.31599736668861095, "grad_norm": 0.6195541575778166, "learning_rate": 9.999153735587632e-06, "loss": 0.5021, "step": 80 }, { "epoch": 0.31994733377221857, "grad_norm": 0.621162782715734, "learning_rate": 9.998677732837604e-06, "loss": 0.4876, "step": 81 }, { "epoch": 0.3238973008558262, "grad_norm": 0.6720452136270956, "learning_rate": 9.99809597221382e-06, "loss": 0.4902, "step": 82 }, { "epoch": 0.32784726793943386, "grad_norm": 0.651630608660343, "learning_rate": 9.997408466024692e-06, "loss": 0.4894, "step": 83 }, { "epoch": 0.3317972350230415, "grad_norm": 0.5676776039893645, "learning_rate": 9.996615228815906e-06, "loss": 0.4957, "step": 84 }, { "epoch": 0.3357472021066491, "grad_norm": 0.6005717297536874, "learning_rate": 9.995716277370114e-06, "loss": 0.4695, "step": 85 }, { "epoch": 0.3396971691902567, "grad_norm": 0.626768869144381, "learning_rate": 9.994711630706585e-06, "loss": 0.4934, "step": 86 }, { "epoch": 0.3436471362738644, "grad_norm": 0.5455161777212448, "learning_rate": 9.9936013100808e-06, "loss": 0.491, "step": 87 }, { "epoch": 0.347597103357472, "grad_norm": 0.5333006097387643, "learning_rate": 9.992385338984e-06, "loss": 0.4603, "step": 88 }, { "epoch": 0.35154707044107963, "grad_norm": 0.6366561524059222, "learning_rate": 9.991063743142693e-06, "loss": 0.5062, "step": 89 }, { "epoch": 0.3554970375246873, "grad_norm": 0.5499475574742378, "learning_rate": 9.989636550518105e-06, "loss": 0.4913, "step": 90 }, { "epoch": 0.35944700460829493, "grad_norm": 0.549241711572128, "learning_rate": 9.988103791305594e-06, "loss": 0.4863, "step": 91 }, { "epoch": 0.36339697169190255, "grad_norm": 0.5604139974675725, "learning_rate": 9.986465497934008e-06, "loss": 0.4955, "step": 92 }, { "epoch": 0.3673469387755102, "grad_norm": 0.576968134806151, "learning_rate": 9.984721705064994e-06, "loss": 0.4776, "step": 93 }, { "epoch": 0.37129690585911784, "grad_norm": 0.5766772052442326, "learning_rate": 9.98287244959228e-06, "loss": 0.479, "step": 94 }, { "epoch": 0.37524687294272546, "grad_norm": 0.5937654028305102, "learning_rate": 9.980917770640873e-06, "loss": 0.476, "step": 95 }, { "epoch": 0.37919684002633314, "grad_norm": 0.5671163571769688, "learning_rate": 9.97885770956625e-06, "loss": 0.4783, "step": 96 }, { "epoch": 0.38314680710994076, "grad_norm": 0.5781463142704665, "learning_rate": 9.976692309953472e-06, "loss": 0.499, "step": 97 }, { "epoch": 0.3870967741935484, "grad_norm": 0.4923452243680876, "learning_rate": 9.974421617616267e-06, "loss": 0.4823, "step": 98 }, { "epoch": 0.39104674127715605, "grad_norm": 0.5686942429331671, "learning_rate": 9.97204568059606e-06, "loss": 0.5018, "step": 99 }, { "epoch": 0.39499670836076367, "grad_norm": 0.5309009070888455, "learning_rate": 9.969564549160952e-06, "loss": 0.4894, "step": 100 }, { "epoch": 0.3989466754443713, "grad_norm": 0.5717848268595466, "learning_rate": 9.96697827580466e-06, "loss": 0.4812, "step": 101 }, { "epoch": 0.4028966425279789, "grad_norm": 0.5104085309145232, "learning_rate": 9.964286915245414e-06, "loss": 0.4872, "step": 102 }, { "epoch": 0.4068466096115866, "grad_norm": 0.5581199237176204, "learning_rate": 9.961490524424781e-06, "loss": 0.4895, "step": 103 }, { "epoch": 0.4107965766951942, "grad_norm": 0.5441395470345458, "learning_rate": 9.958589162506481e-06, "loss": 0.4791, "step": 104 }, { "epoch": 0.4147465437788018, "grad_norm": 0.5428579182326724, "learning_rate": 9.955582890875118e-06, "loss": 0.4676, "step": 105 }, { "epoch": 0.4186965108624095, "grad_norm": 0.589119471133675, "learning_rate": 9.952471773134893e-06, "loss": 0.4697, "step": 106 }, { "epoch": 0.4226464779460171, "grad_norm": 0.5178678243366326, "learning_rate": 9.949255875108252e-06, "loss": 0.5059, "step": 107 }, { "epoch": 0.42659644502962474, "grad_norm": 0.5415413848635673, "learning_rate": 9.945935264834495e-06, "loss": 0.4965, "step": 108 }, { "epoch": 0.4305464121132324, "grad_norm": 0.5169475717611434, "learning_rate": 9.942510012568338e-06, "loss": 0.4927, "step": 109 }, { "epoch": 0.43449637919684003, "grad_norm": 0.5184074445147275, "learning_rate": 9.938980190778426e-06, "loss": 0.4826, "step": 110 }, { "epoch": 0.43844634628044765, "grad_norm": 0.6083738891717713, "learning_rate": 9.935345874145797e-06, "loss": 0.4718, "step": 111 }, { "epoch": 0.4423963133640553, "grad_norm": 0.5351570593538689, "learning_rate": 9.931607139562304e-06, "loss": 0.4861, "step": 112 }, { "epoch": 0.44634628044766295, "grad_norm": 0.6015513256486655, "learning_rate": 9.927764066128992e-06, "loss": 0.4602, "step": 113 }, { "epoch": 0.45029624753127057, "grad_norm": 0.5668635320391895, "learning_rate": 9.923816735154417e-06, "loss": 0.4916, "step": 114 }, { "epoch": 0.4542462146148782, "grad_norm": 0.5189427254025247, "learning_rate": 9.919765230152932e-06, "loss": 0.4638, "step": 115 }, { "epoch": 0.45819618169848586, "grad_norm": 0.6319877446824944, "learning_rate": 9.915609636842914e-06, "loss": 0.4817, "step": 116 }, { "epoch": 0.4621461487820935, "grad_norm": 0.5981012229295962, "learning_rate": 9.911350043144958e-06, "loss": 0.4986, "step": 117 }, { "epoch": 0.4660961158657011, "grad_norm": 0.6181624748677259, "learning_rate": 9.906986539180012e-06, "loss": 0.4691, "step": 118 }, { "epoch": 0.4700460829493088, "grad_norm": 0.5628840410354462, "learning_rate": 9.90251921726747e-06, "loss": 0.4767, "step": 119 }, { "epoch": 0.4739960500329164, "grad_norm": 0.6404850244355799, "learning_rate": 9.89794817192322e-06, "loss": 0.501, "step": 120 }, { "epoch": 0.477946017116524, "grad_norm": 0.5258810495702417, "learning_rate": 9.893273499857642e-06, "loss": 0.4767, "step": 121 }, { "epoch": 0.4818959842001317, "grad_norm": 0.6991575782395473, "learning_rate": 9.888495299973574e-06, "loss": 0.4881, "step": 122 }, { "epoch": 0.4858459512837393, "grad_norm": 0.5179390886923614, "learning_rate": 9.883613673364197e-06, "loss": 0.4926, "step": 123 }, { "epoch": 0.4897959183673469, "grad_norm": 0.6629485700830527, "learning_rate": 9.878628723310914e-06, "loss": 0.4821, "step": 124 }, { "epoch": 0.4937458854509546, "grad_norm": 0.5890040880860206, "learning_rate": 9.873540555281162e-06, "loss": 0.4846, "step": 125 }, { "epoch": 0.4976958525345622, "grad_norm": 0.6464512835706927, "learning_rate": 9.868349276926174e-06, "loss": 0.4817, "step": 126 }, { "epoch": 0.5016458196181699, "grad_norm": 0.576460881576221, "learning_rate": 9.863054998078711e-06, "loss": 0.4547, "step": 127 }, { "epoch": 0.5055957867017775, "grad_norm": 0.6618740734226224, "learning_rate": 9.857657830750727e-06, "loss": 0.4747, "step": 128 }, { "epoch": 0.5095457537853851, "grad_norm": 0.6635835006083409, "learning_rate": 9.85215788913101e-06, "loss": 0.4899, "step": 129 }, { "epoch": 0.5134957208689928, "grad_norm": 0.6196320028368675, "learning_rate": 9.846555289582757e-06, "loss": 0.464, "step": 130 }, { "epoch": 0.5174456879526004, "grad_norm": 0.7017645391670726, "learning_rate": 9.840850150641117e-06, "loss": 0.5099, "step": 131 }, { "epoch": 0.521395655036208, "grad_norm": 0.6156041743103616, "learning_rate": 9.835042593010688e-06, "loss": 0.4528, "step": 132 }, { "epoch": 0.5253456221198156, "grad_norm": 0.6101813150476937, "learning_rate": 9.82913273956295e-06, "loss": 0.4358, "step": 133 }, { "epoch": 0.5292955892034233, "grad_norm": 0.5670769429239731, "learning_rate": 9.823120715333677e-06, "loss": 0.4464, "step": 134 }, { "epoch": 0.533245556287031, "grad_norm": 0.6022397090632055, "learning_rate": 9.817006647520285e-06, "loss": 0.4751, "step": 135 }, { "epoch": 0.5371955233706386, "grad_norm": 0.5297220445443962, "learning_rate": 9.810790665479147e-06, "loss": 0.4441, "step": 136 }, { "epoch": 0.5411454904542462, "grad_norm": 0.5258520864490437, "learning_rate": 9.80447290072285e-06, "loss": 0.4797, "step": 137 }, { "epoch": 0.5450954575378538, "grad_norm": 0.4996418334796664, "learning_rate": 9.798053486917417e-06, "loss": 0.4617, "step": 138 }, { "epoch": 0.5490454246214614, "grad_norm": 0.5671128908945776, "learning_rate": 9.791532559879475e-06, "loss": 0.4575, "step": 139 }, { "epoch": 0.5529953917050692, "grad_norm": 0.46439037381822834, "learning_rate": 9.784910257573384e-06, "loss": 0.4605, "step": 140 }, { "epoch": 0.5569453587886768, "grad_norm": 0.5385566253621885, "learning_rate": 9.77818672010832e-06, "loss": 0.4632, "step": 141 }, { "epoch": 0.5608953258722844, "grad_norm": 0.5379645512099025, "learning_rate": 9.771362089735308e-06, "loss": 0.4782, "step": 142 }, { "epoch": 0.564845292955892, "grad_norm": 0.5291475366458595, "learning_rate": 9.764436510844211e-06, "loss": 0.4722, "step": 143 }, { "epoch": 0.5687952600394997, "grad_norm": 0.5549937488638883, "learning_rate": 9.757410129960677e-06, "loss": 0.475, "step": 144 }, { "epoch": 0.5727452271231073, "grad_norm": 0.5473290434958883, "learning_rate": 9.750283095743038e-06, "loss": 0.4606, "step": 145 }, { "epoch": 0.5766951942067149, "grad_norm": 0.5268038681083071, "learning_rate": 9.743055558979172e-06, "loss": 0.4689, "step": 146 }, { "epoch": 0.5806451612903226, "grad_norm": 0.5096669822291863, "learning_rate": 9.735727672583298e-06, "loss": 0.4683, "step": 147 }, { "epoch": 0.5845951283739302, "grad_norm": 0.5695837246815498, "learning_rate": 9.728299591592754e-06, "loss": 0.4854, "step": 148 }, { "epoch": 0.5885450954575379, "grad_norm": 0.5888568576067914, "learning_rate": 9.720771473164711e-06, "loss": 0.4723, "step": 149 }, { "epoch": 0.5924950625411455, "grad_norm": 0.5252127492904591, "learning_rate": 9.713143476572853e-06, "loss": 0.47, "step": 150 }, { "epoch": 0.5964450296247531, "grad_norm": 0.5094055424742244, "learning_rate": 9.705415763203992e-06, "loss": 0.4729, "step": 151 }, { "epoch": 0.6003949967083607, "grad_norm": 0.6282469950692865, "learning_rate": 9.697588496554679e-06, "loss": 0.4761, "step": 152 }, { "epoch": 0.6043449637919684, "grad_norm": 0.6426767885434058, "learning_rate": 9.68966184222772e-06, "loss": 0.4755, "step": 153 }, { "epoch": 0.6082949308755761, "grad_norm": 0.5622361615609462, "learning_rate": 9.681635967928687e-06, "loss": 0.4651, "step": 154 }, { "epoch": 0.6122448979591837, "grad_norm": 0.6648424628876591, "learning_rate": 9.673511043462367e-06, "loss": 0.456, "step": 155 }, { "epoch": 0.6161948650427913, "grad_norm": 0.7333902821604775, "learning_rate": 9.665287240729166e-06, "loss": 0.4695, "step": 156 }, { "epoch": 0.6201448321263989, "grad_norm": 0.6443651454553839, "learning_rate": 9.656964733721476e-06, "loss": 0.4808, "step": 157 }, { "epoch": 0.6240947992100065, "grad_norm": 0.7932466298591265, "learning_rate": 9.648543698519993e-06, "loss": 0.496, "step": 158 }, { "epoch": 0.6280447662936142, "grad_norm": 0.5665540196550471, "learning_rate": 9.640024313289982e-06, "loss": 0.4908, "step": 159 }, { "epoch": 0.6319947333772219, "grad_norm": 0.6356083361139434, "learning_rate": 9.63140675827753e-06, "loss": 0.4982, "step": 160 }, { "epoch": 0.6359447004608295, "grad_norm": 0.5186680316983545, "learning_rate": 9.62269121580571e-06, "loss": 0.4652, "step": 161 }, { "epoch": 0.6398946675444371, "grad_norm": 0.6228924174757652, "learning_rate": 9.613877870270735e-06, "loss": 0.4732, "step": 162 }, { "epoch": 0.6438446346280448, "grad_norm": 0.5497308794093734, "learning_rate": 9.604966908138052e-06, "loss": 0.4713, "step": 163 }, { "epoch": 0.6477946017116524, "grad_norm": 0.5208533245267041, "learning_rate": 9.5959585179384e-06, "loss": 0.4555, "step": 164 }, { "epoch": 0.65174456879526, "grad_norm": 0.6379633267065392, "learning_rate": 9.586852890263822e-06, "loss": 0.4686, "step": 165 }, { "epoch": 0.6556945358788677, "grad_norm": 0.5433162450238969, "learning_rate": 9.577650217763627e-06, "loss": 0.4776, "step": 166 }, { "epoch": 0.6596445029624753, "grad_norm": 0.5125049288783692, "learning_rate": 9.568350695140322e-06, "loss": 0.4683, "step": 167 }, { "epoch": 0.663594470046083, "grad_norm": 0.5850586628671525, "learning_rate": 9.558954519145487e-06, "loss": 0.4491, "step": 168 }, { "epoch": 0.6675444371296906, "grad_norm": 0.5637652572450188, "learning_rate": 9.549461888575611e-06, "loss": 0.4764, "step": 169 }, { "epoch": 0.6714944042132982, "grad_norm": 0.4724538986566349, "learning_rate": 9.539873004267892e-06, "loss": 0.4477, "step": 170 }, { "epoch": 0.6754443712969058, "grad_norm": 0.5974403493801838, "learning_rate": 9.530188069095985e-06, "loss": 0.4526, "step": 171 }, { "epoch": 0.6793943383805134, "grad_norm": 0.5554923337902067, "learning_rate": 9.520407287965707e-06, "loss": 0.4777, "step": 172 }, { "epoch": 0.6833443054641212, "grad_norm": 0.6338405413860229, "learning_rate": 9.510530867810706e-06, "loss": 0.4449, "step": 173 }, { "epoch": 0.6872942725477288, "grad_norm": 0.5618931553939812, "learning_rate": 9.500559017588081e-06, "loss": 0.4648, "step": 174 }, { "epoch": 0.6912442396313364, "grad_norm": 0.5819308869756974, "learning_rate": 9.490491948273961e-06, "loss": 0.4677, "step": 175 }, { "epoch": 0.695194206714944, "grad_norm": 0.5914857222260143, "learning_rate": 9.48032987285904e-06, "loss": 0.4536, "step": 176 }, { "epoch": 0.6991441737985516, "grad_norm": 0.5889793190678951, "learning_rate": 9.470073006344074e-06, "loss": 0.4647, "step": 177 }, { "epoch": 0.7030941408821593, "grad_norm": 0.5098628885738195, "learning_rate": 9.45972156573533e-06, "loss": 0.4515, "step": 178 }, { "epoch": 0.707044107965767, "grad_norm": 0.5691095895141721, "learning_rate": 9.449275770039995e-06, "loss": 0.4435, "step": 179 }, { "epoch": 0.7109940750493746, "grad_norm": 0.5182604084751163, "learning_rate": 9.438735840261541e-06, "loss": 0.4866, "step": 180 }, { "epoch": 0.7149440421329822, "grad_norm": 0.5108378482576097, "learning_rate": 9.428101999395057e-06, "loss": 0.4923, "step": 181 }, { "epoch": 0.7188940092165899, "grad_norm": 0.5337595682679888, "learning_rate": 9.417374472422514e-06, "loss": 0.4364, "step": 182 }, { "epoch": 0.7228439763001975, "grad_norm": 0.5001601656405088, "learning_rate": 9.406553486308028e-06, "loss": 0.4704, "step": 183 }, { "epoch": 0.7267939433838051, "grad_norm": 0.507633534833019, "learning_rate": 9.395639269993035e-06, "loss": 0.4542, "step": 184 }, { "epoch": 0.7307439104674127, "grad_norm": 0.5676958888775687, "learning_rate": 9.384632054391468e-06, "loss": 0.4563, "step": 185 }, { "epoch": 0.7346938775510204, "grad_norm": 0.5367000202389174, "learning_rate": 9.373532072384852e-06, "loss": 0.4493, "step": 186 }, { "epoch": 0.7386438446346281, "grad_norm": 0.46460640437170164, "learning_rate": 9.362339558817395e-06, "loss": 0.4366, "step": 187 }, { "epoch": 0.7425938117182357, "grad_norm": 0.5751294580853923, "learning_rate": 9.351054750491005e-06, "loss": 0.448, "step": 188 }, { "epoch": 0.7465437788018433, "grad_norm": 0.577033532716407, "learning_rate": 9.339677886160293e-06, "loss": 0.4775, "step": 189 }, { "epoch": 0.7504937458854509, "grad_norm": 0.5136719814842392, "learning_rate": 9.328209206527503e-06, "loss": 0.4484, "step": 190 }, { "epoch": 0.7544437129690585, "grad_norm": 0.5257009381474894, "learning_rate": 9.316648954237441e-06, "loss": 0.4532, "step": 191 }, { "epoch": 0.7583936800526663, "grad_norm": 0.48549776905425, "learning_rate": 9.304997373872332e-06, "loss": 0.4845, "step": 192 }, { "epoch": 0.7623436471362739, "grad_norm": 0.4944468650331745, "learning_rate": 9.293254711946634e-06, "loss": 0.4557, "step": 193 }, { "epoch": 0.7662936142198815, "grad_norm": 0.4886408400631905, "learning_rate": 9.281421216901844e-06, "loss": 0.452, "step": 194 }, { "epoch": 0.7702435813034891, "grad_norm": 0.5229732058159794, "learning_rate": 9.269497139101224e-06, "loss": 0.4711, "step": 195 }, { "epoch": 0.7741935483870968, "grad_norm": 0.599866494481382, "learning_rate": 9.257482730824516e-06, "loss": 0.4629, "step": 196 }, { "epoch": 0.7781435154707044, "grad_norm": 0.5050711429543929, "learning_rate": 9.245378246262593e-06, "loss": 0.4586, "step": 197 }, { "epoch": 0.7820934825543121, "grad_norm": 0.5055494208192743, "learning_rate": 9.233183941512093e-06, "loss": 0.4717, "step": 198 }, { "epoch": 0.7860434496379197, "grad_norm": 0.5498748434843441, "learning_rate": 9.220900074569994e-06, "loss": 0.4316, "step": 199 }, { "epoch": 0.7899934167215273, "grad_norm": 0.5250619998109198, "learning_rate": 9.208526905328151e-06, "loss": 0.4753, "step": 200 }, { "epoch": 0.793943383805135, "grad_norm": 0.5730292920017251, "learning_rate": 9.19606469556781e-06, "loss": 0.4618, "step": 201 }, { "epoch": 0.7978933508887426, "grad_norm": 0.5834853364687494, "learning_rate": 9.183513708954058e-06, "loss": 0.4474, "step": 202 }, { "epoch": 0.8018433179723502, "grad_norm": 0.5766779992714471, "learning_rate": 9.170874211030252e-06, "loss": 0.4796, "step": 203 }, { "epoch": 0.8057932850559578, "grad_norm": 0.6339335179507832, "learning_rate": 9.158146469212394e-06, "loss": 0.4762, "step": 204 }, { "epoch": 0.8097432521395656, "grad_norm": 0.5302572756545678, "learning_rate": 9.145330752783482e-06, "loss": 0.466, "step": 205 }, { "epoch": 0.8136932192231732, "grad_norm": 0.593440741154616, "learning_rate": 9.132427332887804e-06, "loss": 0.4666, "step": 206 }, { "epoch": 0.8176431863067808, "grad_norm": 0.5859461907043721, "learning_rate": 9.119436482525205e-06, "loss": 0.462, "step": 207 }, { "epoch": 0.8215931533903884, "grad_norm": 0.5237454115346878, "learning_rate": 9.106358476545313e-06, "loss": 0.4519, "step": 208 }, { "epoch": 0.825543120473996, "grad_norm": 0.6172976769919012, "learning_rate": 9.093193591641723e-06, "loss": 0.4621, "step": 209 }, { "epoch": 0.8294930875576036, "grad_norm": 0.5749660198403135, "learning_rate": 9.079942106346138e-06, "loss": 0.4472, "step": 210 }, { "epoch": 0.8334430546412114, "grad_norm": 0.5539958289134616, "learning_rate": 9.066604301022485e-06, "loss": 0.467, "step": 211 }, { "epoch": 0.837393021724819, "grad_norm": 0.6008114863070498, "learning_rate": 9.053180457860978e-06, "loss": 0.4692, "step": 212 }, { "epoch": 0.8413429888084266, "grad_norm": 0.6736909702465329, "learning_rate": 9.039670860872145e-06, "loss": 0.4711, "step": 213 }, { "epoch": 0.8452929558920342, "grad_norm": 0.5245072828257612, "learning_rate": 9.026075795880822e-06, "loss": 0.465, "step": 214 }, { "epoch": 0.8492429229756419, "grad_norm": 0.6355448137905055, "learning_rate": 9.01239555052011e-06, "loss": 0.4703, "step": 215 }, { "epoch": 0.8531928900592495, "grad_norm": 0.5605598954757884, "learning_rate": 8.998630414225284e-06, "loss": 0.4491, "step": 216 }, { "epoch": 0.8571428571428571, "grad_norm": 0.6127064016250322, "learning_rate": 8.98478067822767e-06, "loss": 0.4687, "step": 217 }, { "epoch": 0.8610928242264648, "grad_norm": 0.6392360868189375, "learning_rate": 8.970846635548483e-06, "loss": 0.4742, "step": 218 }, { "epoch": 0.8650427913100724, "grad_norm": 0.6526369659230478, "learning_rate": 8.956828580992633e-06, "loss": 0.4566, "step": 219 }, { "epoch": 0.8689927583936801, "grad_norm": 0.6783773870903784, "learning_rate": 8.942726811142478e-06, "loss": 0.4686, "step": 220 }, { "epoch": 0.8729427254772877, "grad_norm": 0.5418694177687857, "learning_rate": 8.928541624351563e-06, "loss": 0.4598, "step": 221 }, { "epoch": 0.8768926925608953, "grad_norm": 0.6545190257040637, "learning_rate": 8.914273320738288e-06, "loss": 0.4876, "step": 222 }, { "epoch": 0.8808426596445029, "grad_norm": 0.5399982294464548, "learning_rate": 8.899922202179581e-06, "loss": 0.4524, "step": 223 }, { "epoch": 0.8847926267281107, "grad_norm": 0.5568746126845764, "learning_rate": 8.88548857230449e-06, "loss": 0.4682, "step": 224 }, { "epoch": 0.8887425938117183, "grad_norm": 0.5030276500366323, "learning_rate": 8.870972736487775e-06, "loss": 0.4472, "step": 225 }, { "epoch": 0.8926925608953259, "grad_norm": 0.5865526165014883, "learning_rate": 8.856375001843442e-06, "loss": 0.4909, "step": 226 }, { "epoch": 0.8966425279789335, "grad_norm": 0.5663275709546551, "learning_rate": 8.84169567721824e-06, "loss": 0.4474, "step": 227 }, { "epoch": 0.9005924950625411, "grad_norm": 0.6265419589882282, "learning_rate": 8.826935073185135e-06, "loss": 0.476, "step": 228 }, { "epoch": 0.9045424621461488, "grad_norm": 0.5331805011597713, "learning_rate": 8.812093502036732e-06, "loss": 0.4598, "step": 229 }, { "epoch": 0.9084924292297564, "grad_norm": 0.5928841305928686, "learning_rate": 8.797171277778672e-06, "loss": 0.4766, "step": 230 }, { "epoch": 0.9124423963133641, "grad_norm": 0.6555032003037463, "learning_rate": 8.782168716122988e-06, "loss": 0.4515, "step": 231 }, { "epoch": 0.9163923633969717, "grad_norm": 0.5064869069889608, "learning_rate": 8.767086134481427e-06, "loss": 0.4459, "step": 232 }, { "epoch": 0.9203423304805793, "grad_norm": 0.638200939068264, "learning_rate": 8.751923851958728e-06, "loss": 0.4807, "step": 233 }, { "epoch": 0.924292297564187, "grad_norm": 0.4633736298652728, "learning_rate": 8.73668218934588e-06, "loss": 0.4218, "step": 234 }, { "epoch": 0.9282422646477946, "grad_norm": 0.6702414679069545, "learning_rate": 8.721361469113324e-06, "loss": 0.4694, "step": 235 }, { "epoch": 0.9321922317314022, "grad_norm": 0.5247264787366135, "learning_rate": 8.705962015404143e-06, "loss": 0.4399, "step": 236 }, { "epoch": 0.9361421988150099, "grad_norm": 0.540794256034141, "learning_rate": 8.690484154027192e-06, "loss": 0.4172, "step": 237 }, { "epoch": 0.9400921658986175, "grad_norm": 0.5879208517625489, "learning_rate": 8.674928212450216e-06, "loss": 0.4773, "step": 238 }, { "epoch": 0.9440421329822252, "grad_norm": 0.6181191136722756, "learning_rate": 8.659294519792909e-06, "loss": 0.4733, "step": 239 }, { "epoch": 0.9479921000658328, "grad_norm": 0.5583946921545812, "learning_rate": 8.643583406819965e-06, "loss": 0.4369, "step": 240 }, { "epoch": 0.9519420671494404, "grad_norm": 0.5504099027588751, "learning_rate": 8.627795205934069e-06, "loss": 0.464, "step": 241 }, { "epoch": 0.955892034233048, "grad_norm": 0.6593101224262681, "learning_rate": 8.611930251168867e-06, "loss": 0.4803, "step": 242 }, { "epoch": 0.9598420013166556, "grad_norm": 0.5582778976615183, "learning_rate": 8.595988878181904e-06, "loss": 0.4543, "step": 243 }, { "epoch": 0.9637919684002634, "grad_norm": 0.5112952824773496, "learning_rate": 8.579971424247514e-06, "loss": 0.4697, "step": 244 }, { "epoch": 0.967741935483871, "grad_norm": 0.5585560532283136, "learning_rate": 8.563878228249686e-06, "loss": 0.4555, "step": 245 }, { "epoch": 0.9716919025674786, "grad_norm": 0.6853082634878661, "learning_rate": 8.547709630674909e-06, "loss": 0.4635, "step": 246 }, { "epoch": 0.9756418696510862, "grad_norm": 0.6035479887143546, "learning_rate": 8.531465973604946e-06, "loss": 0.4779, "step": 247 }, { "epoch": 0.9795918367346939, "grad_norm": 0.5717332618303782, "learning_rate": 8.515147600709604e-06, "loss": 0.4747, "step": 248 }, { "epoch": 0.9835418038183015, "grad_norm": 0.6227826342825249, "learning_rate": 8.498754857239472e-06, "loss": 0.4459, "step": 249 }, { "epoch": 0.9874917709019092, "grad_norm": 0.5617502965826755, "learning_rate": 8.482288090018608e-06, "loss": 0.4557, "step": 250 }, { "epoch": 0.9914417379855168, "grad_norm": 0.549726221233288, "learning_rate": 8.465747647437205e-06, "loss": 0.4597, "step": 251 }, { "epoch": 0.9953917050691244, "grad_norm": 0.605000332315275, "learning_rate": 8.449133879444211e-06, "loss": 0.4709, "step": 252 }, { "epoch": 0.9993416721527321, "grad_norm": 0.5306724995883702, "learning_rate": 8.43244713753994e-06, "loss": 0.4615, "step": 253 }, { "epoch": 1.0032916392363398, "grad_norm": 0.5468070013061788, "learning_rate": 8.415687774768625e-06, "loss": 0.4203, "step": 254 }, { "epoch": 1.0072416063199474, "grad_norm": 0.5253125597043682, "learning_rate": 8.398856145710953e-06, "loss": 0.3922, "step": 255 }, { "epoch": 1.011191573403555, "grad_norm": 0.5426955599750107, "learning_rate": 8.381952606476552e-06, "loss": 0.4057, "step": 256 }, { "epoch": 1.0151415404871627, "grad_norm": 0.4773406095752995, "learning_rate": 8.364977514696474e-06, "loss": 0.4171, "step": 257 }, { "epoch": 1.0190915075707703, "grad_norm": 0.5988750520897403, "learning_rate": 8.347931229515625e-06, "loss": 0.429, "step": 258 }, { "epoch": 1.023041474654378, "grad_norm": 0.503670886345039, "learning_rate": 8.330814111585149e-06, "loss": 0.4207, "step": 259 }, { "epoch": 1.0269914417379855, "grad_norm": 0.5468995103994461, "learning_rate": 8.31362652305482e-06, "loss": 0.4054, "step": 260 }, { "epoch": 1.0309414088215931, "grad_norm": 0.5053019654372004, "learning_rate": 8.296368827565365e-06, "loss": 0.4109, "step": 261 }, { "epoch": 1.0348913759052007, "grad_norm": 0.6196190589274644, "learning_rate": 8.279041390240781e-06, "loss": 0.4287, "step": 262 }, { "epoch": 1.0388413429888084, "grad_norm": 0.5403626479871063, "learning_rate": 8.261644577680603e-06, "loss": 0.4363, "step": 263 }, { "epoch": 1.042791310072416, "grad_norm": 0.5868791088397879, "learning_rate": 8.244178757952149e-06, "loss": 0.421, "step": 264 }, { "epoch": 1.0467412771560236, "grad_norm": 0.6183506529026369, "learning_rate": 8.22664430058273e-06, "loss": 0.4173, "step": 265 }, { "epoch": 1.0506912442396312, "grad_norm": 0.494874302279448, "learning_rate": 8.209041576551842e-06, "loss": 0.3834, "step": 266 }, { "epoch": 1.054641211323239, "grad_norm": 0.6219023667379031, "learning_rate": 8.191370958283305e-06, "loss": 0.4074, "step": 267 }, { "epoch": 1.0585911784068467, "grad_norm": 0.5600381950919785, "learning_rate": 8.17363281963739e-06, "loss": 0.3892, "step": 268 }, { "epoch": 1.0625411454904543, "grad_norm": 0.6337715084794726, "learning_rate": 8.155827535902912e-06, "loss": 0.4109, "step": 269 }, { "epoch": 1.066491112574062, "grad_norm": 0.5790760517013389, "learning_rate": 8.137955483789279e-06, "loss": 0.3938, "step": 270 }, { "epoch": 1.0704410796576695, "grad_norm": 0.5725751077156453, "learning_rate": 8.120017041418539e-06, "loss": 0.3974, "step": 271 }, { "epoch": 1.0743910467412772, "grad_norm": 0.5270558371378474, "learning_rate": 8.102012588317356e-06, "loss": 0.4044, "step": 272 }, { "epoch": 1.0783410138248848, "grad_norm": 0.554768694475077, "learning_rate": 8.083942505409009e-06, "loss": 0.4076, "step": 273 }, { "epoch": 1.0822909809084924, "grad_norm": 0.5399286833096253, "learning_rate": 8.065807175005307e-06, "loss": 0.4064, "step": 274 }, { "epoch": 1.0862409479921, "grad_norm": 0.5131321744363273, "learning_rate": 8.047606980798517e-06, "loss": 0.4083, "step": 275 }, { "epoch": 1.0901909150757076, "grad_norm": 0.6155334091097188, "learning_rate": 8.029342307853238e-06, "loss": 0.4336, "step": 276 }, { "epoch": 1.0941408821593153, "grad_norm": 0.4963326594997601, "learning_rate": 8.011013542598258e-06, "loss": 0.4283, "step": 277 }, { "epoch": 1.0980908492429229, "grad_norm": 0.4931375150904232, "learning_rate": 7.992621072818377e-06, "loss": 0.4345, "step": 278 }, { "epoch": 1.1020408163265305, "grad_norm": 0.5455901405669513, "learning_rate": 7.9741652876462e-06, "loss": 0.4128, "step": 279 }, { "epoch": 1.1059907834101383, "grad_norm": 0.5588668489786742, "learning_rate": 7.95564657755391e-06, "loss": 0.4147, "step": 280 }, { "epoch": 1.109940750493746, "grad_norm": 0.5616946557431427, "learning_rate": 7.937065334345002e-06, "loss": 0.4166, "step": 281 }, { "epoch": 1.1138907175773536, "grad_norm": 0.5656400204184403, "learning_rate": 7.918421951145993e-06, "loss": 0.4168, "step": 282 }, { "epoch": 1.1178406846609612, "grad_norm": 0.601724269921387, "learning_rate": 7.899716822398107e-06, "loss": 0.4168, "step": 283 }, { "epoch": 1.1217906517445688, "grad_norm": 0.5334102795862128, "learning_rate": 7.880950343848933e-06, "loss": 0.4183, "step": 284 }, { "epoch": 1.1257406188281764, "grad_norm": 0.5196141736064548, "learning_rate": 7.862122912544043e-06, "loss": 0.411, "step": 285 }, { "epoch": 1.129690585911784, "grad_norm": 0.5410773174416676, "learning_rate": 7.843234926818595e-06, "loss": 0.4199, "step": 286 }, { "epoch": 1.1336405529953917, "grad_norm": 0.521512637148522, "learning_rate": 7.824286786288919e-06, "loss": 0.3898, "step": 287 }, { "epoch": 1.1375905200789993, "grad_norm": 0.4743870737171394, "learning_rate": 7.805278891844036e-06, "loss": 0.3994, "step": 288 }, { "epoch": 1.141540487162607, "grad_norm": 0.5227700481039957, "learning_rate": 7.786211645637197e-06, "loss": 0.418, "step": 289 }, { "epoch": 1.1454904542462145, "grad_norm": 0.4677344521226036, "learning_rate": 7.76708545107737e-06, "loss": 0.4186, "step": 290 }, { "epoch": 1.1494404213298222, "grad_norm": 0.4915983580050301, "learning_rate": 7.747900712820705e-06, "loss": 0.4133, "step": 291 }, { "epoch": 1.1533903884134298, "grad_norm": 0.4677633729157998, "learning_rate": 7.72865783676196e-06, "loss": 0.4067, "step": 292 }, { "epoch": 1.1573403554970376, "grad_norm": 0.5591599779072501, "learning_rate": 7.709357230025937e-06, "loss": 0.4236, "step": 293 }, { "epoch": 1.1612903225806452, "grad_norm": 0.4773208174606624, "learning_rate": 7.689999300958853e-06, "loss": 0.4126, "step": 294 }, { "epoch": 1.1652402896642529, "grad_norm": 0.5075008240750409, "learning_rate": 7.670584459119695e-06, "loss": 0.399, "step": 295 }, { "epoch": 1.1691902567478605, "grad_norm": 0.5614704606769243, "learning_rate": 7.651113115271573e-06, "loss": 0.4256, "step": 296 }, { "epoch": 1.173140223831468, "grad_norm": 0.5394048297427791, "learning_rate": 7.631585681373014e-06, "loss": 0.4179, "step": 297 }, { "epoch": 1.1770901909150757, "grad_norm": 0.46228670128781124, "learning_rate": 7.612002570569254e-06, "loss": 0.411, "step": 298 }, { "epoch": 1.1810401579986833, "grad_norm": 0.4886061196127952, "learning_rate": 7.592364197183495e-06, "loss": 0.42, "step": 299 }, { "epoch": 1.184990125082291, "grad_norm": 0.5066201067505931, "learning_rate": 7.572670976708137e-06, "loss": 0.4082, "step": 300 }, { "epoch": 1.1889400921658986, "grad_norm": 0.458978956556694, "learning_rate": 7.552923325795991e-06, "loss": 0.3967, "step": 301 }, { "epoch": 1.1928900592495062, "grad_norm": 0.4676936026340293, "learning_rate": 7.5331216622514595e-06, "loss": 0.3964, "step": 302 }, { "epoch": 1.1968400263331138, "grad_norm": 0.49396690831001383, "learning_rate": 7.513266405021704e-06, "loss": 0.3961, "step": 303 }, { "epoch": 1.2007899934167214, "grad_norm": 0.4891807114221917, "learning_rate": 7.4933579741877715e-06, "loss": 0.3979, "step": 304 }, { "epoch": 1.204739960500329, "grad_norm": 0.49042001839802846, "learning_rate": 7.473396790955715e-06, "loss": 0.4031, "step": 305 }, { "epoch": 1.208689927583937, "grad_norm": 0.6007902779640868, "learning_rate": 7.4533832776476785e-06, "loss": 0.4095, "step": 306 }, { "epoch": 1.2126398946675445, "grad_norm": 0.47247263801988787, "learning_rate": 7.433317857692963e-06, "loss": 0.3968, "step": 307 }, { "epoch": 1.2165898617511521, "grad_norm": 0.4766011056040539, "learning_rate": 7.413200955619066e-06, "loss": 0.4025, "step": 308 }, { "epoch": 1.2205398288347598, "grad_norm": 0.5982182446831422, "learning_rate": 7.3930329970426984e-06, "loss": 0.404, "step": 309 }, { "epoch": 1.2244897959183674, "grad_norm": 0.5349831648259383, "learning_rate": 7.372814408660789e-06, "loss": 0.3941, "step": 310 }, { "epoch": 1.228439763001975, "grad_norm": 0.485611104331788, "learning_rate": 7.352545618241444e-06, "loss": 0.3979, "step": 311 }, { "epoch": 1.2323897300855826, "grad_norm": 0.5332021733971769, "learning_rate": 7.332227054614904e-06, "loss": 0.3991, "step": 312 }, { "epoch": 1.2363396971691902, "grad_norm": 0.47398708172575216, "learning_rate": 7.311859147664473e-06, "loss": 0.417, "step": 313 }, { "epoch": 1.2402896642527979, "grad_norm": 0.507133082649169, "learning_rate": 7.291442328317414e-06, "loss": 0.4184, "step": 314 }, { "epoch": 1.2442396313364055, "grad_norm": 0.5598309241048224, "learning_rate": 7.270977028535846e-06, "loss": 0.4217, "step": 315 }, { "epoch": 1.248189598420013, "grad_norm": 0.47250208694649426, "learning_rate": 7.250463681307589e-06, "loss": 0.4102, "step": 316 }, { "epoch": 1.252139565503621, "grad_norm": 0.5519959757532354, "learning_rate": 7.229902720637014e-06, "loss": 0.4044, "step": 317 }, { "epoch": 1.2560895325872283, "grad_norm": 0.4854196157312305, "learning_rate": 7.2092945815358605e-06, "loss": 0.3894, "step": 318 }, { "epoch": 1.2600394996708362, "grad_norm": 0.4735739967048277, "learning_rate": 7.1886397000140265e-06, "loss": 0.4041, "step": 319 }, { "epoch": 1.2639894667544438, "grad_norm": 0.556876077278747, "learning_rate": 7.167938513070345e-06, "loss": 0.4298, "step": 320 }, { "epoch": 1.2679394338380514, "grad_norm": 0.5013257083184863, "learning_rate": 7.147191458683349e-06, "loss": 0.4357, "step": 321 }, { "epoch": 1.271889400921659, "grad_norm": 0.49830143585623826, "learning_rate": 7.126398975801989e-06, "loss": 0.419, "step": 322 }, { "epoch": 1.2758393680052666, "grad_norm": 0.48052619402439883, "learning_rate": 7.105561504336357e-06, "loss": 0.423, "step": 323 }, { "epoch": 1.2797893350888743, "grad_norm": 0.4675034381397055, "learning_rate": 7.084679485148376e-06, "loss": 0.3911, "step": 324 }, { "epoch": 1.2837393021724819, "grad_norm": 0.49732814701427736, "learning_rate": 7.063753360042471e-06, "loss": 0.4052, "step": 325 }, { "epoch": 1.2876892692560895, "grad_norm": 0.5123892209600395, "learning_rate": 7.042783571756229e-06, "loss": 0.4113, "step": 326 }, { "epoch": 1.2916392363396971, "grad_norm": 0.44087573321390805, "learning_rate": 7.021770563951018e-06, "loss": 0.4239, "step": 327 }, { "epoch": 1.2955892034233047, "grad_norm": 0.4337002783461821, "learning_rate": 7.0007147812026136e-06, "loss": 0.4184, "step": 328 }, { "epoch": 1.2995391705069124, "grad_norm": 0.46289536747161647, "learning_rate": 6.979616668991791e-06, "loss": 0.3832, "step": 329 }, { "epoch": 1.3034891375905202, "grad_norm": 0.45650576688624306, "learning_rate": 6.958476673694888e-06, "loss": 0.3903, "step": 330 }, { "epoch": 1.3074391046741276, "grad_norm": 0.4793994719486388, "learning_rate": 6.93729524257438e-06, "loss": 0.3995, "step": 331 }, { "epoch": 1.3113890717577354, "grad_norm": 0.44080301462548366, "learning_rate": 6.9160728237694e-06, "loss": 0.4023, "step": 332 }, { "epoch": 1.315339038841343, "grad_norm": 0.5005822253090466, "learning_rate": 6.89480986628627e-06, "loss": 0.415, "step": 333 }, { "epoch": 1.3192890059249507, "grad_norm": 0.4914348440490074, "learning_rate": 6.873506819988986e-06, "loss": 0.4011, "step": 334 }, { "epoch": 1.3232389730085583, "grad_norm": 0.4490367053053073, "learning_rate": 6.852164135589725e-06, "loss": 0.3991, "step": 335 }, { "epoch": 1.327188940092166, "grad_norm": 0.49779632699430143, "learning_rate": 6.830782264639281e-06, "loss": 0.3997, "step": 336 }, { "epoch": 1.3311389071757735, "grad_norm": 0.47757641752113966, "learning_rate": 6.809361659517528e-06, "loss": 0.4072, "step": 337 }, { "epoch": 1.3350888742593812, "grad_norm": 0.47517228293488023, "learning_rate": 6.78790277342385e-06, "loss": 0.4233, "step": 338 }, { "epoch": 1.3390388413429888, "grad_norm": 0.5603239117996478, "learning_rate": 6.766406060367544e-06, "loss": 0.4165, "step": 339 }, { "epoch": 1.3429888084265964, "grad_norm": 0.45220772374120183, "learning_rate": 6.744871975158216e-06, "loss": 0.4454, "step": 340 }, { "epoch": 1.346938775510204, "grad_norm": 0.5405755655244042, "learning_rate": 6.723300973396167e-06, "loss": 0.3973, "step": 341 }, { "epoch": 1.3508887425938116, "grad_norm": 0.5919336354728978, "learning_rate": 6.701693511462744e-06, "loss": 0.4257, "step": 342 }, { "epoch": 1.3548387096774195, "grad_norm": 0.4512061556063902, "learning_rate": 6.680050046510689e-06, "loss": 0.4136, "step": 343 }, { "epoch": 1.3587886767610269, "grad_norm": 0.502381259326561, "learning_rate": 6.658371036454464e-06, "loss": 0.4257, "step": 344 }, { "epoch": 1.3627386438446347, "grad_norm": 0.5100548598777424, "learning_rate": 6.636656939960569e-06, "loss": 0.3852, "step": 345 }, { "epoch": 1.3666886109282423, "grad_norm": 0.5170027310577614, "learning_rate": 6.614908216437832e-06, "loss": 0.3894, "step": 346 }, { "epoch": 1.37063857801185, "grad_norm": 0.4927734595024461, "learning_rate": 6.59312532602769e-06, "loss": 0.4179, "step": 347 }, { "epoch": 1.3745885450954576, "grad_norm": 0.5038874800728005, "learning_rate": 6.57130872959445e-06, "loss": 0.4277, "step": 348 }, { "epoch": 1.3785385121790652, "grad_norm": 0.4878507205829387, "learning_rate": 6.549458888715555e-06, "loss": 0.4195, "step": 349 }, { "epoch": 1.3824884792626728, "grad_norm": 0.4576463489000922, "learning_rate": 6.527576265671796e-06, "loss": 0.4223, "step": 350 }, { "epoch": 1.3864384463462804, "grad_norm": 0.5195912966585179, "learning_rate": 6.505661323437544e-06, "loss": 0.416, "step": 351 }, { "epoch": 1.390388413429888, "grad_norm": 0.5533984569888453, "learning_rate": 6.483714525670956e-06, "loss": 0.4091, "step": 352 }, { "epoch": 1.3943383805134957, "grad_norm": 0.4898112788050507, "learning_rate": 6.4617363367041605e-06, "loss": 0.4334, "step": 353 }, { "epoch": 1.3982883475971033, "grad_norm": 0.4486573851270682, "learning_rate": 6.439727221533431e-06, "loss": 0.3876, "step": 354 }, { "epoch": 1.402238314680711, "grad_norm": 0.47599229330514786, "learning_rate": 6.417687645809358e-06, "loss": 0.4088, "step": 355 }, { "epoch": 1.4061882817643188, "grad_norm": 0.43433754814635545, "learning_rate": 6.395618075826987e-06, "loss": 0.4114, "step": 356 }, { "epoch": 1.4101382488479262, "grad_norm": 0.4875632625437401, "learning_rate": 6.373518978515958e-06, "loss": 0.4059, "step": 357 }, { "epoch": 1.414088215931534, "grad_norm": 0.47169886053282933, "learning_rate": 6.351390821430626e-06, "loss": 0.4256, "step": 358 }, { "epoch": 1.4180381830151416, "grad_norm": 0.4789247712767584, "learning_rate": 6.329234072740169e-06, "loss": 0.3952, "step": 359 }, { "epoch": 1.4219881500987492, "grad_norm": 0.5094725112212999, "learning_rate": 6.3070492012186836e-06, "loss": 0.3974, "step": 360 }, { "epoch": 1.4259381171823569, "grad_norm": 0.4815644892162477, "learning_rate": 6.284836676235262e-06, "loss": 0.4097, "step": 361 }, { "epoch": 1.4298880842659645, "grad_norm": 0.4719204154983206, "learning_rate": 6.262596967744069e-06, "loss": 0.4084, "step": 362 }, { "epoch": 1.433838051349572, "grad_norm": 0.4370506512519176, "learning_rate": 6.240330546274394e-06, "loss": 0.4127, "step": 363 }, { "epoch": 1.4377880184331797, "grad_norm": 0.45254691404430053, "learning_rate": 6.218037882920698e-06, "loss": 0.4015, "step": 364 }, { "epoch": 1.4417379855167873, "grad_norm": 0.49046161658621534, "learning_rate": 6.195719449332645e-06, "loss": 0.4153, "step": 365 }, { "epoch": 1.445687952600395, "grad_norm": 0.46774617033467114, "learning_rate": 6.173375717705124e-06, "loss": 0.3919, "step": 366 }, { "epoch": 1.4496379196840026, "grad_norm": 0.45709278711551554, "learning_rate": 6.151007160768265e-06, "loss": 0.3833, "step": 367 }, { "epoch": 1.4535878867676102, "grad_norm": 0.469135808911332, "learning_rate": 6.128614251777417e-06, "loss": 0.4113, "step": 368 }, { "epoch": 1.457537853851218, "grad_norm": 0.4746360055024692, "learning_rate": 6.106197464503168e-06, "loss": 0.4073, "step": 369 }, { "epoch": 1.4614878209348254, "grad_norm": 0.5013438596515611, "learning_rate": 6.083757273221288e-06, "loss": 0.4163, "step": 370 }, { "epoch": 1.4654377880184333, "grad_norm": 0.4595992452611871, "learning_rate": 6.061294152702717e-06, "loss": 0.4124, "step": 371 }, { "epoch": 1.469387755102041, "grad_norm": 0.5241170125469728, "learning_rate": 6.03880857820351e-06, "loss": 0.4082, "step": 372 }, { "epoch": 1.4733377221856485, "grad_norm": 0.479756853144827, "learning_rate": 6.016301025454787e-06, "loss": 0.4218, "step": 373 }, { "epoch": 1.4772876892692561, "grad_norm": 0.46499871986914276, "learning_rate": 5.993771970652661e-06, "loss": 0.4075, "step": 374 }, { "epoch": 1.4812376563528638, "grad_norm": 0.47322531835936726, "learning_rate": 5.971221890448175e-06, "loss": 0.4164, "step": 375 }, { "epoch": 1.4851876234364714, "grad_norm": 0.4521699831160759, "learning_rate": 5.948651261937203e-06, "loss": 0.4135, "step": 376 }, { "epoch": 1.489137590520079, "grad_norm": 0.4695500653310168, "learning_rate": 5.926060562650365e-06, "loss": 0.444, "step": 377 }, { "epoch": 1.4930875576036866, "grad_norm": 0.44943870796703855, "learning_rate": 5.903450270542925e-06, "loss": 0.418, "step": 378 }, { "epoch": 1.4970375246872942, "grad_norm": 0.5154250566167112, "learning_rate": 5.880820863984672e-06, "loss": 0.402, "step": 379 }, { "epoch": 1.500987491770902, "grad_norm": 0.511365558299148, "learning_rate": 5.858172821749804e-06, "loss": 0.4217, "step": 380 }, { "epoch": 1.5049374588545095, "grad_norm": 0.4544965900807186, "learning_rate": 5.835506623006798e-06, "loss": 0.4264, "step": 381 }, { "epoch": 1.5088874259381173, "grad_norm": 0.49440660798954045, "learning_rate": 5.8128227473082676e-06, "loss": 0.4302, "step": 382 }, { "epoch": 1.5128373930217247, "grad_norm": 0.489131402726814, "learning_rate": 5.790121674580825e-06, "loss": 0.4156, "step": 383 }, { "epoch": 1.5167873601053325, "grad_norm": 0.5033162113342163, "learning_rate": 5.7674038851149225e-06, "loss": 0.3956, "step": 384 }, { "epoch": 1.52073732718894, "grad_norm": 0.482556259472661, "learning_rate": 5.744669859554689e-06, "loss": 0.4047, "step": 385 }, { "epoch": 1.5246872942725478, "grad_norm": 0.43750760442832703, "learning_rate": 5.721920078887764e-06, "loss": 0.4058, "step": 386 }, { "epoch": 1.5286372613561554, "grad_norm": 0.45671044388903165, "learning_rate": 5.699155024435123e-06, "loss": 0.4055, "step": 387 }, { "epoch": 1.532587228439763, "grad_norm": 0.5497451843691814, "learning_rate": 5.676375177840886e-06, "loss": 0.4041, "step": 388 }, { "epoch": 1.5365371955233706, "grad_norm": 0.5170104366588478, "learning_rate": 5.653581021062139e-06, "loss": 0.4148, "step": 389 }, { "epoch": 1.5404871626069783, "grad_norm": 0.4299221031648158, "learning_rate": 5.630773036358727e-06, "loss": 0.4024, "step": 390 }, { "epoch": 1.5444371296905859, "grad_norm": 0.5117881151204825, "learning_rate": 5.607951706283056e-06, "loss": 0.4064, "step": 391 }, { "epoch": 1.5483870967741935, "grad_norm": 0.5175949944009517, "learning_rate": 5.585117513669883e-06, "loss": 0.4191, "step": 392 }, { "epoch": 1.5523370638578013, "grad_norm": 0.4505520670573529, "learning_rate": 5.562270941626099e-06, "loss": 0.4, "step": 393 }, { "epoch": 1.5562870309414087, "grad_norm": 0.574850111853717, "learning_rate": 5.539412473520508e-06, "loss": 0.4273, "step": 394 }, { "epoch": 1.5602369980250166, "grad_norm": 0.4872900640799847, "learning_rate": 5.516542592973604e-06, "loss": 0.4038, "step": 395 }, { "epoch": 1.564186965108624, "grad_norm": 0.4135072675279007, "learning_rate": 5.493661783847331e-06, "loss": 0.4136, "step": 396 }, { "epoch": 1.5681369321922318, "grad_norm": 0.5024585398609805, "learning_rate": 5.470770530234856e-06, "loss": 0.4265, "step": 397 }, { "epoch": 1.5720868992758392, "grad_norm": 0.5924280274097413, "learning_rate": 5.447869316450318e-06, "loss": 0.4213, "step": 398 }, { "epoch": 1.576036866359447, "grad_norm": 0.437652603807874, "learning_rate": 5.424958627018587e-06, "loss": 0.4127, "step": 399 }, { "epoch": 1.5799868334430547, "grad_norm": 0.4315037850948973, "learning_rate": 5.402038946665011e-06, "loss": 0.4116, "step": 400 }, { "epoch": 1.5839368005266623, "grad_norm": 0.46101217218230606, "learning_rate": 5.3791107603051605e-06, "loss": 0.4025, "step": 401 }, { "epoch": 1.58788676761027, "grad_norm": 0.4851945993701666, "learning_rate": 5.356174553034566e-06, "loss": 0.4327, "step": 402 }, { "epoch": 1.5918367346938775, "grad_norm": 0.5221176567462656, "learning_rate": 5.3332308101184616e-06, "loss": 0.4134, "step": 403 }, { "epoch": 1.5957867017774852, "grad_norm": 0.4868957031981493, "learning_rate": 5.310280016981513e-06, "loss": 0.4085, "step": 404 }, { "epoch": 1.5997366688610928, "grad_norm": 0.433474447142496, "learning_rate": 5.287322659197548e-06, "loss": 0.3755, "step": 405 }, { "epoch": 1.6036866359447006, "grad_norm": 0.4679793459336981, "learning_rate": 5.264359222479284e-06, "loss": 0.4207, "step": 406 }, { "epoch": 1.607636603028308, "grad_norm": 0.41856962341421294, "learning_rate": 5.2413901926680535e-06, "loss": 0.3997, "step": 407 }, { "epoch": 1.6115865701119159, "grad_norm": 0.4508231948324545, "learning_rate": 5.218416055723517e-06, "loss": 0.395, "step": 408 }, { "epoch": 1.6155365371955233, "grad_norm": 0.4225120061067675, "learning_rate": 5.195437297713397e-06, "loss": 0.4227, "step": 409 }, { "epoch": 1.619486504279131, "grad_norm": 0.4373396707386501, "learning_rate": 5.172454404803176e-06, "loss": 0.4063, "step": 410 }, { "epoch": 1.6234364713627385, "grad_norm": 0.44810914699444376, "learning_rate": 5.149467863245824e-06, "loss": 0.4046, "step": 411 }, { "epoch": 1.6273864384463463, "grad_norm": 0.4560814813263572, "learning_rate": 5.126478159371503e-06, "loss": 0.4093, "step": 412 }, { "epoch": 1.631336405529954, "grad_norm": 0.46985481242896954, "learning_rate": 5.103485779577285e-06, "loss": 0.3985, "step": 413 }, { "epoch": 1.6352863726135616, "grad_norm": 0.5189345228761076, "learning_rate": 5.0804912103168504e-06, "loss": 0.4237, "step": 414 }, { "epoch": 1.6392363396971692, "grad_norm": 0.4337485281907629, "learning_rate": 5.057494938090212e-06, "loss": 0.3884, "step": 415 }, { "epoch": 1.6431863067807768, "grad_norm": 0.48276829670076205, "learning_rate": 5.034497449433402e-06, "loss": 0.3956, "step": 416 }, { "epoch": 1.6471362738643844, "grad_norm": 0.6193306871372818, "learning_rate": 5.011499230908195e-06, "loss": 0.409, "step": 417 }, { "epoch": 1.651086240947992, "grad_norm": 0.4436292426394267, "learning_rate": 4.988500769091808e-06, "loss": 0.4256, "step": 418 }, { "epoch": 1.6550362080316, "grad_norm": 0.518026765939683, "learning_rate": 4.9655025505666e-06, "loss": 0.4184, "step": 419 }, { "epoch": 1.6589861751152073, "grad_norm": 0.5631145369132682, "learning_rate": 4.94250506190979e-06, "loss": 0.3948, "step": 420 }, { "epoch": 1.6629361421988151, "grad_norm": 0.4268349219414941, "learning_rate": 4.9195087896831495e-06, "loss": 0.3839, "step": 421 }, { "epoch": 1.6668861092824225, "grad_norm": 0.4532231838179289, "learning_rate": 4.896514220422718e-06, "loss": 0.4164, "step": 422 }, { "epoch": 1.6708360763660304, "grad_norm": 0.4647448195507562, "learning_rate": 4.873521840628498e-06, "loss": 0.417, "step": 423 }, { "epoch": 1.6747860434496378, "grad_norm": 0.4637892055660408, "learning_rate": 4.850532136754178e-06, "loss": 0.3774, "step": 424 }, { "epoch": 1.6787360105332456, "grad_norm": 0.4395794925168542, "learning_rate": 4.827545595196825e-06, "loss": 0.4052, "step": 425 }, { "epoch": 1.6826859776168532, "grad_norm": 0.4623868987342022, "learning_rate": 4.804562702286606e-06, "loss": 0.4023, "step": 426 }, { "epoch": 1.6866359447004609, "grad_norm": 0.4554434564381581, "learning_rate": 4.7815839442764846e-06, "loss": 0.3982, "step": 427 }, { "epoch": 1.6905859117840685, "grad_norm": 0.4698223389382366, "learning_rate": 4.758609807331948e-06, "loss": 0.401, "step": 428 }, { "epoch": 1.694535878867676, "grad_norm": 0.42716797015893476, "learning_rate": 4.735640777520716e-06, "loss": 0.396, "step": 429 }, { "epoch": 1.6984858459512837, "grad_norm": 0.4422859842690804, "learning_rate": 4.712677340802454e-06, "loss": 0.4206, "step": 430 }, { "epoch": 1.7024358130348913, "grad_norm": 0.45215065138089927, "learning_rate": 4.6897199830184885e-06, "loss": 0.4102, "step": 431 }, { "epoch": 1.7063857801184992, "grad_norm": 0.4127271822740493, "learning_rate": 4.66676918988154e-06, "loss": 0.3942, "step": 432 }, { "epoch": 1.7103357472021066, "grad_norm": 0.4907955389157334, "learning_rate": 4.643825446965435e-06, "loss": 0.4128, "step": 433 }, { "epoch": 1.7142857142857144, "grad_norm": 0.5132047703476442, "learning_rate": 4.620889239694842e-06, "loss": 0.3891, "step": 434 }, { "epoch": 1.7182356813693218, "grad_norm": 0.4480322295786916, "learning_rate": 4.5979610533349904e-06, "loss": 0.4186, "step": 435 }, { "epoch": 1.7221856484529297, "grad_norm": 0.45021415296751505, "learning_rate": 4.575041372981414e-06, "loss": 0.3813, "step": 436 }, { "epoch": 1.726135615536537, "grad_norm": 0.45266735971736105, "learning_rate": 4.5521306835496825e-06, "loss": 0.4099, "step": 437 }, { "epoch": 1.730085582620145, "grad_norm": 0.44944029688099313, "learning_rate": 4.5292294697651476e-06, "loss": 0.3868, "step": 438 }, { "epoch": 1.7340355497037525, "grad_norm": 0.4016594022941216, "learning_rate": 4.50633821615267e-06, "loss": 0.3974, "step": 439 }, { "epoch": 1.7379855167873601, "grad_norm": 0.4330866352620027, "learning_rate": 4.483457407026398e-06, "loss": 0.4043, "step": 440 }, { "epoch": 1.7419354838709677, "grad_norm": 0.44619784310096816, "learning_rate": 4.460587526479493e-06, "loss": 0.4104, "step": 441 }, { "epoch": 1.7458854509545754, "grad_norm": 0.4662020738618116, "learning_rate": 4.437729058373903e-06, "loss": 0.4099, "step": 442 }, { "epoch": 1.749835418038183, "grad_norm": 0.4181935146039366, "learning_rate": 4.4148824863301185e-06, "loss": 0.3904, "step": 443 }, { "epoch": 1.7537853851217906, "grad_norm": 0.42099154126072585, "learning_rate": 4.392048293716945e-06, "loss": 0.4045, "step": 444 }, { "epoch": 1.7577353522053984, "grad_norm": 0.510803401796639, "learning_rate": 4.369226963641274e-06, "loss": 0.4197, "step": 445 }, { "epoch": 1.7616853192890058, "grad_norm": 0.4601135128404259, "learning_rate": 4.346418978937863e-06, "loss": 0.4125, "step": 446 }, { "epoch": 1.7656352863726137, "grad_norm": 0.4574324880973365, "learning_rate": 4.323624822159116e-06, "loss": 0.4035, "step": 447 }, { "epoch": 1.769585253456221, "grad_norm": 0.4433286274343357, "learning_rate": 4.300844975564878e-06, "loss": 0.4084, "step": 448 }, { "epoch": 1.773535220539829, "grad_norm": 0.4475797238299404, "learning_rate": 4.278079921112236e-06, "loss": 0.4023, "step": 449 }, { "epoch": 1.7774851876234363, "grad_norm": 0.4310854879904227, "learning_rate": 4.2553301404453125e-06, "loss": 0.4113, "step": 450 }, { "epoch": 1.7814351547070442, "grad_norm": 0.3942406170060367, "learning_rate": 4.232596114885078e-06, "loss": 0.3824, "step": 451 }, { "epoch": 1.7853851217906518, "grad_norm": 0.43866947761072433, "learning_rate": 4.209878325419176e-06, "loss": 0.4209, "step": 452 }, { "epoch": 1.7893350888742594, "grad_norm": 0.4484539632108044, "learning_rate": 4.187177252691734e-06, "loss": 0.3928, "step": 453 }, { "epoch": 1.793285055957867, "grad_norm": 0.4125743407142397, "learning_rate": 4.164493376993205e-06, "loss": 0.391, "step": 454 }, { "epoch": 1.7972350230414746, "grad_norm": 0.4308195739919132, "learning_rate": 4.1418271782501974e-06, "loss": 0.4272, "step": 455 }, { "epoch": 1.8011849901250823, "grad_norm": 0.42637046095576137, "learning_rate": 4.119179136015329e-06, "loss": 0.3992, "step": 456 }, { "epoch": 1.8051349572086899, "grad_norm": 0.4341349047891367, "learning_rate": 4.096549729457074e-06, "loss": 0.4086, "step": 457 }, { "epoch": 1.8090849242922977, "grad_norm": 0.4201839126701762, "learning_rate": 4.0739394373496364e-06, "loss": 0.4099, "step": 458 }, { "epoch": 1.8130348913759051, "grad_norm": 0.40686526475532786, "learning_rate": 4.051348738062798e-06, "loss": 0.399, "step": 459 }, { "epoch": 1.816984858459513, "grad_norm": 0.4100480491440975, "learning_rate": 4.028778109551826e-06, "loss": 0.3987, "step": 460 }, { "epoch": 1.8209348255431204, "grad_norm": 0.4892844510437587, "learning_rate": 4.006228029347339e-06, "loss": 0.4016, "step": 461 }, { "epoch": 1.8248847926267282, "grad_norm": 0.4814780207307003, "learning_rate": 3.983698974545216e-06, "loss": 0.4468, "step": 462 }, { "epoch": 1.8288347597103356, "grad_norm": 0.4252213813505314, "learning_rate": 3.961191421796492e-06, "loss": 0.4065, "step": 463 }, { "epoch": 1.8327847267939434, "grad_norm": 0.4203331049481512, "learning_rate": 3.938705847297285e-06, "loss": 0.4047, "step": 464 }, { "epoch": 1.836734693877551, "grad_norm": 0.40192784963827966, "learning_rate": 3.916242726778712e-06, "loss": 0.3964, "step": 465 }, { "epoch": 1.8406846609611587, "grad_norm": 0.409440482724312, "learning_rate": 3.893802535496834e-06, "loss": 0.3943, "step": 466 }, { "epoch": 1.8446346280447663, "grad_norm": 0.41120858118300013, "learning_rate": 3.871385748222584e-06, "loss": 0.3855, "step": 467 }, { "epoch": 1.848584595128374, "grad_norm": 0.39783258663990556, "learning_rate": 3.848992839231738e-06, "loss": 0.3775, "step": 468 }, { "epoch": 1.8525345622119815, "grad_norm": 0.4275419318693948, "learning_rate": 3.826624282294876e-06, "loss": 0.413, "step": 469 }, { "epoch": 1.8564845292955892, "grad_norm": 0.4270922240508849, "learning_rate": 3.804280550667357e-06, "loss": 0.4289, "step": 470 }, { "epoch": 1.860434496379197, "grad_norm": 0.45024838700787906, "learning_rate": 3.781962117079304e-06, "loss": 0.3981, "step": 471 }, { "epoch": 1.8643844634628044, "grad_norm": 0.42067389907946806, "learning_rate": 3.759669453725607e-06, "loss": 0.417, "step": 472 }, { "epoch": 1.8683344305464122, "grad_norm": 0.4707783243304426, "learning_rate": 3.737403032255932e-06, "loss": 0.4121, "step": 473 }, { "epoch": 1.8722843976300196, "grad_norm": 0.3995849923628797, "learning_rate": 3.71516332376474e-06, "loss": 0.3841, "step": 474 }, { "epoch": 1.8762343647136275, "grad_norm": 0.45411304786811546, "learning_rate": 3.6929507987813185e-06, "loss": 0.4243, "step": 475 }, { "epoch": 1.8801843317972349, "grad_norm": 0.47918399015425245, "learning_rate": 3.670765927259832e-06, "loss": 0.3992, "step": 476 }, { "epoch": 1.8841342988808427, "grad_norm": 0.4224141580891673, "learning_rate": 3.6486091785693744e-06, "loss": 0.3943, "step": 477 }, { "epoch": 1.8880842659644503, "grad_norm": 0.4488920343631708, "learning_rate": 3.626481021484045e-06, "loss": 0.428, "step": 478 }, { "epoch": 1.892034233048058, "grad_norm": 0.4382917141453344, "learning_rate": 3.6043819241730148e-06, "loss": 0.3825, "step": 479 }, { "epoch": 1.8959842001316656, "grad_norm": 0.37775492713758124, "learning_rate": 3.582312354190643e-06, "loss": 0.4105, "step": 480 }, { "epoch": 1.8999341672152732, "grad_norm": 0.46484065488529275, "learning_rate": 3.5602727784665692e-06, "loss": 0.4051, "step": 481 }, { "epoch": 1.9038841342988808, "grad_norm": 0.45884720923183464, "learning_rate": 3.538263663295841e-06, "loss": 0.4022, "step": 482 }, { "epoch": 1.9078341013824884, "grad_norm": 0.42472011802273496, "learning_rate": 3.516285474329045e-06, "loss": 0.4243, "step": 483 }, { "epoch": 1.9117840684660963, "grad_norm": 0.4509475197275075, "learning_rate": 3.4943386765624564e-06, "loss": 0.4248, "step": 484 }, { "epoch": 1.9157340355497037, "grad_norm": 0.43816037565201393, "learning_rate": 3.472423734328205e-06, "loss": 0.4087, "step": 485 }, { "epoch": 1.9196840026333115, "grad_norm": 0.4391563289644798, "learning_rate": 3.450541111284447e-06, "loss": 0.3895, "step": 486 }, { "epoch": 1.923633969716919, "grad_norm": 0.40539973952049296, "learning_rate": 3.4286912704055507e-06, "loss": 0.4013, "step": 487 }, { "epoch": 1.9275839368005268, "grad_norm": 0.4584761843561075, "learning_rate": 3.4068746739723124e-06, "loss": 0.4029, "step": 488 }, { "epoch": 1.9315339038841342, "grad_norm": 0.4227660889345728, "learning_rate": 3.385091783562168e-06, "loss": 0.4027, "step": 489 }, { "epoch": 1.935483870967742, "grad_norm": 0.4165287202906532, "learning_rate": 3.363343060039431e-06, "loss": 0.4084, "step": 490 }, { "epoch": 1.9394338380513496, "grad_norm": 0.4395180884156357, "learning_rate": 3.3416289635455367e-06, "loss": 0.3901, "step": 491 }, { "epoch": 1.9433838051349572, "grad_norm": 0.4265197016870327, "learning_rate": 3.319949953489313e-06, "loss": 0.3998, "step": 492 }, { "epoch": 1.9473337722185649, "grad_norm": 0.42307017828945875, "learning_rate": 3.2983064885372574e-06, "loss": 0.4203, "step": 493 }, { "epoch": 1.9512837393021725, "grad_norm": 0.42243124304991475, "learning_rate": 3.2766990266038358e-06, "loss": 0.4219, "step": 494 }, { "epoch": 1.95523370638578, "grad_norm": 0.4024127725334922, "learning_rate": 3.255128024841786e-06, "loss": 0.4122, "step": 495 }, { "epoch": 1.9591836734693877, "grad_norm": 0.38922920871678796, "learning_rate": 3.233593939632458e-06, "loss": 0.3799, "step": 496 }, { "epoch": 1.9631336405529956, "grad_norm": 0.4249248211603702, "learning_rate": 3.21209722657615e-06, "loss": 0.3999, "step": 497 }, { "epoch": 1.967083607636603, "grad_norm": 0.4018077491551274, "learning_rate": 3.1906383404824735e-06, "loss": 0.3874, "step": 498 }, { "epoch": 1.9710335747202108, "grad_norm": 0.39911091164207707, "learning_rate": 3.169217735360721e-06, "loss": 0.4128, "step": 499 }, { "epoch": 1.9749835418038182, "grad_norm": 0.38167889787417103, "learning_rate": 3.1478358644102763e-06, "loss": 0.3842, "step": 500 }, { "epoch": 1.978933508887426, "grad_norm": 0.4328880678043028, "learning_rate": 3.1264931800110143e-06, "loss": 0.4145, "step": 501 }, { "epoch": 1.9828834759710334, "grad_norm": 0.43194842471460126, "learning_rate": 3.1051901337137337e-06, "loss": 0.4017, "step": 502 }, { "epoch": 1.9868334430546413, "grad_norm": 0.4196309410010332, "learning_rate": 3.083927176230601e-06, "loss": 0.4043, "step": 503 }, { "epoch": 1.9907834101382489, "grad_norm": 0.4004872255725005, "learning_rate": 3.062704757425622e-06, "loss": 0.4033, "step": 504 }, { "epoch": 1.9947333772218565, "grad_norm": 0.406258433876982, "learning_rate": 3.041523326305112e-06, "loss": 0.4008, "step": 505 }, { "epoch": 1.9986833443054641, "grad_norm": 0.4308591251265654, "learning_rate": 3.020383331008212e-06, "loss": 0.4164, "step": 506 }, { "epoch": 2.0026333113890717, "grad_norm": 0.43059490612139534, "learning_rate": 2.9992852187973877e-06, "loss": 0.369, "step": 507 }, { "epoch": 2.0065832784726796, "grad_norm": 0.431105969070634, "learning_rate": 2.978229436048983e-06, "loss": 0.368, "step": 508 }, { "epoch": 2.010533245556287, "grad_norm": 0.41426917949772585, "learning_rate": 2.957216428243772e-06, "loss": 0.35, "step": 509 }, { "epoch": 2.014483212639895, "grad_norm": 0.4269273192623821, "learning_rate": 2.9362466399575295e-06, "loss": 0.3703, "step": 510 }, { "epoch": 2.0184331797235022, "grad_norm": 0.39628783192824596, "learning_rate": 2.915320514851627e-06, "loss": 0.3477, "step": 511 }, { "epoch": 2.02238314680711, "grad_norm": 0.4233369101603715, "learning_rate": 2.894438495663644e-06, "loss": 0.3695, "step": 512 }, { "epoch": 2.0263331138907175, "grad_norm": 0.44233393949613237, "learning_rate": 2.8736010241980115e-06, "loss": 0.3611, "step": 513 }, { "epoch": 2.0302830809743253, "grad_norm": 0.4075987938679799, "learning_rate": 2.8528085413166527e-06, "loss": 0.3646, "step": 514 }, { "epoch": 2.0342330480579327, "grad_norm": 0.445477829810087, "learning_rate": 2.8320614869296566e-06, "loss": 0.355, "step": 515 }, { "epoch": 2.0381830151415405, "grad_norm": 0.4545056386462829, "learning_rate": 2.8113602999859764e-06, "loss": 0.3656, "step": 516 }, { "epoch": 2.042132982225148, "grad_norm": 0.4125714188313961, "learning_rate": 2.7907054184641412e-06, "loss": 0.3627, "step": 517 }, { "epoch": 2.046082949308756, "grad_norm": 0.41598068539741595, "learning_rate": 2.7700972793629866e-06, "loss": 0.3686, "step": 518 }, { "epoch": 2.050032916392363, "grad_norm": 0.4868676758694044, "learning_rate": 2.7495363186924125e-06, "loss": 0.3539, "step": 519 }, { "epoch": 2.053982883475971, "grad_norm": 0.4320824082577185, "learning_rate": 2.7290229714641546e-06, "loss": 0.361, "step": 520 }, { "epoch": 2.057932850559579, "grad_norm": 0.4306405126687012, "learning_rate": 2.708557671682586e-06, "loss": 0.3761, "step": 521 }, { "epoch": 2.0618828176431863, "grad_norm": 0.41321349284775005, "learning_rate": 2.68814085233553e-06, "loss": 0.3698, "step": 522 }, { "epoch": 2.065832784726794, "grad_norm": 0.4269062243121182, "learning_rate": 2.6677729453850964e-06, "loss": 0.377, "step": 523 }, { "epoch": 2.0697827518104015, "grad_norm": 0.3927859833952262, "learning_rate": 2.6474543817585575e-06, "loss": 0.3513, "step": 524 }, { "epoch": 2.0737327188940093, "grad_norm": 0.42306130932238234, "learning_rate": 2.627185591339212e-06, "loss": 0.3695, "step": 525 }, { "epoch": 2.0776826859776167, "grad_norm": 0.42125830895574407, "learning_rate": 2.6069670029573036e-06, "loss": 0.3616, "step": 526 }, { "epoch": 2.0816326530612246, "grad_norm": 0.41342464377627897, "learning_rate": 2.5867990443809373e-06, "loss": 0.3684, "step": 527 }, { "epoch": 2.085582620144832, "grad_norm": 0.43988242492827745, "learning_rate": 2.5666821423070386e-06, "loss": 0.3622, "step": 528 }, { "epoch": 2.08953258722844, "grad_norm": 0.38860970984118265, "learning_rate": 2.546616722352321e-06, "loss": 0.3631, "step": 529 }, { "epoch": 2.093482554312047, "grad_norm": 0.3951257356740448, "learning_rate": 2.526603209044286e-06, "loss": 0.3517, "step": 530 }, { "epoch": 2.097432521395655, "grad_norm": 0.41241866412818307, "learning_rate": 2.50664202581223e-06, "loss": 0.3755, "step": 531 }, { "epoch": 2.1013824884792625, "grad_norm": 0.4044492505914914, "learning_rate": 2.486733594978298e-06, "loss": 0.3727, "step": 532 }, { "epoch": 2.1053324555628703, "grad_norm": 0.43531379891859384, "learning_rate": 2.466878337748541e-06, "loss": 0.3888, "step": 533 }, { "epoch": 2.109282422646478, "grad_norm": 0.41429282944556184, "learning_rate": 2.447076674204011e-06, "loss": 0.3363, "step": 534 }, { "epoch": 2.1132323897300855, "grad_norm": 0.4460531962899919, "learning_rate": 2.427329023291864e-06, "loss": 0.3864, "step": 535 }, { "epoch": 2.1171823568136934, "grad_norm": 0.37833726467287027, "learning_rate": 2.407635802816506e-06, "loss": 0.353, "step": 536 }, { "epoch": 2.1211323238973008, "grad_norm": 0.4083131273171637, "learning_rate": 2.387997429430746e-06, "loss": 0.3567, "step": 537 }, { "epoch": 2.1250822909809086, "grad_norm": 0.3922555285342666, "learning_rate": 2.3684143186269887e-06, "loss": 0.3675, "step": 538 }, { "epoch": 2.129032258064516, "grad_norm": 0.41199781869008384, "learning_rate": 2.3488868847284296e-06, "loss": 0.3748, "step": 539 }, { "epoch": 2.132982225148124, "grad_norm": 0.40005886313150185, "learning_rate": 2.329415540880307e-06, "loss": 0.3575, "step": 540 }, { "epoch": 2.1369321922317313, "grad_norm": 0.3951582956319914, "learning_rate": 2.3100006990411476e-06, "loss": 0.3522, "step": 541 }, { "epoch": 2.140882159315339, "grad_norm": 0.42020310559856866, "learning_rate": 2.290642769974063e-06, "loss": 0.385, "step": 542 }, { "epoch": 2.1448321263989465, "grad_norm": 0.3875802405133011, "learning_rate": 2.271342163238041e-06, "loss": 0.3664, "step": 543 }, { "epoch": 2.1487820934825543, "grad_norm": 0.3943994701897059, "learning_rate": 2.252099287179298e-06, "loss": 0.3552, "step": 544 }, { "epoch": 2.152732060566162, "grad_norm": 0.420961284339023, "learning_rate": 2.2329145489226307e-06, "loss": 0.3565, "step": 545 }, { "epoch": 2.1566820276497696, "grad_norm": 0.41299040054852304, "learning_rate": 2.2137883543628047e-06, "loss": 0.3855, "step": 546 }, { "epoch": 2.1606319947333774, "grad_norm": 0.3767418556378203, "learning_rate": 2.1947211081559666e-06, "loss": 0.3671, "step": 547 }, { "epoch": 2.164581961816985, "grad_norm": 0.38465919031274487, "learning_rate": 2.1757132137110826e-06, "loss": 0.358, "step": 548 }, { "epoch": 2.1685319289005927, "grad_norm": 0.39199728396963307, "learning_rate": 2.1567650731814045e-06, "loss": 0.3837, "step": 549 }, { "epoch": 2.1724818959842, "grad_norm": 0.3743628005362859, "learning_rate": 2.1378770874559607e-06, "loss": 0.3723, "step": 550 }, { "epoch": 2.176431863067808, "grad_norm": 0.38575592852365864, "learning_rate": 2.1190496561510693e-06, "loss": 0.3779, "step": 551 }, { "epoch": 2.1803818301514153, "grad_norm": 0.40155673668791925, "learning_rate": 2.100283177601892e-06, "loss": 0.3656, "step": 552 }, { "epoch": 2.184331797235023, "grad_norm": 0.4042299633710585, "learning_rate": 2.081578048854007e-06, "loss": 0.354, "step": 553 }, { "epoch": 2.1882817643186305, "grad_norm": 0.3668726365387741, "learning_rate": 2.0629346656549996e-06, "loss": 0.3439, "step": 554 }, { "epoch": 2.1922317314022384, "grad_norm": 0.3905864097648711, "learning_rate": 2.044353422446091e-06, "loss": 0.3787, "step": 555 }, { "epoch": 2.1961816984858458, "grad_norm": 0.4054936998042014, "learning_rate": 2.0258347123538013e-06, "loss": 0.3716, "step": 556 }, { "epoch": 2.2001316655694536, "grad_norm": 0.4192215431775659, "learning_rate": 2.0073789271816246e-06, "loss": 0.3595, "step": 557 }, { "epoch": 2.204081632653061, "grad_norm": 0.4168184665737652, "learning_rate": 1.9889864574017433e-06, "loss": 0.3623, "step": 558 }, { "epoch": 2.208031599736669, "grad_norm": 0.40664876948303114, "learning_rate": 1.9706576921467627e-06, "loss": 0.3871, "step": 559 }, { "epoch": 2.2119815668202767, "grad_norm": 0.3836543781476228, "learning_rate": 1.952393019201484e-06, "loss": 0.3688, "step": 560 }, { "epoch": 2.215931533903884, "grad_norm": 0.3679645987638231, "learning_rate": 1.934192824994694e-06, "loss": 0.346, "step": 561 }, { "epoch": 2.219881500987492, "grad_norm": 0.4124024293063666, "learning_rate": 1.916057494590994e-06, "loss": 0.3776, "step": 562 }, { "epoch": 2.2238314680710993, "grad_norm": 0.3822911994445428, "learning_rate": 1.8979874116826436e-06, "loss": 0.33, "step": 563 }, { "epoch": 2.227781435154707, "grad_norm": 0.40672859259985183, "learning_rate": 1.8799829585814627e-06, "loss": 0.3703, "step": 564 }, { "epoch": 2.2317314022383146, "grad_norm": 0.40255156986748153, "learning_rate": 1.8620445162107204e-06, "loss": 0.3613, "step": 565 }, { "epoch": 2.2356813693219224, "grad_norm": 0.3750484934368842, "learning_rate": 1.8441724640970904e-06, "loss": 0.3501, "step": 566 }, { "epoch": 2.23963133640553, "grad_norm": 0.36988549816572636, "learning_rate": 1.826367180362612e-06, "loss": 0.3619, "step": 567 }, { "epoch": 2.2435813034891376, "grad_norm": 0.38468075651687694, "learning_rate": 1.808629041716698e-06, "loss": 0.3582, "step": 568 }, { "epoch": 2.247531270572745, "grad_norm": 0.4231114441171588, "learning_rate": 1.7909584234481591e-06, "loss": 0.3575, "step": 569 }, { "epoch": 2.251481237656353, "grad_norm": 0.39356011748465514, "learning_rate": 1.7733556994172719e-06, "loss": 0.3649, "step": 570 }, { "epoch": 2.2554312047399607, "grad_norm": 0.3883012291298516, "learning_rate": 1.7558212420478533e-06, "loss": 0.3411, "step": 571 }, { "epoch": 2.259381171823568, "grad_norm": 0.39517064184980494, "learning_rate": 1.7383554223193977e-06, "loss": 0.3579, "step": 572 }, { "epoch": 2.263331138907176, "grad_norm": 0.381405712965206, "learning_rate": 1.720958609759219e-06, "loss": 0.3549, "step": 573 }, { "epoch": 2.2672811059907834, "grad_norm": 0.4171125304777428, "learning_rate": 1.703631172434636e-06, "loss": 0.369, "step": 574 }, { "epoch": 2.271231073074391, "grad_norm": 0.41534822797094995, "learning_rate": 1.6863734769451822e-06, "loss": 0.369, "step": 575 }, { "epoch": 2.2751810401579986, "grad_norm": 0.3829033030491985, "learning_rate": 1.6691858884148527e-06, "loss": 0.3555, "step": 576 }, { "epoch": 2.2791310072416064, "grad_norm": 0.3974042504654446, "learning_rate": 1.6520687704843762e-06, "loss": 0.3665, "step": 577 }, { "epoch": 2.283080974325214, "grad_norm": 0.3734845484357207, "learning_rate": 1.6350224853035268e-06, "loss": 0.3581, "step": 578 }, { "epoch": 2.2870309414088217, "grad_norm": 0.4030764111709865, "learning_rate": 1.6180473935234508e-06, "loss": 0.3674, "step": 579 }, { "epoch": 2.290980908492429, "grad_norm": 0.4085313150786639, "learning_rate": 1.6011438542890484e-06, "loss": 0.3662, "step": 580 }, { "epoch": 2.294930875576037, "grad_norm": 0.3927302845326312, "learning_rate": 1.584312225231373e-06, "loss": 0.3536, "step": 581 }, { "epoch": 2.2988808426596443, "grad_norm": 0.37039745972724736, "learning_rate": 1.56755286246006e-06, "loss": 0.3447, "step": 582 }, { "epoch": 2.302830809743252, "grad_norm": 0.38690232276102043, "learning_rate": 1.5508661205557902e-06, "loss": 0.3704, "step": 583 }, { "epoch": 2.3067807768268596, "grad_norm": 0.3989754371011779, "learning_rate": 1.5342523525627973e-06, "loss": 0.3723, "step": 584 }, { "epoch": 2.3107307439104674, "grad_norm": 0.368927625151238, "learning_rate": 1.5177119099813925e-06, "loss": 0.3479, "step": 585 }, { "epoch": 2.3146807109940752, "grad_norm": 0.4227395352574204, "learning_rate": 1.5012451427605295e-06, "loss": 0.3659, "step": 586 }, { "epoch": 2.3186306780776826, "grad_norm": 0.38209167569268143, "learning_rate": 1.484852399290398e-06, "loss": 0.3665, "step": 587 }, { "epoch": 2.3225806451612905, "grad_norm": 0.37910038254162204, "learning_rate": 1.468534026395056e-06, "loss": 0.3383, "step": 588 }, { "epoch": 2.326530612244898, "grad_norm": 0.3819479996869078, "learning_rate": 1.4522903693250906e-06, "loss": 0.3662, "step": 589 }, { "epoch": 2.3304805793285057, "grad_norm": 0.4022721866940796, "learning_rate": 1.4361217717503145e-06, "loss": 0.3445, "step": 590 }, { "epoch": 2.334430546412113, "grad_norm": 0.3903299389258058, "learning_rate": 1.4200285757524896e-06, "loss": 0.3628, "step": 591 }, { "epoch": 2.338380513495721, "grad_norm": 0.3985606505323535, "learning_rate": 1.4040111218180968e-06, "loss": 0.3628, "step": 592 }, { "epoch": 2.3423304805793284, "grad_norm": 0.39305196713812435, "learning_rate": 1.388069748831133e-06, "loss": 0.3564, "step": 593 }, { "epoch": 2.346280447662936, "grad_norm": 0.38499219163864845, "learning_rate": 1.3722047940659328e-06, "loss": 0.3455, "step": 594 }, { "epoch": 2.3502304147465436, "grad_norm": 0.37559938578424784, "learning_rate": 1.356416593180036e-06, "loss": 0.3615, "step": 595 }, { "epoch": 2.3541803818301514, "grad_norm": 0.39024814417901343, "learning_rate": 1.3407054802070923e-06, "loss": 0.3715, "step": 596 }, { "epoch": 2.3581303489137593, "grad_norm": 0.3714883797936843, "learning_rate": 1.3250717875497865e-06, "loss": 0.3471, "step": 597 }, { "epoch": 2.3620803159973667, "grad_norm": 0.385164737546989, "learning_rate": 1.3095158459728092e-06, "loss": 0.3573, "step": 598 }, { "epoch": 2.3660302830809745, "grad_norm": 0.41498492499428885, "learning_rate": 1.294037984595859e-06, "loss": 0.3727, "step": 599 }, { "epoch": 2.369980250164582, "grad_norm": 0.3654601769315185, "learning_rate": 1.2786385308866772e-06, "loss": 0.3511, "step": 600 }, { "epoch": 2.3739302172481898, "grad_norm": 0.36745399166756065, "learning_rate": 1.2633178106541217e-06, "loss": 0.3623, "step": 601 }, { "epoch": 2.377880184331797, "grad_norm": 0.37841080451951603, "learning_rate": 1.248076148041274e-06, "loss": 0.3641, "step": 602 }, { "epoch": 2.381830151415405, "grad_norm": 0.3583118964667342, "learning_rate": 1.2329138655185736e-06, "loss": 0.3452, "step": 603 }, { "epoch": 2.3857801184990124, "grad_norm": 0.3853246574091475, "learning_rate": 1.2178312838770119e-06, "loss": 0.3517, "step": 604 }, { "epoch": 2.3897300855826202, "grad_norm": 0.3754496924339384, "learning_rate": 1.2028287222213286e-06, "loss": 0.3822, "step": 605 }, { "epoch": 2.3936800526662276, "grad_norm": 0.3736466947095948, "learning_rate": 1.1879064979632705e-06, "loss": 0.3793, "step": 606 }, { "epoch": 2.3976300197498355, "grad_norm": 0.3911153134079558, "learning_rate": 1.1730649268148663e-06, "loss": 0.3745, "step": 607 }, { "epoch": 2.401579986833443, "grad_norm": 0.3870714306606935, "learning_rate": 1.1583043227817608e-06, "loss": 0.3643, "step": 608 }, { "epoch": 2.4055299539170507, "grad_norm": 0.3798036980492469, "learning_rate": 1.1436249981565577e-06, "loss": 0.37, "step": 609 }, { "epoch": 2.409479921000658, "grad_norm": 0.36859317117789914, "learning_rate": 1.1290272635122256e-06, "loss": 0.3628, "step": 610 }, { "epoch": 2.413429888084266, "grad_norm": 0.3647837984082812, "learning_rate": 1.114511427695512e-06, "loss": 0.3423, "step": 611 }, { "epoch": 2.417379855167874, "grad_norm": 0.3582215068933737, "learning_rate": 1.1000777978204214e-06, "loss": 0.3502, "step": 612 }, { "epoch": 2.421329822251481, "grad_norm": 0.3614836927172191, "learning_rate": 1.0857266792617122e-06, "loss": 0.3682, "step": 613 }, { "epoch": 2.425279789335089, "grad_norm": 0.37894988770166294, "learning_rate": 1.0714583756484382e-06, "loss": 0.368, "step": 614 }, { "epoch": 2.4292297564186964, "grad_norm": 0.39545840824628087, "learning_rate": 1.057273188857521e-06, "loss": 0.3801, "step": 615 }, { "epoch": 2.4331797235023043, "grad_norm": 0.3802171467885492, "learning_rate": 1.0431714190073673e-06, "loss": 0.3575, "step": 616 }, { "epoch": 2.4371296905859117, "grad_norm": 0.3688160859172857, "learning_rate": 1.0291533644515167e-06, "loss": 0.3595, "step": 617 }, { "epoch": 2.4410796576695195, "grad_norm": 0.38412735540728, "learning_rate": 1.0152193217723316e-06, "loss": 0.3617, "step": 618 }, { "epoch": 2.445029624753127, "grad_norm": 0.36777245553644944, "learning_rate": 1.0013695857747175e-06, "loss": 0.373, "step": 619 }, { "epoch": 2.4489795918367347, "grad_norm": 0.3629906502014108, "learning_rate": 9.876044494798897e-07, "loss": 0.3468, "step": 620 }, { "epoch": 2.452929558920342, "grad_norm": 0.3932315320924159, "learning_rate": 9.739242041191782e-07, "loss": 0.3737, "step": 621 }, { "epoch": 2.45687952600395, "grad_norm": 0.3508629317317041, "learning_rate": 9.60329139127857e-07, "loss": 0.3491, "step": 622 }, { "epoch": 2.460829493087558, "grad_norm": 0.37482808336302703, "learning_rate": 9.46819542139023e-07, "loss": 0.3476, "step": 623 }, { "epoch": 2.4647794601711652, "grad_norm": 0.39866198970279526, "learning_rate": 9.333956989775151e-07, "loss": 0.3686, "step": 624 }, { "epoch": 2.468729427254773, "grad_norm": 0.38121036327955937, "learning_rate": 9.200578936538629e-07, "loss": 0.3802, "step": 625 }, { "epoch": 2.4726793943383805, "grad_norm": 0.37755821537271617, "learning_rate": 9.06806408358279e-07, "loss": 0.3683, "step": 626 }, { "epoch": 2.4766293614219883, "grad_norm": 0.3834446065122302, "learning_rate": 8.93641523454688e-07, "loss": 0.3561, "step": 627 }, { "epoch": 2.4805793285055957, "grad_norm": 0.3743048236625299, "learning_rate": 8.805635174747962e-07, "loss": 0.3407, "step": 628 }, { "epoch": 2.4845292955892035, "grad_norm": 0.3682088387307466, "learning_rate": 8.67572667112197e-07, "loss": 0.3621, "step": 629 }, { "epoch": 2.488479262672811, "grad_norm": 0.37122751493539824, "learning_rate": 8.546692472165197e-07, "loss": 0.3742, "step": 630 }, { "epoch": 2.492429229756419, "grad_norm": 0.3752625573526226, "learning_rate": 8.418535307876058e-07, "loss": 0.3612, "step": 631 }, { "epoch": 2.496379196840026, "grad_norm": 0.3804433354541563, "learning_rate": 8.291257889697485e-07, "loss": 0.3636, "step": 632 }, { "epoch": 2.500329163923634, "grad_norm": 0.35123769396935123, "learning_rate": 8.16486291045942e-07, "loss": 0.3482, "step": 633 }, { "epoch": 2.504279131007242, "grad_norm": 0.3671289537456502, "learning_rate": 8.039353044321918e-07, "loss": 0.3735, "step": 634 }, { "epoch": 2.5082290980908493, "grad_norm": 0.3750612473420445, "learning_rate": 7.914730946718507e-07, "loss": 0.3553, "step": 635 }, { "epoch": 2.5121790651744567, "grad_norm": 0.380341805864739, "learning_rate": 7.790999254300081e-07, "loss": 0.3665, "step": 636 }, { "epoch": 2.5161290322580645, "grad_norm": 0.37612698458230404, "learning_rate": 7.66816058487907e-07, "loss": 0.3737, "step": 637 }, { "epoch": 2.5200789993416723, "grad_norm": 0.3610796611659749, "learning_rate": 7.546217537374073e-07, "loss": 0.3589, "step": 638 }, { "epoch": 2.5240289664252797, "grad_norm": 0.361329208637621, "learning_rate": 7.425172691754851e-07, "loss": 0.3663, "step": 639 }, { "epoch": 2.5279789335088876, "grad_norm": 0.36627999341212697, "learning_rate": 7.305028608987763e-07, "loss": 0.3602, "step": 640 }, { "epoch": 2.531928900592495, "grad_norm": 0.38231347808268445, "learning_rate": 7.185787830981572e-07, "loss": 0.3604, "step": 641 }, { "epoch": 2.535878867676103, "grad_norm": 0.3766644173222793, "learning_rate": 7.067452880533665e-07, "loss": 0.3492, "step": 642 }, { "epoch": 2.53982883475971, "grad_norm": 0.3787148248440278, "learning_rate": 6.950026261276699e-07, "loss": 0.3722, "step": 643 }, { "epoch": 2.543778801843318, "grad_norm": 0.3634633807057774, "learning_rate": 6.833510457625586e-07, "loss": 0.3646, "step": 644 }, { "epoch": 2.5477287689269255, "grad_norm": 0.3562081059439772, "learning_rate": 6.717907934724982e-07, "loss": 0.346, "step": 645 }, { "epoch": 2.5516787360105333, "grad_norm": 0.38384305642415906, "learning_rate": 6.603221138397104e-07, "loss": 0.3534, "step": 646 }, { "epoch": 2.5556287030941407, "grad_norm": 0.38960105395288636, "learning_rate": 6.489452495089959e-07, "loss": 0.3666, "step": 647 }, { "epoch": 2.5595786701777485, "grad_norm": 0.37953771788167656, "learning_rate": 6.376604411826071e-07, "loss": 0.3605, "step": 648 }, { "epoch": 2.5635286372613564, "grad_norm": 0.36783699257112756, "learning_rate": 6.264679276151486e-07, "loss": 0.3563, "step": 649 }, { "epoch": 2.5674786043449638, "grad_norm": 0.3866249383183639, "learning_rate": 6.153679456085344e-07, "loss": 0.3949, "step": 650 }, { "epoch": 2.571428571428571, "grad_norm": 0.36481228778481656, "learning_rate": 6.043607300069654e-07, "loss": 0.3638, "step": 651 }, { "epoch": 2.575378538512179, "grad_norm": 0.3542661338518236, "learning_rate": 5.934465136919737e-07, "loss": 0.3608, "step": 652 }, { "epoch": 2.579328505595787, "grad_norm": 0.3848480985285939, "learning_rate": 5.826255275774861e-07, "loss": 0.3827, "step": 653 }, { "epoch": 2.5832784726793943, "grad_norm": 0.36522745207931606, "learning_rate": 5.718980006049446e-07, "loss": 0.3615, "step": 654 }, { "epoch": 2.587228439763002, "grad_norm": 0.39179583618407227, "learning_rate": 5.612641597384588e-07, "loss": 0.3551, "step": 655 }, { "epoch": 2.5911784068466095, "grad_norm": 0.38932000829917784, "learning_rate": 5.507242299600063e-07, "loss": 0.3563, "step": 656 }, { "epoch": 2.5951283739302173, "grad_norm": 0.36546680922687486, "learning_rate": 5.402784342646711e-07, "loss": 0.3546, "step": 657 }, { "epoch": 2.5990783410138247, "grad_norm": 0.4057498231355954, "learning_rate": 5.299269936559276e-07, "loss": 0.3649, "step": 658 }, { "epoch": 2.6030283080974326, "grad_norm": 0.3632264466026109, "learning_rate": 5.196701271409616e-07, "loss": 0.3618, "step": 659 }, { "epoch": 2.6069782751810404, "grad_norm": 0.36168215579369317, "learning_rate": 5.095080517260398e-07, "loss": 0.3456, "step": 660 }, { "epoch": 2.610928242264648, "grad_norm": 0.388729521614058, "learning_rate": 4.994409824119189e-07, "loss": 0.3881, "step": 661 }, { "epoch": 2.614878209348255, "grad_norm": 0.3747091788021845, "learning_rate": 4.894691321892947e-07, "loss": 0.3642, "step": 662 }, { "epoch": 2.618828176431863, "grad_norm": 0.3795066899669426, "learning_rate": 4.795927120342941e-07, "loss": 0.3492, "step": 663 }, { "epoch": 2.622778143515471, "grad_norm": 0.3601158492370546, "learning_rate": 4.698119309040161e-07, "loss": 0.3562, "step": 664 }, { "epoch": 2.6267281105990783, "grad_norm": 0.3563679746656453, "learning_rate": 4.601269957321092e-07, "loss": 0.3576, "step": 665 }, { "epoch": 2.630678077682686, "grad_norm": 0.3744423872400407, "learning_rate": 4.5053811142439065e-07, "loss": 0.3567, "step": 666 }, { "epoch": 2.6346280447662935, "grad_norm": 0.36235217126204783, "learning_rate": 4.410454808545145e-07, "loss": 0.3554, "step": 667 }, { "epoch": 2.6385780118499014, "grad_norm": 0.36268964564016637, "learning_rate": 4.316493048596787e-07, "loss": 0.3616, "step": 668 }, { "epoch": 2.6425279789335088, "grad_norm": 0.36831100045189846, "learning_rate": 4.2234978223637367e-07, "loss": 0.3573, "step": 669 }, { "epoch": 2.6464779460171166, "grad_norm": 0.36373080730318275, "learning_rate": 4.1314710973618e-07, "loss": 0.3493, "step": 670 }, { "epoch": 2.650427913100724, "grad_norm": 0.372787378185949, "learning_rate": 4.0404148206160067e-07, "loss": 0.366, "step": 671 }, { "epoch": 2.654377880184332, "grad_norm": 0.35625205259307574, "learning_rate": 3.9503309186194883e-07, "loss": 0.3654, "step": 672 }, { "epoch": 2.6583278472679392, "grad_norm": 0.37009847844986465, "learning_rate": 3.861221297292656e-07, "loss": 0.3658, "step": 673 }, { "epoch": 2.662277814351547, "grad_norm": 0.34674408066576584, "learning_rate": 3.7730878419429074e-07, "loss": 0.3584, "step": 674 }, { "epoch": 2.666227781435155, "grad_norm": 0.35438727742904624, "learning_rate": 3.6859324172247024e-07, "loss": 0.3466, "step": 675 }, { "epoch": 2.6701777485187623, "grad_norm": 0.36395601016300216, "learning_rate": 3.599756867100185e-07, "loss": 0.3586, "step": 676 }, { "epoch": 2.6741277156023697, "grad_norm": 0.3551035511091147, "learning_rate": 3.5145630148000984e-07, "loss": 0.3594, "step": 677 }, { "epoch": 2.6780776826859776, "grad_norm": 0.4023515069704998, "learning_rate": 3.4303526627852467e-07, "loss": 0.3941, "step": 678 }, { "epoch": 2.6820276497695854, "grad_norm": 0.34106072747737726, "learning_rate": 3.3471275927083435e-07, "loss": 0.3366, "step": 679 }, { "epoch": 2.685977616853193, "grad_norm": 0.37492257232233306, "learning_rate": 3.264889565376339e-07, "loss": 0.3556, "step": 680 }, { "epoch": 2.6899275839368006, "grad_norm": 0.3561162555245172, "learning_rate": 3.1836403207131384e-07, "loss": 0.3449, "step": 681 }, { "epoch": 2.693877551020408, "grad_norm": 0.3740486926076543, "learning_rate": 3.103381577722814e-07, "loss": 0.3606, "step": 682 }, { "epoch": 2.697827518104016, "grad_norm": 0.3789540130941264, "learning_rate": 3.0241150344532235e-07, "loss": 0.3784, "step": 683 }, { "epoch": 2.7017774851876233, "grad_norm": 0.36810135991449355, "learning_rate": 2.945842367960083e-07, "loss": 0.3657, "step": 684 }, { "epoch": 2.705727452271231, "grad_norm": 0.37026024844088956, "learning_rate": 2.8685652342714866e-07, "loss": 0.3642, "step": 685 }, { "epoch": 2.709677419354839, "grad_norm": 0.35059547152331344, "learning_rate": 2.79228526835289e-07, "loss": 0.3456, "step": 686 }, { "epoch": 2.7136273864384464, "grad_norm": 0.36988409653125376, "learning_rate": 2.717004084072472e-07, "loss": 0.3555, "step": 687 }, { "epoch": 2.7175773535220538, "grad_norm": 0.36515259558716734, "learning_rate": 2.642723274167036e-07, "loss": 0.3671, "step": 688 }, { "epoch": 2.7215273206056616, "grad_norm": 0.37278930088974765, "learning_rate": 2.5694444102082937e-07, "loss": 0.3577, "step": 689 }, { "epoch": 2.7254772876892694, "grad_norm": 0.35399131467416955, "learning_rate": 2.4971690425696304e-07, "loss": 0.347, "step": 690 }, { "epoch": 2.729427254772877, "grad_norm": 0.3620811851230565, "learning_rate": 2.425898700393253e-07, "loss": 0.3403, "step": 691 }, { "epoch": 2.7333772218564847, "grad_norm": 0.3791362812815794, "learning_rate": 2.3556348915579064e-07, "loss": 0.3794, "step": 692 }, { "epoch": 2.737327188940092, "grad_norm": 0.36219729185740157, "learning_rate": 2.286379102646924e-07, "loss": 0.3536, "step": 693 }, { "epoch": 2.7412771560237, "grad_norm": 0.38657739140669617, "learning_rate": 2.2181327989168e-07, "loss": 0.3702, "step": 694 }, { "epoch": 2.7452271231073073, "grad_norm": 0.36251083856239635, "learning_rate": 2.150897424266163e-07, "loss": 0.3509, "step": 695 }, { "epoch": 2.749177090190915, "grad_norm": 0.3699052654169764, "learning_rate": 2.084674401205261e-07, "loss": 0.3513, "step": 696 }, { "epoch": 2.7531270572745226, "grad_norm": 0.35425470456248437, "learning_rate": 2.0194651308258374e-07, "loss": 0.3632, "step": 697 }, { "epoch": 2.7570770243581304, "grad_norm": 0.3656567082173593, "learning_rate": 1.9552709927715073e-07, "loss": 0.3653, "step": 698 }, { "epoch": 2.761026991441738, "grad_norm": 0.388813927081414, "learning_rate": 1.8920933452085398e-07, "loss": 0.3701, "step": 699 }, { "epoch": 2.7649769585253456, "grad_norm": 0.37156836907607405, "learning_rate": 1.829933524797156e-07, "loss": 0.3661, "step": 700 }, { "epoch": 2.7689269256089535, "grad_norm": 0.36201117460129345, "learning_rate": 1.7687928466632421e-07, "loss": 0.355, "step": 701 }, { "epoch": 2.772876892692561, "grad_norm": 0.3821513829894693, "learning_rate": 1.7086726043705093e-07, "loss": 0.3578, "step": 702 }, { "epoch": 2.7768268597761683, "grad_norm": 0.3804095149613746, "learning_rate": 1.6495740698931283e-07, "loss": 0.3728, "step": 703 }, { "epoch": 2.780776826859776, "grad_norm": 0.3619040686495984, "learning_rate": 1.5914984935888278e-07, "loss": 0.3581, "step": 704 }, { "epoch": 2.784726793943384, "grad_norm": 0.3693825714869537, "learning_rate": 1.5344471041724485e-07, "loss": 0.3622, "step": 705 }, { "epoch": 2.7886767610269914, "grad_norm": 0.3650393865411166, "learning_rate": 1.4784211086899147e-07, "loss": 0.3697, "step": 706 }, { "epoch": 2.792626728110599, "grad_norm": 0.38527136143072716, "learning_rate": 1.423421692492738e-07, "loss": 0.3846, "step": 707 }, { "epoch": 2.7965766951942066, "grad_norm": 0.36713892007514554, "learning_rate": 1.369450019212898e-07, "loss": 0.3554, "step": 708 }, { "epoch": 2.8005266622778144, "grad_norm": 0.3666117226502091, "learning_rate": 1.316507230738262e-07, "loss": 0.3407, "step": 709 }, { "epoch": 2.804476629361422, "grad_norm": 0.3615703464208447, "learning_rate": 1.2645944471883997e-07, "loss": 0.3576, "step": 710 }, { "epoch": 2.8084265964450297, "grad_norm": 0.35753791232395016, "learning_rate": 1.2137127668908733e-07, "loss": 0.355, "step": 711 }, { "epoch": 2.8123765635286375, "grad_norm": 0.363319808777208, "learning_rate": 1.1638632663580452e-07, "loss": 0.353, "step": 712 }, { "epoch": 2.816326530612245, "grad_norm": 0.363593029429133, "learning_rate": 1.1150470002642689e-07, "loss": 0.3596, "step": 713 }, { "epoch": 2.8202764976958523, "grad_norm": 0.35036008826746873, "learning_rate": 1.067265001423573e-07, "loss": 0.3554, "step": 714 }, { "epoch": 2.82422646477946, "grad_norm": 0.36031115481719755, "learning_rate": 1.0205182807678182e-07, "loss": 0.3376, "step": 715 }, { "epoch": 2.828176431863068, "grad_norm": 0.34795532927900635, "learning_rate": 9.748078273253137e-08, "loss": 0.3525, "step": 716 }, { "epoch": 2.8321263989466754, "grad_norm": 0.3699417700297945, "learning_rate": 9.30134608199884e-08, "loss": 0.376, "step": 717 }, { "epoch": 2.8360763660302832, "grad_norm": 0.3814162433792073, "learning_rate": 8.864995685504252e-08, "loss": 0.3512, "step": 718 }, { "epoch": 2.8400263331138906, "grad_norm": 0.35988470754799523, "learning_rate": 8.439036315708693e-08, "loss": 0.3565, "step": 719 }, { "epoch": 2.8439763001974985, "grad_norm": 0.3496770371220566, "learning_rate": 8.023476984706957e-08, "loss": 0.3508, "step": 720 }, { "epoch": 2.847926267281106, "grad_norm": 0.36328144259958667, "learning_rate": 7.618326484558402e-08, "loss": 0.3584, "step": 721 }, { "epoch": 2.8518762343647137, "grad_norm": 0.381455287445506, "learning_rate": 7.22359338710088e-08, "loss": 0.3672, "step": 722 }, { "epoch": 2.855826201448321, "grad_norm": 0.363344294321485, "learning_rate": 6.839286043769655e-08, "loss": 0.3574, "step": 723 }, { "epoch": 2.859776168531929, "grad_norm": 0.3674820403857133, "learning_rate": 6.465412585420439e-08, "loss": 0.3678, "step": 724 }, { "epoch": 2.8637261356155364, "grad_norm": 0.3948740442548483, "learning_rate": 6.101980922157524e-08, "loss": 0.3855, "step": 725 }, { "epoch": 2.867676102699144, "grad_norm": 0.39253483765361225, "learning_rate": 5.748998743166256e-08, "loss": 0.3794, "step": 726 }, { "epoch": 2.871626069782752, "grad_norm": 0.38937620014514557, "learning_rate": 5.4064735165506035e-08, "loss": 0.3815, "step": 727 }, { "epoch": 2.8755760368663594, "grad_norm": 0.37164297628789084, "learning_rate": 5.0744124891748956e-08, "loss": 0.3654, "step": 728 }, { "epoch": 2.879526003949967, "grad_norm": 0.37005814070272575, "learning_rate": 4.7528226865107274e-08, "loss": 0.3561, "step": 729 }, { "epoch": 2.8834759710335747, "grad_norm": 0.36468008921484535, "learning_rate": 4.4417109124882394e-08, "loss": 0.3741, "step": 730 }, { "epoch": 2.8874259381171825, "grad_norm": 0.3640973426010247, "learning_rate": 4.1410837493519595e-08, "loss": 0.3726, "step": 731 }, { "epoch": 2.89137590520079, "grad_norm": 0.36641511282013955, "learning_rate": 3.8509475575219115e-08, "loss": 0.3851, "step": 732 }, { "epoch": 2.8953258722843978, "grad_norm": 0.37233582653678365, "learning_rate": 3.571308475458723e-08, "loss": 0.3669, "step": 733 }, { "epoch": 2.899275839368005, "grad_norm": 0.35965049350239775, "learning_rate": 3.302172419534011e-08, "loss": 0.357, "step": 734 }, { "epoch": 2.903225806451613, "grad_norm": 0.3505488315287831, "learning_rate": 3.04354508390492e-08, "loss": 0.3399, "step": 735 }, { "epoch": 2.9071757735352204, "grad_norm": 0.37021111813053637, "learning_rate": 2.7954319403940555e-08, "loss": 0.368, "step": 736 }, { "epoch": 2.9111257406188282, "grad_norm": 0.36577366486991375, "learning_rate": 2.5578382383732446e-08, "loss": 0.3574, "step": 737 }, { "epoch": 2.915075707702436, "grad_norm": 0.3691586046629489, "learning_rate": 2.3307690046527887e-08, "loss": 0.3805, "step": 738 }, { "epoch": 2.9190256747860435, "grad_norm": 0.3672213156503764, "learning_rate": 2.114229043375049e-08, "loss": 0.3699, "step": 739 }, { "epoch": 2.922975641869651, "grad_norm": 0.36493544527232824, "learning_rate": 1.9082229359127512e-08, "loss": 0.3743, "step": 740 }, { "epoch": 2.9282422646477944, "grad_norm": 0.3606043242411447, "learning_rate": 1.7127550407721184e-08, "loss": 0.3609, "step": 741 }, { "epoch": 2.932192231731402, "grad_norm": 0.35895064839276425, "learning_rate": 1.52782949350061e-08, "loss": 0.3483, "step": 742 }, { "epoch": 2.93614219881501, "grad_norm": 0.3485329943101189, "learning_rate": 1.3534502065993826e-08, "loss": 0.3577, "step": 743 }, { "epoch": 2.9400921658986174, "grad_norm": 0.3790848184294848, "learning_rate": 1.1896208694406886e-08, "loss": 0.3537, "step": 744 }, { "epoch": 2.9440421329822253, "grad_norm": 0.364942751537554, "learning_rate": 1.0363449481896604e-08, "loss": 0.3611, "step": 745 }, { "epoch": 2.9479921000658327, "grad_norm": 0.33691137642330465, "learning_rate": 8.936256857308701e-09, "loss": 0.3369, "step": 746 }, { "epoch": 2.9519420671494405, "grad_norm": 0.35838053745559706, "learning_rate": 7.614661016001056e-09, "loss": 0.3636, "step": 747 }, { "epoch": 2.955892034233048, "grad_norm": 0.358668481280941, "learning_rate": 6.398689919201451e-09, "loss": 0.3549, "step": 748 }, { "epoch": 2.9598420013166558, "grad_norm": 0.3647484351170196, "learning_rate": 5.288369293415807e-09, "loss": 0.3502, "step": 749 }, { "epoch": 2.9637919684002636, "grad_norm": 0.3514237264002563, "learning_rate": 4.283722629887521e-09, "loss": 0.36, "step": 750 }, { "epoch": 2.967741935483871, "grad_norm": 0.36014372505333425, "learning_rate": 3.3847711840950813e-09, "loss": 0.3594, "step": 751 }, { "epoch": 2.9716919025674784, "grad_norm": 0.3812826494959831, "learning_rate": 2.5915339753085355e-09, "loss": 0.3745, "step": 752 }, { "epoch": 2.9756418696510862, "grad_norm": 0.3892841587592995, "learning_rate": 1.9040277861814836e-09, "loss": 0.388, "step": 753 }, { "epoch": 2.979591836734694, "grad_norm": 0.36151626054414365, "learning_rate": 1.3222671623991379e-09, "loss": 0.3546, "step": 754 }, { "epoch": 2.9835418038183015, "grad_norm": 0.3626834588186693, "learning_rate": 8.462644123696794e-10, "loss": 0.353, "step": 755 }, { "epoch": 2.9874917709019093, "grad_norm": 0.3500539162541361, "learning_rate": 4.760296069639125e-10, "loss": 0.3509, "step": 756 }, { "epoch": 2.9914417379855167, "grad_norm": 0.37315571459471814, "learning_rate": 2.1157057930321079e-10, "loss": 0.355, "step": 757 }, { "epoch": 2.9953917050691246, "grad_norm": 0.3446852557602504, "learning_rate": 5.289292459187412e-11, "loss": 0.359, "step": 758 }, { "epoch": 2.999341672152732, "grad_norm": 0.3685417641758729, "learning_rate": 0.0, "loss": 0.3499, "step": 759 }, { "epoch": 2.999341672152732, "step": 759, "total_flos": 8.138715082826711e+17, "train_loss": 0.008948151262695454, "train_runtime": 693.8105, "train_samples_per_second": 105.059, "train_steps_per_second": 1.094 } ], "logging_steps": 1, "max_steps": 759, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 10, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 8.138715082826711e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }