diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,25968 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.0, + "eval_steps": 500, + "global_step": 3705, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0008097165991902834, + "grad_norm": 8.212680311060101, + "learning_rate": 4.0000000000000003e-07, + "loss": 1.8584, + "step": 1 + }, + { + "epoch": 0.0016194331983805667, + "grad_norm": 8.289202083488915, + "learning_rate": 8.000000000000001e-07, + "loss": 1.8781, + "step": 2 + }, + { + "epoch": 0.0024291497975708503, + "grad_norm": 8.121793660526253, + "learning_rate": 1.2000000000000002e-06, + "loss": 1.8632, + "step": 3 + }, + { + "epoch": 0.0032388663967611335, + "grad_norm": 8.282365329091148, + "learning_rate": 1.6000000000000001e-06, + "loss": 1.8428, + "step": 4 + }, + { + "epoch": 0.004048582995951417, + "grad_norm": 6.205624862594086, + "learning_rate": 2.0000000000000003e-06, + "loss": 1.8148, + "step": 5 + }, + { + "epoch": 0.004858299595141701, + "grad_norm": 3.874015809560423, + "learning_rate": 2.4000000000000003e-06, + "loss": 1.8443, + "step": 6 + }, + { + "epoch": 0.005668016194331984, + "grad_norm": 3.295685239888731, + "learning_rate": 2.8000000000000003e-06, + "loss": 1.7624, + "step": 7 + }, + { + "epoch": 0.006477732793522267, + "grad_norm": 3.35229547435271, + "learning_rate": 3.2000000000000003e-06, + "loss": 1.7892, + "step": 8 + }, + { + "epoch": 0.0072874493927125505, + "grad_norm": 3.6152531340095924, + "learning_rate": 3.6000000000000003e-06, + "loss": 1.7692, + "step": 9 + }, + { + "epoch": 0.008097165991902834, + "grad_norm": 2.839020140388393, + "learning_rate": 4.000000000000001e-06, + "loss": 1.7546, + "step": 10 + }, + { + "epoch": 0.008906882591093117, + "grad_norm": 3.280470763841352, + "learning_rate": 4.4e-06, + "loss": 1.781, + "step": 11 + }, + { + "epoch": 0.009716599190283401, + "grad_norm": 2.919143442971804, + "learning_rate": 4.800000000000001e-06, + "loss": 1.7255, + "step": 12 + }, + { + "epoch": 0.010526315789473684, + "grad_norm": 2.1780641385791957, + "learning_rate": 5.2e-06, + "loss": 1.7069, + "step": 13 + }, + { + "epoch": 0.011336032388663968, + "grad_norm": 2.0864543101995356, + "learning_rate": 5.600000000000001e-06, + "loss": 1.7801, + "step": 14 + }, + { + "epoch": 0.012145748987854251, + "grad_norm": 2.433989036218932, + "learning_rate": 6e-06, + "loss": 1.6886, + "step": 15 + }, + { + "epoch": 0.012955465587044534, + "grad_norm": 2.108153096043187, + "learning_rate": 6.4000000000000006e-06, + "loss": 1.7845, + "step": 16 + }, + { + "epoch": 0.013765182186234818, + "grad_norm": 2.147481330273472, + "learning_rate": 6.800000000000001e-06, + "loss": 1.7888, + "step": 17 + }, + { + "epoch": 0.014574898785425101, + "grad_norm": 2.0230155835526658, + "learning_rate": 7.2000000000000005e-06, + "loss": 1.7484, + "step": 18 + }, + { + "epoch": 0.015384615384615385, + "grad_norm": 1.8351338768344623, + "learning_rate": 7.600000000000001e-06, + "loss": 1.6815, + "step": 19 + }, + { + "epoch": 0.016194331983805668, + "grad_norm": 1.8385443541885478, + "learning_rate": 8.000000000000001e-06, + "loss": 1.7259, + "step": 20 + }, + { + "epoch": 0.01700404858299595, + "grad_norm": 1.9450404722821018, + "learning_rate": 8.400000000000001e-06, + "loss": 1.7513, + "step": 21 + }, + { + "epoch": 0.017813765182186234, + "grad_norm": 1.7329165869002578, + "learning_rate": 8.8e-06, + "loss": 1.6934, + "step": 22 + }, + { + "epoch": 0.01862348178137652, + "grad_norm": 1.8043465898806423, + "learning_rate": 9.200000000000002e-06, + "loss": 1.7411, + "step": 23 + }, + { + "epoch": 0.019433198380566803, + "grad_norm": 1.6575139155258072, + "learning_rate": 9.600000000000001e-06, + "loss": 1.7141, + "step": 24 + }, + { + "epoch": 0.020242914979757085, + "grad_norm": 1.6149477634364793, + "learning_rate": 1e-05, + "loss": 1.6359, + "step": 25 + }, + { + "epoch": 0.021052631578947368, + "grad_norm": 1.720501765932264, + "learning_rate": 1.04e-05, + "loss": 1.6983, + "step": 26 + }, + { + "epoch": 0.02186234817813765, + "grad_norm": 1.669970332575193, + "learning_rate": 1.0800000000000002e-05, + "loss": 1.7, + "step": 27 + }, + { + "epoch": 0.022672064777327937, + "grad_norm": 1.588644479806073, + "learning_rate": 1.1200000000000001e-05, + "loss": 1.6018, + "step": 28 + }, + { + "epoch": 0.02348178137651822, + "grad_norm": 1.6915129269465283, + "learning_rate": 1.16e-05, + "loss": 1.6732, + "step": 29 + }, + { + "epoch": 0.024291497975708502, + "grad_norm": 1.67842726423677, + "learning_rate": 1.2e-05, + "loss": 1.7291, + "step": 30 + }, + { + "epoch": 0.025101214574898785, + "grad_norm": 1.662602613195812, + "learning_rate": 1.2400000000000002e-05, + "loss": 1.714, + "step": 31 + }, + { + "epoch": 0.025910931174089068, + "grad_norm": 1.5692125530449308, + "learning_rate": 1.2800000000000001e-05, + "loss": 1.7681, + "step": 32 + }, + { + "epoch": 0.026720647773279354, + "grad_norm": 1.5593466931369984, + "learning_rate": 1.3200000000000002e-05, + "loss": 1.6625, + "step": 33 + }, + { + "epoch": 0.027530364372469637, + "grad_norm": 1.5314296376489476, + "learning_rate": 1.3600000000000002e-05, + "loss": 1.6711, + "step": 34 + }, + { + "epoch": 0.02834008097165992, + "grad_norm": 1.7793373457448158, + "learning_rate": 1.4e-05, + "loss": 1.7056, + "step": 35 + }, + { + "epoch": 0.029149797570850202, + "grad_norm": 1.5178537061767556, + "learning_rate": 1.4400000000000001e-05, + "loss": 1.7037, + "step": 36 + }, + { + "epoch": 0.029959514170040485, + "grad_norm": 1.5459895717695264, + "learning_rate": 1.48e-05, + "loss": 1.7683, + "step": 37 + }, + { + "epoch": 0.03076923076923077, + "grad_norm": 1.5735552822755368, + "learning_rate": 1.5200000000000002e-05, + "loss": 1.6872, + "step": 38 + }, + { + "epoch": 0.031578947368421054, + "grad_norm": 1.4358615071215153, + "learning_rate": 1.5600000000000003e-05, + "loss": 1.7402, + "step": 39 + }, + { + "epoch": 0.032388663967611336, + "grad_norm": 1.712555492023088, + "learning_rate": 1.6000000000000003e-05, + "loss": 1.7413, + "step": 40 + }, + { + "epoch": 0.03319838056680162, + "grad_norm": 1.527931327004627, + "learning_rate": 1.64e-05, + "loss": 1.7361, + "step": 41 + }, + { + "epoch": 0.0340080971659919, + "grad_norm": 1.5654078241724048, + "learning_rate": 1.6800000000000002e-05, + "loss": 1.7174, + "step": 42 + }, + { + "epoch": 0.034817813765182185, + "grad_norm": 1.5967020304138295, + "learning_rate": 1.72e-05, + "loss": 1.6863, + "step": 43 + }, + { + "epoch": 0.03562753036437247, + "grad_norm": 1.5449242343523315, + "learning_rate": 1.76e-05, + "loss": 1.7186, + "step": 44 + }, + { + "epoch": 0.03643724696356275, + "grad_norm": 1.5664894285095186, + "learning_rate": 1.8e-05, + "loss": 1.677, + "step": 45 + }, + { + "epoch": 0.03724696356275304, + "grad_norm": 1.6465253425137338, + "learning_rate": 1.8400000000000003e-05, + "loss": 1.767, + "step": 46 + }, + { + "epoch": 0.03805668016194332, + "grad_norm": 1.6352491164142435, + "learning_rate": 1.88e-05, + "loss": 1.5998, + "step": 47 + }, + { + "epoch": 0.038866396761133605, + "grad_norm": 1.6363309288994599, + "learning_rate": 1.9200000000000003e-05, + "loss": 1.6266, + "step": 48 + }, + { + "epoch": 0.03967611336032389, + "grad_norm": 1.8173439281602377, + "learning_rate": 1.9600000000000002e-05, + "loss": 1.6592, + "step": 49 + }, + { + "epoch": 0.04048582995951417, + "grad_norm": 1.5272192265553266, + "learning_rate": 2e-05, + "loss": 1.738, + "step": 50 + }, + { + "epoch": 0.04129554655870445, + "grad_norm": 1.8649531326109419, + "learning_rate": 1.9999996306016426e-05, + "loss": 1.7247, + "step": 51 + }, + { + "epoch": 0.042105263157894736, + "grad_norm": 1.7074872326048571, + "learning_rate": 1.9999985224068418e-05, + "loss": 1.7084, + "step": 52 + }, + { + "epoch": 0.04291497975708502, + "grad_norm": 1.4977194700952365, + "learning_rate": 1.9999966754164176e-05, + "loss": 1.7055, + "step": 53 + }, + { + "epoch": 0.0437246963562753, + "grad_norm": 1.6230626841795457, + "learning_rate": 1.9999940896317337e-05, + "loss": 1.5649, + "step": 54 + }, + { + "epoch": 0.044534412955465584, + "grad_norm": 1.6916647362719663, + "learning_rate": 1.9999907650547006e-05, + "loss": 1.7438, + "step": 55 + }, + { + "epoch": 0.045344129554655874, + "grad_norm": 1.4840562908731014, + "learning_rate": 1.999986701687775e-05, + "loss": 1.6202, + "step": 56 + }, + { + "epoch": 0.046153846153846156, + "grad_norm": 2.1064626020990596, + "learning_rate": 1.9999818995339587e-05, + "loss": 1.6874, + "step": 57 + }, + { + "epoch": 0.04696356275303644, + "grad_norm": 1.514821876037186, + "learning_rate": 1.999976358596799e-05, + "loss": 1.6145, + "step": 58 + }, + { + "epoch": 0.04777327935222672, + "grad_norm": 1.5837774362892976, + "learning_rate": 1.9999700788803902e-05, + "loss": 1.7424, + "step": 59 + }, + { + "epoch": 0.048582995951417005, + "grad_norm": 1.5015921648476362, + "learning_rate": 1.999963060389371e-05, + "loss": 1.7074, + "step": 60 + }, + { + "epoch": 0.04939271255060729, + "grad_norm": 1.5081987493072453, + "learning_rate": 1.9999553031289277e-05, + "loss": 1.7671, + "step": 61 + }, + { + "epoch": 0.05020242914979757, + "grad_norm": 1.449828681342794, + "learning_rate": 1.9999468071047904e-05, + "loss": 1.7608, + "step": 62 + }, + { + "epoch": 0.05101214574898785, + "grad_norm": 1.601994782437138, + "learning_rate": 1.9999375723232362e-05, + "loss": 1.7219, + "step": 63 + }, + { + "epoch": 0.051821862348178135, + "grad_norm": 1.5204297943669711, + "learning_rate": 1.999927598791088e-05, + "loss": 1.756, + "step": 64 + }, + { + "epoch": 0.05263157894736842, + "grad_norm": 1.4755593142051295, + "learning_rate": 1.9999168865157137e-05, + "loss": 1.6837, + "step": 65 + }, + { + "epoch": 0.05344129554655871, + "grad_norm": 1.5345109179851053, + "learning_rate": 1.999905435505028e-05, + "loss": 1.7347, + "step": 66 + }, + { + "epoch": 0.05425101214574899, + "grad_norm": 1.6001626392593242, + "learning_rate": 1.9998932457674904e-05, + "loss": 1.654, + "step": 67 + }, + { + "epoch": 0.05506072874493927, + "grad_norm": 1.4701409508033054, + "learning_rate": 1.999880317312107e-05, + "loss": 1.7161, + "step": 68 + }, + { + "epoch": 0.055870445344129556, + "grad_norm": 1.3692751510613166, + "learning_rate": 1.999866650148429e-05, + "loss": 1.7001, + "step": 69 + }, + { + "epoch": 0.05668016194331984, + "grad_norm": 1.48694331218701, + "learning_rate": 1.999852244286554e-05, + "loss": 1.812, + "step": 70 + }, + { + "epoch": 0.05748987854251012, + "grad_norm": 1.3924742554067722, + "learning_rate": 1.999837099737125e-05, + "loss": 1.6882, + "step": 71 + }, + { + "epoch": 0.058299595141700404, + "grad_norm": 1.4558556277711194, + "learning_rate": 1.9998212165113305e-05, + "loss": 1.7295, + "step": 72 + }, + { + "epoch": 0.05910931174089069, + "grad_norm": 1.4086032761044172, + "learning_rate": 1.999804594620905e-05, + "loss": 1.6551, + "step": 73 + }, + { + "epoch": 0.05991902834008097, + "grad_norm": 1.4939708802308547, + "learning_rate": 1.999787234078129e-05, + "loss": 1.7266, + "step": 74 + }, + { + "epoch": 0.06072874493927125, + "grad_norm": 1.4235299645544777, + "learning_rate": 1.9997691348958278e-05, + "loss": 1.7154, + "step": 75 + }, + { + "epoch": 0.06153846153846154, + "grad_norm": 1.644992651490282, + "learning_rate": 1.9997502970873736e-05, + "loss": 1.7414, + "step": 76 + }, + { + "epoch": 0.062348178137651825, + "grad_norm": 1.4731685460504425, + "learning_rate": 1.9997307206666835e-05, + "loss": 1.6953, + "step": 77 + }, + { + "epoch": 0.06315789473684211, + "grad_norm": 1.4684965944812223, + "learning_rate": 1.9997104056482206e-05, + "loss": 1.7127, + "step": 78 + }, + { + "epoch": 0.06396761133603239, + "grad_norm": 1.5057059153504087, + "learning_rate": 1.9996893520469934e-05, + "loss": 1.7393, + "step": 79 + }, + { + "epoch": 0.06477732793522267, + "grad_norm": 1.3158005918665274, + "learning_rate": 1.999667559878556e-05, + "loss": 1.6802, + "step": 80 + }, + { + "epoch": 0.06558704453441296, + "grad_norm": 1.405150913782594, + "learning_rate": 1.9996450291590093e-05, + "loss": 1.7208, + "step": 81 + }, + { + "epoch": 0.06639676113360324, + "grad_norm": 1.540226649005545, + "learning_rate": 1.9996217599049978e-05, + "loss": 1.7939, + "step": 82 + }, + { + "epoch": 0.06720647773279352, + "grad_norm": 1.4885304031726005, + "learning_rate": 1.9995977521337134e-05, + "loss": 1.6682, + "step": 83 + }, + { + "epoch": 0.0680161943319838, + "grad_norm": 1.3901851985498637, + "learning_rate": 1.9995730058628928e-05, + "loss": 1.667, + "step": 84 + }, + { + "epoch": 0.06882591093117409, + "grad_norm": 1.4725073250334886, + "learning_rate": 1.9995475211108183e-05, + "loss": 1.7547, + "step": 85 + }, + { + "epoch": 0.06963562753036437, + "grad_norm": 1.452556987551732, + "learning_rate": 1.9995212978963185e-05, + "loss": 1.7608, + "step": 86 + }, + { + "epoch": 0.07044534412955465, + "grad_norm": 1.4996449206080487, + "learning_rate": 1.9994943362387666e-05, + "loss": 1.6664, + "step": 87 + }, + { + "epoch": 0.07125506072874493, + "grad_norm": 1.4199683810583805, + "learning_rate": 1.9994666361580815e-05, + "loss": 1.6367, + "step": 88 + }, + { + "epoch": 0.07206477732793522, + "grad_norm": 1.571126361723556, + "learning_rate": 1.999438197674729e-05, + "loss": 1.7765, + "step": 89 + }, + { + "epoch": 0.0728744939271255, + "grad_norm": 1.3926335473604161, + "learning_rate": 1.9994090208097176e-05, + "loss": 1.722, + "step": 90 + }, + { + "epoch": 0.07368421052631578, + "grad_norm": 1.4552870672529465, + "learning_rate": 1.9993791055846048e-05, + "loss": 1.6651, + "step": 91 + }, + { + "epoch": 0.07449392712550608, + "grad_norm": 1.379931558535098, + "learning_rate": 1.999348452021491e-05, + "loss": 1.7096, + "step": 92 + }, + { + "epoch": 0.07530364372469636, + "grad_norm": 1.4198581520574205, + "learning_rate": 1.9993170601430233e-05, + "loss": 1.6601, + "step": 93 + }, + { + "epoch": 0.07611336032388664, + "grad_norm": 1.5078914507853327, + "learning_rate": 1.9992849299723933e-05, + "loss": 1.8005, + "step": 94 + }, + { + "epoch": 0.07692307692307693, + "grad_norm": 1.5076216489476741, + "learning_rate": 1.9992520615333393e-05, + "loss": 1.682, + "step": 95 + }, + { + "epoch": 0.07773279352226721, + "grad_norm": 1.3937658114601108, + "learning_rate": 1.9992184548501444e-05, + "loss": 1.6415, + "step": 96 + }, + { + "epoch": 0.07854251012145749, + "grad_norm": 1.5041710747062258, + "learning_rate": 1.9991841099476365e-05, + "loss": 1.6654, + "step": 97 + }, + { + "epoch": 0.07935222672064778, + "grad_norm": 1.4006737880954128, + "learning_rate": 1.9991490268511903e-05, + "loss": 1.6976, + "step": 98 + }, + { + "epoch": 0.08016194331983806, + "grad_norm": 1.5631879172620895, + "learning_rate": 1.9991132055867244e-05, + "loss": 1.669, + "step": 99 + }, + { + "epoch": 0.08097165991902834, + "grad_norm": 1.394392254430093, + "learning_rate": 1.9990766461807037e-05, + "loss": 1.6968, + "step": 100 + }, + { + "epoch": 0.08178137651821862, + "grad_norm": 1.472525619978714, + "learning_rate": 1.9990393486601385e-05, + "loss": 1.6773, + "step": 101 + }, + { + "epoch": 0.0825910931174089, + "grad_norm": 1.378206403293524, + "learning_rate": 1.9990013130525835e-05, + "loss": 1.6821, + "step": 102 + }, + { + "epoch": 0.08340080971659919, + "grad_norm": 1.328026656261711, + "learning_rate": 1.9989625393861397e-05, + "loss": 1.6103, + "step": 103 + }, + { + "epoch": 0.08421052631578947, + "grad_norm": 1.4064722782717616, + "learning_rate": 1.9989230276894525e-05, + "loss": 1.7324, + "step": 104 + }, + { + "epoch": 0.08502024291497975, + "grad_norm": 1.4813376774614593, + "learning_rate": 1.9988827779917138e-05, + "loss": 1.7272, + "step": 105 + }, + { + "epoch": 0.08582995951417004, + "grad_norm": 1.3687982499187121, + "learning_rate": 1.998841790322659e-05, + "loss": 1.673, + "step": 106 + }, + { + "epoch": 0.08663967611336032, + "grad_norm": 1.422876362819873, + "learning_rate": 1.9988000647125703e-05, + "loss": 1.7597, + "step": 107 + }, + { + "epoch": 0.0874493927125506, + "grad_norm": 1.308219863689626, + "learning_rate": 1.9987576011922743e-05, + "loss": 1.7697, + "step": 108 + }, + { + "epoch": 0.08825910931174089, + "grad_norm": 1.4183268142705985, + "learning_rate": 1.9987143997931428e-05, + "loss": 1.6809, + "step": 109 + }, + { + "epoch": 0.08906882591093117, + "grad_norm": 1.4357441361156031, + "learning_rate": 1.9986704605470932e-05, + "loss": 1.6358, + "step": 110 + }, + { + "epoch": 0.08987854251012145, + "grad_norm": 1.3529581082840894, + "learning_rate": 1.998625783486587e-05, + "loss": 1.6681, + "step": 111 + }, + { + "epoch": 0.09068825910931175, + "grad_norm": 1.408818420235949, + "learning_rate": 1.998580368644632e-05, + "loss": 1.6938, + "step": 112 + }, + { + "epoch": 0.09149797570850203, + "grad_norm": 1.4451180541868947, + "learning_rate": 1.998534216054781e-05, + "loss": 1.7739, + "step": 113 + }, + { + "epoch": 0.09230769230769231, + "grad_norm": 1.2941303668841717, + "learning_rate": 1.9984873257511296e-05, + "loss": 1.6704, + "step": 114 + }, + { + "epoch": 0.0931174089068826, + "grad_norm": 1.440123130144802, + "learning_rate": 1.9984396977683223e-05, + "loss": 1.7456, + "step": 115 + }, + { + "epoch": 0.09392712550607288, + "grad_norm": 1.3845008076241843, + "learning_rate": 1.998391332141545e-05, + "loss": 1.6968, + "step": 116 + }, + { + "epoch": 0.09473684210526316, + "grad_norm": 1.5220029955302263, + "learning_rate": 1.998342228906531e-05, + "loss": 1.8585, + "step": 117 + }, + { + "epoch": 0.09554655870445344, + "grad_norm": 1.350510831118027, + "learning_rate": 1.998292388099557e-05, + "loss": 1.7009, + "step": 118 + }, + { + "epoch": 0.09635627530364373, + "grad_norm": 1.410947289102555, + "learning_rate": 1.9982418097574458e-05, + "loss": 1.7082, + "step": 119 + }, + { + "epoch": 0.09716599190283401, + "grad_norm": 1.4674312785560688, + "learning_rate": 1.998190493917564e-05, + "loss": 1.6933, + "step": 120 + }, + { + "epoch": 0.09797570850202429, + "grad_norm": 1.3586528211235367, + "learning_rate": 1.9981384406178235e-05, + "loss": 1.697, + "step": 121 + }, + { + "epoch": 0.09878542510121457, + "grad_norm": 1.323874170111494, + "learning_rate": 1.998085649896682e-05, + "loss": 1.6996, + "step": 122 + }, + { + "epoch": 0.09959514170040486, + "grad_norm": 1.3857202766997219, + "learning_rate": 1.99803212179314e-05, + "loss": 1.6158, + "step": 123 + }, + { + "epoch": 0.10040485829959514, + "grad_norm": 1.3135040066156267, + "learning_rate": 1.9979778563467446e-05, + "loss": 1.6917, + "step": 124 + }, + { + "epoch": 0.10121457489878542, + "grad_norm": 1.450855282133572, + "learning_rate": 1.9979228535975866e-05, + "loss": 1.6773, + "step": 125 + }, + { + "epoch": 0.1020242914979757, + "grad_norm": 1.347188642125951, + "learning_rate": 1.997867113586302e-05, + "loss": 1.7144, + "step": 126 + }, + { + "epoch": 0.10283400809716599, + "grad_norm": 1.345981107132026, + "learning_rate": 1.997810636354071e-05, + "loss": 1.6041, + "step": 127 + }, + { + "epoch": 0.10364372469635627, + "grad_norm": 1.2972173884632567, + "learning_rate": 1.9977534219426195e-05, + "loss": 1.6855, + "step": 128 + }, + { + "epoch": 0.10445344129554655, + "grad_norm": 1.275527287436048, + "learning_rate": 1.997695470394217e-05, + "loss": 1.696, + "step": 129 + }, + { + "epoch": 0.10526315789473684, + "grad_norm": 1.3215678390799637, + "learning_rate": 1.9976367817516773e-05, + "loss": 1.6935, + "step": 130 + }, + { + "epoch": 0.10607287449392712, + "grad_norm": 1.3828255635499427, + "learning_rate": 1.99757735605836e-05, + "loss": 1.7256, + "step": 131 + }, + { + "epoch": 0.10688259109311742, + "grad_norm": 1.448631635568024, + "learning_rate": 1.997517193358169e-05, + "loss": 1.6388, + "step": 132 + }, + { + "epoch": 0.1076923076923077, + "grad_norm": 1.3338631760173656, + "learning_rate": 1.9974562936955513e-05, + "loss": 1.6965, + "step": 133 + }, + { + "epoch": 0.10850202429149798, + "grad_norm": 1.3424092875489613, + "learning_rate": 1.9973946571155e-05, + "loss": 1.709, + "step": 134 + }, + { + "epoch": 0.10931174089068826, + "grad_norm": 1.3898696103299275, + "learning_rate": 1.9973322836635517e-05, + "loss": 1.6822, + "step": 135 + }, + { + "epoch": 0.11012145748987855, + "grad_norm": 1.4706810809601643, + "learning_rate": 1.997269173385788e-05, + "loss": 1.7058, + "step": 136 + }, + { + "epoch": 0.11093117408906883, + "grad_norm": 1.4026119282799607, + "learning_rate": 1.9972053263288346e-05, + "loss": 1.6531, + "step": 137 + }, + { + "epoch": 0.11174089068825911, + "grad_norm": 1.3288506201522425, + "learning_rate": 1.9971407425398614e-05, + "loss": 1.6532, + "step": 138 + }, + { + "epoch": 0.1125506072874494, + "grad_norm": 1.5239912432928768, + "learning_rate": 1.9970754220665824e-05, + "loss": 1.7604, + "step": 139 + }, + { + "epoch": 0.11336032388663968, + "grad_norm": 1.444106448065426, + "learning_rate": 1.9970093649572567e-05, + "loss": 1.7596, + "step": 140 + }, + { + "epoch": 0.11417004048582996, + "grad_norm": 1.3200905343039002, + "learning_rate": 1.9969425712606864e-05, + "loss": 1.6867, + "step": 141 + }, + { + "epoch": 0.11497975708502024, + "grad_norm": 1.3681666300591042, + "learning_rate": 1.996875041026219e-05, + "loss": 1.6352, + "step": 142 + }, + { + "epoch": 0.11578947368421053, + "grad_norm": 1.4975233179603091, + "learning_rate": 1.9968067743037453e-05, + "loss": 1.679, + "step": 143 + }, + { + "epoch": 0.11659919028340081, + "grad_norm": 1.3172078248631487, + "learning_rate": 1.9967377711437008e-05, + "loss": 1.7421, + "step": 144 + }, + { + "epoch": 0.11740890688259109, + "grad_norm": 1.260492754026676, + "learning_rate": 1.9966680315970647e-05, + "loss": 1.5965, + "step": 145 + }, + { + "epoch": 0.11821862348178137, + "grad_norm": 1.3786263323616315, + "learning_rate": 1.9965975557153604e-05, + "loss": 1.7367, + "step": 146 + }, + { + "epoch": 0.11902834008097166, + "grad_norm": 1.3104589610244388, + "learning_rate": 1.996526343550655e-05, + "loss": 1.6468, + "step": 147 + }, + { + "epoch": 0.11983805668016194, + "grad_norm": 1.3212338618017334, + "learning_rate": 1.99645439515556e-05, + "loss": 1.6932, + "step": 148 + }, + { + "epoch": 0.12064777327935222, + "grad_norm": 1.2823071957240442, + "learning_rate": 1.9963817105832305e-05, + "loss": 1.6809, + "step": 149 + }, + { + "epoch": 0.1214574898785425, + "grad_norm": 1.2331269340221573, + "learning_rate": 1.996308289887366e-05, + "loss": 1.7086, + "step": 150 + }, + { + "epoch": 0.12226720647773279, + "grad_norm": 1.3183668422073427, + "learning_rate": 1.9962341331222092e-05, + "loss": 1.6732, + "step": 151 + }, + { + "epoch": 0.12307692307692308, + "grad_norm": 1.3211093232564248, + "learning_rate": 1.996159240342547e-05, + "loss": 1.6752, + "step": 152 + }, + { + "epoch": 0.12388663967611337, + "grad_norm": 1.3807756953246901, + "learning_rate": 1.9960836116037095e-05, + "loss": 1.6819, + "step": 153 + }, + { + "epoch": 0.12469635627530365, + "grad_norm": 1.229226565824642, + "learning_rate": 1.9960072469615716e-05, + "loss": 1.7211, + "step": 154 + }, + { + "epoch": 0.12550607287449392, + "grad_norm": 1.37757201668452, + "learning_rate": 1.9959301464725507e-05, + "loss": 1.7252, + "step": 155 + }, + { + "epoch": 0.12631578947368421, + "grad_norm": 1.3625152047582556, + "learning_rate": 1.9958523101936083e-05, + "loss": 1.7572, + "step": 156 + }, + { + "epoch": 0.12712550607287448, + "grad_norm": 1.3700626216344285, + "learning_rate": 1.9957737381822505e-05, + "loss": 1.7384, + "step": 157 + }, + { + "epoch": 0.12793522267206478, + "grad_norm": 1.3265891040200295, + "learning_rate": 1.9956944304965257e-05, + "loss": 1.7058, + "step": 158 + }, + { + "epoch": 0.12874493927125505, + "grad_norm": 1.477875528364369, + "learning_rate": 1.9956143871950252e-05, + "loss": 1.6846, + "step": 159 + }, + { + "epoch": 0.12955465587044535, + "grad_norm": 1.3079566287214603, + "learning_rate": 1.995533608336886e-05, + "loss": 1.7252, + "step": 160 + }, + { + "epoch": 0.13036437246963561, + "grad_norm": 1.422258383650131, + "learning_rate": 1.9954520939817863e-05, + "loss": 1.7018, + "step": 161 + }, + { + "epoch": 0.1311740890688259, + "grad_norm": 1.4264781196637615, + "learning_rate": 1.9953698441899494e-05, + "loss": 1.6585, + "step": 162 + }, + { + "epoch": 0.1319838056680162, + "grad_norm": 1.2963346637584106, + "learning_rate": 1.9952868590221403e-05, + "loss": 1.6369, + "step": 163 + }, + { + "epoch": 0.13279352226720648, + "grad_norm": 1.5083900065401323, + "learning_rate": 1.9952031385396694e-05, + "loss": 1.7224, + "step": 164 + }, + { + "epoch": 0.13360323886639677, + "grad_norm": 1.5179214854876595, + "learning_rate": 1.995118682804388e-05, + "loss": 1.7588, + "step": 165 + }, + { + "epoch": 0.13441295546558704, + "grad_norm": 1.3017296176425739, + "learning_rate": 1.995033491878692e-05, + "loss": 1.7246, + "step": 166 + }, + { + "epoch": 0.13522267206477734, + "grad_norm": 1.505962687706292, + "learning_rate": 1.9949475658255207e-05, + "loss": 1.7567, + "step": 167 + }, + { + "epoch": 0.1360323886639676, + "grad_norm": 1.2555297272542156, + "learning_rate": 1.994860904708355e-05, + "loss": 1.7062, + "step": 168 + }, + { + "epoch": 0.1368421052631579, + "grad_norm": 1.485635141654141, + "learning_rate": 1.994773508591221e-05, + "loss": 1.6812, + "step": 169 + }, + { + "epoch": 0.13765182186234817, + "grad_norm": 1.2908226955038942, + "learning_rate": 1.9946853775386857e-05, + "loss": 1.6867, + "step": 170 + }, + { + "epoch": 0.13846153846153847, + "grad_norm": 1.32792940285529, + "learning_rate": 1.9945965116158605e-05, + "loss": 1.6529, + "step": 171 + }, + { + "epoch": 0.13927125506072874, + "grad_norm": 1.2871011365969764, + "learning_rate": 1.9945069108883993e-05, + "loss": 1.6857, + "step": 172 + }, + { + "epoch": 0.14008097165991903, + "grad_norm": 1.3266369892431742, + "learning_rate": 1.994416575422499e-05, + "loss": 1.7121, + "step": 173 + }, + { + "epoch": 0.1408906882591093, + "grad_norm": 1.3444583702433448, + "learning_rate": 1.9943255052848984e-05, + "loss": 1.6784, + "step": 174 + }, + { + "epoch": 0.1417004048582996, + "grad_norm": 1.2393516324343374, + "learning_rate": 1.9942337005428805e-05, + "loss": 1.6686, + "step": 175 + }, + { + "epoch": 0.14251012145748987, + "grad_norm": 1.3248524170639142, + "learning_rate": 1.99414116126427e-05, + "loss": 1.6597, + "step": 176 + }, + { + "epoch": 0.14331983805668017, + "grad_norm": 1.1663228528592244, + "learning_rate": 1.9940478875174346e-05, + "loss": 1.5819, + "step": 177 + }, + { + "epoch": 0.14412955465587043, + "grad_norm": 1.3787198785492285, + "learning_rate": 1.9939538793712852e-05, + "loss": 1.6482, + "step": 178 + }, + { + "epoch": 0.14493927125506073, + "grad_norm": 1.350980964072927, + "learning_rate": 1.993859136895274e-05, + "loss": 1.6578, + "step": 179 + }, + { + "epoch": 0.145748987854251, + "grad_norm": 1.4172264638957366, + "learning_rate": 1.9937636601593965e-05, + "loss": 1.7364, + "step": 180 + }, + { + "epoch": 0.1465587044534413, + "grad_norm": 1.2164996262071472, + "learning_rate": 1.9936674492341913e-05, + "loss": 1.6566, + "step": 181 + }, + { + "epoch": 0.14736842105263157, + "grad_norm": 1.163342097978053, + "learning_rate": 1.9935705041907375e-05, + "loss": 1.6742, + "step": 182 + }, + { + "epoch": 0.14817813765182186, + "grad_norm": 1.3251945549485633, + "learning_rate": 1.9934728251006593e-05, + "loss": 1.6875, + "step": 183 + }, + { + "epoch": 0.14898785425101216, + "grad_norm": 1.2641308251973702, + "learning_rate": 1.9933744120361202e-05, + "loss": 1.7407, + "step": 184 + }, + { + "epoch": 0.14979757085020243, + "grad_norm": 1.3486708746298524, + "learning_rate": 1.9932752650698285e-05, + "loss": 1.6667, + "step": 185 + }, + { + "epoch": 0.15060728744939272, + "grad_norm": 1.2926218776022609, + "learning_rate": 1.993175384275033e-05, + "loss": 1.6742, + "step": 186 + }, + { + "epoch": 0.151417004048583, + "grad_norm": 1.367440454792052, + "learning_rate": 1.9930747697255263e-05, + "loss": 1.6944, + "step": 187 + }, + { + "epoch": 0.1522267206477733, + "grad_norm": 1.3527036774330383, + "learning_rate": 1.992973421495641e-05, + "loss": 1.626, + "step": 188 + }, + { + "epoch": 0.15303643724696356, + "grad_norm": 1.287788530805172, + "learning_rate": 1.992871339660253e-05, + "loss": 1.6921, + "step": 189 + }, + { + "epoch": 0.15384615384615385, + "grad_norm": 1.4084861938787652, + "learning_rate": 1.9927685242947804e-05, + "loss": 1.7108, + "step": 190 + }, + { + "epoch": 0.15465587044534412, + "grad_norm": 1.279148085366124, + "learning_rate": 1.9926649754751825e-05, + "loss": 1.6675, + "step": 191 + }, + { + "epoch": 0.15546558704453442, + "grad_norm": 1.2664930272396757, + "learning_rate": 1.9925606932779615e-05, + "loss": 1.7115, + "step": 192 + }, + { + "epoch": 0.1562753036437247, + "grad_norm": 1.321724868931457, + "learning_rate": 1.99245567778016e-05, + "loss": 1.6596, + "step": 193 + }, + { + "epoch": 0.15708502024291499, + "grad_norm": 1.2948713460179608, + "learning_rate": 1.9923499290593637e-05, + "loss": 1.6664, + "step": 194 + }, + { + "epoch": 0.15789473684210525, + "grad_norm": 1.3818189742460931, + "learning_rate": 1.9922434471936987e-05, + "loss": 1.7305, + "step": 195 + }, + { + "epoch": 0.15870445344129555, + "grad_norm": 1.2479572194235047, + "learning_rate": 1.9921362322618337e-05, + "loss": 1.7553, + "step": 196 + }, + { + "epoch": 0.15951417004048582, + "grad_norm": 1.2898516448229447, + "learning_rate": 1.9920282843429795e-05, + "loss": 1.65, + "step": 197 + }, + { + "epoch": 0.16032388663967612, + "grad_norm": 1.344427118443871, + "learning_rate": 1.9919196035168865e-05, + "loss": 1.6366, + "step": 198 + }, + { + "epoch": 0.16113360323886639, + "grad_norm": 1.2531134681362286, + "learning_rate": 1.9918101898638488e-05, + "loss": 1.6608, + "step": 199 + }, + { + "epoch": 0.16194331983805668, + "grad_norm": 1.5273799001568773, + "learning_rate": 1.9917000434647e-05, + "loss": 1.6969, + "step": 200 + }, + { + "epoch": 0.16275303643724695, + "grad_norm": 1.4281468350234128, + "learning_rate": 1.9915891644008164e-05, + "loss": 1.6933, + "step": 201 + }, + { + "epoch": 0.16356275303643725, + "grad_norm": 1.2479553523055322, + "learning_rate": 1.991477552754115e-05, + "loss": 1.633, + "step": 202 + }, + { + "epoch": 0.16437246963562754, + "grad_norm": 1.3889081568820743, + "learning_rate": 1.9913652086070535e-05, + "loss": 1.6847, + "step": 203 + }, + { + "epoch": 0.1651821862348178, + "grad_norm": 1.3301717360886383, + "learning_rate": 1.9912521320426327e-05, + "loss": 1.6712, + "step": 204 + }, + { + "epoch": 0.1659919028340081, + "grad_norm": 1.2916336401520587, + "learning_rate": 1.991138323144392e-05, + "loss": 1.6672, + "step": 205 + }, + { + "epoch": 0.16680161943319838, + "grad_norm": 1.2694035888961783, + "learning_rate": 1.9910237819964135e-05, + "loss": 1.6759, + "step": 206 + }, + { + "epoch": 0.16761133603238867, + "grad_norm": 1.2658690812785272, + "learning_rate": 1.9909085086833198e-05, + "loss": 1.6743, + "step": 207 + }, + { + "epoch": 0.16842105263157894, + "grad_norm": 1.2672804576639893, + "learning_rate": 1.9907925032902745e-05, + "loss": 1.664, + "step": 208 + }, + { + "epoch": 0.16923076923076924, + "grad_norm": 1.1845691219627643, + "learning_rate": 1.9906757659029817e-05, + "loss": 1.6352, + "step": 209 + }, + { + "epoch": 0.1700404858299595, + "grad_norm": 1.260533932342834, + "learning_rate": 1.990558296607687e-05, + "loss": 1.6717, + "step": 210 + }, + { + "epoch": 0.1708502024291498, + "grad_norm": 1.1724140852250677, + "learning_rate": 1.9904400954911763e-05, + "loss": 1.5936, + "step": 211 + }, + { + "epoch": 0.17165991902834007, + "grad_norm": 1.3376574942903026, + "learning_rate": 1.990321162640776e-05, + "loss": 1.6479, + "step": 212 + }, + { + "epoch": 0.17246963562753037, + "grad_norm": 1.3302629754630635, + "learning_rate": 1.9902014981443532e-05, + "loss": 1.6688, + "step": 213 + }, + { + "epoch": 0.17327935222672064, + "grad_norm": 1.2504083776545458, + "learning_rate": 1.9900811020903158e-05, + "loss": 1.6221, + "step": 214 + }, + { + "epoch": 0.17408906882591094, + "grad_norm": 1.279410572569658, + "learning_rate": 1.9899599745676123e-05, + "loss": 1.7135, + "step": 215 + }, + { + "epoch": 0.1748987854251012, + "grad_norm": 1.3819762266443132, + "learning_rate": 1.989838115665731e-05, + "loss": 1.6995, + "step": 216 + }, + { + "epoch": 0.1757085020242915, + "grad_norm": 1.260595817978411, + "learning_rate": 1.9897155254747006e-05, + "loss": 1.6651, + "step": 217 + }, + { + "epoch": 0.17651821862348177, + "grad_norm": 1.3746335152221298, + "learning_rate": 1.989592204085091e-05, + "loss": 1.732, + "step": 218 + }, + { + "epoch": 0.17732793522267207, + "grad_norm": 1.2946738994826934, + "learning_rate": 1.9894681515880106e-05, + "loss": 1.6498, + "step": 219 + }, + { + "epoch": 0.17813765182186234, + "grad_norm": 1.3214716293210784, + "learning_rate": 1.9893433680751105e-05, + "loss": 1.6552, + "step": 220 + }, + { + "epoch": 0.17894736842105263, + "grad_norm": 1.267390260422078, + "learning_rate": 1.9892178536385788e-05, + "loss": 1.6601, + "step": 221 + }, + { + "epoch": 0.1797570850202429, + "grad_norm": 1.2661450912214733, + "learning_rate": 1.9890916083711463e-05, + "loss": 1.6884, + "step": 222 + }, + { + "epoch": 0.1805668016194332, + "grad_norm": 1.2915140670465655, + "learning_rate": 1.9889646323660816e-05, + "loss": 1.7442, + "step": 223 + }, + { + "epoch": 0.1813765182186235, + "grad_norm": 1.2082943407898998, + "learning_rate": 1.9888369257171952e-05, + "loss": 1.6767, + "step": 224 + }, + { + "epoch": 0.18218623481781376, + "grad_norm": 1.3731016770046895, + "learning_rate": 1.9887084885188354e-05, + "loss": 1.7334, + "step": 225 + }, + { + "epoch": 0.18299595141700406, + "grad_norm": 1.2555298583209036, + "learning_rate": 1.988579320865892e-05, + "loss": 1.7203, + "step": 226 + }, + { + "epoch": 0.18380566801619433, + "grad_norm": 1.263140649035035, + "learning_rate": 1.988449422853793e-05, + "loss": 1.6895, + "step": 227 + }, + { + "epoch": 0.18461538461538463, + "grad_norm": 1.2749775919052824, + "learning_rate": 1.9883187945785067e-05, + "loss": 1.6948, + "step": 228 + }, + { + "epoch": 0.1854251012145749, + "grad_norm": 1.2429895536427575, + "learning_rate": 1.9881874361365413e-05, + "loss": 1.6361, + "step": 229 + }, + { + "epoch": 0.1862348178137652, + "grad_norm": 1.2416951269903427, + "learning_rate": 1.9880553476249437e-05, + "loss": 1.6758, + "step": 230 + }, + { + "epoch": 0.18704453441295546, + "grad_norm": 1.239555283382809, + "learning_rate": 1.9879225291413e-05, + "loss": 1.6989, + "step": 231 + }, + { + "epoch": 0.18785425101214576, + "grad_norm": 1.2919283930709724, + "learning_rate": 1.9877889807837373e-05, + "loss": 1.6731, + "step": 232 + }, + { + "epoch": 0.18866396761133603, + "grad_norm": 1.2233652648633166, + "learning_rate": 1.9876547026509194e-05, + "loss": 1.6967, + "step": 233 + }, + { + "epoch": 0.18947368421052632, + "grad_norm": 1.2745651498433337, + "learning_rate": 1.987519694842051e-05, + "loss": 1.7336, + "step": 234 + }, + { + "epoch": 0.1902834008097166, + "grad_norm": 1.2732698824293152, + "learning_rate": 1.9873839574568756e-05, + "loss": 1.6757, + "step": 235 + }, + { + "epoch": 0.1910931174089069, + "grad_norm": 1.3370656954094273, + "learning_rate": 1.9872474905956752e-05, + "loss": 1.7574, + "step": 236 + }, + { + "epoch": 0.19190283400809716, + "grad_norm": 1.337679917073954, + "learning_rate": 1.9871102943592717e-05, + "loss": 1.6757, + "step": 237 + }, + { + "epoch": 0.19271255060728745, + "grad_norm": 1.3246803706512758, + "learning_rate": 1.9869723688490247e-05, + "loss": 1.7, + "step": 238 + }, + { + "epoch": 0.19352226720647772, + "grad_norm": 1.2000006625597532, + "learning_rate": 1.9868337141668333e-05, + "loss": 1.6744, + "step": 239 + }, + { + "epoch": 0.19433198380566802, + "grad_norm": 1.3328073640320248, + "learning_rate": 1.9866943304151346e-05, + "loss": 1.6301, + "step": 240 + }, + { + "epoch": 0.1951417004048583, + "grad_norm": 1.2595984845371397, + "learning_rate": 1.9865542176969055e-05, + "loss": 1.7314, + "step": 241 + }, + { + "epoch": 0.19595141700404858, + "grad_norm": 1.2697481850011105, + "learning_rate": 1.986413376115661e-05, + "loss": 1.7403, + "step": 242 + }, + { + "epoch": 0.19676113360323888, + "grad_norm": 1.184815618587701, + "learning_rate": 1.9862718057754536e-05, + "loss": 1.6913, + "step": 243 + }, + { + "epoch": 0.19757085020242915, + "grad_norm": 1.2575508256727617, + "learning_rate": 1.9861295067808754e-05, + "loss": 1.6972, + "step": 244 + }, + { + "epoch": 0.19838056680161945, + "grad_norm": 1.2037684360733698, + "learning_rate": 1.9859864792370565e-05, + "loss": 1.6808, + "step": 245 + }, + { + "epoch": 0.19919028340080971, + "grad_norm": 1.1984658119885334, + "learning_rate": 1.985842723249665e-05, + "loss": 1.6486, + "step": 246 + }, + { + "epoch": 0.2, + "grad_norm": 1.3302394542035112, + "learning_rate": 1.985698238924908e-05, + "loss": 1.6491, + "step": 247 + }, + { + "epoch": 0.20080971659919028, + "grad_norm": 1.2819039031405692, + "learning_rate": 1.9855530263695287e-05, + "loss": 1.7145, + "step": 248 + }, + { + "epoch": 0.20161943319838058, + "grad_norm": 1.2645396518124346, + "learning_rate": 1.9854070856908113e-05, + "loss": 1.6766, + "step": 249 + }, + { + "epoch": 0.20242914979757085, + "grad_norm": 1.3551588648492328, + "learning_rate": 1.985260416996575e-05, + "loss": 1.7532, + "step": 250 + }, + { + "epoch": 0.20323886639676114, + "grad_norm": 1.1872824323555884, + "learning_rate": 1.9851130203951787e-05, + "loss": 1.722, + "step": 251 + }, + { + "epoch": 0.2040485829959514, + "grad_norm": 1.3233391136396748, + "learning_rate": 1.9849648959955187e-05, + "loss": 1.6066, + "step": 252 + }, + { + "epoch": 0.2048582995951417, + "grad_norm": 1.3232293456672852, + "learning_rate": 1.9848160439070284e-05, + "loss": 1.6556, + "step": 253 + }, + { + "epoch": 0.20566801619433198, + "grad_norm": 1.3252440210019765, + "learning_rate": 1.9846664642396793e-05, + "loss": 1.6771, + "step": 254 + }, + { + "epoch": 0.20647773279352227, + "grad_norm": 1.2091779306727122, + "learning_rate": 1.9845161571039805e-05, + "loss": 1.7038, + "step": 255 + }, + { + "epoch": 0.20728744939271254, + "grad_norm": 1.2887939358404155, + "learning_rate": 1.9843651226109784e-05, + "loss": 1.6948, + "step": 256 + }, + { + "epoch": 0.20809716599190284, + "grad_norm": 1.2583822543738294, + "learning_rate": 1.984213360872257e-05, + "loss": 1.6888, + "step": 257 + }, + { + "epoch": 0.2089068825910931, + "grad_norm": 1.3279683247483347, + "learning_rate": 1.9840608719999367e-05, + "loss": 1.7059, + "step": 258 + }, + { + "epoch": 0.2097165991902834, + "grad_norm": 1.270081125900351, + "learning_rate": 1.9839076561066766e-05, + "loss": 1.7175, + "step": 259 + }, + { + "epoch": 0.21052631578947367, + "grad_norm": 1.254273959928416, + "learning_rate": 1.983753713305672e-05, + "loss": 1.6763, + "step": 260 + }, + { + "epoch": 0.21133603238866397, + "grad_norm": 1.3521388079612242, + "learning_rate": 1.9835990437106542e-05, + "loss": 1.6948, + "step": 261 + }, + { + "epoch": 0.21214574898785424, + "grad_norm": 1.267536006413824, + "learning_rate": 1.983443647435894e-05, + "loss": 1.675, + "step": 262 + }, + { + "epoch": 0.21295546558704453, + "grad_norm": 1.3388930511150345, + "learning_rate": 1.9832875245961972e-05, + "loss": 1.7291, + "step": 263 + }, + { + "epoch": 0.21376518218623483, + "grad_norm": 1.2427093269086842, + "learning_rate": 1.9831306753069066e-05, + "loss": 1.6094, + "step": 264 + }, + { + "epoch": 0.2145748987854251, + "grad_norm": 1.349674351751306, + "learning_rate": 1.982973099683902e-05, + "loss": 1.6964, + "step": 265 + }, + { + "epoch": 0.2153846153846154, + "grad_norm": 1.365432576876562, + "learning_rate": 1.9828147978436e-05, + "loss": 1.7176, + "step": 266 + }, + { + "epoch": 0.21619433198380567, + "grad_norm": 1.2307745643880328, + "learning_rate": 1.982655769902953e-05, + "loss": 1.6596, + "step": 267 + }, + { + "epoch": 0.21700404858299596, + "grad_norm": 1.2552754011065252, + "learning_rate": 1.9824960159794512e-05, + "loss": 1.7251, + "step": 268 + }, + { + "epoch": 0.21781376518218623, + "grad_norm": 1.2821452869506051, + "learning_rate": 1.9823355361911192e-05, + "loss": 1.6452, + "step": 269 + }, + { + "epoch": 0.21862348178137653, + "grad_norm": 1.2062731542559535, + "learning_rate": 1.98217433065652e-05, + "loss": 1.6814, + "step": 270 + }, + { + "epoch": 0.2194331983805668, + "grad_norm": 1.2262168648135, + "learning_rate": 1.9820123994947505e-05, + "loss": 1.6816, + "step": 271 + }, + { + "epoch": 0.2202429149797571, + "grad_norm": 1.2938841697813679, + "learning_rate": 1.981849742825446e-05, + "loss": 1.7483, + "step": 272 + }, + { + "epoch": 0.22105263157894736, + "grad_norm": 1.2645687786911326, + "learning_rate": 1.981686360768776e-05, + "loss": 1.6805, + "step": 273 + }, + { + "epoch": 0.22186234817813766, + "grad_norm": 1.222316415160104, + "learning_rate": 1.9815222534454472e-05, + "loss": 1.7249, + "step": 274 + }, + { + "epoch": 0.22267206477732793, + "grad_norm": 1.2204096535680333, + "learning_rate": 1.9813574209767013e-05, + "loss": 1.6393, + "step": 275 + }, + { + "epoch": 0.22348178137651822, + "grad_norm": 1.2272417519081043, + "learning_rate": 1.981191863484316e-05, + "loss": 1.665, + "step": 276 + }, + { + "epoch": 0.2242914979757085, + "grad_norm": 1.2688048841631627, + "learning_rate": 1.9810255810906046e-05, + "loss": 1.6725, + "step": 277 + }, + { + "epoch": 0.2251012145748988, + "grad_norm": 1.1858272375472187, + "learning_rate": 1.9808585739184156e-05, + "loss": 1.6861, + "step": 278 + }, + { + "epoch": 0.22591093117408906, + "grad_norm": 1.2436292029105176, + "learning_rate": 1.980690842091134e-05, + "loss": 1.679, + "step": 279 + }, + { + "epoch": 0.22672064777327935, + "grad_norm": 1.2348131442739951, + "learning_rate": 1.9805223857326794e-05, + "loss": 1.6451, + "step": 280 + }, + { + "epoch": 0.22753036437246962, + "grad_norm": 1.195784828107809, + "learning_rate": 1.9803532049675062e-05, + "loss": 1.637, + "step": 281 + }, + { + "epoch": 0.22834008097165992, + "grad_norm": 1.2391790592489964, + "learning_rate": 1.9801832999206057e-05, + "loss": 1.7539, + "step": 282 + }, + { + "epoch": 0.2291497975708502, + "grad_norm": 1.1984543549350761, + "learning_rate": 1.980012670717502e-05, + "loss": 1.6307, + "step": 283 + }, + { + "epoch": 0.22995951417004049, + "grad_norm": 1.2082020794861243, + "learning_rate": 1.9798413174842565e-05, + "loss": 1.7061, + "step": 284 + }, + { + "epoch": 0.23076923076923078, + "grad_norm": 1.17049178703938, + "learning_rate": 1.9796692403474632e-05, + "loss": 1.6614, + "step": 285 + }, + { + "epoch": 0.23157894736842105, + "grad_norm": 1.1776506077280637, + "learning_rate": 1.9794964394342532e-05, + "loss": 1.7012, + "step": 286 + }, + { + "epoch": 0.23238866396761135, + "grad_norm": 1.250032564625677, + "learning_rate": 1.9793229148722907e-05, + "loss": 1.7472, + "step": 287 + }, + { + "epoch": 0.23319838056680162, + "grad_norm": 1.1923743701902663, + "learning_rate": 1.979148666789775e-05, + "loss": 1.6169, + "step": 288 + }, + { + "epoch": 0.2340080971659919, + "grad_norm": 1.2445641617846566, + "learning_rate": 1.9789736953154405e-05, + "loss": 1.6697, + "step": 289 + }, + { + "epoch": 0.23481781376518218, + "grad_norm": 1.2338781631842157, + "learning_rate": 1.9787980005785553e-05, + "loss": 1.6691, + "step": 290 + }, + { + "epoch": 0.23562753036437248, + "grad_norm": 1.263470721107049, + "learning_rate": 1.9786215827089216e-05, + "loss": 1.6716, + "step": 291 + }, + { + "epoch": 0.23643724696356275, + "grad_norm": 1.2561973670317472, + "learning_rate": 1.978444441836877e-05, + "loss": 1.6759, + "step": 292 + }, + { + "epoch": 0.23724696356275304, + "grad_norm": 1.2208658433077548, + "learning_rate": 1.9782665780932926e-05, + "loss": 1.6881, + "step": 293 + }, + { + "epoch": 0.2380566801619433, + "grad_norm": 1.2129567864078932, + "learning_rate": 1.9780879916095733e-05, + "loss": 1.6417, + "step": 294 + }, + { + "epoch": 0.2388663967611336, + "grad_norm": 1.260630886604336, + "learning_rate": 1.977908682517658e-05, + "loss": 1.6602, + "step": 295 + }, + { + "epoch": 0.23967611336032388, + "grad_norm": 1.2506139069671103, + "learning_rate": 1.97772865095002e-05, + "loss": 1.7444, + "step": 296 + }, + { + "epoch": 0.24048582995951417, + "grad_norm": 1.360572780404123, + "learning_rate": 1.9775478970396663e-05, + "loss": 1.6975, + "step": 297 + }, + { + "epoch": 0.24129554655870444, + "grad_norm": 1.2420340903090243, + "learning_rate": 1.9773664209201368e-05, + "loss": 1.6733, + "step": 298 + }, + { + "epoch": 0.24210526315789474, + "grad_norm": 1.2691192435537564, + "learning_rate": 1.977184222725505e-05, + "loss": 1.6496, + "step": 299 + }, + { + "epoch": 0.242914979757085, + "grad_norm": 1.251420231930437, + "learning_rate": 1.9770013025903797e-05, + "loss": 1.6639, + "step": 300 + }, + { + "epoch": 0.2437246963562753, + "grad_norm": 1.3007823671114107, + "learning_rate": 1.9768176606499005e-05, + "loss": 1.7034, + "step": 301 + }, + { + "epoch": 0.24453441295546557, + "grad_norm": 1.2809858772390412, + "learning_rate": 1.976633297039742e-05, + "loss": 1.6903, + "step": 302 + }, + { + "epoch": 0.24534412955465587, + "grad_norm": 1.2280345470671303, + "learning_rate": 1.976448211896111e-05, + "loss": 1.6723, + "step": 303 + }, + { + "epoch": 0.24615384615384617, + "grad_norm": 1.1497220827533912, + "learning_rate": 1.9762624053557485e-05, + "loss": 1.6747, + "step": 304 + }, + { + "epoch": 0.24696356275303644, + "grad_norm": 1.318495032765947, + "learning_rate": 1.9760758775559275e-05, + "loss": 1.6121, + "step": 305 + }, + { + "epoch": 0.24777327935222673, + "grad_norm": 1.2399534094992168, + "learning_rate": 1.9758886286344536e-05, + "loss": 1.7047, + "step": 306 + }, + { + "epoch": 0.248582995951417, + "grad_norm": 1.2398070884711676, + "learning_rate": 1.9757006587296664e-05, + "loss": 1.6521, + "step": 307 + }, + { + "epoch": 0.2493927125506073, + "grad_norm": 1.148807377709599, + "learning_rate": 1.975511967980437e-05, + "loss": 1.7039, + "step": 308 + }, + { + "epoch": 0.25020242914979757, + "grad_norm": 1.213998992883127, + "learning_rate": 1.9753225565261695e-05, + "loss": 1.6879, + "step": 309 + }, + { + "epoch": 0.25101214574898784, + "grad_norm": 1.2397569747054522, + "learning_rate": 1.9751324245068008e-05, + "loss": 1.7233, + "step": 310 + }, + { + "epoch": 0.25182186234817816, + "grad_norm": 1.1848797839629908, + "learning_rate": 1.9749415720627993e-05, + "loss": 1.6697, + "step": 311 + }, + { + "epoch": 0.25263157894736843, + "grad_norm": 1.334789053117042, + "learning_rate": 1.974749999335167e-05, + "loss": 1.6449, + "step": 312 + }, + { + "epoch": 0.2534412955465587, + "grad_norm": 1.1684958236354195, + "learning_rate": 1.974557706465436e-05, + "loss": 1.6163, + "step": 313 + }, + { + "epoch": 0.25425101214574897, + "grad_norm": 1.1517446837404286, + "learning_rate": 1.9743646935956727e-05, + "loss": 1.6145, + "step": 314 + }, + { + "epoch": 0.2550607287449393, + "grad_norm": 1.3651988245433082, + "learning_rate": 1.974170960868474e-05, + "loss": 1.7181, + "step": 315 + }, + { + "epoch": 0.25587044534412956, + "grad_norm": 1.2278060656467518, + "learning_rate": 1.973976508426969e-05, + "loss": 1.7003, + "step": 316 + }, + { + "epoch": 0.25668016194331983, + "grad_norm": 1.298927878086763, + "learning_rate": 1.9737813364148187e-05, + "loss": 1.7066, + "step": 317 + }, + { + "epoch": 0.2574898785425101, + "grad_norm": 1.280731725281388, + "learning_rate": 1.973585444976215e-05, + "loss": 1.6962, + "step": 318 + }, + { + "epoch": 0.2582995951417004, + "grad_norm": 1.3481441118932556, + "learning_rate": 1.973388834255882e-05, + "loss": 1.6744, + "step": 319 + }, + { + "epoch": 0.2591093117408907, + "grad_norm": 1.2693166429730327, + "learning_rate": 1.973191504399076e-05, + "loss": 1.6762, + "step": 320 + }, + { + "epoch": 0.25991902834008096, + "grad_norm": 1.2340493742688543, + "learning_rate": 1.9729934555515823e-05, + "loss": 1.6857, + "step": 321 + }, + { + "epoch": 0.26072874493927123, + "grad_norm": 1.2452283567023132, + "learning_rate": 1.9727946878597193e-05, + "loss": 1.6444, + "step": 322 + }, + { + "epoch": 0.26153846153846155, + "grad_norm": 1.3029700463193483, + "learning_rate": 1.9725952014703366e-05, + "loss": 1.7188, + "step": 323 + }, + { + "epoch": 0.2623481781376518, + "grad_norm": 1.179155915390884, + "learning_rate": 1.9723949965308132e-05, + "loss": 1.6801, + "step": 324 + }, + { + "epoch": 0.2631578947368421, + "grad_norm": 1.286301672885254, + "learning_rate": 1.97219407318906e-05, + "loss": 1.6617, + "step": 325 + }, + { + "epoch": 0.2639676113360324, + "grad_norm": 1.2990996064383904, + "learning_rate": 1.9719924315935185e-05, + "loss": 1.7269, + "step": 326 + }, + { + "epoch": 0.2647773279352227, + "grad_norm": 1.2224528081088977, + "learning_rate": 1.971790071893161e-05, + "loss": 1.7372, + "step": 327 + }, + { + "epoch": 0.26558704453441295, + "grad_norm": 1.2367757457178075, + "learning_rate": 1.9715869942374902e-05, + "loss": 1.6821, + "step": 328 + }, + { + "epoch": 0.2663967611336032, + "grad_norm": 1.2373862157826907, + "learning_rate": 1.9713831987765394e-05, + "loss": 1.7366, + "step": 329 + }, + { + "epoch": 0.26720647773279355, + "grad_norm": 1.2623515990334666, + "learning_rate": 1.9711786856608714e-05, + "loss": 1.6406, + "step": 330 + }, + { + "epoch": 0.2680161943319838, + "grad_norm": 1.1587501394940165, + "learning_rate": 1.9709734550415804e-05, + "loss": 1.6572, + "step": 331 + }, + { + "epoch": 0.2688259109311741, + "grad_norm": 1.3564850748615598, + "learning_rate": 1.97076750707029e-05, + "loss": 1.6981, + "step": 332 + }, + { + "epoch": 0.26963562753036435, + "grad_norm": 1.1910339514470332, + "learning_rate": 1.9705608418991534e-05, + "loss": 1.712, + "step": 333 + }, + { + "epoch": 0.2704453441295547, + "grad_norm": 1.2174211006655304, + "learning_rate": 1.9703534596808547e-05, + "loss": 1.6783, + "step": 334 + }, + { + "epoch": 0.27125506072874495, + "grad_norm": 1.1830375775538684, + "learning_rate": 1.970145360568607e-05, + "loss": 1.7121, + "step": 335 + }, + { + "epoch": 0.2720647773279352, + "grad_norm": 1.1619637628427306, + "learning_rate": 1.9699365447161535e-05, + "loss": 1.6468, + "step": 336 + }, + { + "epoch": 0.2728744939271255, + "grad_norm": 1.2028809053995226, + "learning_rate": 1.969727012277766e-05, + "loss": 1.6848, + "step": 337 + }, + { + "epoch": 0.2736842105263158, + "grad_norm": 1.1621842682222943, + "learning_rate": 1.969516763408247e-05, + "loss": 1.644, + "step": 338 + }, + { + "epoch": 0.2744939271255061, + "grad_norm": 1.2484226953452637, + "learning_rate": 1.9693057982629277e-05, + "loss": 1.6179, + "step": 339 + }, + { + "epoch": 0.27530364372469635, + "grad_norm": 1.3119636180148664, + "learning_rate": 1.969094116997668e-05, + "loss": 1.7032, + "step": 340 + }, + { + "epoch": 0.2761133603238866, + "grad_norm": 1.2508340881483084, + "learning_rate": 1.9688817197688576e-05, + "loss": 1.6472, + "step": 341 + }, + { + "epoch": 0.27692307692307694, + "grad_norm": 1.202434258153614, + "learning_rate": 1.968668606733415e-05, + "loss": 1.6996, + "step": 342 + }, + { + "epoch": 0.2777327935222672, + "grad_norm": 1.1862376977274527, + "learning_rate": 1.9684547780487873e-05, + "loss": 1.6536, + "step": 343 + }, + { + "epoch": 0.2785425101214575, + "grad_norm": 1.3042945008375981, + "learning_rate": 1.9682402338729504e-05, + "loss": 1.714, + "step": 344 + }, + { + "epoch": 0.2793522267206478, + "grad_norm": 1.2321323526734465, + "learning_rate": 1.968024974364408e-05, + "loss": 1.6788, + "step": 345 + }, + { + "epoch": 0.28016194331983807, + "grad_norm": 1.176640860678081, + "learning_rate": 1.967808999682195e-05, + "loss": 1.6697, + "step": 346 + }, + { + "epoch": 0.28097165991902834, + "grad_norm": 1.2519779435098668, + "learning_rate": 1.9675923099858712e-05, + "loss": 1.663, + "step": 347 + }, + { + "epoch": 0.2817813765182186, + "grad_norm": 1.2594751793446184, + "learning_rate": 1.9673749054355268e-05, + "loss": 1.7454, + "step": 348 + }, + { + "epoch": 0.28259109311740893, + "grad_norm": 1.1910600819228896, + "learning_rate": 1.9671567861917796e-05, + "loss": 1.6278, + "step": 349 + }, + { + "epoch": 0.2834008097165992, + "grad_norm": 1.1545397374773405, + "learning_rate": 1.9669379524157755e-05, + "loss": 1.6888, + "step": 350 + }, + { + "epoch": 0.28421052631578947, + "grad_norm": 1.1774954626925083, + "learning_rate": 1.9667184042691877e-05, + "loss": 1.62, + "step": 351 + }, + { + "epoch": 0.28502024291497974, + "grad_norm": 1.1669061183444, + "learning_rate": 1.966498141914218e-05, + "loss": 1.6305, + "step": 352 + }, + { + "epoch": 0.28582995951417006, + "grad_norm": 1.2826242009462285, + "learning_rate": 1.9662771655135954e-05, + "loss": 1.7025, + "step": 353 + }, + { + "epoch": 0.28663967611336033, + "grad_norm": 1.215510453464448, + "learning_rate": 1.9660554752305763e-05, + "loss": 1.7183, + "step": 354 + }, + { + "epoch": 0.2874493927125506, + "grad_norm": 1.1500016494292455, + "learning_rate": 1.9658330712289456e-05, + "loss": 1.6474, + "step": 355 + }, + { + "epoch": 0.28825910931174087, + "grad_norm": 1.1771753941978587, + "learning_rate": 1.965609953673014e-05, + "loss": 1.6303, + "step": 356 + }, + { + "epoch": 0.2890688259109312, + "grad_norm": 1.1888955521350377, + "learning_rate": 1.9653861227276197e-05, + "loss": 1.7237, + "step": 357 + }, + { + "epoch": 0.28987854251012146, + "grad_norm": 1.3602337615622975, + "learning_rate": 1.9651615785581287e-05, + "loss": 1.6749, + "step": 358 + }, + { + "epoch": 0.29068825910931173, + "grad_norm": 1.1994666759895225, + "learning_rate": 1.9649363213304337e-05, + "loss": 1.6335, + "step": 359 + }, + { + "epoch": 0.291497975708502, + "grad_norm": 1.1952494705657468, + "learning_rate": 1.9647103512109535e-05, + "loss": 1.6583, + "step": 360 + }, + { + "epoch": 0.2923076923076923, + "grad_norm": 1.2493452825647193, + "learning_rate": 1.9644836683666347e-05, + "loss": 1.7336, + "step": 361 + }, + { + "epoch": 0.2931174089068826, + "grad_norm": 1.2043236715266186, + "learning_rate": 1.9642562729649492e-05, + "loss": 1.6457, + "step": 362 + }, + { + "epoch": 0.29392712550607286, + "grad_norm": 1.3258481801373758, + "learning_rate": 1.964028165173896e-05, + "loss": 1.7147, + "step": 363 + }, + { + "epoch": 0.29473684210526313, + "grad_norm": 1.3308234537433237, + "learning_rate": 1.963799345162001e-05, + "loss": 1.7239, + "step": 364 + }, + { + "epoch": 0.29554655870445345, + "grad_norm": 1.2105650981201128, + "learning_rate": 1.9635698130983153e-05, + "loss": 1.69, + "step": 365 + }, + { + "epoch": 0.2963562753036437, + "grad_norm": 1.4724018359938562, + "learning_rate": 1.9633395691524163e-05, + "loss": 1.7904, + "step": 366 + }, + { + "epoch": 0.297165991902834, + "grad_norm": 1.167011876559427, + "learning_rate": 1.9631086134944076e-05, + "loss": 1.6719, + "step": 367 + }, + { + "epoch": 0.2979757085020243, + "grad_norm": 1.2115645319681754, + "learning_rate": 1.9628769462949187e-05, + "loss": 1.7076, + "step": 368 + }, + { + "epoch": 0.2987854251012146, + "grad_norm": 1.2930285867259568, + "learning_rate": 1.9626445677251043e-05, + "loss": 1.7555, + "step": 369 + }, + { + "epoch": 0.29959514170040485, + "grad_norm": 1.1530085275414237, + "learning_rate": 1.962411477956645e-05, + "loss": 1.6965, + "step": 370 + }, + { + "epoch": 0.3004048582995951, + "grad_norm": 1.4097103876759387, + "learning_rate": 1.9621776771617464e-05, + "loss": 1.6768, + "step": 371 + }, + { + "epoch": 0.30121457489878545, + "grad_norm": 1.168938484091032, + "learning_rate": 1.9619431655131404e-05, + "loss": 1.6209, + "step": 372 + }, + { + "epoch": 0.3020242914979757, + "grad_norm": 1.2057005406635852, + "learning_rate": 1.961707943184083e-05, + "loss": 1.6893, + "step": 373 + }, + { + "epoch": 0.302834008097166, + "grad_norm": 1.2784392150421353, + "learning_rate": 1.9614720103483562e-05, + "loss": 1.7162, + "step": 374 + }, + { + "epoch": 0.30364372469635625, + "grad_norm": 1.2120518452791202, + "learning_rate": 1.9612353671802658e-05, + "loss": 1.5952, + "step": 375 + }, + { + "epoch": 0.3044534412955466, + "grad_norm": 1.2873149200801106, + "learning_rate": 1.960998013854643e-05, + "loss": 1.7035, + "step": 376 + }, + { + "epoch": 0.30526315789473685, + "grad_norm": 1.3006849812067949, + "learning_rate": 1.960759950546844e-05, + "loss": 1.6379, + "step": 377 + }, + { + "epoch": 0.3060728744939271, + "grad_norm": 1.2990265392717135, + "learning_rate": 1.960521177432749e-05, + "loss": 1.7101, + "step": 378 + }, + { + "epoch": 0.3068825910931174, + "grad_norm": 1.3082477762819935, + "learning_rate": 1.9602816946887634e-05, + "loss": 1.691, + "step": 379 + }, + { + "epoch": 0.3076923076923077, + "grad_norm": 1.2057202532696198, + "learning_rate": 1.960041502491815e-05, + "loss": 1.7229, + "step": 380 + }, + { + "epoch": 0.308502024291498, + "grad_norm": 1.155390745921477, + "learning_rate": 1.959800601019358e-05, + "loss": 1.649, + "step": 381 + }, + { + "epoch": 0.30931174089068825, + "grad_norm": 1.2547425138455335, + "learning_rate": 1.9595589904493696e-05, + "loss": 1.7094, + "step": 382 + }, + { + "epoch": 0.3101214574898785, + "grad_norm": 1.2268479082620285, + "learning_rate": 1.9593166709603503e-05, + "loss": 1.6612, + "step": 383 + }, + { + "epoch": 0.31093117408906884, + "grad_norm": 1.247022945456135, + "learning_rate": 1.9590736427313255e-05, + "loss": 1.7201, + "step": 384 + }, + { + "epoch": 0.3117408906882591, + "grad_norm": 1.1780738130918533, + "learning_rate": 1.9588299059418434e-05, + "loss": 1.707, + "step": 385 + }, + { + "epoch": 0.3125506072874494, + "grad_norm": 1.3110540287499943, + "learning_rate": 1.958585460771976e-05, + "loss": 1.5952, + "step": 386 + }, + { + "epoch": 0.3133603238866397, + "grad_norm": 1.2356287734723737, + "learning_rate": 1.9583403074023183e-05, + "loss": 1.7514, + "step": 387 + }, + { + "epoch": 0.31417004048582997, + "grad_norm": 1.251297643939624, + "learning_rate": 1.9580944460139896e-05, + "loss": 1.6577, + "step": 388 + }, + { + "epoch": 0.31497975708502024, + "grad_norm": 1.283479017126295, + "learning_rate": 1.9578478767886303e-05, + "loss": 1.6909, + "step": 389 + }, + { + "epoch": 0.3157894736842105, + "grad_norm": 1.1892005294340031, + "learning_rate": 1.957600599908406e-05, + "loss": 1.6956, + "step": 390 + }, + { + "epoch": 0.31659919028340083, + "grad_norm": 1.299357039138039, + "learning_rate": 1.957352615556004e-05, + "loss": 1.6994, + "step": 391 + }, + { + "epoch": 0.3174089068825911, + "grad_norm": 1.2192402309654093, + "learning_rate": 1.9571039239146332e-05, + "loss": 1.7085, + "step": 392 + }, + { + "epoch": 0.31821862348178137, + "grad_norm": 1.2089295961333866, + "learning_rate": 1.9568545251680272e-05, + "loss": 1.7312, + "step": 393 + }, + { + "epoch": 0.31902834008097164, + "grad_norm": 1.2002238729382657, + "learning_rate": 1.956604419500441e-05, + "loss": 1.6919, + "step": 394 + }, + { + "epoch": 0.31983805668016196, + "grad_norm": 1.1372502072400479, + "learning_rate": 1.9563536070966513e-05, + "loss": 1.6842, + "step": 395 + }, + { + "epoch": 0.32064777327935223, + "grad_norm": 1.1848402112810035, + "learning_rate": 1.956102088141958e-05, + "loss": 1.6854, + "step": 396 + }, + { + "epoch": 0.3214574898785425, + "grad_norm": 1.212172944607681, + "learning_rate": 1.9558498628221816e-05, + "loss": 1.6693, + "step": 397 + }, + { + "epoch": 0.32226720647773277, + "grad_norm": 1.2323473874443758, + "learning_rate": 1.9555969313236666e-05, + "loss": 1.6916, + "step": 398 + }, + { + "epoch": 0.3230769230769231, + "grad_norm": 1.197869807167727, + "learning_rate": 1.955343293833277e-05, + "loss": 1.6777, + "step": 399 + }, + { + "epoch": 0.32388663967611336, + "grad_norm": 1.2182554044770237, + "learning_rate": 1.9550889505383996e-05, + "loss": 1.7348, + "step": 400 + }, + { + "epoch": 0.32469635627530363, + "grad_norm": 1.2204060971165689, + "learning_rate": 1.954833901626943e-05, + "loss": 1.6656, + "step": 401 + }, + { + "epoch": 0.3255060728744939, + "grad_norm": 1.2003350489961402, + "learning_rate": 1.9545781472873354e-05, + "loss": 1.6162, + "step": 402 + }, + { + "epoch": 0.3263157894736842, + "grad_norm": 1.3712370071917266, + "learning_rate": 1.954321687708528e-05, + "loss": 1.6689, + "step": 403 + }, + { + "epoch": 0.3271255060728745, + "grad_norm": 1.2066042919530333, + "learning_rate": 1.954064523079992e-05, + "loss": 1.6092, + "step": 404 + }, + { + "epoch": 0.32793522267206476, + "grad_norm": 1.2696649824839026, + "learning_rate": 1.9538066535917196e-05, + "loss": 1.6442, + "step": 405 + }, + { + "epoch": 0.3287449392712551, + "grad_norm": 1.470140395099795, + "learning_rate": 1.9535480794342248e-05, + "loss": 1.6058, + "step": 406 + }, + { + "epoch": 0.32955465587044536, + "grad_norm": 1.3119894938694066, + "learning_rate": 1.9532888007985408e-05, + "loss": 1.7058, + "step": 407 + }, + { + "epoch": 0.3303643724696356, + "grad_norm": 1.4968426504458228, + "learning_rate": 1.9530288178762213e-05, + "loss": 1.6929, + "step": 408 + }, + { + "epoch": 0.3311740890688259, + "grad_norm": 1.2601976626437872, + "learning_rate": 1.9527681308593412e-05, + "loss": 1.6336, + "step": 409 + }, + { + "epoch": 0.3319838056680162, + "grad_norm": 1.3452338661981322, + "learning_rate": 1.952506739940496e-05, + "loss": 1.7445, + "step": 410 + }, + { + "epoch": 0.3327935222672065, + "grad_norm": 1.2717619212491589, + "learning_rate": 1.9522446453127994e-05, + "loss": 1.6666, + "step": 411 + }, + { + "epoch": 0.33360323886639676, + "grad_norm": 1.1984329741126858, + "learning_rate": 1.951981847169886e-05, + "loss": 1.7448, + "step": 412 + }, + { + "epoch": 0.334412955465587, + "grad_norm": 1.1961449074373034, + "learning_rate": 1.951718345705911e-05, + "loss": 1.667, + "step": 413 + }, + { + "epoch": 0.33522267206477735, + "grad_norm": 1.330214100403057, + "learning_rate": 1.9514541411155478e-05, + "loss": 1.7108, + "step": 414 + }, + { + "epoch": 0.3360323886639676, + "grad_norm": 1.183842085273524, + "learning_rate": 1.9511892335939904e-05, + "loss": 1.621, + "step": 415 + }, + { + "epoch": 0.3368421052631579, + "grad_norm": 1.2664210311376265, + "learning_rate": 1.950923623336951e-05, + "loss": 1.6512, + "step": 416 + }, + { + "epoch": 0.33765182186234816, + "grad_norm": 1.1261494079421026, + "learning_rate": 1.9506573105406623e-05, + "loss": 1.6123, + "step": 417 + }, + { + "epoch": 0.3384615384615385, + "grad_norm": 1.27913282791293, + "learning_rate": 1.9503902954018748e-05, + "loss": 1.7083, + "step": 418 + }, + { + "epoch": 0.33927125506072875, + "grad_norm": 1.1019506289059786, + "learning_rate": 1.9501225781178586e-05, + "loss": 1.6501, + "step": 419 + }, + { + "epoch": 0.340080971659919, + "grad_norm": 1.1163063251074246, + "learning_rate": 1.9498541588864022e-05, + "loss": 1.626, + "step": 420 + }, + { + "epoch": 0.3408906882591093, + "grad_norm": 1.1208221900644928, + "learning_rate": 1.9495850379058127e-05, + "loss": 1.6486, + "step": 421 + }, + { + "epoch": 0.3417004048582996, + "grad_norm": 1.1252985647383595, + "learning_rate": 1.9493152153749162e-05, + "loss": 1.6098, + "step": 422 + }, + { + "epoch": 0.3425101214574899, + "grad_norm": 1.1658129970131546, + "learning_rate": 1.9490446914930564e-05, + "loss": 1.6067, + "step": 423 + }, + { + "epoch": 0.34331983805668015, + "grad_norm": 1.2465190601078746, + "learning_rate": 1.9487734664600956e-05, + "loss": 1.7689, + "step": 424 + }, + { + "epoch": 0.3441295546558704, + "grad_norm": 1.2032865897994165, + "learning_rate": 1.948501540476414e-05, + "loss": 1.6348, + "step": 425 + }, + { + "epoch": 0.34493927125506074, + "grad_norm": 1.3211565552357492, + "learning_rate": 1.9482289137429098e-05, + "loss": 1.6349, + "step": 426 + }, + { + "epoch": 0.345748987854251, + "grad_norm": 1.1307592944834632, + "learning_rate": 1.9479555864609985e-05, + "loss": 1.593, + "step": 427 + }, + { + "epoch": 0.3465587044534413, + "grad_norm": 1.2787848323635016, + "learning_rate": 1.947681558832613e-05, + "loss": 1.7022, + "step": 428 + }, + { + "epoch": 0.3473684210526316, + "grad_norm": 1.2091547875890887, + "learning_rate": 1.9474068310602048e-05, + "loss": 1.6566, + "step": 429 + }, + { + "epoch": 0.3481781376518219, + "grad_norm": 1.2483509469560066, + "learning_rate": 1.9471314033467413e-05, + "loss": 1.6993, + "step": 430 + }, + { + "epoch": 0.34898785425101214, + "grad_norm": 1.2498538878012477, + "learning_rate": 1.9468552758957076e-05, + "loss": 1.7252, + "step": 431 + }, + { + "epoch": 0.3497975708502024, + "grad_norm": 1.1989025930704638, + "learning_rate": 1.9465784489111063e-05, + "loss": 1.6999, + "step": 432 + }, + { + "epoch": 0.35060728744939273, + "grad_norm": 1.1762119281728465, + "learning_rate": 1.9463009225974558e-05, + "loss": 1.6501, + "step": 433 + }, + { + "epoch": 0.351417004048583, + "grad_norm": 1.2950834978750903, + "learning_rate": 1.9460226971597916e-05, + "loss": 1.7399, + "step": 434 + }, + { + "epoch": 0.3522267206477733, + "grad_norm": 1.2455256194553699, + "learning_rate": 1.945743772803666e-05, + "loss": 1.6708, + "step": 435 + }, + { + "epoch": 0.35303643724696354, + "grad_norm": 1.2202828687384126, + "learning_rate": 1.945464149735147e-05, + "loss": 1.6896, + "step": 436 + }, + { + "epoch": 0.35384615384615387, + "grad_norm": 1.158420520477845, + "learning_rate": 1.94518382816082e-05, + "loss": 1.6619, + "step": 437 + }, + { + "epoch": 0.35465587044534413, + "grad_norm": 1.1779822358008685, + "learning_rate": 1.9449028082877843e-05, + "loss": 1.6571, + "step": 438 + }, + { + "epoch": 0.3554655870445344, + "grad_norm": 1.1534682512913468, + "learning_rate": 1.944621090323658e-05, + "loss": 1.5662, + "step": 439 + }, + { + "epoch": 0.3562753036437247, + "grad_norm": 1.231944980721245, + "learning_rate": 1.9443386744765726e-05, + "loss": 1.6072, + "step": 440 + }, + { + "epoch": 0.357085020242915, + "grad_norm": 1.2309431429104132, + "learning_rate": 1.9440555609551758e-05, + "loss": 1.6478, + "step": 441 + }, + { + "epoch": 0.35789473684210527, + "grad_norm": 1.1256347239923377, + "learning_rate": 1.9437717499686313e-05, + "loss": 1.5872, + "step": 442 + }, + { + "epoch": 0.35870445344129553, + "grad_norm": 1.177945395683949, + "learning_rate": 1.9434872417266176e-05, + "loss": 1.6847, + "step": 443 + }, + { + "epoch": 0.3595141700404858, + "grad_norm": 1.1817750781529393, + "learning_rate": 1.943202036439329e-05, + "loss": 1.6555, + "step": 444 + }, + { + "epoch": 0.3603238866396761, + "grad_norm": 1.155486180808622, + "learning_rate": 1.942916134317473e-05, + "loss": 1.703, + "step": 445 + }, + { + "epoch": 0.3611336032388664, + "grad_norm": 1.2249672033301868, + "learning_rate": 1.9426295355722745e-05, + "loss": 1.6686, + "step": 446 + }, + { + "epoch": 0.36194331983805667, + "grad_norm": 1.1939431449591948, + "learning_rate": 1.9423422404154708e-05, + "loss": 1.6403, + "step": 447 + }, + { + "epoch": 0.362753036437247, + "grad_norm": 1.283040329656735, + "learning_rate": 1.942054249059315e-05, + "loss": 1.7033, + "step": 448 + }, + { + "epoch": 0.36356275303643726, + "grad_norm": 1.2302822837989298, + "learning_rate": 1.941765561716574e-05, + "loss": 1.6309, + "step": 449 + }, + { + "epoch": 0.3643724696356275, + "grad_norm": 1.258830672551768, + "learning_rate": 1.9414761786005293e-05, + "loss": 1.6807, + "step": 450 + }, + { + "epoch": 0.3651821862348178, + "grad_norm": 1.2346970982772958, + "learning_rate": 1.9411860999249762e-05, + "loss": 1.6063, + "step": 451 + }, + { + "epoch": 0.3659919028340081, + "grad_norm": 1.323015686111149, + "learning_rate": 1.9408953259042236e-05, + "loss": 1.6917, + "step": 452 + }, + { + "epoch": 0.3668016194331984, + "grad_norm": 1.1863762011370744, + "learning_rate": 1.9406038567530944e-05, + "loss": 1.6365, + "step": 453 + }, + { + "epoch": 0.36761133603238866, + "grad_norm": 1.2001660652965502, + "learning_rate": 1.9403116926869253e-05, + "loss": 1.6383, + "step": 454 + }, + { + "epoch": 0.3684210526315789, + "grad_norm": 1.2089353111723722, + "learning_rate": 1.9400188339215657e-05, + "loss": 1.6778, + "step": 455 + }, + { + "epoch": 0.36923076923076925, + "grad_norm": 1.2546231222228346, + "learning_rate": 1.9397252806733793e-05, + "loss": 1.6405, + "step": 456 + }, + { + "epoch": 0.3700404858299595, + "grad_norm": 1.3359592739488402, + "learning_rate": 1.939431033159242e-05, + "loss": 1.643, + "step": 457 + }, + { + "epoch": 0.3708502024291498, + "grad_norm": 1.1715630102315027, + "learning_rate": 1.9391360915965426e-05, + "loss": 1.5935, + "step": 458 + }, + { + "epoch": 0.37165991902834006, + "grad_norm": 1.1385520627989953, + "learning_rate": 1.9388404562031836e-05, + "loss": 1.6979, + "step": 459 + }, + { + "epoch": 0.3724696356275304, + "grad_norm": 1.2399884358282447, + "learning_rate": 1.9385441271975786e-05, + "loss": 1.6434, + "step": 460 + }, + { + "epoch": 0.37327935222672065, + "grad_norm": 1.163385165068389, + "learning_rate": 1.9382471047986555e-05, + "loss": 1.6838, + "step": 461 + }, + { + "epoch": 0.3740890688259109, + "grad_norm": 1.1225008795816183, + "learning_rate": 1.9379493892258527e-05, + "loss": 1.6541, + "step": 462 + }, + { + "epoch": 0.3748987854251012, + "grad_norm": 1.2531286384618345, + "learning_rate": 1.937650980699122e-05, + "loss": 1.654, + "step": 463 + }, + { + "epoch": 0.3757085020242915, + "grad_norm": 1.1876213186476707, + "learning_rate": 1.9373518794389263e-05, + "loss": 1.6901, + "step": 464 + }, + { + "epoch": 0.3765182186234818, + "grad_norm": 1.112836379139156, + "learning_rate": 1.9370520856662406e-05, + "loss": 1.6399, + "step": 465 + }, + { + "epoch": 0.37732793522267205, + "grad_norm": 1.3185471861887414, + "learning_rate": 1.9367515996025516e-05, + "loss": 1.6746, + "step": 466 + }, + { + "epoch": 0.3781376518218624, + "grad_norm": 1.3149218591633627, + "learning_rate": 1.9364504214698578e-05, + "loss": 1.6721, + "step": 467 + }, + { + "epoch": 0.37894736842105264, + "grad_norm": 1.1367079723459552, + "learning_rate": 1.936148551490668e-05, + "loss": 1.7142, + "step": 468 + }, + { + "epoch": 0.3797570850202429, + "grad_norm": 1.1910905835785592, + "learning_rate": 1.935845989888003e-05, + "loss": 1.6307, + "step": 469 + }, + { + "epoch": 0.3805668016194332, + "grad_norm": 1.1936772698797782, + "learning_rate": 1.9355427368853946e-05, + "loss": 1.6976, + "step": 470 + }, + { + "epoch": 0.3813765182186235, + "grad_norm": 1.2030550020345074, + "learning_rate": 1.935238792706885e-05, + "loss": 1.7098, + "step": 471 + }, + { + "epoch": 0.3821862348178138, + "grad_norm": 1.171000490443613, + "learning_rate": 1.934934157577027e-05, + "loss": 1.7119, + "step": 472 + }, + { + "epoch": 0.38299595141700404, + "grad_norm": 1.164404989871964, + "learning_rate": 1.934628831720884e-05, + "loss": 1.6364, + "step": 473 + }, + { + "epoch": 0.3838056680161943, + "grad_norm": 1.1421132414106066, + "learning_rate": 1.9343228153640296e-05, + "loss": 1.6652, + "step": 474 + }, + { + "epoch": 0.38461538461538464, + "grad_norm": 1.1666775642165048, + "learning_rate": 1.9340161087325483e-05, + "loss": 1.6711, + "step": 475 + }, + { + "epoch": 0.3854251012145749, + "grad_norm": 1.1725292925597457, + "learning_rate": 1.9337087120530335e-05, + "loss": 1.5852, + "step": 476 + }, + { + "epoch": 0.3862348178137652, + "grad_norm": 1.2217452550533756, + "learning_rate": 1.9334006255525884e-05, + "loss": 1.6768, + "step": 477 + }, + { + "epoch": 0.38704453441295544, + "grad_norm": 1.1442586319126382, + "learning_rate": 1.9330918494588275e-05, + "loss": 1.6791, + "step": 478 + }, + { + "epoch": 0.38785425101214577, + "grad_norm": 1.0946129383309182, + "learning_rate": 1.9327823839998726e-05, + "loss": 1.7151, + "step": 479 + }, + { + "epoch": 0.38866396761133604, + "grad_norm": 1.2204161246543435, + "learning_rate": 1.932472229404356e-05, + "loss": 1.6237, + "step": 480 + }, + { + "epoch": 0.3894736842105263, + "grad_norm": 1.1880149763428396, + "learning_rate": 1.932161385901419e-05, + "loss": 1.6761, + "step": 481 + }, + { + "epoch": 0.3902834008097166, + "grad_norm": 1.2592668241512581, + "learning_rate": 1.931849853720712e-05, + "loss": 1.5898, + "step": 482 + }, + { + "epoch": 0.3910931174089069, + "grad_norm": 1.209479528339513, + "learning_rate": 1.931537633092393e-05, + "loss": 1.6279, + "step": 483 + }, + { + "epoch": 0.39190283400809717, + "grad_norm": 1.2513199284132253, + "learning_rate": 1.9312247242471306e-05, + "loss": 1.6343, + "step": 484 + }, + { + "epoch": 0.39271255060728744, + "grad_norm": 1.1989743043638685, + "learning_rate": 1.9309111274161005e-05, + "loss": 1.6881, + "step": 485 + }, + { + "epoch": 0.39352226720647776, + "grad_norm": 1.2273758702155306, + "learning_rate": 1.930596842830987e-05, + "loss": 1.6703, + "step": 486 + }, + { + "epoch": 0.39433198380566803, + "grad_norm": 1.1911609997920505, + "learning_rate": 1.9302818707239822e-05, + "loss": 1.6359, + "step": 487 + }, + { + "epoch": 0.3951417004048583, + "grad_norm": 1.1555821594774291, + "learning_rate": 1.9299662113277867e-05, + "loss": 1.7005, + "step": 488 + }, + { + "epoch": 0.39595141700404857, + "grad_norm": 1.161714222746895, + "learning_rate": 1.929649864875609e-05, + "loss": 1.6259, + "step": 489 + }, + { + "epoch": 0.3967611336032389, + "grad_norm": 1.2024264839115575, + "learning_rate": 1.9293328316011645e-05, + "loss": 1.6454, + "step": 490 + }, + { + "epoch": 0.39757085020242916, + "grad_norm": 1.182376794250563, + "learning_rate": 1.929015111738676e-05, + "loss": 1.6839, + "step": 491 + }, + { + "epoch": 0.39838056680161943, + "grad_norm": 1.2463525022449193, + "learning_rate": 1.9286967055228744e-05, + "loss": 1.7041, + "step": 492 + }, + { + "epoch": 0.3991902834008097, + "grad_norm": 1.1765792609257826, + "learning_rate": 1.928377613188997e-05, + "loss": 1.6963, + "step": 493 + }, + { + "epoch": 0.4, + "grad_norm": 1.1464191471865726, + "learning_rate": 1.9280578349727882e-05, + "loss": 1.7587, + "step": 494 + }, + { + "epoch": 0.4008097165991903, + "grad_norm": 1.2250198503430287, + "learning_rate": 1.927737371110499e-05, + "loss": 1.6011, + "step": 495 + }, + { + "epoch": 0.40161943319838056, + "grad_norm": 1.2273800649829825, + "learning_rate": 1.927416221838887e-05, + "loss": 1.7219, + "step": 496 + }, + { + "epoch": 0.40242914979757083, + "grad_norm": 1.1823664342882607, + "learning_rate": 1.9270943873952162e-05, + "loss": 1.6612, + "step": 497 + }, + { + "epoch": 0.40323886639676115, + "grad_norm": 1.2129258567045582, + "learning_rate": 1.9267718680172574e-05, + "loss": 1.6969, + "step": 498 + }, + { + "epoch": 0.4040485829959514, + "grad_norm": 1.142011991460994, + "learning_rate": 1.926448663943286e-05, + "loss": 1.6331, + "step": 499 + }, + { + "epoch": 0.4048582995951417, + "grad_norm": 1.1900750233989383, + "learning_rate": 1.9261247754120846e-05, + "loss": 1.5953, + "step": 500 + }, + { + "epoch": 0.40566801619433196, + "grad_norm": 1.1876038257951016, + "learning_rate": 1.925800202662941e-05, + "loss": 1.635, + "step": 501 + }, + { + "epoch": 0.4064777327935223, + "grad_norm": 1.1483754178488699, + "learning_rate": 1.9254749459356482e-05, + "loss": 1.672, + "step": 502 + }, + { + "epoch": 0.40728744939271255, + "grad_norm": 1.2261880502908553, + "learning_rate": 1.9251490054705053e-05, + "loss": 1.7095, + "step": 503 + }, + { + "epoch": 0.4080971659919028, + "grad_norm": 1.2012813998498333, + "learning_rate": 1.9248223815083155e-05, + "loss": 1.6417, + "step": 504 + }, + { + "epoch": 0.4089068825910931, + "grad_norm": 1.1128246203440006, + "learning_rate": 1.924495074290388e-05, + "loss": 1.6963, + "step": 505 + }, + { + "epoch": 0.4097165991902834, + "grad_norm": 1.1860636204765815, + "learning_rate": 1.9241670840585357e-05, + "loss": 1.6741, + "step": 506 + }, + { + "epoch": 0.4105263157894737, + "grad_norm": 1.3472485519941122, + "learning_rate": 1.923838411055077e-05, + "loss": 1.6403, + "step": 507 + }, + { + "epoch": 0.41133603238866395, + "grad_norm": 1.243377675921519, + "learning_rate": 1.923509055522835e-05, + "loss": 1.6858, + "step": 508 + }, + { + "epoch": 0.4121457489878543, + "grad_norm": 1.1780699401889794, + "learning_rate": 1.9231790177051354e-05, + "loss": 1.7021, + "step": 509 + }, + { + "epoch": 0.41295546558704455, + "grad_norm": 1.1235409565713237, + "learning_rate": 1.92284829784581e-05, + "loss": 1.8083, + "step": 510 + }, + { + "epoch": 0.4137651821862348, + "grad_norm": 1.2751087877042944, + "learning_rate": 1.922516896189193e-05, + "loss": 1.6194, + "step": 511 + }, + { + "epoch": 0.4145748987854251, + "grad_norm": 1.1876545871499165, + "learning_rate": 1.922184812980123e-05, + "loss": 1.6526, + "step": 512 + }, + { + "epoch": 0.4153846153846154, + "grad_norm": 1.1881650289467287, + "learning_rate": 1.921852048463942e-05, + "loss": 1.6574, + "step": 513 + }, + { + "epoch": 0.4161943319838057, + "grad_norm": 1.2137821843512242, + "learning_rate": 1.9215186028864955e-05, + "loss": 1.631, + "step": 514 + }, + { + "epoch": 0.41700404858299595, + "grad_norm": 1.2205249821982338, + "learning_rate": 1.9211844764941318e-05, + "loss": 1.6862, + "step": 515 + }, + { + "epoch": 0.4178137651821862, + "grad_norm": 1.229223049157285, + "learning_rate": 1.920849669533702e-05, + "loss": 1.7166, + "step": 516 + }, + { + "epoch": 0.41862348178137654, + "grad_norm": 1.1248482042561723, + "learning_rate": 1.920514182252561e-05, + "loss": 1.7045, + "step": 517 + }, + { + "epoch": 0.4194331983805668, + "grad_norm": 1.1551125743150399, + "learning_rate": 1.9201780148985657e-05, + "loss": 1.6242, + "step": 518 + }, + { + "epoch": 0.4202429149797571, + "grad_norm": 1.1669084355179586, + "learning_rate": 1.9198411677200753e-05, + "loss": 1.708, + "step": 519 + }, + { + "epoch": 0.42105263157894735, + "grad_norm": 1.1195972354108037, + "learning_rate": 1.919503640965951e-05, + "loss": 1.6543, + "step": 520 + }, + { + "epoch": 0.42186234817813767, + "grad_norm": 1.1430422701282208, + "learning_rate": 1.919165434885557e-05, + "loss": 1.6929, + "step": 521 + }, + { + "epoch": 0.42267206477732794, + "grad_norm": 1.248012426300318, + "learning_rate": 1.9188265497287587e-05, + "loss": 1.6886, + "step": 522 + }, + { + "epoch": 0.4234817813765182, + "grad_norm": 1.2892887919197453, + "learning_rate": 1.9184869857459233e-05, + "loss": 1.6493, + "step": 523 + }, + { + "epoch": 0.4242914979757085, + "grad_norm": 1.1122675853550497, + "learning_rate": 1.918146743187919e-05, + "loss": 1.6726, + "step": 524 + }, + { + "epoch": 0.4251012145748988, + "grad_norm": 1.2687690810982857, + "learning_rate": 1.917805822306117e-05, + "loss": 1.6835, + "step": 525 + }, + { + "epoch": 0.42591093117408907, + "grad_norm": 1.248733843761085, + "learning_rate": 1.9174642233523876e-05, + "loss": 1.6022, + "step": 526 + }, + { + "epoch": 0.42672064777327934, + "grad_norm": 1.1610405823160372, + "learning_rate": 1.9171219465791037e-05, + "loss": 1.6776, + "step": 527 + }, + { + "epoch": 0.42753036437246966, + "grad_norm": 1.1844102471692486, + "learning_rate": 1.9167789922391374e-05, + "loss": 1.7224, + "step": 528 + }, + { + "epoch": 0.42834008097165993, + "grad_norm": 1.1045406051602051, + "learning_rate": 1.916435360585863e-05, + "loss": 1.6201, + "step": 529 + }, + { + "epoch": 0.4291497975708502, + "grad_norm": 1.1718884314754825, + "learning_rate": 1.916091051873154e-05, + "loss": 1.6654, + "step": 530 + }, + { + "epoch": 0.42995951417004047, + "grad_norm": 1.2149881721900548, + "learning_rate": 1.915746066355385e-05, + "loss": 1.6386, + "step": 531 + }, + { + "epoch": 0.4307692307692308, + "grad_norm": 1.122281282890927, + "learning_rate": 1.9154004042874295e-05, + "loss": 1.6699, + "step": 532 + }, + { + "epoch": 0.43157894736842106, + "grad_norm": 1.1414789158751815, + "learning_rate": 1.915054065924662e-05, + "loss": 1.7295, + "step": 533 + }, + { + "epoch": 0.43238866396761133, + "grad_norm": 1.1773843495524834, + "learning_rate": 1.914707051522956e-05, + "loss": 1.6519, + "step": 534 + }, + { + "epoch": 0.4331983805668016, + "grad_norm": 1.1370478468226877, + "learning_rate": 1.9143593613386845e-05, + "loss": 1.663, + "step": 535 + }, + { + "epoch": 0.4340080971659919, + "grad_norm": 1.2049613179791026, + "learning_rate": 1.9140109956287202e-05, + "loss": 1.6453, + "step": 536 + }, + { + "epoch": 0.4348178137651822, + "grad_norm": 1.1634868725319234, + "learning_rate": 1.9136619546504344e-05, + "loss": 1.626, + "step": 537 + }, + { + "epoch": 0.43562753036437246, + "grad_norm": 1.151511606446221, + "learning_rate": 1.9133122386616972e-05, + "loss": 1.676, + "step": 538 + }, + { + "epoch": 0.43643724696356273, + "grad_norm": 1.1352020906409594, + "learning_rate": 1.9129618479208775e-05, + "loss": 1.665, + "step": 539 + }, + { + "epoch": 0.43724696356275305, + "grad_norm": 1.0886967891950499, + "learning_rate": 1.9126107826868436e-05, + "loss": 1.6408, + "step": 540 + }, + { + "epoch": 0.4380566801619433, + "grad_norm": 1.209588887486826, + "learning_rate": 1.91225904321896e-05, + "loss": 1.7039, + "step": 541 + }, + { + "epoch": 0.4388663967611336, + "grad_norm": 1.140155316990926, + "learning_rate": 1.9119066297770924e-05, + "loss": 1.6966, + "step": 542 + }, + { + "epoch": 0.43967611336032386, + "grad_norm": 1.1285673550031636, + "learning_rate": 1.9115535426216018e-05, + "loss": 1.6644, + "step": 543 + }, + { + "epoch": 0.4404858299595142, + "grad_norm": 1.179973190293816, + "learning_rate": 1.9111997820133472e-05, + "loss": 1.7061, + "step": 544 + }, + { + "epoch": 0.44129554655870445, + "grad_norm": 1.308360305063932, + "learning_rate": 1.9108453482136866e-05, + "loss": 1.7163, + "step": 545 + }, + { + "epoch": 0.4421052631578947, + "grad_norm": 1.1712581078253976, + "learning_rate": 1.9104902414844746e-05, + "loss": 1.644, + "step": 546 + }, + { + "epoch": 0.44291497975708505, + "grad_norm": 1.1720636280682097, + "learning_rate": 1.9101344620880625e-05, + "loss": 1.7228, + "step": 547 + }, + { + "epoch": 0.4437246963562753, + "grad_norm": 1.2371895291825066, + "learning_rate": 1.909778010287299e-05, + "loss": 1.6121, + "step": 548 + }, + { + "epoch": 0.4445344129554656, + "grad_norm": 1.0466963526468558, + "learning_rate": 1.9094208863455296e-05, + "loss": 1.6056, + "step": 549 + }, + { + "epoch": 0.44534412955465585, + "grad_norm": 1.1539410237456056, + "learning_rate": 1.9090630905265963e-05, + "loss": 1.7385, + "step": 550 + }, + { + "epoch": 0.4461538461538462, + "grad_norm": 1.1690769087937798, + "learning_rate": 1.9087046230948373e-05, + "loss": 1.7135, + "step": 551 + }, + { + "epoch": 0.44696356275303645, + "grad_norm": 1.2143571533897817, + "learning_rate": 1.9083454843150875e-05, + "loss": 1.6558, + "step": 552 + }, + { + "epoch": 0.4477732793522267, + "grad_norm": 1.1034768392638734, + "learning_rate": 1.9079856744526775e-05, + "loss": 1.643, + "step": 553 + }, + { + "epoch": 0.448582995951417, + "grad_norm": 1.150409314573767, + "learning_rate": 1.9076251937734328e-05, + "loss": 1.6522, + "step": 554 + }, + { + "epoch": 0.4493927125506073, + "grad_norm": 1.2891397853585957, + "learning_rate": 1.9072640425436762e-05, + "loss": 1.6858, + "step": 555 + }, + { + "epoch": 0.4502024291497976, + "grad_norm": 1.2158215396752632, + "learning_rate": 1.906902221030225e-05, + "loss": 1.6065, + "step": 556 + }, + { + "epoch": 0.45101214574898785, + "grad_norm": 1.1200407349127612, + "learning_rate": 1.9065397295003917e-05, + "loss": 1.651, + "step": 557 + }, + { + "epoch": 0.4518218623481781, + "grad_norm": 1.1631363825604073, + "learning_rate": 1.9061765682219833e-05, + "loss": 1.6214, + "step": 558 + }, + { + "epoch": 0.45263157894736844, + "grad_norm": 1.1276692541556212, + "learning_rate": 1.9058127374633027e-05, + "loss": 1.6493, + "step": 559 + }, + { + "epoch": 0.4534412955465587, + "grad_norm": 1.1960342249167806, + "learning_rate": 1.905448237493147e-05, + "loss": 1.6425, + "step": 560 + }, + { + "epoch": 0.454251012145749, + "grad_norm": 1.1128837104785292, + "learning_rate": 1.905083068580807e-05, + "loss": 1.6333, + "step": 561 + }, + { + "epoch": 0.45506072874493925, + "grad_norm": 1.2256171630437762, + "learning_rate": 1.9047172309960685e-05, + "loss": 1.6754, + "step": 562 + }, + { + "epoch": 0.45587044534412957, + "grad_norm": 1.2397790580338823, + "learning_rate": 1.9043507250092113e-05, + "loss": 1.6666, + "step": 563 + }, + { + "epoch": 0.45668016194331984, + "grad_norm": 1.1082657174113733, + "learning_rate": 1.9039835508910086e-05, + "loss": 1.6955, + "step": 564 + }, + { + "epoch": 0.4574898785425101, + "grad_norm": 1.1906286541004671, + "learning_rate": 1.9036157089127278e-05, + "loss": 1.6531, + "step": 565 + }, + { + "epoch": 0.4582995951417004, + "grad_norm": 1.1424432978234285, + "learning_rate": 1.903247199346129e-05, + "loss": 1.7165, + "step": 566 + }, + { + "epoch": 0.4591093117408907, + "grad_norm": 1.209971661625088, + "learning_rate": 1.902878022463466e-05, + "loss": 1.7129, + "step": 567 + }, + { + "epoch": 0.45991902834008097, + "grad_norm": 1.1858172748323061, + "learning_rate": 1.9025081785374854e-05, + "loss": 1.6542, + "step": 568 + }, + { + "epoch": 0.46072874493927124, + "grad_norm": 1.133164376536185, + "learning_rate": 1.9021376678414266e-05, + "loss": 1.6183, + "step": 569 + }, + { + "epoch": 0.46153846153846156, + "grad_norm": 1.0890524812398936, + "learning_rate": 1.901766490649022e-05, + "loss": 1.6258, + "step": 570 + }, + { + "epoch": 0.46234817813765183, + "grad_norm": 1.2014659731558368, + "learning_rate": 1.901394647234496e-05, + "loss": 1.7053, + "step": 571 + }, + { + "epoch": 0.4631578947368421, + "grad_norm": 1.1482794624137653, + "learning_rate": 1.901022137872565e-05, + "loss": 1.6733, + "step": 572 + }, + { + "epoch": 0.46396761133603237, + "grad_norm": 1.1400965356949173, + "learning_rate": 1.9006489628384374e-05, + "loss": 1.6498, + "step": 573 + }, + { + "epoch": 0.4647773279352227, + "grad_norm": 1.1411852291997495, + "learning_rate": 1.9002751224078148e-05, + "loss": 1.6479, + "step": 574 + }, + { + "epoch": 0.46558704453441296, + "grad_norm": 1.1952888204227503, + "learning_rate": 1.8999006168568883e-05, + "loss": 1.6397, + "step": 575 + }, + { + "epoch": 0.46639676113360323, + "grad_norm": 1.1088918517492075, + "learning_rate": 1.899525446462342e-05, + "loss": 1.6055, + "step": 576 + }, + { + "epoch": 0.4672064777327935, + "grad_norm": 1.1036176426671134, + "learning_rate": 1.89914961150135e-05, + "loss": 1.5637, + "step": 577 + }, + { + "epoch": 0.4680161943319838, + "grad_norm": 1.2620277434980276, + "learning_rate": 1.8987731122515783e-05, + "loss": 1.7563, + "step": 578 + }, + { + "epoch": 0.4688259109311741, + "grad_norm": 1.2114730386349555, + "learning_rate": 1.8983959489911833e-05, + "loss": 1.6029, + "step": 579 + }, + { + "epoch": 0.46963562753036436, + "grad_norm": 1.1429824486613362, + "learning_rate": 1.8980181219988117e-05, + "loss": 1.6325, + "step": 580 + }, + { + "epoch": 0.47044534412955463, + "grad_norm": 1.2235645463897764, + "learning_rate": 1.897639631553601e-05, + "loss": 1.6529, + "step": 581 + }, + { + "epoch": 0.47125506072874496, + "grad_norm": 1.2114311834584224, + "learning_rate": 1.897260477935179e-05, + "loss": 1.7054, + "step": 582 + }, + { + "epoch": 0.4720647773279352, + "grad_norm": 1.166387438730835, + "learning_rate": 1.8968806614236625e-05, + "loss": 1.5569, + "step": 583 + }, + { + "epoch": 0.4728744939271255, + "grad_norm": 1.195523192072768, + "learning_rate": 1.8965001822996597e-05, + "loss": 1.6743, + "step": 584 + }, + { + "epoch": 0.47368421052631576, + "grad_norm": 1.1792702417522374, + "learning_rate": 1.8961190408442662e-05, + "loss": 1.5886, + "step": 585 + }, + { + "epoch": 0.4744939271255061, + "grad_norm": 1.2088013728568938, + "learning_rate": 1.8957372373390686e-05, + "loss": 1.7337, + "step": 586 + }, + { + "epoch": 0.47530364372469636, + "grad_norm": 1.1827861831578297, + "learning_rate": 1.895354772066142e-05, + "loss": 1.6138, + "step": 587 + }, + { + "epoch": 0.4761133603238866, + "grad_norm": 1.2447777229069645, + "learning_rate": 1.8949716453080508e-05, + "loss": 1.6402, + "step": 588 + }, + { + "epoch": 0.47692307692307695, + "grad_norm": 1.2650084378850481, + "learning_rate": 1.894587857347847e-05, + "loss": 1.7021, + "step": 589 + }, + { + "epoch": 0.4777327935222672, + "grad_norm": 1.1035611492696404, + "learning_rate": 1.8942034084690727e-05, + "loss": 1.666, + "step": 590 + }, + { + "epoch": 0.4785425101214575, + "grad_norm": 1.2449301610510557, + "learning_rate": 1.893818298955757e-05, + "loss": 1.6586, + "step": 591 + }, + { + "epoch": 0.47935222672064776, + "grad_norm": 1.233085689754001, + "learning_rate": 1.8934325290924177e-05, + "loss": 1.6828, + "step": 592 + }, + { + "epoch": 0.4801619433198381, + "grad_norm": 1.1810873725015931, + "learning_rate": 1.8930460991640606e-05, + "loss": 1.6581, + "step": 593 + }, + { + "epoch": 0.48097165991902835, + "grad_norm": 1.2288793304782808, + "learning_rate": 1.8926590094561784e-05, + "loss": 1.691, + "step": 594 + }, + { + "epoch": 0.4817813765182186, + "grad_norm": 1.2201380059232416, + "learning_rate": 1.8922712602547516e-05, + "loss": 1.6666, + "step": 595 + }, + { + "epoch": 0.4825910931174089, + "grad_norm": 1.145832667110585, + "learning_rate": 1.891882851846249e-05, + "loss": 1.678, + "step": 596 + }, + { + "epoch": 0.4834008097165992, + "grad_norm": 1.1830711402174343, + "learning_rate": 1.891493784517624e-05, + "loss": 1.6358, + "step": 597 + }, + { + "epoch": 0.4842105263157895, + "grad_norm": 1.2028619373981595, + "learning_rate": 1.8911040585563196e-05, + "loss": 1.7163, + "step": 598 + }, + { + "epoch": 0.48502024291497975, + "grad_norm": 1.1632991822145227, + "learning_rate": 1.8907136742502633e-05, + "loss": 1.7096, + "step": 599 + }, + { + "epoch": 0.48582995951417, + "grad_norm": 1.177024703746382, + "learning_rate": 1.89032263188787e-05, + "loss": 1.6444, + "step": 600 + }, + { + "epoch": 0.48663967611336034, + "grad_norm": 1.2097396232078657, + "learning_rate": 1.8899309317580403e-05, + "loss": 1.7032, + "step": 601 + }, + { + "epoch": 0.4874493927125506, + "grad_norm": 1.0312500629058605, + "learning_rate": 1.8895385741501608e-05, + "loss": 1.6393, + "step": 602 + }, + { + "epoch": 0.4882591093117409, + "grad_norm": 1.1817251113637202, + "learning_rate": 1.889145559354105e-05, + "loss": 1.6357, + "step": 603 + }, + { + "epoch": 0.48906882591093115, + "grad_norm": 1.192720416754525, + "learning_rate": 1.88875188766023e-05, + "loss": 1.6244, + "step": 604 + }, + { + "epoch": 0.4898785425101215, + "grad_norm": 1.1432510606242128, + "learning_rate": 1.8883575593593793e-05, + "loss": 1.6741, + "step": 605 + }, + { + "epoch": 0.49068825910931174, + "grad_norm": 1.1569463862557519, + "learning_rate": 1.8879625747428815e-05, + "loss": 1.6761, + "step": 606 + }, + { + "epoch": 0.491497975708502, + "grad_norm": 1.187603448727291, + "learning_rate": 1.8875669341025498e-05, + "loss": 1.728, + "step": 607 + }, + { + "epoch": 0.49230769230769234, + "grad_norm": 1.2130166999284056, + "learning_rate": 1.8871706377306826e-05, + "loss": 1.6865, + "step": 608 + }, + { + "epoch": 0.4931174089068826, + "grad_norm": 1.1670591496027172, + "learning_rate": 1.886773685920062e-05, + "loss": 1.6744, + "step": 609 + }, + { + "epoch": 0.4939271255060729, + "grad_norm": 1.2163771157473018, + "learning_rate": 1.8863760789639548e-05, + "loss": 1.7203, + "step": 610 + }, + { + "epoch": 0.49473684210526314, + "grad_norm": 1.229371456590416, + "learning_rate": 1.8859778171561118e-05, + "loss": 1.7059, + "step": 611 + }, + { + "epoch": 0.49554655870445347, + "grad_norm": 1.192832189327257, + "learning_rate": 1.8855789007907672e-05, + "loss": 1.6862, + "step": 612 + }, + { + "epoch": 0.49635627530364373, + "grad_norm": 1.2220933781088503, + "learning_rate": 1.885179330162639e-05, + "loss": 1.6475, + "step": 613 + }, + { + "epoch": 0.497165991902834, + "grad_norm": 1.1263582806182404, + "learning_rate": 1.8847791055669297e-05, + "loss": 1.5831, + "step": 614 + }, + { + "epoch": 0.4979757085020243, + "grad_norm": 1.2054405676608149, + "learning_rate": 1.8843782272993225e-05, + "loss": 1.6485, + "step": 615 + }, + { + "epoch": 0.4987854251012146, + "grad_norm": 1.1513573612013865, + "learning_rate": 1.883976695655986e-05, + "loss": 1.6391, + "step": 616 + }, + { + "epoch": 0.49959514170040487, + "grad_norm": 1.1215561830277645, + "learning_rate": 1.88357451093357e-05, + "loss": 1.689, + "step": 617 + }, + { + "epoch": 0.5004048582995951, + "grad_norm": 1.2642509822675319, + "learning_rate": 1.8831716734292074e-05, + "loss": 1.6564, + "step": 618 + }, + { + "epoch": 0.5012145748987854, + "grad_norm": 1.1935335114428585, + "learning_rate": 1.882768183440513e-05, + "loss": 1.6118, + "step": 619 + }, + { + "epoch": 0.5020242914979757, + "grad_norm": 1.2850491601729415, + "learning_rate": 1.8823640412655844e-05, + "loss": 1.652, + "step": 620 + }, + { + "epoch": 0.5028340080971659, + "grad_norm": 1.202299368491898, + "learning_rate": 1.881959247203e-05, + "loss": 1.6723, + "step": 621 + }, + { + "epoch": 0.5036437246963563, + "grad_norm": 1.0583336420886984, + "learning_rate": 1.8815538015518203e-05, + "loss": 1.6403, + "step": 622 + }, + { + "epoch": 0.5044534412955466, + "grad_norm": 1.24127140876251, + "learning_rate": 1.8811477046115877e-05, + "loss": 1.692, + "step": 623 + }, + { + "epoch": 0.5052631578947369, + "grad_norm": 1.174302573947688, + "learning_rate": 1.880740956682325e-05, + "loss": 1.7306, + "step": 624 + }, + { + "epoch": 0.5060728744939271, + "grad_norm": 1.1958655509103453, + "learning_rate": 1.880333558064536e-05, + "loss": 1.7106, + "step": 625 + }, + { + "epoch": 0.5068825910931174, + "grad_norm": 1.230409287496346, + "learning_rate": 1.8799255090592056e-05, + "loss": 1.6266, + "step": 626 + }, + { + "epoch": 0.5076923076923077, + "grad_norm": 1.1988542561396345, + "learning_rate": 1.8795168099677992e-05, + "loss": 1.6516, + "step": 627 + }, + { + "epoch": 0.5085020242914979, + "grad_norm": 1.1887473097347059, + "learning_rate": 1.8791074610922624e-05, + "loss": 1.7295, + "step": 628 + }, + { + "epoch": 0.5093117408906883, + "grad_norm": 1.3303290476414642, + "learning_rate": 1.8786974627350206e-05, + "loss": 1.6802, + "step": 629 + }, + { + "epoch": 0.5101214574898786, + "grad_norm": 1.1121862887308984, + "learning_rate": 1.878286815198979e-05, + "loss": 1.6759, + "step": 630 + }, + { + "epoch": 0.5109311740890689, + "grad_norm": 1.1527683977442384, + "learning_rate": 1.8778755187875236e-05, + "loss": 1.675, + "step": 631 + }, + { + "epoch": 0.5117408906882591, + "grad_norm": 1.1283416836021556, + "learning_rate": 1.877463573804518e-05, + "loss": 1.7042, + "step": 632 + }, + { + "epoch": 0.5125506072874494, + "grad_norm": 1.111511552571126, + "learning_rate": 1.877050980554306e-05, + "loss": 1.6402, + "step": 633 + }, + { + "epoch": 0.5133603238866397, + "grad_norm": 1.1686794783887113, + "learning_rate": 1.8766377393417104e-05, + "loss": 1.6266, + "step": 634 + }, + { + "epoch": 0.5141700404858299, + "grad_norm": 1.0774322060832857, + "learning_rate": 1.876223850472032e-05, + "loss": 1.6109, + "step": 635 + }, + { + "epoch": 0.5149797570850202, + "grad_norm": 1.1204612114124226, + "learning_rate": 1.875809314251051e-05, + "loss": 1.5896, + "step": 636 + }, + { + "epoch": 0.5157894736842106, + "grad_norm": 1.1411930639576617, + "learning_rate": 1.8753941309850248e-05, + "loss": 1.6531, + "step": 637 + }, + { + "epoch": 0.5165991902834008, + "grad_norm": 1.246386803807688, + "learning_rate": 1.8749783009806898e-05, + "loss": 1.7025, + "step": 638 + }, + { + "epoch": 0.5174089068825911, + "grad_norm": 1.176836096422221, + "learning_rate": 1.8745618245452596e-05, + "loss": 1.6579, + "step": 639 + }, + { + "epoch": 0.5182186234817814, + "grad_norm": 1.10956833415483, + "learning_rate": 1.8741447019864263e-05, + "loss": 1.6245, + "step": 640 + }, + { + "epoch": 0.5190283400809717, + "grad_norm": 1.1188976219487057, + "learning_rate": 1.873726933612358e-05, + "loss": 1.6296, + "step": 641 + }, + { + "epoch": 0.5198380566801619, + "grad_norm": 1.2300857210684373, + "learning_rate": 1.873308519731701e-05, + "loss": 1.6823, + "step": 642 + }, + { + "epoch": 0.5206477732793522, + "grad_norm": 1.1437764748313324, + "learning_rate": 1.872889460653578e-05, + "loss": 1.6442, + "step": 643 + }, + { + "epoch": 0.5214574898785425, + "grad_norm": 1.1135134095150818, + "learning_rate": 1.872469756687588e-05, + "loss": 1.5917, + "step": 644 + }, + { + "epoch": 0.5222672064777328, + "grad_norm": 1.1837475043251595, + "learning_rate": 1.872049408143808e-05, + "loss": 1.7025, + "step": 645 + }, + { + "epoch": 0.5230769230769231, + "grad_norm": 1.1575973920612108, + "learning_rate": 1.8716284153327887e-05, + "loss": 1.6407, + "step": 646 + }, + { + "epoch": 0.5238866396761134, + "grad_norm": 1.1699024098280943, + "learning_rate": 1.871206778565559e-05, + "loss": 1.6838, + "step": 647 + }, + { + "epoch": 0.5246963562753036, + "grad_norm": 1.1006692187902312, + "learning_rate": 1.870784498153623e-05, + "loss": 1.6798, + "step": 648 + }, + { + "epoch": 0.5255060728744939, + "grad_norm": 1.1232448009893263, + "learning_rate": 1.87036157440896e-05, + "loss": 1.6862, + "step": 649 + }, + { + "epoch": 0.5263157894736842, + "grad_norm": 1.1938575522143917, + "learning_rate": 1.8699380076440242e-05, + "loss": 1.7397, + "step": 650 + }, + { + "epoch": 0.5271255060728745, + "grad_norm": 1.1943224317319683, + "learning_rate": 1.8695137981717452e-05, + "loss": 1.6201, + "step": 651 + }, + { + "epoch": 0.5279352226720648, + "grad_norm": 1.1217940510846476, + "learning_rate": 1.8690889463055285e-05, + "loss": 1.637, + "step": 652 + }, + { + "epoch": 0.5287449392712551, + "grad_norm": 1.1039437922379434, + "learning_rate": 1.8686634523592523e-05, + "loss": 1.6459, + "step": 653 + }, + { + "epoch": 0.5295546558704454, + "grad_norm": 1.1566442258336185, + "learning_rate": 1.868237316647271e-05, + "loss": 1.669, + "step": 654 + }, + { + "epoch": 0.5303643724696356, + "grad_norm": 1.2402960513081014, + "learning_rate": 1.8678105394844114e-05, + "loss": 1.6826, + "step": 655 + }, + { + "epoch": 0.5311740890688259, + "grad_norm": 1.1427219527507249, + "learning_rate": 1.8673831211859758e-05, + "loss": 1.6865, + "step": 656 + }, + { + "epoch": 0.5319838056680162, + "grad_norm": 1.1397055186220963, + "learning_rate": 1.866955062067739e-05, + "loss": 1.6982, + "step": 657 + }, + { + "epoch": 0.5327935222672064, + "grad_norm": 1.1723541895025018, + "learning_rate": 1.8665263624459497e-05, + "loss": 1.6186, + "step": 658 + }, + { + "epoch": 0.5336032388663967, + "grad_norm": 1.0732319910985701, + "learning_rate": 1.86609702263733e-05, + "loss": 1.7125, + "step": 659 + }, + { + "epoch": 0.5344129554655871, + "grad_norm": 1.102482299703252, + "learning_rate": 1.8656670429590745e-05, + "loss": 1.7209, + "step": 660 + }, + { + "epoch": 0.5352226720647774, + "grad_norm": 1.1656759203730942, + "learning_rate": 1.8652364237288507e-05, + "loss": 1.6379, + "step": 661 + }, + { + "epoch": 0.5360323886639676, + "grad_norm": 1.1796882306648648, + "learning_rate": 1.864805165264799e-05, + "loss": 1.6733, + "step": 662 + }, + { + "epoch": 0.5368421052631579, + "grad_norm": 1.0979399574684228, + "learning_rate": 1.8643732678855314e-05, + "loss": 1.6439, + "step": 663 + }, + { + "epoch": 0.5376518218623482, + "grad_norm": 1.1656259629485042, + "learning_rate": 1.8639407319101325e-05, + "loss": 1.7003, + "step": 664 + }, + { + "epoch": 0.5384615384615384, + "grad_norm": 1.109949494450682, + "learning_rate": 1.8635075576581587e-05, + "loss": 1.6913, + "step": 665 + }, + { + "epoch": 0.5392712550607287, + "grad_norm": 1.1384719929801121, + "learning_rate": 1.8630737454496374e-05, + "loss": 1.6767, + "step": 666 + }, + { + "epoch": 0.540080971659919, + "grad_norm": 1.1146510327537758, + "learning_rate": 1.8626392956050675e-05, + "loss": 1.6193, + "step": 667 + }, + { + "epoch": 0.5408906882591094, + "grad_norm": 1.1533020616397756, + "learning_rate": 1.862204208445419e-05, + "loss": 1.6373, + "step": 668 + }, + { + "epoch": 0.5417004048582996, + "grad_norm": 1.1409514813984398, + "learning_rate": 1.8617684842921337e-05, + "loss": 1.5814, + "step": 669 + }, + { + "epoch": 0.5425101214574899, + "grad_norm": 1.1892690565319142, + "learning_rate": 1.861332123467122e-05, + "loss": 1.71, + "step": 670 + }, + { + "epoch": 0.5433198380566802, + "grad_norm": 1.0770444731078834, + "learning_rate": 1.8608951262927667e-05, + "loss": 1.6821, + "step": 671 + }, + { + "epoch": 0.5441295546558704, + "grad_norm": 1.2201808284178386, + "learning_rate": 1.8604574930919198e-05, + "loss": 1.608, + "step": 672 + }, + { + "epoch": 0.5449392712550607, + "grad_norm": 1.1156767804046464, + "learning_rate": 1.860019224187903e-05, + "loss": 1.6091, + "step": 673 + }, + { + "epoch": 0.545748987854251, + "grad_norm": 1.1312661674531492, + "learning_rate": 1.8595803199045083e-05, + "loss": 1.6964, + "step": 674 + }, + { + "epoch": 0.5465587044534413, + "grad_norm": 1.2293348497150374, + "learning_rate": 1.859140780565996e-05, + "loss": 1.6872, + "step": 675 + }, + { + "epoch": 0.5473684210526316, + "grad_norm": 1.062664670461368, + "learning_rate": 1.858700606497097e-05, + "loss": 1.6528, + "step": 676 + }, + { + "epoch": 0.5481781376518219, + "grad_norm": 1.1400817362492008, + "learning_rate": 1.85825979802301e-05, + "loss": 1.6562, + "step": 677 + }, + { + "epoch": 0.5489878542510122, + "grad_norm": 1.0787086213793413, + "learning_rate": 1.8578183554694035e-05, + "loss": 1.7133, + "step": 678 + }, + { + "epoch": 0.5497975708502024, + "grad_norm": 1.0952800726386747, + "learning_rate": 1.8573762791624132e-05, + "loss": 1.6316, + "step": 679 + }, + { + "epoch": 0.5506072874493927, + "grad_norm": 1.140497018650844, + "learning_rate": 1.856933569428644e-05, + "loss": 1.5434, + "step": 680 + }, + { + "epoch": 0.551417004048583, + "grad_norm": 1.155661350966275, + "learning_rate": 1.856490226595168e-05, + "loss": 1.7302, + "step": 681 + }, + { + "epoch": 0.5522267206477732, + "grad_norm": 1.162518892367102, + "learning_rate": 1.856046250989526e-05, + "loss": 1.7335, + "step": 682 + }, + { + "epoch": 0.5530364372469636, + "grad_norm": 1.1661792099551183, + "learning_rate": 1.8556016429397248e-05, + "loss": 1.6408, + "step": 683 + }, + { + "epoch": 0.5538461538461539, + "grad_norm": 1.129401750350817, + "learning_rate": 1.8551564027742404e-05, + "loss": 1.6326, + "step": 684 + }, + { + "epoch": 0.5546558704453441, + "grad_norm": 1.2694837443603195, + "learning_rate": 1.8547105308220142e-05, + "loss": 1.6501, + "step": 685 + }, + { + "epoch": 0.5554655870445344, + "grad_norm": 1.0791347009798566, + "learning_rate": 1.854264027412455e-05, + "loss": 1.6494, + "step": 686 + }, + { + "epoch": 0.5562753036437247, + "grad_norm": 1.2420182578602168, + "learning_rate": 1.853816892875438e-05, + "loss": 1.6413, + "step": 687 + }, + { + "epoch": 0.557085020242915, + "grad_norm": 1.2039522453979503, + "learning_rate": 1.853369127541305e-05, + "loss": 1.639, + "step": 688 + }, + { + "epoch": 0.5578947368421052, + "grad_norm": 1.136925790043882, + "learning_rate": 1.8529207317408634e-05, + "loss": 1.6843, + "step": 689 + }, + { + "epoch": 0.5587044534412956, + "grad_norm": 1.2928638734791758, + "learning_rate": 1.852471705805387e-05, + "loss": 1.5799, + "step": 690 + }, + { + "epoch": 0.5595141700404859, + "grad_norm": 1.0426663613551523, + "learning_rate": 1.8520220500666133e-05, + "loss": 1.6005, + "step": 691 + }, + { + "epoch": 0.5603238866396761, + "grad_norm": 1.3130919141243858, + "learning_rate": 1.8515717648567476e-05, + "loss": 1.7198, + "step": 692 + }, + { + "epoch": 0.5611336032388664, + "grad_norm": 1.0955024598419019, + "learning_rate": 1.8511208505084593e-05, + "loss": 1.6197, + "step": 693 + }, + { + "epoch": 0.5619433198380567, + "grad_norm": 1.0883596134382716, + "learning_rate": 1.850669307354882e-05, + "loss": 1.6603, + "step": 694 + }, + { + "epoch": 0.562753036437247, + "grad_norm": 1.218741988618394, + "learning_rate": 1.8502171357296144e-05, + "loss": 1.6684, + "step": 695 + }, + { + "epoch": 0.5635627530364372, + "grad_norm": 1.1391537110795147, + "learning_rate": 1.8497643359667193e-05, + "loss": 1.6579, + "step": 696 + }, + { + "epoch": 0.5643724696356275, + "grad_norm": 1.1387711931236417, + "learning_rate": 1.8493109084007236e-05, + "loss": 1.6465, + "step": 697 + }, + { + "epoch": 0.5651821862348179, + "grad_norm": 1.1254009149677715, + "learning_rate": 1.8488568533666183e-05, + "loss": 1.6042, + "step": 698 + }, + { + "epoch": 0.5659919028340081, + "grad_norm": 1.1257268411329382, + "learning_rate": 1.848402171199858e-05, + "loss": 1.6514, + "step": 699 + }, + { + "epoch": 0.5668016194331984, + "grad_norm": 1.0927876479165208, + "learning_rate": 1.84794686223636e-05, + "loss": 1.6822, + "step": 700 + }, + { + "epoch": 0.5676113360323887, + "grad_norm": 1.090171472139705, + "learning_rate": 1.8474909268125053e-05, + "loss": 1.7033, + "step": 701 + }, + { + "epoch": 0.5684210526315789, + "grad_norm": 1.088693023569449, + "learning_rate": 1.8470343652651374e-05, + "loss": 1.615, + "step": 702 + }, + { + "epoch": 0.5692307692307692, + "grad_norm": 1.3432139320476364, + "learning_rate": 1.846577177931562e-05, + "loss": 1.6546, + "step": 703 + }, + { + "epoch": 0.5700404858299595, + "grad_norm": 1.0705730809837788, + "learning_rate": 1.8461193651495482e-05, + "loss": 1.6723, + "step": 704 + }, + { + "epoch": 0.5708502024291497, + "grad_norm": 1.121041236995467, + "learning_rate": 1.8456609272573268e-05, + "loss": 1.5757, + "step": 705 + }, + { + "epoch": 0.5716599190283401, + "grad_norm": 1.2097795973645553, + "learning_rate": 1.8452018645935895e-05, + "loss": 1.6559, + "step": 706 + }, + { + "epoch": 0.5724696356275304, + "grad_norm": 1.1312684155340222, + "learning_rate": 1.844742177497491e-05, + "loss": 1.6322, + "step": 707 + }, + { + "epoch": 0.5732793522267207, + "grad_norm": 1.126110922221143, + "learning_rate": 1.8442818663086456e-05, + "loss": 1.6482, + "step": 708 + }, + { + "epoch": 0.5740890688259109, + "grad_norm": 1.1010830873261865, + "learning_rate": 1.8438209313671307e-05, + "loss": 1.6829, + "step": 709 + }, + { + "epoch": 0.5748987854251012, + "grad_norm": 1.2743855652131832, + "learning_rate": 1.8433593730134835e-05, + "loss": 1.669, + "step": 710 + }, + { + "epoch": 0.5757085020242915, + "grad_norm": 1.1448636929742997, + "learning_rate": 1.842897191588701e-05, + "loss": 1.692, + "step": 711 + }, + { + "epoch": 0.5765182186234817, + "grad_norm": 1.1106904762427132, + "learning_rate": 1.842434387434242e-05, + "loss": 1.6477, + "step": 712 + }, + { + "epoch": 0.5773279352226721, + "grad_norm": 1.2682672769652985, + "learning_rate": 1.8419709608920243e-05, + "loss": 1.6194, + "step": 713 + }, + { + "epoch": 0.5781376518218624, + "grad_norm": 1.2002428737470912, + "learning_rate": 1.8415069123044263e-05, + "loss": 1.6244, + "step": 714 + }, + { + "epoch": 0.5789473684210527, + "grad_norm": 1.105036420479548, + "learning_rate": 1.841042242014285e-05, + "loss": 1.6496, + "step": 715 + }, + { + "epoch": 0.5797570850202429, + "grad_norm": 1.1794855208133117, + "learning_rate": 1.840576950364898e-05, + "loss": 1.6911, + "step": 716 + }, + { + "epoch": 0.5805668016194332, + "grad_norm": 1.1055378048289626, + "learning_rate": 1.8401110377000206e-05, + "loss": 1.6052, + "step": 717 + }, + { + "epoch": 0.5813765182186235, + "grad_norm": 1.2287534777350115, + "learning_rate": 1.839644504363868e-05, + "loss": 1.6905, + "step": 718 + }, + { + "epoch": 0.5821862348178137, + "grad_norm": 1.1569748526557413, + "learning_rate": 1.839177350701113e-05, + "loss": 1.6255, + "step": 719 + }, + { + "epoch": 0.582995951417004, + "grad_norm": 1.1818168990284688, + "learning_rate": 1.838709577056888e-05, + "loss": 1.5795, + "step": 720 + }, + { + "epoch": 0.5838056680161944, + "grad_norm": 1.1555576007146946, + "learning_rate": 1.838241183776782e-05, + "loss": 1.6674, + "step": 721 + }, + { + "epoch": 0.5846153846153846, + "grad_norm": 1.1401390602412884, + "learning_rate": 1.8377721712068424e-05, + "loss": 1.675, + "step": 722 + }, + { + "epoch": 0.5854251012145749, + "grad_norm": 1.162723139199304, + "learning_rate": 1.8373025396935743e-05, + "loss": 1.5534, + "step": 723 + }, + { + "epoch": 0.5862348178137652, + "grad_norm": 1.1799321705036754, + "learning_rate": 1.8368322895839397e-05, + "loss": 1.639, + "step": 724 + }, + { + "epoch": 0.5870445344129555, + "grad_norm": 1.2240975875375573, + "learning_rate": 1.8363614212253585e-05, + "loss": 1.597, + "step": 725 + }, + { + "epoch": 0.5878542510121457, + "grad_norm": 1.1698625227265869, + "learning_rate": 1.8358899349657063e-05, + "loss": 1.7323, + "step": 726 + }, + { + "epoch": 0.588663967611336, + "grad_norm": 1.1339957776380405, + "learning_rate": 1.8354178311533152e-05, + "loss": 1.6183, + "step": 727 + }, + { + "epoch": 0.5894736842105263, + "grad_norm": 1.2392896642339328, + "learning_rate": 1.8349451101369742e-05, + "loss": 1.6272, + "step": 728 + }, + { + "epoch": 0.5902834008097166, + "grad_norm": 1.1522226881000637, + "learning_rate": 1.8344717722659285e-05, + "loss": 1.6365, + "step": 729 + }, + { + "epoch": 0.5910931174089069, + "grad_norm": 1.2425239844288822, + "learning_rate": 1.833997817889878e-05, + "loss": 1.5836, + "step": 730 + }, + { + "epoch": 0.5919028340080972, + "grad_norm": 1.1120981246598145, + "learning_rate": 1.833523247358979e-05, + "loss": 1.6734, + "step": 731 + }, + { + "epoch": 0.5927125506072874, + "grad_norm": 1.1298710482715226, + "learning_rate": 1.8330480610238424e-05, + "loss": 1.607, + "step": 732 + }, + { + "epoch": 0.5935222672064777, + "grad_norm": 1.1377574517661373, + "learning_rate": 1.8325722592355344e-05, + "loss": 1.6454, + "step": 733 + }, + { + "epoch": 0.594331983805668, + "grad_norm": 1.1538583104420033, + "learning_rate": 1.8320958423455756e-05, + "loss": 1.6921, + "step": 734 + }, + { + "epoch": 0.5951417004048583, + "grad_norm": 1.0793087009627134, + "learning_rate": 1.8316188107059418e-05, + "loss": 1.6726, + "step": 735 + }, + { + "epoch": 0.5959514170040486, + "grad_norm": 1.1201444715810196, + "learning_rate": 1.8311411646690616e-05, + "loss": 1.6588, + "step": 736 + }, + { + "epoch": 0.5967611336032389, + "grad_norm": 1.096654674640351, + "learning_rate": 1.8306629045878192e-05, + "loss": 1.6684, + "step": 737 + }, + { + "epoch": 0.5975708502024292, + "grad_norm": 1.1079216672953291, + "learning_rate": 1.8301840308155507e-05, + "loss": 1.6618, + "step": 738 + }, + { + "epoch": 0.5983805668016194, + "grad_norm": 1.0879802564948844, + "learning_rate": 1.8297045437060474e-05, + "loss": 1.7383, + "step": 739 + }, + { + "epoch": 0.5991902834008097, + "grad_norm": 1.1424198121843356, + "learning_rate": 1.8292244436135517e-05, + "loss": 1.6709, + "step": 740 + }, + { + "epoch": 0.6, + "grad_norm": 1.1430964238759362, + "learning_rate": 1.828743730892761e-05, + "loss": 1.64, + "step": 741 + }, + { + "epoch": 0.6008097165991902, + "grad_norm": 1.0895066483885778, + "learning_rate": 1.8282624058988237e-05, + "loss": 1.6584, + "step": 742 + }, + { + "epoch": 0.6016194331983805, + "grad_norm": 1.1291234703082627, + "learning_rate": 1.827780468987341e-05, + "loss": 1.6452, + "step": 743 + }, + { + "epoch": 0.6024291497975709, + "grad_norm": 1.115557218075291, + "learning_rate": 1.8272979205143674e-05, + "loss": 1.6076, + "step": 744 + }, + { + "epoch": 0.6032388663967612, + "grad_norm": 1.1171405856483771, + "learning_rate": 1.8268147608364068e-05, + "loss": 1.6046, + "step": 745 + }, + { + "epoch": 0.6040485829959514, + "grad_norm": 1.0965261272427589, + "learning_rate": 1.8263309903104163e-05, + "loss": 1.6013, + "step": 746 + }, + { + "epoch": 0.6048582995951417, + "grad_norm": 1.2185450067830268, + "learning_rate": 1.8258466092938042e-05, + "loss": 1.6519, + "step": 747 + }, + { + "epoch": 0.605668016194332, + "grad_norm": 1.1702812963364408, + "learning_rate": 1.82536161814443e-05, + "loss": 1.6953, + "step": 748 + }, + { + "epoch": 0.6064777327935222, + "grad_norm": 1.0686726410287588, + "learning_rate": 1.8248760172206024e-05, + "loss": 1.6262, + "step": 749 + }, + { + "epoch": 0.6072874493927125, + "grad_norm": 1.136164025439449, + "learning_rate": 1.8243898068810833e-05, + "loss": 1.6661, + "step": 750 + }, + { + "epoch": 0.6080971659919029, + "grad_norm": 1.1954715351994403, + "learning_rate": 1.8239029874850823e-05, + "loss": 1.674, + "step": 751 + }, + { + "epoch": 0.6089068825910932, + "grad_norm": 1.1184558352613179, + "learning_rate": 1.82341555939226e-05, + "loss": 1.721, + "step": 752 + }, + { + "epoch": 0.6097165991902834, + "grad_norm": 1.129119070175351, + "learning_rate": 1.822927522962727e-05, + "loss": 1.6457, + "step": 753 + }, + { + "epoch": 0.6105263157894737, + "grad_norm": 1.1410321623438338, + "learning_rate": 1.822438878557043e-05, + "loss": 1.6529, + "step": 754 + }, + { + "epoch": 0.611336032388664, + "grad_norm": 1.1589440461468792, + "learning_rate": 1.8219496265362164e-05, + "loss": 1.7046, + "step": 755 + }, + { + "epoch": 0.6121457489878542, + "grad_norm": 1.115213970991795, + "learning_rate": 1.8214597672617054e-05, + "loss": 1.7141, + "step": 756 + }, + { + "epoch": 0.6129554655870445, + "grad_norm": 1.1739020879997546, + "learning_rate": 1.8209693010954166e-05, + "loss": 1.6513, + "step": 757 + }, + { + "epoch": 0.6137651821862348, + "grad_norm": 1.1420116503362077, + "learning_rate": 1.820478228399704e-05, + "loss": 1.6475, + "step": 758 + }, + { + "epoch": 0.6145748987854251, + "grad_norm": 1.2176203227373104, + "learning_rate": 1.819986549537372e-05, + "loss": 1.6648, + "step": 759 + }, + { + "epoch": 0.6153846153846154, + "grad_norm": 1.1052645276051432, + "learning_rate": 1.81949426487167e-05, + "loss": 1.6193, + "step": 760 + }, + { + "epoch": 0.6161943319838057, + "grad_norm": 1.099724825808027, + "learning_rate": 1.819001374766296e-05, + "loss": 1.5659, + "step": 761 + }, + { + "epoch": 0.617004048582996, + "grad_norm": 1.255382813893447, + "learning_rate": 1.818507879585397e-05, + "loss": 1.6868, + "step": 762 + }, + { + "epoch": 0.6178137651821862, + "grad_norm": 1.1337123092988888, + "learning_rate": 1.8180137796935648e-05, + "loss": 1.7117, + "step": 763 + }, + { + "epoch": 0.6186234817813765, + "grad_norm": 1.0916001764698793, + "learning_rate": 1.8175190754558384e-05, + "loss": 1.6079, + "step": 764 + }, + { + "epoch": 0.6194331983805668, + "grad_norm": 1.1462547249605433, + "learning_rate": 1.8170237672377046e-05, + "loss": 1.6266, + "step": 765 + }, + { + "epoch": 0.620242914979757, + "grad_norm": 1.184448821537451, + "learning_rate": 1.8165278554050946e-05, + "loss": 1.7218, + "step": 766 + }, + { + "epoch": 0.6210526315789474, + "grad_norm": 1.1988569585117117, + "learning_rate": 1.8160313403243874e-05, + "loss": 1.7002, + "step": 767 + }, + { + "epoch": 0.6218623481781377, + "grad_norm": 1.1270467979508891, + "learning_rate": 1.8155342223624054e-05, + "loss": 1.6241, + "step": 768 + }, + { + "epoch": 0.622672064777328, + "grad_norm": 1.0627366879538067, + "learning_rate": 1.8150365018864192e-05, + "loss": 1.63, + "step": 769 + }, + { + "epoch": 0.6234817813765182, + "grad_norm": 1.1499690148470678, + "learning_rate": 1.814538179264142e-05, + "loss": 1.6664, + "step": 770 + }, + { + "epoch": 0.6242914979757085, + "grad_norm": 1.1381510719594987, + "learning_rate": 1.8140392548637333e-05, + "loss": 1.6597, + "step": 771 + }, + { + "epoch": 0.6251012145748988, + "grad_norm": 1.1404467641041274, + "learning_rate": 1.8135397290537967e-05, + "loss": 1.6967, + "step": 772 + }, + { + "epoch": 0.625910931174089, + "grad_norm": 1.1388300535406857, + "learning_rate": 1.81303960220338e-05, + "loss": 1.607, + "step": 773 + }, + { + "epoch": 0.6267206477732794, + "grad_norm": 1.1302382932024566, + "learning_rate": 1.812538874681976e-05, + "loss": 1.6584, + "step": 774 + }, + { + "epoch": 0.6275303643724697, + "grad_norm": 1.1014453045454282, + "learning_rate": 1.81203754685952e-05, + "loss": 1.6767, + "step": 775 + }, + { + "epoch": 0.6283400809716599, + "grad_norm": 1.1824063773454823, + "learning_rate": 1.8115356191063913e-05, + "loss": 1.6373, + "step": 776 + }, + { + "epoch": 0.6291497975708502, + "grad_norm": 1.072773503139197, + "learning_rate": 1.811033091793413e-05, + "loss": 1.6094, + "step": 777 + }, + { + "epoch": 0.6299595141700405, + "grad_norm": 1.1293001784836003, + "learning_rate": 1.8105299652918496e-05, + "loss": 1.6591, + "step": 778 + }, + { + "epoch": 0.6307692307692307, + "grad_norm": 1.2465499878402888, + "learning_rate": 1.8100262399734102e-05, + "loss": 1.7228, + "step": 779 + }, + { + "epoch": 0.631578947368421, + "grad_norm": 1.0649028197252695, + "learning_rate": 1.8095219162102453e-05, + "loss": 1.6456, + "step": 780 + }, + { + "epoch": 0.6323886639676113, + "grad_norm": 1.2144051445060644, + "learning_rate": 1.8090169943749477e-05, + "loss": 1.6515, + "step": 781 + }, + { + "epoch": 0.6331983805668017, + "grad_norm": 1.0813125393674639, + "learning_rate": 1.8085114748405514e-05, + "loss": 1.6634, + "step": 782 + }, + { + "epoch": 0.6340080971659919, + "grad_norm": 1.0907352727369097, + "learning_rate": 1.8080053579805333e-05, + "loss": 1.647, + "step": 783 + }, + { + "epoch": 0.6348178137651822, + "grad_norm": 1.1795860026440994, + "learning_rate": 1.8074986441688102e-05, + "loss": 1.6494, + "step": 784 + }, + { + "epoch": 0.6356275303643725, + "grad_norm": 1.0999290606147423, + "learning_rate": 1.8069913337797414e-05, + "loss": 1.6061, + "step": 785 + }, + { + "epoch": 0.6364372469635627, + "grad_norm": 1.1271587687692461, + "learning_rate": 1.8064834271881252e-05, + "loss": 1.6512, + "step": 786 + }, + { + "epoch": 0.637246963562753, + "grad_norm": 1.2475268374395798, + "learning_rate": 1.805974924769202e-05, + "loss": 1.5999, + "step": 787 + }, + { + "epoch": 0.6380566801619433, + "grad_norm": 1.1666492616855777, + "learning_rate": 1.8054658268986517e-05, + "loss": 1.6134, + "step": 788 + }, + { + "epoch": 0.6388663967611335, + "grad_norm": 1.190647434820481, + "learning_rate": 1.8049561339525938e-05, + "loss": 1.6325, + "step": 789 + }, + { + "epoch": 0.6396761133603239, + "grad_norm": 1.1567789620189808, + "learning_rate": 1.804445846307588e-05, + "loss": 1.6126, + "step": 790 + }, + { + "epoch": 0.6404858299595142, + "grad_norm": 1.1205352247753744, + "learning_rate": 1.803934964340633e-05, + "loss": 1.6759, + "step": 791 + }, + { + "epoch": 0.6412955465587045, + "grad_norm": 1.1623098699690382, + "learning_rate": 1.803423488429167e-05, + "loss": 1.6557, + "step": 792 + }, + { + "epoch": 0.6421052631578947, + "grad_norm": 1.0452603483361436, + "learning_rate": 1.8029114189510664e-05, + "loss": 1.637, + "step": 793 + }, + { + "epoch": 0.642914979757085, + "grad_norm": 1.1457994199931247, + "learning_rate": 1.8023987562846468e-05, + "loss": 1.7003, + "step": 794 + }, + { + "epoch": 0.6437246963562753, + "grad_norm": 1.1078125949520747, + "learning_rate": 1.801885500808661e-05, + "loss": 1.6411, + "step": 795 + }, + { + "epoch": 0.6445344129554655, + "grad_norm": 1.063287107792573, + "learning_rate": 1.8013716529023013e-05, + "loss": 1.5944, + "step": 796 + }, + { + "epoch": 0.6453441295546559, + "grad_norm": 1.1463230389504844, + "learning_rate": 1.8008572129451963e-05, + "loss": 1.6437, + "step": 797 + }, + { + "epoch": 0.6461538461538462, + "grad_norm": 1.0745077662710292, + "learning_rate": 1.800342181317413e-05, + "loss": 1.6141, + "step": 798 + }, + { + "epoch": 0.6469635627530365, + "grad_norm": 1.1207965270353415, + "learning_rate": 1.7998265583994544e-05, + "loss": 1.6043, + "step": 799 + }, + { + "epoch": 0.6477732793522267, + "grad_norm": 1.0890890467994665, + "learning_rate": 1.7993103445722615e-05, + "loss": 1.6425, + "step": 800 + }, + { + "epoch": 0.648582995951417, + "grad_norm": 1.13852357372144, + "learning_rate": 1.7987935402172114e-05, + "loss": 1.6762, + "step": 801 + }, + { + "epoch": 0.6493927125506073, + "grad_norm": 1.1427359144052136, + "learning_rate": 1.7982761457161175e-05, + "loss": 1.6973, + "step": 802 + }, + { + "epoch": 0.6502024291497975, + "grad_norm": 1.0845396194326615, + "learning_rate": 1.7977581614512286e-05, + "loss": 1.6781, + "step": 803 + }, + { + "epoch": 0.6510121457489878, + "grad_norm": 1.1491199289525131, + "learning_rate": 1.7972395878052304e-05, + "loss": 1.6786, + "step": 804 + }, + { + "epoch": 0.6518218623481782, + "grad_norm": 1.0901262157214715, + "learning_rate": 1.7967204251612432e-05, + "loss": 1.6037, + "step": 805 + }, + { + "epoch": 0.6526315789473685, + "grad_norm": 1.1102939312877738, + "learning_rate": 1.796200673902823e-05, + "loss": 1.6314, + "step": 806 + }, + { + "epoch": 0.6534412955465587, + "grad_norm": 1.1409639624890162, + "learning_rate": 1.7956803344139592e-05, + "loss": 1.7136, + "step": 807 + }, + { + "epoch": 0.654251012145749, + "grad_norm": 1.0991234850498368, + "learning_rate": 1.795159407079078e-05, + "loss": 1.6653, + "step": 808 + }, + { + "epoch": 0.6550607287449393, + "grad_norm": 1.1172599612597327, + "learning_rate": 1.7946378922830386e-05, + "loss": 1.625, + "step": 809 + }, + { + "epoch": 0.6558704453441295, + "grad_norm": 1.0552333200507726, + "learning_rate": 1.7941157904111346e-05, + "loss": 1.7037, + "step": 810 + }, + { + "epoch": 0.6566801619433198, + "grad_norm": 1.080338568915127, + "learning_rate": 1.7935931018490923e-05, + "loss": 1.6834, + "step": 811 + }, + { + "epoch": 0.6574898785425102, + "grad_norm": 1.086653104326212, + "learning_rate": 1.7930698269830733e-05, + "loss": 1.6364, + "step": 812 + }, + { + "epoch": 0.6582995951417004, + "grad_norm": 1.0712198372881643, + "learning_rate": 1.7925459661996707e-05, + "loss": 1.5958, + "step": 813 + }, + { + "epoch": 0.6591093117408907, + "grad_norm": 1.1256703101949748, + "learning_rate": 1.7920215198859114e-05, + "loss": 1.6405, + "step": 814 + }, + { + "epoch": 0.659919028340081, + "grad_norm": 1.1116563555135301, + "learning_rate": 1.7914964884292543e-05, + "loss": 1.6689, + "step": 815 + }, + { + "epoch": 0.6607287449392713, + "grad_norm": 1.12107713293399, + "learning_rate": 1.7909708722175914e-05, + "loss": 1.6928, + "step": 816 + }, + { + "epoch": 0.6615384615384615, + "grad_norm": 1.1619550044989124, + "learning_rate": 1.7904446716392457e-05, + "loss": 1.6034, + "step": 817 + }, + { + "epoch": 0.6623481781376518, + "grad_norm": 1.1005819448367389, + "learning_rate": 1.789917887082973e-05, + "loss": 1.6888, + "step": 818 + }, + { + "epoch": 0.6631578947368421, + "grad_norm": 1.1827182789963149, + "learning_rate": 1.7893905189379594e-05, + "loss": 1.6213, + "step": 819 + }, + { + "epoch": 0.6639676113360324, + "grad_norm": 1.1422059406503742, + "learning_rate": 1.7888625675938237e-05, + "loss": 1.6266, + "step": 820 + }, + { + "epoch": 0.6647773279352227, + "grad_norm": 1.0470507697677474, + "learning_rate": 1.7883340334406136e-05, + "loss": 1.6561, + "step": 821 + }, + { + "epoch": 0.665587044534413, + "grad_norm": 1.1216106844368927, + "learning_rate": 1.7878049168688087e-05, + "loss": 1.6705, + "step": 822 + }, + { + "epoch": 0.6663967611336032, + "grad_norm": 1.2138659468039459, + "learning_rate": 1.787275218269319e-05, + "loss": 1.6336, + "step": 823 + }, + { + "epoch": 0.6672064777327935, + "grad_norm": 1.0986476320949485, + "learning_rate": 1.7867449380334834e-05, + "loss": 1.7078, + "step": 824 + }, + { + "epoch": 0.6680161943319838, + "grad_norm": 1.288830051080927, + "learning_rate": 1.7862140765530718e-05, + "loss": 1.6254, + "step": 825 + }, + { + "epoch": 0.668825910931174, + "grad_norm": 1.1447438193383035, + "learning_rate": 1.7856826342202828e-05, + "loss": 1.6201, + "step": 826 + }, + { + "epoch": 0.6696356275303643, + "grad_norm": 1.1649226403537143, + "learning_rate": 1.785150611427744e-05, + "loss": 1.6864, + "step": 827 + }, + { + "epoch": 0.6704453441295547, + "grad_norm": 1.118218283333164, + "learning_rate": 1.7846180085685122e-05, + "loss": 1.6007, + "step": 828 + }, + { + "epoch": 0.671255060728745, + "grad_norm": 1.1014855871694855, + "learning_rate": 1.7840848260360728e-05, + "loss": 1.6499, + "step": 829 + }, + { + "epoch": 0.6720647773279352, + "grad_norm": 1.2218105006616033, + "learning_rate": 1.783551064224339e-05, + "loss": 1.6338, + "step": 830 + }, + { + "epoch": 0.6728744939271255, + "grad_norm": 1.1476489793760862, + "learning_rate": 1.7830167235276524e-05, + "loss": 1.6073, + "step": 831 + }, + { + "epoch": 0.6736842105263158, + "grad_norm": 1.1653286803987006, + "learning_rate": 1.7824818043407828e-05, + "loss": 1.5989, + "step": 832 + }, + { + "epoch": 0.674493927125506, + "grad_norm": 1.1159344507824178, + "learning_rate": 1.7819463070589256e-05, + "loss": 1.6211, + "step": 833 + }, + { + "epoch": 0.6753036437246963, + "grad_norm": 1.0909665741148316, + "learning_rate": 1.781410232077705e-05, + "loss": 1.6052, + "step": 834 + }, + { + "epoch": 0.6761133603238867, + "grad_norm": 1.1405758039320455, + "learning_rate": 1.7808735797931715e-05, + "loss": 1.7158, + "step": 835 + }, + { + "epoch": 0.676923076923077, + "grad_norm": 1.1662611008186603, + "learning_rate": 1.780336350601802e-05, + "loss": 1.714, + "step": 836 + }, + { + "epoch": 0.6777327935222672, + "grad_norm": 1.1318426366131789, + "learning_rate": 1.7797985449004996e-05, + "loss": 1.6362, + "step": 837 + }, + { + "epoch": 0.6785425101214575, + "grad_norm": 1.1122001699040855, + "learning_rate": 1.7792601630865937e-05, + "loss": 1.6337, + "step": 838 + }, + { + "epoch": 0.6793522267206478, + "grad_norm": 1.1309770551503653, + "learning_rate": 1.7787212055578383e-05, + "loss": 1.5989, + "step": 839 + }, + { + "epoch": 0.680161943319838, + "grad_norm": 1.1673264771022611, + "learning_rate": 1.7781816727124138e-05, + "loss": 1.6649, + "step": 840 + }, + { + "epoch": 0.6809716599190283, + "grad_norm": 1.1003607212760034, + "learning_rate": 1.7776415649489257e-05, + "loss": 1.6183, + "step": 841 + }, + { + "epoch": 0.6817813765182186, + "grad_norm": 1.1092869043197529, + "learning_rate": 1.7771008826664036e-05, + "loss": 1.6678, + "step": 842 + }, + { + "epoch": 0.682591093117409, + "grad_norm": 1.2211528749408376, + "learning_rate": 1.7765596262643013e-05, + "loss": 1.6584, + "step": 843 + }, + { + "epoch": 0.6834008097165992, + "grad_norm": 1.1758598116755494, + "learning_rate": 1.776017796142498e-05, + "loss": 1.6415, + "step": 844 + }, + { + "epoch": 0.6842105263157895, + "grad_norm": 1.1767326941123255, + "learning_rate": 1.7754753927012955e-05, + "loss": 1.6265, + "step": 845 + }, + { + "epoch": 0.6850202429149798, + "grad_norm": 1.1747867351240824, + "learning_rate": 1.77493241634142e-05, + "loss": 1.625, + "step": 846 + }, + { + "epoch": 0.68582995951417, + "grad_norm": 1.1345683268627573, + "learning_rate": 1.7743888674640203e-05, + "loss": 1.6418, + "step": 847 + }, + { + "epoch": 0.6866396761133603, + "grad_norm": 1.1283603891809075, + "learning_rate": 1.773844746470669e-05, + "loss": 1.6412, + "step": 848 + }, + { + "epoch": 0.6874493927125506, + "grad_norm": 1.1378377091313905, + "learning_rate": 1.7733000537633605e-05, + "loss": 1.6298, + "step": 849 + }, + { + "epoch": 0.6882591093117408, + "grad_norm": 1.1253702667997967, + "learning_rate": 1.7727547897445117e-05, + "loss": 1.6472, + "step": 850 + }, + { + "epoch": 0.6890688259109312, + "grad_norm": 1.1529285701924186, + "learning_rate": 1.772208954816963e-05, + "loss": 1.6063, + "step": 851 + }, + { + "epoch": 0.6898785425101215, + "grad_norm": 1.2088268908844393, + "learning_rate": 1.771662549383974e-05, + "loss": 1.6893, + "step": 852 + }, + { + "epoch": 0.6906882591093118, + "grad_norm": 1.1008137117384047, + "learning_rate": 1.7711155738492286e-05, + "loss": 1.7038, + "step": 853 + }, + { + "epoch": 0.691497975708502, + "grad_norm": 1.1203578405697336, + "learning_rate": 1.7705680286168297e-05, + "loss": 1.691, + "step": 854 + }, + { + "epoch": 0.6923076923076923, + "grad_norm": 1.0904874884732818, + "learning_rate": 1.770019914091302e-05, + "loss": 1.66, + "step": 855 + }, + { + "epoch": 0.6931174089068826, + "grad_norm": 1.2342232334641767, + "learning_rate": 1.769471230677591e-05, + "loss": 1.6029, + "step": 856 + }, + { + "epoch": 0.6939271255060728, + "grad_norm": 1.1892221297007457, + "learning_rate": 1.7689219787810618e-05, + "loss": 1.6115, + "step": 857 + }, + { + "epoch": 0.6947368421052632, + "grad_norm": 1.1436275460030911, + "learning_rate": 1.7683721588075005e-05, + "loss": 1.6779, + "step": 858 + }, + { + "epoch": 0.6955465587044535, + "grad_norm": 1.0531746133706077, + "learning_rate": 1.7678217711631115e-05, + "loss": 1.6254, + "step": 859 + }, + { + "epoch": 0.6963562753036437, + "grad_norm": 1.204345691882841, + "learning_rate": 1.76727081625452e-05, + "loss": 1.7005, + "step": 860 + }, + { + "epoch": 0.697165991902834, + "grad_norm": 1.1113015688625698, + "learning_rate": 1.7667192944887696e-05, + "loss": 1.6242, + "step": 861 + }, + { + "epoch": 0.6979757085020243, + "grad_norm": 1.0642180179681062, + "learning_rate": 1.7661672062733226e-05, + "loss": 1.6057, + "step": 862 + }, + { + "epoch": 0.6987854251012146, + "grad_norm": 1.1074568311327786, + "learning_rate": 1.76561455201606e-05, + "loss": 1.6569, + "step": 863 + }, + { + "epoch": 0.6995951417004048, + "grad_norm": 1.1475467339069338, + "learning_rate": 1.765061332125281e-05, + "loss": 1.6668, + "step": 864 + }, + { + "epoch": 0.7004048582995951, + "grad_norm": 1.0613765631942633, + "learning_rate": 1.7645075470097024e-05, + "loss": 1.6471, + "step": 865 + }, + { + "epoch": 0.7012145748987855, + "grad_norm": 1.1262125184431948, + "learning_rate": 1.7639531970784594e-05, + "loss": 1.632, + "step": 866 + }, + { + "epoch": 0.7020242914979757, + "grad_norm": 1.053405645515529, + "learning_rate": 1.763398282741103e-05, + "loss": 1.6628, + "step": 867 + }, + { + "epoch": 0.702834008097166, + "grad_norm": 1.053485822528232, + "learning_rate": 1.762842804407603e-05, + "loss": 1.6268, + "step": 868 + }, + { + "epoch": 0.7036437246963563, + "grad_norm": 1.1067274819740494, + "learning_rate": 1.7622867624883446e-05, + "loss": 1.6268, + "step": 869 + }, + { + "epoch": 0.7044534412955465, + "grad_norm": 1.1365300210950073, + "learning_rate": 1.7617301573941296e-05, + "loss": 1.6751, + "step": 870 + }, + { + "epoch": 0.7052631578947368, + "grad_norm": 1.0944652038336933, + "learning_rate": 1.7611729895361763e-05, + "loss": 1.7041, + "step": 871 + }, + { + "epoch": 0.7060728744939271, + "grad_norm": 1.0536399181541465, + "learning_rate": 1.760615259326118e-05, + "loss": 1.6283, + "step": 872 + }, + { + "epoch": 0.7068825910931175, + "grad_norm": 1.0489723553006804, + "learning_rate": 1.760056967176005e-05, + "loss": 1.7136, + "step": 873 + }, + { + "epoch": 0.7076923076923077, + "grad_norm": 1.0719751555919763, + "learning_rate": 1.7594981134983003e-05, + "loss": 1.6001, + "step": 874 + }, + { + "epoch": 0.708502024291498, + "grad_norm": 1.1499881882739111, + "learning_rate": 1.758938698705884e-05, + "loss": 1.677, + "step": 875 + }, + { + "epoch": 0.7093117408906883, + "grad_norm": 1.101761750936134, + "learning_rate": 1.75837872321205e-05, + "loss": 1.5747, + "step": 876 + }, + { + "epoch": 0.7101214574898785, + "grad_norm": 1.077980130931622, + "learning_rate": 1.757818187430506e-05, + "loss": 1.6827, + "step": 877 + }, + { + "epoch": 0.7109311740890688, + "grad_norm": 1.0897713278469439, + "learning_rate": 1.757257091775374e-05, + "loss": 1.659, + "step": 878 + }, + { + "epoch": 0.7117408906882591, + "grad_norm": 1.4838670869876838, + "learning_rate": 1.7566954366611896e-05, + "loss": 1.6481, + "step": 879 + }, + { + "epoch": 0.7125506072874493, + "grad_norm": 1.1522319490945014, + "learning_rate": 1.7561332225029022e-05, + "loss": 1.6426, + "step": 880 + }, + { + "epoch": 0.7133603238866397, + "grad_norm": 1.1601026341967655, + "learning_rate": 1.7555704497158734e-05, + "loss": 1.6299, + "step": 881 + }, + { + "epoch": 0.71417004048583, + "grad_norm": 1.0844245975035776, + "learning_rate": 1.755007118715878e-05, + "loss": 1.6366, + "step": 882 + }, + { + "epoch": 0.7149797570850203, + "grad_norm": 1.106288732347795, + "learning_rate": 1.754443229919103e-05, + "loss": 1.6275, + "step": 883 + }, + { + "epoch": 0.7157894736842105, + "grad_norm": 1.0464400021747562, + "learning_rate": 1.7538787837421475e-05, + "loss": 1.5273, + "step": 884 + }, + { + "epoch": 0.7165991902834008, + "grad_norm": 1.1156141109394004, + "learning_rate": 1.7533137806020226e-05, + "loss": 1.6415, + "step": 885 + }, + { + "epoch": 0.7174089068825911, + "grad_norm": 1.1617480668291316, + "learning_rate": 1.752748220916151e-05, + "loss": 1.6285, + "step": 886 + }, + { + "epoch": 0.7182186234817813, + "grad_norm": 1.0663413051063326, + "learning_rate": 1.752182105102366e-05, + "loss": 1.6715, + "step": 887 + }, + { + "epoch": 0.7190283400809716, + "grad_norm": 1.1351571282717112, + "learning_rate": 1.7516154335789123e-05, + "loss": 1.676, + "step": 888 + }, + { + "epoch": 0.719838056680162, + "grad_norm": 1.0745374695294985, + "learning_rate": 1.751048206764445e-05, + "loss": 1.6808, + "step": 889 + }, + { + "epoch": 0.7206477732793523, + "grad_norm": 1.1821923099732665, + "learning_rate": 1.7504804250780292e-05, + "loss": 1.6717, + "step": 890 + }, + { + "epoch": 0.7214574898785425, + "grad_norm": 1.1136129753607058, + "learning_rate": 1.7499120889391403e-05, + "loss": 1.6181, + "step": 891 + }, + { + "epoch": 0.7222672064777328, + "grad_norm": 1.114536036094605, + "learning_rate": 1.7493431987676628e-05, + "loss": 1.6641, + "step": 892 + }, + { + "epoch": 0.7230769230769231, + "grad_norm": 1.2106559019950824, + "learning_rate": 1.7487737549838915e-05, + "loss": 1.673, + "step": 893 + }, + { + "epoch": 0.7238866396761133, + "grad_norm": 1.1049671120265863, + "learning_rate": 1.748203758008529e-05, + "loss": 1.6829, + "step": 894 + }, + { + "epoch": 0.7246963562753036, + "grad_norm": 1.0330699329285005, + "learning_rate": 1.747633208262688e-05, + "loss": 1.5981, + "step": 895 + }, + { + "epoch": 0.725506072874494, + "grad_norm": 1.122795995838303, + "learning_rate": 1.747062106167888e-05, + "loss": 1.6874, + "step": 896 + }, + { + "epoch": 0.7263157894736842, + "grad_norm": 1.0411327718132521, + "learning_rate": 1.7464904521460574e-05, + "loss": 1.6655, + "step": 897 + }, + { + "epoch": 0.7271255060728745, + "grad_norm": 1.094203881932362, + "learning_rate": 1.7459182466195328e-05, + "loss": 1.6611, + "step": 898 + }, + { + "epoch": 0.7279352226720648, + "grad_norm": 1.1197343663542865, + "learning_rate": 1.7453454900110575e-05, + "loss": 1.6484, + "step": 899 + }, + { + "epoch": 0.728744939271255, + "grad_norm": 1.0891684716999215, + "learning_rate": 1.744772182743782e-05, + "loss": 1.6823, + "step": 900 + }, + { + "epoch": 0.7295546558704453, + "grad_norm": 1.0117179095825084, + "learning_rate": 1.744198325241264e-05, + "loss": 1.5874, + "step": 901 + }, + { + "epoch": 0.7303643724696356, + "grad_norm": 1.11669367321381, + "learning_rate": 1.743623917927468e-05, + "loss": 1.6327, + "step": 902 + }, + { + "epoch": 0.7311740890688259, + "grad_norm": 1.0477262743830713, + "learning_rate": 1.7430489612267634e-05, + "loss": 1.5939, + "step": 903 + }, + { + "epoch": 0.7319838056680162, + "grad_norm": 1.1755335589263856, + "learning_rate": 1.742473455563927e-05, + "loss": 1.6671, + "step": 904 + }, + { + "epoch": 0.7327935222672065, + "grad_norm": 1.1271564518170447, + "learning_rate": 1.74189740136414e-05, + "loss": 1.6179, + "step": 905 + }, + { + "epoch": 0.7336032388663968, + "grad_norm": 1.1024780353509462, + "learning_rate": 1.7413207990529897e-05, + "loss": 1.6315, + "step": 906 + }, + { + "epoch": 0.734412955465587, + "grad_norm": 1.057413980088522, + "learning_rate": 1.7407436490564675e-05, + "loss": 1.6019, + "step": 907 + }, + { + "epoch": 0.7352226720647773, + "grad_norm": 1.0903114614208873, + "learning_rate": 1.740165951800971e-05, + "loss": 1.6435, + "step": 908 + }, + { + "epoch": 0.7360323886639676, + "grad_norm": 1.1465351066882732, + "learning_rate": 1.7395877077132996e-05, + "loss": 1.6659, + "step": 909 + }, + { + "epoch": 0.7368421052631579, + "grad_norm": 1.0879106203040465, + "learning_rate": 1.7390089172206594e-05, + "loss": 1.6035, + "step": 910 + }, + { + "epoch": 0.7376518218623481, + "grad_norm": 1.0251909256160818, + "learning_rate": 1.738429580750658e-05, + "loss": 1.6751, + "step": 911 + }, + { + "epoch": 0.7384615384615385, + "grad_norm": 1.101454173734882, + "learning_rate": 1.7378496987313078e-05, + "loss": 1.6924, + "step": 912 + }, + { + "epoch": 0.7392712550607288, + "grad_norm": 1.0632875331068412, + "learning_rate": 1.7372692715910236e-05, + "loss": 1.7052, + "step": 913 + }, + { + "epoch": 0.740080971659919, + "grad_norm": 1.2019849547754717, + "learning_rate": 1.736688299758623e-05, + "loss": 1.6453, + "step": 914 + }, + { + "epoch": 0.7408906882591093, + "grad_norm": 1.0424618834849857, + "learning_rate": 1.736106783663326e-05, + "loss": 1.6095, + "step": 915 + }, + { + "epoch": 0.7417004048582996, + "grad_norm": 1.119522754130525, + "learning_rate": 1.735524723734755e-05, + "loss": 1.5714, + "step": 916 + }, + { + "epoch": 0.7425101214574898, + "grad_norm": 1.1126763401822644, + "learning_rate": 1.7349421204029343e-05, + "loss": 1.5884, + "step": 917 + }, + { + "epoch": 0.7433198380566801, + "grad_norm": 1.1436206847704529, + "learning_rate": 1.734358974098288e-05, + "loss": 1.6716, + "step": 918 + }, + { + "epoch": 0.7441295546558705, + "grad_norm": 1.1033761896656453, + "learning_rate": 1.7337752852516443e-05, + "loss": 1.6642, + "step": 919 + }, + { + "epoch": 0.7449392712550608, + "grad_norm": 1.0771058808303946, + "learning_rate": 1.7331910542942298e-05, + "loss": 1.6655, + "step": 920 + }, + { + "epoch": 0.745748987854251, + "grad_norm": 1.1540799878521129, + "learning_rate": 1.732606281657672e-05, + "loss": 1.6533, + "step": 921 + }, + { + "epoch": 0.7465587044534413, + "grad_norm": 1.1632383389035768, + "learning_rate": 1.732020967774e-05, + "loss": 1.6251, + "step": 922 + }, + { + "epoch": 0.7473684210526316, + "grad_norm": 1.0554497626152348, + "learning_rate": 1.7314351130756412e-05, + "loss": 1.6913, + "step": 923 + }, + { + "epoch": 0.7481781376518218, + "grad_norm": 1.0558743109746518, + "learning_rate": 1.7308487179954233e-05, + "loss": 1.6377, + "step": 924 + }, + { + "epoch": 0.7489878542510121, + "grad_norm": 1.0485799147334454, + "learning_rate": 1.7302617829665725e-05, + "loss": 1.6837, + "step": 925 + }, + { + "epoch": 0.7497975708502024, + "grad_norm": 1.1396931481843333, + "learning_rate": 1.729674308422715e-05, + "loss": 1.6475, + "step": 926 + }, + { + "epoch": 0.7506072874493928, + "grad_norm": 1.089831914564661, + "learning_rate": 1.7290862947978753e-05, + "loss": 1.6612, + "step": 927 + }, + { + "epoch": 0.751417004048583, + "grad_norm": 1.214076933636336, + "learning_rate": 1.7284977425264755e-05, + "loss": 1.6558, + "step": 928 + }, + { + "epoch": 0.7522267206477733, + "grad_norm": 1.0860938684718011, + "learning_rate": 1.727908652043336e-05, + "loss": 1.6885, + "step": 929 + }, + { + "epoch": 0.7530364372469636, + "grad_norm": 1.101065862505211, + "learning_rate": 1.7273190237836757e-05, + "loss": 1.6578, + "step": 930 + }, + { + "epoch": 0.7538461538461538, + "grad_norm": 1.1342711520778153, + "learning_rate": 1.726728858183109e-05, + "loss": 1.5637, + "step": 931 + }, + { + "epoch": 0.7546558704453441, + "grad_norm": 1.0939363758915137, + "learning_rate": 1.726138155677649e-05, + "loss": 1.6303, + "step": 932 + }, + { + "epoch": 0.7554655870445344, + "grad_norm": 1.0310679045269706, + "learning_rate": 1.725546916703704e-05, + "loss": 1.5678, + "step": 933 + }, + { + "epoch": 0.7562753036437248, + "grad_norm": 1.1728811689558527, + "learning_rate": 1.7249551416980806e-05, + "loss": 1.6611, + "step": 934 + }, + { + "epoch": 0.757085020242915, + "grad_norm": 1.1888012056779975, + "learning_rate": 1.7243628310979793e-05, + "loss": 1.6931, + "step": 935 + }, + { + "epoch": 0.7578947368421053, + "grad_norm": 1.1056546162441991, + "learning_rate": 1.7237699853409974e-05, + "loss": 1.6788, + "step": 936 + }, + { + "epoch": 0.7587044534412956, + "grad_norm": 1.1020429695381275, + "learning_rate": 1.7231766048651272e-05, + "loss": 1.6557, + "step": 937 + }, + { + "epoch": 0.7595141700404858, + "grad_norm": 1.174830307710626, + "learning_rate": 1.722582690108757e-05, + "loss": 1.5964, + "step": 938 + }, + { + "epoch": 0.7603238866396761, + "grad_norm": 1.1111495814988965, + "learning_rate": 1.7219882415106685e-05, + "loss": 1.6547, + "step": 939 + }, + { + "epoch": 0.7611336032388664, + "grad_norm": 1.1511962350490403, + "learning_rate": 1.7213932595100384e-05, + "loss": 1.6447, + "step": 940 + }, + { + "epoch": 0.7619433198380566, + "grad_norm": 1.1124431654124347, + "learning_rate": 1.7207977445464374e-05, + "loss": 1.6756, + "step": 941 + }, + { + "epoch": 0.762753036437247, + "grad_norm": 1.094632192703889, + "learning_rate": 1.7202016970598303e-05, + "loss": 1.6216, + "step": 942 + }, + { + "epoch": 0.7635627530364373, + "grad_norm": 1.0848286346166132, + "learning_rate": 1.7196051174905746e-05, + "loss": 1.6162, + "step": 943 + }, + { + "epoch": 0.7643724696356275, + "grad_norm": 1.117835402570124, + "learning_rate": 1.719008006279422e-05, + "loss": 1.6988, + "step": 944 + }, + { + "epoch": 0.7651821862348178, + "grad_norm": 1.0695545232404842, + "learning_rate": 1.7184103638675157e-05, + "loss": 1.6417, + "step": 945 + }, + { + "epoch": 0.7659919028340081, + "grad_norm": 1.2187648019898212, + "learning_rate": 1.7178121906963925e-05, + "loss": 1.5581, + "step": 946 + }, + { + "epoch": 0.7668016194331984, + "grad_norm": 1.1428081923285947, + "learning_rate": 1.71721348720798e-05, + "loss": 1.5958, + "step": 947 + }, + { + "epoch": 0.7676113360323886, + "grad_norm": 1.1258009592099063, + "learning_rate": 1.7166142538445986e-05, + "loss": 1.7054, + "step": 948 + }, + { + "epoch": 0.7684210526315789, + "grad_norm": 1.122110087454184, + "learning_rate": 1.7160144910489602e-05, + "loss": 1.6479, + "step": 949 + }, + { + "epoch": 0.7692307692307693, + "grad_norm": 1.2197448941368936, + "learning_rate": 1.715414199264168e-05, + "loss": 1.6362, + "step": 950 + }, + { + "epoch": 0.7700404858299595, + "grad_norm": 1.081986455490488, + "learning_rate": 1.7148133789337145e-05, + "loss": 1.6376, + "step": 951 + }, + { + "epoch": 0.7708502024291498, + "grad_norm": 1.0866654156187887, + "learning_rate": 1.7142120305014848e-05, + "loss": 1.6275, + "step": 952 + }, + { + "epoch": 0.7716599190283401, + "grad_norm": 1.1078330177065072, + "learning_rate": 1.7136101544117526e-05, + "loss": 1.62, + "step": 953 + }, + { + "epoch": 0.7724696356275303, + "grad_norm": 1.0817145396361252, + "learning_rate": 1.7130077511091817e-05, + "loss": 1.6552, + "step": 954 + }, + { + "epoch": 0.7732793522267206, + "grad_norm": 1.105540137190186, + "learning_rate": 1.7124048210388268e-05, + "loss": 1.6459, + "step": 955 + }, + { + "epoch": 0.7740890688259109, + "grad_norm": 1.0847839201754785, + "learning_rate": 1.7118013646461295e-05, + "loss": 1.6759, + "step": 956 + }, + { + "epoch": 0.7748987854251013, + "grad_norm": 1.084427169214924, + "learning_rate": 1.711197382376922e-05, + "loss": 1.6833, + "step": 957 + }, + { + "epoch": 0.7757085020242915, + "grad_norm": 1.131195586447811, + "learning_rate": 1.710592874677424e-05, + "loss": 1.6091, + "step": 958 + }, + { + "epoch": 0.7765182186234818, + "grad_norm": 1.0521712965264904, + "learning_rate": 1.7099878419942444e-05, + "loss": 1.6319, + "step": 959 + }, + { + "epoch": 0.7773279352226721, + "grad_norm": 1.0638585192349859, + "learning_rate": 1.709382284774379e-05, + "loss": 1.6039, + "step": 960 + }, + { + "epoch": 0.7781376518218623, + "grad_norm": 1.1423083079533596, + "learning_rate": 1.7087762034652113e-05, + "loss": 1.672, + "step": 961 + }, + { + "epoch": 0.7789473684210526, + "grad_norm": 1.1028125449584352, + "learning_rate": 1.7081695985145124e-05, + "loss": 1.6914, + "step": 962 + }, + { + "epoch": 0.7797570850202429, + "grad_norm": 1.1268022996799139, + "learning_rate": 1.7075624703704404e-05, + "loss": 1.7132, + "step": 963 + }, + { + "epoch": 0.7805668016194331, + "grad_norm": 1.1209367289547245, + "learning_rate": 1.7069548194815387e-05, + "loss": 1.6357, + "step": 964 + }, + { + "epoch": 0.7813765182186235, + "grad_norm": 1.1887708228854563, + "learning_rate": 1.706346646296739e-05, + "loss": 1.5665, + "step": 965 + }, + { + "epoch": 0.7821862348178138, + "grad_norm": 1.118443433275976, + "learning_rate": 1.7057379512653565e-05, + "loss": 1.6763, + "step": 966 + }, + { + "epoch": 0.7829959514170041, + "grad_norm": 1.0454329186401545, + "learning_rate": 1.7051287348370934e-05, + "loss": 1.5866, + "step": 967 + }, + { + "epoch": 0.7838056680161943, + "grad_norm": 1.076636884210294, + "learning_rate": 1.704518997462037e-05, + "loss": 1.6171, + "step": 968 + }, + { + "epoch": 0.7846153846153846, + "grad_norm": 1.02837143519001, + "learning_rate": 1.7039087395906593e-05, + "loss": 1.6411, + "step": 969 + }, + { + "epoch": 0.7854251012145749, + "grad_norm": 1.0924312535153355, + "learning_rate": 1.7032979616738167e-05, + "loss": 1.6169, + "step": 970 + }, + { + "epoch": 0.7862348178137651, + "grad_norm": 1.0621344949493872, + "learning_rate": 1.7026866641627503e-05, + "loss": 1.6543, + "step": 971 + }, + { + "epoch": 0.7870445344129555, + "grad_norm": 1.0840593933228118, + "learning_rate": 1.7020748475090835e-05, + "loss": 1.6604, + "step": 972 + }, + { + "epoch": 0.7878542510121458, + "grad_norm": 1.0324309043206548, + "learning_rate": 1.701462512164826e-05, + "loss": 1.6547, + "step": 973 + }, + { + "epoch": 0.7886639676113361, + "grad_norm": 1.1347770119446707, + "learning_rate": 1.700849658582368e-05, + "loss": 1.6716, + "step": 974 + }, + { + "epoch": 0.7894736842105263, + "grad_norm": 1.1772883414899646, + "learning_rate": 1.7002362872144843e-05, + "loss": 1.6968, + "step": 975 + }, + { + "epoch": 0.7902834008097166, + "grad_norm": 1.1298734658918796, + "learning_rate": 1.6996223985143314e-05, + "loss": 1.7227, + "step": 976 + }, + { + "epoch": 0.7910931174089069, + "grad_norm": 1.0893915856547574, + "learning_rate": 1.6990079929354485e-05, + "loss": 1.6051, + "step": 977 + }, + { + "epoch": 0.7919028340080971, + "grad_norm": 1.0564375360450058, + "learning_rate": 1.698393070931756e-05, + "loss": 1.6565, + "step": 978 + }, + { + "epoch": 0.7927125506072874, + "grad_norm": 1.0465087498034826, + "learning_rate": 1.6977776329575566e-05, + "loss": 1.6737, + "step": 979 + }, + { + "epoch": 0.7935222672064778, + "grad_norm": 1.0938693523776055, + "learning_rate": 1.697161679467534e-05, + "loss": 1.6149, + "step": 980 + }, + { + "epoch": 0.794331983805668, + "grad_norm": 1.1328659849792375, + "learning_rate": 1.696545210916752e-05, + "loss": 1.6203, + "step": 981 + }, + { + "epoch": 0.7951417004048583, + "grad_norm": 1.1231511367270903, + "learning_rate": 1.695928227760656e-05, + "loss": 1.7056, + "step": 982 + }, + { + "epoch": 0.7959514170040486, + "grad_norm": 1.1090374560264693, + "learning_rate": 1.6953107304550714e-05, + "loss": 1.5986, + "step": 983 + }, + { + "epoch": 0.7967611336032389, + "grad_norm": 1.0770482845738483, + "learning_rate": 1.694692719456202e-05, + "loss": 1.6314, + "step": 984 + }, + { + "epoch": 0.7975708502024291, + "grad_norm": 1.0762090702083147, + "learning_rate": 1.6940741952206342e-05, + "loss": 1.6346, + "step": 985 + }, + { + "epoch": 0.7983805668016194, + "grad_norm": 1.0669244133657225, + "learning_rate": 1.69345515820533e-05, + "loss": 1.6763, + "step": 986 + }, + { + "epoch": 0.7991902834008097, + "grad_norm": 1.062879048024205, + "learning_rate": 1.6928356088676325e-05, + "loss": 1.7014, + "step": 987 + }, + { + "epoch": 0.8, + "grad_norm": 1.0947855744497017, + "learning_rate": 1.6922155476652625e-05, + "loss": 1.631, + "step": 988 + }, + { + "epoch": 0.8008097165991903, + "grad_norm": 1.034452979238415, + "learning_rate": 1.6915949750563202e-05, + "loss": 1.6141, + "step": 989 + }, + { + "epoch": 0.8016194331983806, + "grad_norm": 1.0485411735651209, + "learning_rate": 1.6909738914992812e-05, + "loss": 1.6518, + "step": 990 + }, + { + "epoch": 0.8024291497975709, + "grad_norm": 1.055581466504577, + "learning_rate": 1.6903522974530005e-05, + "loss": 1.613, + "step": 991 + }, + { + "epoch": 0.8032388663967611, + "grad_norm": 1.0560876471593497, + "learning_rate": 1.6897301933767103e-05, + "loss": 1.7046, + "step": 992 + }, + { + "epoch": 0.8040485829959514, + "grad_norm": 1.1003223798755561, + "learning_rate": 1.6891075797300184e-05, + "loss": 1.6625, + "step": 993 + }, + { + "epoch": 0.8048582995951417, + "grad_norm": 1.1021700834191077, + "learning_rate": 1.6884844569729097e-05, + "loss": 1.6678, + "step": 994 + }, + { + "epoch": 0.805668016194332, + "grad_norm": 1.1166959526421196, + "learning_rate": 1.6878608255657457e-05, + "loss": 1.6381, + "step": 995 + }, + { + "epoch": 0.8064777327935223, + "grad_norm": 1.065512317839742, + "learning_rate": 1.687236685969263e-05, + "loss": 1.6296, + "step": 996 + }, + { + "epoch": 0.8072874493927126, + "grad_norm": 1.1156855620907848, + "learning_rate": 1.6866120386445737e-05, + "loss": 1.6858, + "step": 997 + }, + { + "epoch": 0.8080971659919028, + "grad_norm": 1.0953706109586931, + "learning_rate": 1.6859868840531654e-05, + "loss": 1.7635, + "step": 998 + }, + { + "epoch": 0.8089068825910931, + "grad_norm": 1.0233507628776497, + "learning_rate": 1.6853612226569005e-05, + "loss": 1.6315, + "step": 999 + }, + { + "epoch": 0.8097165991902834, + "grad_norm": 1.1751028012727203, + "learning_rate": 1.6847350549180148e-05, + "loss": 1.6798, + "step": 1000 + }, + { + "epoch": 0.8105263157894737, + "grad_norm": 1.128106775328797, + "learning_rate": 1.68410838129912e-05, + "loss": 1.667, + "step": 1001 + }, + { + "epoch": 0.8113360323886639, + "grad_norm": 1.0812082492824788, + "learning_rate": 1.6834812022632e-05, + "loss": 1.6104, + "step": 1002 + }, + { + "epoch": 0.8121457489878543, + "grad_norm": 1.1490248467998732, + "learning_rate": 1.682853518273612e-05, + "loss": 1.6542, + "step": 1003 + }, + { + "epoch": 0.8129554655870446, + "grad_norm": 1.0715868026423407, + "learning_rate": 1.6822253297940876e-05, + "loss": 1.6435, + "step": 1004 + }, + { + "epoch": 0.8137651821862348, + "grad_norm": 1.117058675593047, + "learning_rate": 1.6815966372887305e-05, + "loss": 1.6082, + "step": 1005 + }, + { + "epoch": 0.8145748987854251, + "grad_norm": 1.1438973084718087, + "learning_rate": 1.6809674412220166e-05, + "loss": 1.6409, + "step": 1006 + }, + { + "epoch": 0.8153846153846154, + "grad_norm": 1.1167195347301986, + "learning_rate": 1.6803377420587935e-05, + "loss": 1.6139, + "step": 1007 + }, + { + "epoch": 0.8161943319838056, + "grad_norm": 1.050529006455886, + "learning_rate": 1.679707540264281e-05, + "loss": 1.6202, + "step": 1008 + }, + { + "epoch": 0.8170040485829959, + "grad_norm": 1.0456426778952734, + "learning_rate": 1.6790768363040704e-05, + "loss": 1.6511, + "step": 1009 + }, + { + "epoch": 0.8178137651821862, + "grad_norm": 1.0744209810818393, + "learning_rate": 1.6784456306441234e-05, + "loss": 1.6343, + "step": 1010 + }, + { + "epoch": 0.8186234817813766, + "grad_norm": 1.0454475493984707, + "learning_rate": 1.6778139237507728e-05, + "loss": 1.6411, + "step": 1011 + }, + { + "epoch": 0.8194331983805668, + "grad_norm": 1.1450988763255063, + "learning_rate": 1.6771817160907214e-05, + "loss": 1.6547, + "step": 1012 + }, + { + "epoch": 0.8202429149797571, + "grad_norm": 1.0820294221210558, + "learning_rate": 1.6765490081310426e-05, + "loss": 1.7049, + "step": 1013 + }, + { + "epoch": 0.8210526315789474, + "grad_norm": 1.0795056916204937, + "learning_rate": 1.6759158003391783e-05, + "loss": 1.6503, + "step": 1014 + }, + { + "epoch": 0.8218623481781376, + "grad_norm": 1.0754101028602392, + "learning_rate": 1.675282093182941e-05, + "loss": 1.6015, + "step": 1015 + }, + { + "epoch": 0.8226720647773279, + "grad_norm": 1.0902544138244956, + "learning_rate": 1.674647887130511e-05, + "loss": 1.6625, + "step": 1016 + }, + { + "epoch": 0.8234817813765182, + "grad_norm": 1.0728952819859472, + "learning_rate": 1.674013182650438e-05, + "loss": 1.6318, + "step": 1017 + }, + { + "epoch": 0.8242914979757086, + "grad_norm": 1.087191133019499, + "learning_rate": 1.673377980211639e-05, + "loss": 1.607, + "step": 1018 + }, + { + "epoch": 0.8251012145748988, + "grad_norm": 1.092471584016294, + "learning_rate": 1.6727422802834e-05, + "loss": 1.6151, + "step": 1019 + }, + { + "epoch": 0.8259109311740891, + "grad_norm": 1.0942633386482077, + "learning_rate": 1.672106083335374e-05, + "loss": 1.6538, + "step": 1020 + }, + { + "epoch": 0.8267206477732794, + "grad_norm": 1.0611257052505665, + "learning_rate": 1.671469389837581e-05, + "loss": 1.6813, + "step": 1021 + }, + { + "epoch": 0.8275303643724696, + "grad_norm": 1.1243190686891869, + "learning_rate": 1.6708322002604085e-05, + "loss": 1.6294, + "step": 1022 + }, + { + "epoch": 0.8283400809716599, + "grad_norm": 1.1202657006614518, + "learning_rate": 1.6701945150746094e-05, + "loss": 1.6243, + "step": 1023 + }, + { + "epoch": 0.8291497975708502, + "grad_norm": 1.1727083505566944, + "learning_rate": 1.6695563347513036e-05, + "loss": 1.631, + "step": 1024 + }, + { + "epoch": 0.8299595141700404, + "grad_norm": 1.0959511539538467, + "learning_rate": 1.6689176597619773e-05, + "loss": 1.7506, + "step": 1025 + }, + { + "epoch": 0.8307692307692308, + "grad_norm": 1.1456195238430023, + "learning_rate": 1.6682784905784808e-05, + "loss": 1.6222, + "step": 1026 + }, + { + "epoch": 0.8315789473684211, + "grad_norm": 1.1142846493104133, + "learning_rate": 1.6676388276730305e-05, + "loss": 1.5693, + "step": 1027 + }, + { + "epoch": 0.8323886639676114, + "grad_norm": 1.054064976471123, + "learning_rate": 1.6669986715182064e-05, + "loss": 1.677, + "step": 1028 + }, + { + "epoch": 0.8331983805668016, + "grad_norm": 1.026287560156129, + "learning_rate": 1.6663580225869554e-05, + "loss": 1.711, + "step": 1029 + }, + { + "epoch": 0.8340080971659919, + "grad_norm": 1.1059699325557255, + "learning_rate": 1.6657168813525855e-05, + "loss": 1.6314, + "step": 1030 + }, + { + "epoch": 0.8348178137651822, + "grad_norm": 1.0663812567827384, + "learning_rate": 1.6650752482887698e-05, + "loss": 1.6292, + "step": 1031 + }, + { + "epoch": 0.8356275303643724, + "grad_norm": 1.1308684576404386, + "learning_rate": 1.6644331238695454e-05, + "loss": 1.6018, + "step": 1032 + }, + { + "epoch": 0.8364372469635628, + "grad_norm": 1.0658872985205445, + "learning_rate": 1.6637905085693113e-05, + "loss": 1.603, + "step": 1033 + }, + { + "epoch": 0.8372469635627531, + "grad_norm": 1.0560438364461333, + "learning_rate": 1.6631474028628298e-05, + "loss": 1.6701, + "step": 1034 + }, + { + "epoch": 0.8380566801619433, + "grad_norm": 1.097675248377845, + "learning_rate": 1.662503807225225e-05, + "loss": 1.6673, + "step": 1035 + }, + { + "epoch": 0.8388663967611336, + "grad_norm": 1.1463142445281125, + "learning_rate": 1.6618597221319835e-05, + "loss": 1.6401, + "step": 1036 + }, + { + "epoch": 0.8396761133603239, + "grad_norm": 1.0991588788894604, + "learning_rate": 1.6612151480589526e-05, + "loss": 1.6822, + "step": 1037 + }, + { + "epoch": 0.8404858299595142, + "grad_norm": 1.1432757857765516, + "learning_rate": 1.6605700854823427e-05, + "loss": 1.6552, + "step": 1038 + }, + { + "epoch": 0.8412955465587044, + "grad_norm": 1.0996521956343472, + "learning_rate": 1.659924534878723e-05, + "loss": 1.6155, + "step": 1039 + }, + { + "epoch": 0.8421052631578947, + "grad_norm": 1.2092130985611158, + "learning_rate": 1.659278496725024e-05, + "loss": 1.6697, + "step": 1040 + }, + { + "epoch": 0.8429149797570851, + "grad_norm": 1.1859643866916718, + "learning_rate": 1.6586319714985372e-05, + "loss": 1.6514, + "step": 1041 + }, + { + "epoch": 0.8437246963562753, + "grad_norm": 1.096493975856786, + "learning_rate": 1.6579849596769132e-05, + "loss": 1.6892, + "step": 1042 + }, + { + "epoch": 0.8445344129554656, + "grad_norm": 1.041583638770386, + "learning_rate": 1.6573374617381622e-05, + "loss": 1.5834, + "step": 1043 + }, + { + "epoch": 0.8453441295546559, + "grad_norm": 1.256429469555766, + "learning_rate": 1.656689478160653e-05, + "loss": 1.5953, + "step": 1044 + }, + { + "epoch": 0.8461538461538461, + "grad_norm": 1.1156384269735415, + "learning_rate": 1.6560410094231144e-05, + "loss": 1.608, + "step": 1045 + }, + { + "epoch": 0.8469635627530364, + "grad_norm": 1.2175373188796819, + "learning_rate": 1.655392056004633e-05, + "loss": 1.7043, + "step": 1046 + }, + { + "epoch": 0.8477732793522267, + "grad_norm": 1.1837512398752925, + "learning_rate": 1.6547426183846527e-05, + "loss": 1.6364, + "step": 1047 + }, + { + "epoch": 0.848582995951417, + "grad_norm": 1.0731112756132954, + "learning_rate": 1.6540926970429768e-05, + "loss": 1.6826, + "step": 1048 + }, + { + "epoch": 0.8493927125506073, + "grad_norm": 1.2861445088088037, + "learning_rate": 1.6534422924597647e-05, + "loss": 1.671, + "step": 1049 + }, + { + "epoch": 0.8502024291497976, + "grad_norm": 1.2667796024188396, + "learning_rate": 1.6527914051155328e-05, + "loss": 1.6239, + "step": 1050 + }, + { + "epoch": 0.8510121457489879, + "grad_norm": 1.0621089700252984, + "learning_rate": 1.652140035491155e-05, + "loss": 1.6278, + "step": 1051 + }, + { + "epoch": 0.8518218623481781, + "grad_norm": 1.1959034368355665, + "learning_rate": 1.6514881840678606e-05, + "loss": 1.6383, + "step": 1052 + }, + { + "epoch": 0.8526315789473684, + "grad_norm": 1.1679490573250864, + "learning_rate": 1.650835851327236e-05, + "loss": 1.6685, + "step": 1053 + }, + { + "epoch": 0.8534412955465587, + "grad_norm": 1.0724625429268337, + "learning_rate": 1.6501830377512214e-05, + "loss": 1.6101, + "step": 1054 + }, + { + "epoch": 0.854251012145749, + "grad_norm": 1.0992880410878538, + "learning_rate": 1.6495297438221145e-05, + "loss": 1.6482, + "step": 1055 + }, + { + "epoch": 0.8550607287449393, + "grad_norm": 1.2082881920964228, + "learning_rate": 1.6488759700225663e-05, + "loss": 1.6947, + "step": 1056 + }, + { + "epoch": 0.8558704453441296, + "grad_norm": 1.064085800343604, + "learning_rate": 1.6482217168355824e-05, + "loss": 1.7083, + "step": 1057 + }, + { + "epoch": 0.8566801619433199, + "grad_norm": 1.2300421861334183, + "learning_rate": 1.647566984744523e-05, + "loss": 1.6707, + "step": 1058 + }, + { + "epoch": 0.8574898785425101, + "grad_norm": 1.168895681590581, + "learning_rate": 1.6469117742331023e-05, + "loss": 1.5947, + "step": 1059 + }, + { + "epoch": 0.8582995951417004, + "grad_norm": 1.0860055758849936, + "learning_rate": 1.6462560857853876e-05, + "loss": 1.6144, + "step": 1060 + }, + { + "epoch": 0.8591093117408907, + "grad_norm": 1.2059795818651675, + "learning_rate": 1.645599919885799e-05, + "loss": 1.7368, + "step": 1061 + }, + { + "epoch": 0.8599190283400809, + "grad_norm": 1.1705528700907784, + "learning_rate": 1.6449432770191104e-05, + "loss": 1.7067, + "step": 1062 + }, + { + "epoch": 0.8607287449392712, + "grad_norm": 1.0305975110654955, + "learning_rate": 1.6442861576704467e-05, + "loss": 1.6808, + "step": 1063 + }, + { + "epoch": 0.8615384615384616, + "grad_norm": 1.0672600681647995, + "learning_rate": 1.6436285623252863e-05, + "loss": 1.6516, + "step": 1064 + }, + { + "epoch": 0.8623481781376519, + "grad_norm": 1.0921123850603622, + "learning_rate": 1.6429704914694573e-05, + "loss": 1.6578, + "step": 1065 + }, + { + "epoch": 0.8631578947368421, + "grad_norm": 1.1420424627061119, + "learning_rate": 1.6423119455891412e-05, + "loss": 1.6166, + "step": 1066 + }, + { + "epoch": 0.8639676113360324, + "grad_norm": 1.1974027318652682, + "learning_rate": 1.6416529251708695e-05, + "loss": 1.5715, + "step": 1067 + }, + { + "epoch": 0.8647773279352227, + "grad_norm": 1.0470830216699876, + "learning_rate": 1.6409934307015237e-05, + "loss": 1.6083, + "step": 1068 + }, + { + "epoch": 0.8655870445344129, + "grad_norm": 1.070487613342487, + "learning_rate": 1.6403334626683373e-05, + "loss": 1.596, + "step": 1069 + }, + { + "epoch": 0.8663967611336032, + "grad_norm": 1.1529368703348062, + "learning_rate": 1.6396730215588913e-05, + "loss": 1.7128, + "step": 1070 + }, + { + "epoch": 0.8672064777327935, + "grad_norm": 1.1573095803039568, + "learning_rate": 1.639012107861118e-05, + "loss": 1.5791, + "step": 1071 + }, + { + "epoch": 0.8680161943319838, + "grad_norm": 1.0795488410207628, + "learning_rate": 1.6383507220632983e-05, + "loss": 1.6544, + "step": 1072 + }, + { + "epoch": 0.8688259109311741, + "grad_norm": 1.037145472528182, + "learning_rate": 1.6376888646540617e-05, + "loss": 1.6401, + "step": 1073 + }, + { + "epoch": 0.8696356275303644, + "grad_norm": 1.060717046272381, + "learning_rate": 1.6370265361223864e-05, + "loss": 1.6801, + "step": 1074 + }, + { + "epoch": 0.8704453441295547, + "grad_norm": 1.0964370572935223, + "learning_rate": 1.6363637369575984e-05, + "loss": 1.6636, + "step": 1075 + }, + { + "epoch": 0.8712550607287449, + "grad_norm": 1.0310388617035904, + "learning_rate": 1.6357004676493716e-05, + "loss": 1.5856, + "step": 1076 + }, + { + "epoch": 0.8720647773279352, + "grad_norm": 1.0168478194178103, + "learning_rate": 1.635036728687727e-05, + "loss": 1.5626, + "step": 1077 + }, + { + "epoch": 0.8728744939271255, + "grad_norm": 1.1227525434781638, + "learning_rate": 1.6343725205630335e-05, + "loss": 1.6922, + "step": 1078 + }, + { + "epoch": 0.8736842105263158, + "grad_norm": 1.0084316594907183, + "learning_rate": 1.633707843766005e-05, + "loss": 1.6777, + "step": 1079 + }, + { + "epoch": 0.8744939271255061, + "grad_norm": 1.0733928985684549, + "learning_rate": 1.633042698787703e-05, + "loss": 1.6883, + "step": 1080 + }, + { + "epoch": 0.8753036437246964, + "grad_norm": 1.0420921585152885, + "learning_rate": 1.632377086119534e-05, + "loss": 1.5657, + "step": 1081 + }, + { + "epoch": 0.8761133603238866, + "grad_norm": 1.0479710213227003, + "learning_rate": 1.631711006253251e-05, + "loss": 1.7318, + "step": 1082 + }, + { + "epoch": 0.8769230769230769, + "grad_norm": 1.0675821904258518, + "learning_rate": 1.6310444596809514e-05, + "loss": 1.6281, + "step": 1083 + }, + { + "epoch": 0.8777327935222672, + "grad_norm": 1.0092848547062299, + "learning_rate": 1.6303774468950776e-05, + "loss": 1.5721, + "step": 1084 + }, + { + "epoch": 0.8785425101214575, + "grad_norm": 1.1037917612800165, + "learning_rate": 1.6297099683884163e-05, + "loss": 1.588, + "step": 1085 + }, + { + "epoch": 0.8793522267206477, + "grad_norm": 1.081509721545105, + "learning_rate": 1.629042024654099e-05, + "loss": 1.6159, + "step": 1086 + }, + { + "epoch": 0.8801619433198381, + "grad_norm": 1.0836597364116385, + "learning_rate": 1.6283736161855995e-05, + "loss": 1.6192, + "step": 1087 + }, + { + "epoch": 0.8809716599190284, + "grad_norm": 1.0587543748723958, + "learning_rate": 1.6277047434767364e-05, + "loss": 1.5687, + "step": 1088 + }, + { + "epoch": 0.8817813765182186, + "grad_norm": 1.0747135995302248, + "learning_rate": 1.6270354070216704e-05, + "loss": 1.6513, + "step": 1089 + }, + { + "epoch": 0.8825910931174089, + "grad_norm": 1.0869005500984592, + "learning_rate": 1.626365607314905e-05, + "loss": 1.6559, + "step": 1090 + }, + { + "epoch": 0.8834008097165992, + "grad_norm": 1.0595343275908542, + "learning_rate": 1.625695344851286e-05, + "loss": 1.6598, + "step": 1091 + }, + { + "epoch": 0.8842105263157894, + "grad_norm": 1.1107620614268396, + "learning_rate": 1.6250246201260017e-05, + "loss": 1.6897, + "step": 1092 + }, + { + "epoch": 0.8850202429149797, + "grad_norm": 1.1907108746856276, + "learning_rate": 1.624353433634581e-05, + "loss": 1.6426, + "step": 1093 + }, + { + "epoch": 0.8858299595141701, + "grad_norm": 1.046609129170575, + "learning_rate": 1.6236817858728937e-05, + "loss": 1.5677, + "step": 1094 + }, + { + "epoch": 0.8866396761133604, + "grad_norm": 1.0744260551582498, + "learning_rate": 1.6230096773371514e-05, + "loss": 1.6268, + "step": 1095 + }, + { + "epoch": 0.8874493927125506, + "grad_norm": 1.1340503009391705, + "learning_rate": 1.622337108523906e-05, + "loss": 1.6032, + "step": 1096 + }, + { + "epoch": 0.8882591093117409, + "grad_norm": 1.07216583644219, + "learning_rate": 1.621664079930049e-05, + "loss": 1.66, + "step": 1097 + }, + { + "epoch": 0.8890688259109312, + "grad_norm": 1.137011474293611, + "learning_rate": 1.620990592052811e-05, + "loss": 1.6506, + "step": 1098 + }, + { + "epoch": 0.8898785425101214, + "grad_norm": 1.0925701713040545, + "learning_rate": 1.620316645389764e-05, + "loss": 1.6441, + "step": 1099 + }, + { + "epoch": 0.8906882591093117, + "grad_norm": 0.9809230441134872, + "learning_rate": 1.619642240438816e-05, + "loss": 1.6116, + "step": 1100 + }, + { + "epoch": 0.891497975708502, + "grad_norm": 1.0227226563848153, + "learning_rate": 1.618967377698216e-05, + "loss": 1.6869, + "step": 1101 + }, + { + "epoch": 0.8923076923076924, + "grad_norm": 1.0936896377565115, + "learning_rate": 1.6182920576665508e-05, + "loss": 1.6799, + "step": 1102 + }, + { + "epoch": 0.8931174089068826, + "grad_norm": 1.064709684635027, + "learning_rate": 1.6176162808427437e-05, + "loss": 1.6547, + "step": 1103 + }, + { + "epoch": 0.8939271255060729, + "grad_norm": 1.089323924603956, + "learning_rate": 1.6169400477260566e-05, + "loss": 1.6514, + "step": 1104 + }, + { + "epoch": 0.8947368421052632, + "grad_norm": 1.051251367154035, + "learning_rate": 1.616263358816089e-05, + "loss": 1.5567, + "step": 1105 + }, + { + "epoch": 0.8955465587044534, + "grad_norm": 1.0955130855844533, + "learning_rate": 1.6155862146127757e-05, + "loss": 1.6892, + "step": 1106 + }, + { + "epoch": 0.8963562753036437, + "grad_norm": 1.2427921629069014, + "learning_rate": 1.6149086156163893e-05, + "loss": 1.6672, + "step": 1107 + }, + { + "epoch": 0.897165991902834, + "grad_norm": 1.0531149381114697, + "learning_rate": 1.6142305623275367e-05, + "loss": 1.6756, + "step": 1108 + }, + { + "epoch": 0.8979757085020242, + "grad_norm": 1.1185947127125788, + "learning_rate": 1.6135520552471625e-05, + "loss": 1.6263, + "step": 1109 + }, + { + "epoch": 0.8987854251012146, + "grad_norm": 1.0114886200304354, + "learning_rate": 1.612873094876545e-05, + "loss": 1.6887, + "step": 1110 + }, + { + "epoch": 0.8995951417004049, + "grad_norm": 1.1380754110775286, + "learning_rate": 1.612193681717298e-05, + "loss": 1.6664, + "step": 1111 + }, + { + "epoch": 0.9004048582995952, + "grad_norm": 1.0844337360811653, + "learning_rate": 1.611513816271369e-05, + "loss": 1.6717, + "step": 1112 + }, + { + "epoch": 0.9012145748987854, + "grad_norm": 1.0684169391575367, + "learning_rate": 1.6108334990410413e-05, + "loss": 1.6056, + "step": 1113 + }, + { + "epoch": 0.9020242914979757, + "grad_norm": 1.0828647321774048, + "learning_rate": 1.610152730528931e-05, + "loss": 1.6916, + "step": 1114 + }, + { + "epoch": 0.902834008097166, + "grad_norm": 1.0539257127891122, + "learning_rate": 1.6094715112379874e-05, + "loss": 1.6259, + "step": 1115 + }, + { + "epoch": 0.9036437246963562, + "grad_norm": 1.0495197445873474, + "learning_rate": 1.6087898416714928e-05, + "loss": 1.6449, + "step": 1116 + }, + { + "epoch": 0.9044534412955466, + "grad_norm": 1.037832176782658, + "learning_rate": 1.608107722333063e-05, + "loss": 1.5811, + "step": 1117 + }, + { + "epoch": 0.9052631578947369, + "grad_norm": 1.1204020070782086, + "learning_rate": 1.607425153726645e-05, + "loss": 1.6671, + "step": 1118 + }, + { + "epoch": 0.9060728744939271, + "grad_norm": 1.0947786620111954, + "learning_rate": 1.6067421363565185e-05, + "loss": 1.6438, + "step": 1119 + }, + { + "epoch": 0.9068825910931174, + "grad_norm": 1.1703591478489133, + "learning_rate": 1.6060586707272943e-05, + "loss": 1.5142, + "step": 1120 + }, + { + "epoch": 0.9076923076923077, + "grad_norm": 1.1101060613426852, + "learning_rate": 1.6053747573439147e-05, + "loss": 1.6466, + "step": 1121 + }, + { + "epoch": 0.908502024291498, + "grad_norm": 0.9844797162954405, + "learning_rate": 1.6046903967116532e-05, + "loss": 1.5892, + "step": 1122 + }, + { + "epoch": 0.9093117408906882, + "grad_norm": 1.2066011149235545, + "learning_rate": 1.604005589336112e-05, + "loss": 1.6379, + "step": 1123 + }, + { + "epoch": 0.9101214574898785, + "grad_norm": 1.12551790929863, + "learning_rate": 1.6033203357232255e-05, + "loss": 1.5908, + "step": 1124 + }, + { + "epoch": 0.9109311740890689, + "grad_norm": 1.0539443046535852, + "learning_rate": 1.6026346363792565e-05, + "loss": 1.6407, + "step": 1125 + }, + { + "epoch": 0.9117408906882591, + "grad_norm": 1.2718959974915909, + "learning_rate": 1.6019484918107977e-05, + "loss": 1.6657, + "step": 1126 + }, + { + "epoch": 0.9125506072874494, + "grad_norm": 1.1172864236142332, + "learning_rate": 1.60126190252477e-05, + "loss": 1.5977, + "step": 1127 + }, + { + "epoch": 0.9133603238866397, + "grad_norm": 1.029629174783446, + "learning_rate": 1.600574869028423e-05, + "loss": 1.5827, + "step": 1128 + }, + { + "epoch": 0.91417004048583, + "grad_norm": 1.1276448324458075, + "learning_rate": 1.599887391829336e-05, + "loss": 1.5556, + "step": 1129 + }, + { + "epoch": 0.9149797570850202, + "grad_norm": 1.234475202377491, + "learning_rate": 1.599199471435414e-05, + "loss": 1.6962, + "step": 1130 + }, + { + "epoch": 0.9157894736842105, + "grad_norm": 1.0658020028121022, + "learning_rate": 1.5985111083548905e-05, + "loss": 1.6202, + "step": 1131 + }, + { + "epoch": 0.9165991902834008, + "grad_norm": 1.1895352614746753, + "learning_rate": 1.5978223030963257e-05, + "loss": 1.5915, + "step": 1132 + }, + { + "epoch": 0.9174089068825911, + "grad_norm": 1.1294963341176811, + "learning_rate": 1.5971330561686073e-05, + "loss": 1.6618, + "step": 1133 + }, + { + "epoch": 0.9182186234817814, + "grad_norm": 1.0383726295620532, + "learning_rate": 1.596443368080948e-05, + "loss": 1.7049, + "step": 1134 + }, + { + "epoch": 0.9190283400809717, + "grad_norm": 1.0765024500512697, + "learning_rate": 1.5957532393428872e-05, + "loss": 1.6578, + "step": 1135 + }, + { + "epoch": 0.9198380566801619, + "grad_norm": 1.0435257860758407, + "learning_rate": 1.5950626704642898e-05, + "loss": 1.5491, + "step": 1136 + }, + { + "epoch": 0.9206477732793522, + "grad_norm": 1.040007314475756, + "learning_rate": 1.594371661955346e-05, + "loss": 1.5721, + "step": 1137 + }, + { + "epoch": 0.9214574898785425, + "grad_norm": 1.1078111485151643, + "learning_rate": 1.5936802143265708e-05, + "loss": 1.6908, + "step": 1138 + }, + { + "epoch": 0.9222672064777327, + "grad_norm": 1.1320470811777017, + "learning_rate": 1.592988328088803e-05, + "loss": 1.6745, + "step": 1139 + }, + { + "epoch": 0.9230769230769231, + "grad_norm": 1.016464328891757, + "learning_rate": 1.5922960037532057e-05, + "loss": 1.5978, + "step": 1140 + }, + { + "epoch": 0.9238866396761134, + "grad_norm": 1.0843626078556605, + "learning_rate": 1.5916032418312665e-05, + "loss": 1.5708, + "step": 1141 + }, + { + "epoch": 0.9246963562753037, + "grad_norm": 1.1378816169740498, + "learning_rate": 1.5909100428347953e-05, + "loss": 1.7212, + "step": 1142 + }, + { + "epoch": 0.9255060728744939, + "grad_norm": 1.0606143686664404, + "learning_rate": 1.590216407275925e-05, + "loss": 1.5639, + "step": 1143 + }, + { + "epoch": 0.9263157894736842, + "grad_norm": 1.3367149432598577, + "learning_rate": 1.5895223356671116e-05, + "loss": 1.6906, + "step": 1144 + }, + { + "epoch": 0.9271255060728745, + "grad_norm": 1.0416656067812, + "learning_rate": 1.588827828521133e-05, + "loss": 1.5926, + "step": 1145 + }, + { + "epoch": 0.9279352226720647, + "grad_norm": 1.0644446557633547, + "learning_rate": 1.5881328863510885e-05, + "loss": 1.6572, + "step": 1146 + }, + { + "epoch": 0.928744939271255, + "grad_norm": 1.0307821440286953, + "learning_rate": 1.5874375096703993e-05, + "loss": 1.6722, + "step": 1147 + }, + { + "epoch": 0.9295546558704454, + "grad_norm": 1.0777429781962335, + "learning_rate": 1.5867416989928077e-05, + "loss": 1.6206, + "step": 1148 + }, + { + "epoch": 0.9303643724696357, + "grad_norm": 1.0968718914039448, + "learning_rate": 1.5860454548323755e-05, + "loss": 1.6033, + "step": 1149 + }, + { + "epoch": 0.9311740890688259, + "grad_norm": 1.094113134525346, + "learning_rate": 1.585348777703486e-05, + "loss": 1.6282, + "step": 1150 + }, + { + "epoch": 0.9319838056680162, + "grad_norm": 1.110942326892508, + "learning_rate": 1.5846516681208425e-05, + "loss": 1.6293, + "step": 1151 + }, + { + "epoch": 0.9327935222672065, + "grad_norm": 1.1089357151037313, + "learning_rate": 1.5839541265994663e-05, + "loss": 1.7625, + "step": 1152 + }, + { + "epoch": 0.9336032388663967, + "grad_norm": 1.0813117854976675, + "learning_rate": 1.5832561536546998e-05, + "loss": 1.6626, + "step": 1153 + }, + { + "epoch": 0.934412955465587, + "grad_norm": 1.0395260288355859, + "learning_rate": 1.5825577498022027e-05, + "loss": 1.6422, + "step": 1154 + }, + { + "epoch": 0.9352226720647774, + "grad_norm": 1.0434934564308764, + "learning_rate": 1.581858915557953e-05, + "loss": 1.6427, + "step": 1155 + }, + { + "epoch": 0.9360323886639677, + "grad_norm": 1.1131287757276462, + "learning_rate": 1.5811596514382474e-05, + "loss": 1.6684, + "step": 1156 + }, + { + "epoch": 0.9368421052631579, + "grad_norm": 1.0535686300577007, + "learning_rate": 1.5804599579597007e-05, + "loss": 1.5772, + "step": 1157 + }, + { + "epoch": 0.9376518218623482, + "grad_norm": 1.0265557890806118, + "learning_rate": 1.5797598356392433e-05, + "loss": 1.6554, + "step": 1158 + }, + { + "epoch": 0.9384615384615385, + "grad_norm": 1.040297515122633, + "learning_rate": 1.5790592849941234e-05, + "loss": 1.6032, + "step": 1159 + }, + { + "epoch": 0.9392712550607287, + "grad_norm": 1.0584403749937723, + "learning_rate": 1.5783583065419054e-05, + "loss": 1.6032, + "step": 1160 + }, + { + "epoch": 0.940080971659919, + "grad_norm": 1.1762205201783962, + "learning_rate": 1.5776569008004705e-05, + "loss": 1.6195, + "step": 1161 + }, + { + "epoch": 0.9408906882591093, + "grad_norm": 1.116940246849939, + "learning_rate": 1.5769550682880143e-05, + "loss": 1.6004, + "step": 1162 + }, + { + "epoch": 0.9417004048582996, + "grad_norm": 1.0651667577950013, + "learning_rate": 1.5762528095230488e-05, + "loss": 1.584, + "step": 1163 + }, + { + "epoch": 0.9425101214574899, + "grad_norm": 1.1032179981000514, + "learning_rate": 1.5755501250244e-05, + "loss": 1.6358, + "step": 1164 + }, + { + "epoch": 0.9433198380566802, + "grad_norm": 1.1087189553464278, + "learning_rate": 1.5748470153112093e-05, + "loss": 1.5852, + "step": 1165 + }, + { + "epoch": 0.9441295546558705, + "grad_norm": 1.0912761006902574, + "learning_rate": 1.574143480902932e-05, + "loss": 1.6718, + "step": 1166 + }, + { + "epoch": 0.9449392712550607, + "grad_norm": 1.0720080749650522, + "learning_rate": 1.5734395223193367e-05, + "loss": 1.637, + "step": 1167 + }, + { + "epoch": 0.945748987854251, + "grad_norm": 1.1016199611874298, + "learning_rate": 1.5727351400805054e-05, + "loss": 1.7111, + "step": 1168 + }, + { + "epoch": 0.9465587044534413, + "grad_norm": 1.0389321277980663, + "learning_rate": 1.572030334706834e-05, + "loss": 1.6109, + "step": 1169 + }, + { + "epoch": 0.9473684210526315, + "grad_norm": 1.1153836189672202, + "learning_rate": 1.57132510671903e-05, + "loss": 1.6806, + "step": 1170 + }, + { + "epoch": 0.9481781376518219, + "grad_norm": 1.0852648007480221, + "learning_rate": 1.5706194566381136e-05, + "loss": 1.5954, + "step": 1171 + }, + { + "epoch": 0.9489878542510122, + "grad_norm": 1.1024355588691714, + "learning_rate": 1.5699133849854164e-05, + "loss": 1.7109, + "step": 1172 + }, + { + "epoch": 0.9497975708502024, + "grad_norm": 1.1103223087215968, + "learning_rate": 1.5692068922825826e-05, + "loss": 1.6344, + "step": 1173 + }, + { + "epoch": 0.9506072874493927, + "grad_norm": 1.0273649828440345, + "learning_rate": 1.5684999790515664e-05, + "loss": 1.6134, + "step": 1174 + }, + { + "epoch": 0.951417004048583, + "grad_norm": 1.025710171640773, + "learning_rate": 1.5677926458146327e-05, + "loss": 1.6368, + "step": 1175 + }, + { + "epoch": 0.9522267206477733, + "grad_norm": 1.080001734059546, + "learning_rate": 1.567084893094357e-05, + "loss": 1.6528, + "step": 1176 + }, + { + "epoch": 0.9530364372469635, + "grad_norm": 1.0606923317026444, + "learning_rate": 1.566376721413625e-05, + "loss": 1.5999, + "step": 1177 + }, + { + "epoch": 0.9538461538461539, + "grad_norm": 1.0103172442950479, + "learning_rate": 1.5656681312956316e-05, + "loss": 1.5915, + "step": 1178 + }, + { + "epoch": 0.9546558704453442, + "grad_norm": 1.1057352054726133, + "learning_rate": 1.5649591232638804e-05, + "loss": 1.6497, + "step": 1179 + }, + { + "epoch": 0.9554655870445344, + "grad_norm": 1.1047227327452962, + "learning_rate": 1.5642496978421842e-05, + "loss": 1.6919, + "step": 1180 + }, + { + "epoch": 0.9562753036437247, + "grad_norm": 1.0283713285578344, + "learning_rate": 1.563539855554665e-05, + "loss": 1.6248, + "step": 1181 + }, + { + "epoch": 0.957085020242915, + "grad_norm": 1.0214669851609182, + "learning_rate": 1.5628295969257515e-05, + "loss": 1.623, + "step": 1182 + }, + { + "epoch": 0.9578947368421052, + "grad_norm": 1.0696244337426537, + "learning_rate": 1.5621189224801797e-05, + "loss": 1.6771, + "step": 1183 + }, + { + "epoch": 0.9587044534412955, + "grad_norm": 1.0663459977571883, + "learning_rate": 1.5614078327429947e-05, + "loss": 1.7245, + "step": 1184 + }, + { + "epoch": 0.9595141700404858, + "grad_norm": 1.100787082491128, + "learning_rate": 1.560696328239547e-05, + "loss": 1.6435, + "step": 1185 + }, + { + "epoch": 0.9603238866396762, + "grad_norm": 1.0861397245205235, + "learning_rate": 1.559984409495493e-05, + "loss": 1.5699, + "step": 1186 + }, + { + "epoch": 0.9611336032388664, + "grad_norm": 1.0927150205723406, + "learning_rate": 1.5592720770367967e-05, + "loss": 1.6555, + "step": 1187 + }, + { + "epoch": 0.9619433198380567, + "grad_norm": 1.0739743395413222, + "learning_rate": 1.5585593313897267e-05, + "loss": 1.5801, + "step": 1188 + }, + { + "epoch": 0.962753036437247, + "grad_norm": 1.058740021769692, + "learning_rate": 1.5578461730808575e-05, + "loss": 1.6698, + "step": 1189 + }, + { + "epoch": 0.9635627530364372, + "grad_norm": 1.0525423850505513, + "learning_rate": 1.5571326026370676e-05, + "loss": 1.5322, + "step": 1190 + }, + { + "epoch": 0.9643724696356275, + "grad_norm": 1.075841311496856, + "learning_rate": 1.5564186205855407e-05, + "loss": 1.5897, + "step": 1191 + }, + { + "epoch": 0.9651821862348178, + "grad_norm": 1.1185012350730061, + "learning_rate": 1.5557042274537644e-05, + "loss": 1.632, + "step": 1192 + }, + { + "epoch": 0.965991902834008, + "grad_norm": 1.0818043247678821, + "learning_rate": 1.5549894237695302e-05, + "loss": 1.6112, + "step": 1193 + }, + { + "epoch": 0.9668016194331984, + "grad_norm": 1.0762571974215163, + "learning_rate": 1.5542742100609324e-05, + "loss": 1.5975, + "step": 1194 + }, + { + "epoch": 0.9676113360323887, + "grad_norm": 1.1294759095742624, + "learning_rate": 1.5535585868563688e-05, + "loss": 1.628, + "step": 1195 + }, + { + "epoch": 0.968421052631579, + "grad_norm": 1.0550884938111755, + "learning_rate": 1.552842554684539e-05, + "loss": 1.6819, + "step": 1196 + }, + { + "epoch": 0.9692307692307692, + "grad_norm": 1.1137617349520605, + "learning_rate": 1.5521261140744458e-05, + "loss": 1.5694, + "step": 1197 + }, + { + "epoch": 0.9700404858299595, + "grad_norm": 1.122314201391633, + "learning_rate": 1.551409265555393e-05, + "loss": 1.7246, + "step": 1198 + }, + { + "epoch": 0.9708502024291498, + "grad_norm": 1.0252937720177295, + "learning_rate": 1.5506920096569857e-05, + "loss": 1.5807, + "step": 1199 + }, + { + "epoch": 0.97165991902834, + "grad_norm": 1.066566322393118, + "learning_rate": 1.5499743469091303e-05, + "loss": 1.5806, + "step": 1200 + }, + { + "epoch": 0.9724696356275304, + "grad_norm": 1.1862181447108153, + "learning_rate": 1.5492562778420342e-05, + "loss": 1.6677, + "step": 1201 + }, + { + "epoch": 0.9732793522267207, + "grad_norm": 1.1600571833816153, + "learning_rate": 1.5485378029862034e-05, + "loss": 1.6181, + "step": 1202 + }, + { + "epoch": 0.974089068825911, + "grad_norm": 1.1209706305697864, + "learning_rate": 1.547818922872446e-05, + "loss": 1.595, + "step": 1203 + }, + { + "epoch": 0.9748987854251012, + "grad_norm": 1.2007659516216234, + "learning_rate": 1.547099638031867e-05, + "loss": 1.5757, + "step": 1204 + }, + { + "epoch": 0.9757085020242915, + "grad_norm": 1.087592209286536, + "learning_rate": 1.5463799489958727e-05, + "loss": 1.6574, + "step": 1205 + }, + { + "epoch": 0.9765182186234818, + "grad_norm": 1.0599754950787568, + "learning_rate": 1.5456598562961666e-05, + "loss": 1.6501, + "step": 1206 + }, + { + "epoch": 0.977327935222672, + "grad_norm": 1.2383281394995351, + "learning_rate": 1.544939360464751e-05, + "loss": 1.6212, + "step": 1207 + }, + { + "epoch": 0.9781376518218623, + "grad_norm": 1.1245330174866524, + "learning_rate": 1.5442184620339252e-05, + "loss": 1.6466, + "step": 1208 + }, + { + "epoch": 0.9789473684210527, + "grad_norm": 0.9799617502733805, + "learning_rate": 1.5434971615362875e-05, + "loss": 1.6056, + "step": 1209 + }, + { + "epoch": 0.979757085020243, + "grad_norm": 1.142502619460788, + "learning_rate": 1.542775459504732e-05, + "loss": 1.6629, + "step": 1210 + }, + { + "epoch": 0.9805668016194332, + "grad_norm": 1.0942507084406352, + "learning_rate": 1.5420533564724495e-05, + "loss": 1.6764, + "step": 1211 + }, + { + "epoch": 0.9813765182186235, + "grad_norm": 1.161749663929768, + "learning_rate": 1.5413308529729274e-05, + "loss": 1.6983, + "step": 1212 + }, + { + "epoch": 0.9821862348178138, + "grad_norm": 1.0441007142843908, + "learning_rate": 1.5406079495399495e-05, + "loss": 1.5531, + "step": 1213 + }, + { + "epoch": 0.982995951417004, + "grad_norm": 1.0129014720851601, + "learning_rate": 1.5398846467075937e-05, + "loss": 1.5144, + "step": 1214 + }, + { + "epoch": 0.9838056680161943, + "grad_norm": 1.0916694795654285, + "learning_rate": 1.5391609450102346e-05, + "loss": 1.5713, + "step": 1215 + }, + { + "epoch": 0.9846153846153847, + "grad_norm": 1.0119872612879202, + "learning_rate": 1.5384368449825395e-05, + "loss": 1.6537, + "step": 1216 + }, + { + "epoch": 0.9854251012145749, + "grad_norm": 1.089804356576127, + "learning_rate": 1.5377123471594723e-05, + "loss": 1.6905, + "step": 1217 + }, + { + "epoch": 0.9862348178137652, + "grad_norm": 1.018296330143969, + "learning_rate": 1.536987452076289e-05, + "loss": 1.5738, + "step": 1218 + }, + { + "epoch": 0.9870445344129555, + "grad_norm": 1.1463126951348903, + "learning_rate": 1.5362621602685394e-05, + "loss": 1.5971, + "step": 1219 + }, + { + "epoch": 0.9878542510121457, + "grad_norm": 1.0831047769241162, + "learning_rate": 1.5355364722720674e-05, + "loss": 1.6642, + "step": 1220 + }, + { + "epoch": 0.988663967611336, + "grad_norm": 1.0229539605576596, + "learning_rate": 1.5348103886230086e-05, + "loss": 1.6169, + "step": 1221 + }, + { + "epoch": 0.9894736842105263, + "grad_norm": 1.0706268272715531, + "learning_rate": 1.5340839098577912e-05, + "loss": 1.6369, + "step": 1222 + }, + { + "epoch": 0.9902834008097166, + "grad_norm": 1.076282820889626, + "learning_rate": 1.5333570365131353e-05, + "loss": 1.6125, + "step": 1223 + }, + { + "epoch": 0.9910931174089069, + "grad_norm": 1.0521363851747962, + "learning_rate": 1.5326297691260526e-05, + "loss": 1.6126, + "step": 1224 + }, + { + "epoch": 0.9919028340080972, + "grad_norm": 1.028881455867217, + "learning_rate": 1.5319021082338458e-05, + "loss": 1.6541, + "step": 1225 + }, + { + "epoch": 0.9927125506072875, + "grad_norm": 1.04183651822074, + "learning_rate": 1.5311740543741088e-05, + "loss": 1.6238, + "step": 1226 + }, + { + "epoch": 0.9935222672064777, + "grad_norm": 1.0858918280353071, + "learning_rate": 1.5304456080847247e-05, + "loss": 1.6627, + "step": 1227 + }, + { + "epoch": 0.994331983805668, + "grad_norm": 1.0035165826353316, + "learning_rate": 1.5297167699038673e-05, + "loss": 1.6204, + "step": 1228 + }, + { + "epoch": 0.9951417004048583, + "grad_norm": 1.0816188271259373, + "learning_rate": 1.5289875403700005e-05, + "loss": 1.6276, + "step": 1229 + }, + { + "epoch": 0.9959514170040485, + "grad_norm": 1.0599193319828009, + "learning_rate": 1.5282579200218762e-05, + "loss": 1.6398, + "step": 1230 + }, + { + "epoch": 0.9967611336032388, + "grad_norm": 1.0919178042931876, + "learning_rate": 1.5275279093985355e-05, + "loss": 1.5611, + "step": 1231 + }, + { + "epoch": 0.9975708502024292, + "grad_norm": 1.1107755668554948, + "learning_rate": 1.5267975090393078e-05, + "loss": 1.715, + "step": 1232 + }, + { + "epoch": 0.9983805668016195, + "grad_norm": 0.9966294564153475, + "learning_rate": 1.526066719483811e-05, + "loss": 1.6605, + "step": 1233 + }, + { + "epoch": 0.9991902834008097, + "grad_norm": 1.1569636814062747, + "learning_rate": 1.5253355412719498e-05, + "loss": 1.6844, + "step": 1234 + }, + { + "epoch": 1.0, + "grad_norm": 1.0838838624298, + "learning_rate": 1.5246039749439159e-05, + "loss": 1.6023, + "step": 1235 + }, + { + "epoch": 1.0008097165991903, + "grad_norm": 1.3033229459063758, + "learning_rate": 1.5238720210401881e-05, + "loss": 1.4524, + "step": 1236 + }, + { + "epoch": 1.0016194331983805, + "grad_norm": 1.1817666802596205, + "learning_rate": 1.5231396801015321e-05, + "loss": 1.4273, + "step": 1237 + }, + { + "epoch": 1.0024291497975708, + "grad_norm": 1.1157969184839365, + "learning_rate": 1.5224069526689981e-05, + "loss": 1.4376, + "step": 1238 + }, + { + "epoch": 1.003238866396761, + "grad_norm": 1.1746908857400815, + "learning_rate": 1.5216738392839241e-05, + "loss": 1.3419, + "step": 1239 + }, + { + "epoch": 1.0040485829959513, + "grad_norm": 1.5854462006834873, + "learning_rate": 1.5209403404879305e-05, + "loss": 1.3791, + "step": 1240 + }, + { + "epoch": 1.0048582995951416, + "grad_norm": 1.4607487661303684, + "learning_rate": 1.5202064568229242e-05, + "loss": 1.3514, + "step": 1241 + }, + { + "epoch": 1.0056680161943319, + "grad_norm": 1.4180363035835093, + "learning_rate": 1.5194721888310966e-05, + "loss": 1.4033, + "step": 1242 + }, + { + "epoch": 1.0064777327935224, + "grad_norm": 1.3518288634394915, + "learning_rate": 1.5187375370549218e-05, + "loss": 1.3413, + "step": 1243 + }, + { + "epoch": 1.0072874493927126, + "grad_norm": 1.3776243915536928, + "learning_rate": 1.5180025020371585e-05, + "loss": 1.4354, + "step": 1244 + }, + { + "epoch": 1.008097165991903, + "grad_norm": 1.263117229706304, + "learning_rate": 1.5172670843208477e-05, + "loss": 1.3634, + "step": 1245 + }, + { + "epoch": 1.0089068825910932, + "grad_norm": 1.2727019839645528, + "learning_rate": 1.5165312844493146e-05, + "loss": 1.3325, + "step": 1246 + }, + { + "epoch": 1.0097165991902834, + "grad_norm": 1.2423022959844323, + "learning_rate": 1.5157951029661644e-05, + "loss": 1.3249, + "step": 1247 + }, + { + "epoch": 1.0105263157894737, + "grad_norm": 1.3569865807346784, + "learning_rate": 1.5150585404152864e-05, + "loss": 1.388, + "step": 1248 + }, + { + "epoch": 1.011336032388664, + "grad_norm": 1.3839955804790385, + "learning_rate": 1.5143215973408505e-05, + "loss": 1.3716, + "step": 1249 + }, + { + "epoch": 1.0121457489878543, + "grad_norm": 1.281605620473294, + "learning_rate": 1.5135842742873077e-05, + "loss": 1.3419, + "step": 1250 + }, + { + "epoch": 1.0129554655870445, + "grad_norm": 1.240875523498585, + "learning_rate": 1.5128465717993898e-05, + "loss": 1.4467, + "step": 1251 + }, + { + "epoch": 1.0137651821862348, + "grad_norm": 1.2013839184847355, + "learning_rate": 1.5121084904221093e-05, + "loss": 1.3489, + "step": 1252 + }, + { + "epoch": 1.014574898785425, + "grad_norm": 1.224004173475795, + "learning_rate": 1.5113700307007575e-05, + "loss": 1.3784, + "step": 1253 + }, + { + "epoch": 1.0153846153846153, + "grad_norm": 1.2009133574758046, + "learning_rate": 1.510631193180907e-05, + "loss": 1.3149, + "step": 1254 + }, + { + "epoch": 1.0161943319838056, + "grad_norm": 1.2303064249057094, + "learning_rate": 1.5098919784084083e-05, + "loss": 1.3512, + "step": 1255 + }, + { + "epoch": 1.0170040485829959, + "grad_norm": 1.2885247341623098, + "learning_rate": 1.50915238692939e-05, + "loss": 1.3995, + "step": 1256 + }, + { + "epoch": 1.0178137651821861, + "grad_norm": 1.2509468790226062, + "learning_rate": 1.5084124192902612e-05, + "loss": 1.3568, + "step": 1257 + }, + { + "epoch": 1.0186234817813766, + "grad_norm": 1.2329823773251085, + "learning_rate": 1.5076720760377064e-05, + "loss": 1.3742, + "step": 1258 + }, + { + "epoch": 1.019433198380567, + "grad_norm": 1.2514982887751502, + "learning_rate": 1.5069313577186892e-05, + "loss": 1.4078, + "step": 1259 + }, + { + "epoch": 1.0202429149797572, + "grad_norm": 1.1878670599291559, + "learning_rate": 1.5061902648804503e-05, + "loss": 1.3712, + "step": 1260 + }, + { + "epoch": 1.0210526315789474, + "grad_norm": 1.1702105583771456, + "learning_rate": 1.5054487980705059e-05, + "loss": 1.3456, + "step": 1261 + }, + { + "epoch": 1.0218623481781377, + "grad_norm": 1.2390980605393822, + "learning_rate": 1.5047069578366497e-05, + "loss": 1.3626, + "step": 1262 + }, + { + "epoch": 1.022672064777328, + "grad_norm": 1.1903294920255179, + "learning_rate": 1.5039647447269508e-05, + "loss": 1.3562, + "step": 1263 + }, + { + "epoch": 1.0234817813765182, + "grad_norm": 1.210946578195798, + "learning_rate": 1.5032221592897536e-05, + "loss": 1.3543, + "step": 1264 + }, + { + "epoch": 1.0242914979757085, + "grad_norm": 1.1617580200681779, + "learning_rate": 1.502479202073678e-05, + "loss": 1.3388, + "step": 1265 + }, + { + "epoch": 1.0251012145748988, + "grad_norm": 1.2440798916020672, + "learning_rate": 1.5017358736276183e-05, + "loss": 1.378, + "step": 1266 + }, + { + "epoch": 1.025910931174089, + "grad_norm": 1.2628930798261047, + "learning_rate": 1.500992174500743e-05, + "loss": 1.3923, + "step": 1267 + }, + { + "epoch": 1.0267206477732793, + "grad_norm": 1.2359583176315028, + "learning_rate": 1.5002481052424945e-05, + "loss": 1.3872, + "step": 1268 + }, + { + "epoch": 1.0275303643724696, + "grad_norm": 1.1963174169460538, + "learning_rate": 1.499503666402589e-05, + "loss": 1.3951, + "step": 1269 + }, + { + "epoch": 1.0283400809716599, + "grad_norm": 1.124590149714284, + "learning_rate": 1.4987588585310154e-05, + "loss": 1.361, + "step": 1270 + }, + { + "epoch": 1.0291497975708501, + "grad_norm": 1.1617067643230052, + "learning_rate": 1.4980136821780348e-05, + "loss": 1.3614, + "step": 1271 + }, + { + "epoch": 1.0299595141700404, + "grad_norm": 1.6170111738264419, + "learning_rate": 1.497268137894182e-05, + "loss": 1.4219, + "step": 1272 + }, + { + "epoch": 1.0307692307692307, + "grad_norm": 1.2275764161254092, + "learning_rate": 1.4965222262302621e-05, + "loss": 1.3835, + "step": 1273 + }, + { + "epoch": 1.0315789473684212, + "grad_norm": 1.1996462946756752, + "learning_rate": 1.4957759477373519e-05, + "loss": 1.2693, + "step": 1274 + }, + { + "epoch": 1.0323886639676114, + "grad_norm": 1.2125778356953747, + "learning_rate": 1.4950293029668004e-05, + "loss": 1.3383, + "step": 1275 + }, + { + "epoch": 1.0331983805668017, + "grad_norm": 1.2012198190789183, + "learning_rate": 1.4942822924702252e-05, + "loss": 1.374, + "step": 1276 + }, + { + "epoch": 1.034008097165992, + "grad_norm": 1.2130852863124042, + "learning_rate": 1.4935349167995161e-05, + "loss": 1.4402, + "step": 1277 + }, + { + "epoch": 1.0348178137651822, + "grad_norm": 1.1740546729540866, + "learning_rate": 1.4927871765068314e-05, + "loss": 1.3831, + "step": 1278 + }, + { + "epoch": 1.0356275303643725, + "grad_norm": 1.1917159960823642, + "learning_rate": 1.4920390721445993e-05, + "loss": 1.3729, + "step": 1279 + }, + { + "epoch": 1.0364372469635628, + "grad_norm": 1.1733824111242548, + "learning_rate": 1.4912906042655164e-05, + "loss": 1.3949, + "step": 1280 + }, + { + "epoch": 1.037246963562753, + "grad_norm": 1.1752871025521956, + "learning_rate": 1.4905417734225488e-05, + "loss": 1.3553, + "step": 1281 + }, + { + "epoch": 1.0380566801619433, + "grad_norm": 1.1715814660303838, + "learning_rate": 1.4897925801689304e-05, + "loss": 1.3838, + "step": 1282 + }, + { + "epoch": 1.0388663967611336, + "grad_norm": 1.2415703860859453, + "learning_rate": 1.4890430250581622e-05, + "loss": 1.3841, + "step": 1283 + }, + { + "epoch": 1.0396761133603238, + "grad_norm": 1.1825484273450233, + "learning_rate": 1.4882931086440133e-05, + "loss": 1.3573, + "step": 1284 + }, + { + "epoch": 1.040485829959514, + "grad_norm": 1.1504373958278675, + "learning_rate": 1.4875428314805195e-05, + "loss": 1.3256, + "step": 1285 + }, + { + "epoch": 1.0412955465587044, + "grad_norm": 1.1436853666808242, + "learning_rate": 1.4867921941219834e-05, + "loss": 1.3932, + "step": 1286 + }, + { + "epoch": 1.0421052631578946, + "grad_norm": 1.162386050137052, + "learning_rate": 1.4860411971229728e-05, + "loss": 1.3845, + "step": 1287 + }, + { + "epoch": 1.042914979757085, + "grad_norm": 1.2092585781453715, + "learning_rate": 1.485289841038322e-05, + "loss": 1.3369, + "step": 1288 + }, + { + "epoch": 1.0437246963562754, + "grad_norm": 1.145037292414635, + "learning_rate": 1.484538126423131e-05, + "loss": 1.4054, + "step": 1289 + }, + { + "epoch": 1.0445344129554657, + "grad_norm": 1.1686385666583774, + "learning_rate": 1.483786053832763e-05, + "loss": 1.3353, + "step": 1290 + }, + { + "epoch": 1.045344129554656, + "grad_norm": 1.2275739238178558, + "learning_rate": 1.483033623822848e-05, + "loss": 1.4264, + "step": 1291 + }, + { + "epoch": 1.0461538461538462, + "grad_norm": 1.1623896668490261, + "learning_rate": 1.4822808369492778e-05, + "loss": 1.4345, + "step": 1292 + }, + { + "epoch": 1.0469635627530365, + "grad_norm": 1.2296387845097625, + "learning_rate": 1.4815276937682094e-05, + "loss": 1.3207, + "step": 1293 + }, + { + "epoch": 1.0477732793522267, + "grad_norm": 1.176201622341318, + "learning_rate": 1.4807741948360625e-05, + "loss": 1.3585, + "step": 1294 + }, + { + "epoch": 1.048582995951417, + "grad_norm": 1.1610794617830476, + "learning_rate": 1.4800203407095194e-05, + "loss": 1.3479, + "step": 1295 + }, + { + "epoch": 1.0493927125506073, + "grad_norm": 1.1776599630056936, + "learning_rate": 1.4792661319455252e-05, + "loss": 1.4027, + "step": 1296 + }, + { + "epoch": 1.0502024291497976, + "grad_norm": 1.2338228585850806, + "learning_rate": 1.4785115691012866e-05, + "loss": 1.3332, + "step": 1297 + }, + { + "epoch": 1.0510121457489878, + "grad_norm": 1.1880604320942654, + "learning_rate": 1.4777566527342729e-05, + "loss": 1.3304, + "step": 1298 + }, + { + "epoch": 1.051821862348178, + "grad_norm": 1.1817231459155564, + "learning_rate": 1.4770013834022128e-05, + "loss": 1.3714, + "step": 1299 + }, + { + "epoch": 1.0526315789473684, + "grad_norm": 1.185176582407899, + "learning_rate": 1.4762457616630972e-05, + "loss": 1.2879, + "step": 1300 + }, + { + "epoch": 1.0534412955465586, + "grad_norm": 1.1828727222185689, + "learning_rate": 1.4754897880751776e-05, + "loss": 1.2923, + "step": 1301 + }, + { + "epoch": 1.054251012145749, + "grad_norm": 1.1744325557146087, + "learning_rate": 1.474733463196964e-05, + "loss": 1.3548, + "step": 1302 + }, + { + "epoch": 1.0550607287449392, + "grad_norm": 1.160850787157178, + "learning_rate": 1.4739767875872271e-05, + "loss": 1.4059, + "step": 1303 + }, + { + "epoch": 1.0558704453441297, + "grad_norm": 1.176977157825984, + "learning_rate": 1.473219761804996e-05, + "loss": 1.3351, + "step": 1304 + }, + { + "epoch": 1.05668016194332, + "grad_norm": 1.2503348243091958, + "learning_rate": 1.4724623864095595e-05, + "loss": 1.4432, + "step": 1305 + }, + { + "epoch": 1.0574898785425102, + "grad_norm": 1.1872676938264057, + "learning_rate": 1.4717046619604636e-05, + "loss": 1.4101, + "step": 1306 + }, + { + "epoch": 1.0582995951417005, + "grad_norm": 1.168594441423703, + "learning_rate": 1.4709465890175125e-05, + "loss": 1.3947, + "step": 1307 + }, + { + "epoch": 1.0591093117408907, + "grad_norm": 1.1705898387765592, + "learning_rate": 1.4701881681407684e-05, + "loss": 1.3358, + "step": 1308 + }, + { + "epoch": 1.059919028340081, + "grad_norm": 1.1603679171263597, + "learning_rate": 1.46942939989055e-05, + "loss": 1.3784, + "step": 1309 + }, + { + "epoch": 1.0607287449392713, + "grad_norm": 1.164400476925873, + "learning_rate": 1.4686702848274328e-05, + "loss": 1.3856, + "step": 1310 + }, + { + "epoch": 1.0615384615384615, + "grad_norm": 1.1554651486479175, + "learning_rate": 1.4679108235122482e-05, + "loss": 1.3397, + "step": 1311 + }, + { + "epoch": 1.0623481781376518, + "grad_norm": 1.2155869311427083, + "learning_rate": 1.467151016506084e-05, + "loss": 1.3729, + "step": 1312 + }, + { + "epoch": 1.063157894736842, + "grad_norm": 1.2133615435457628, + "learning_rate": 1.4663908643702836e-05, + "loss": 1.3122, + "step": 1313 + }, + { + "epoch": 1.0639676113360323, + "grad_norm": 1.1634564916763344, + "learning_rate": 1.465630367666444e-05, + "loss": 1.3354, + "step": 1314 + }, + { + "epoch": 1.0647773279352226, + "grad_norm": 1.2001955918988352, + "learning_rate": 1.4648695269564182e-05, + "loss": 1.4082, + "step": 1315 + }, + { + "epoch": 1.0655870445344129, + "grad_norm": 1.1379663684306365, + "learning_rate": 1.4641083428023124e-05, + "loss": 1.3493, + "step": 1316 + }, + { + "epoch": 1.0663967611336032, + "grad_norm": 1.1718455198343034, + "learning_rate": 1.4633468157664879e-05, + "loss": 1.3505, + "step": 1317 + }, + { + "epoch": 1.0672064777327934, + "grad_norm": 1.1646759648158063, + "learning_rate": 1.4625849464115571e-05, + "loss": 1.3558, + "step": 1318 + }, + { + "epoch": 1.068016194331984, + "grad_norm": 1.162310000496439, + "learning_rate": 1.4618227353003878e-05, + "loss": 1.3958, + "step": 1319 + }, + { + "epoch": 1.0688259109311742, + "grad_norm": 1.1488822317358849, + "learning_rate": 1.461060182996098e-05, + "loss": 1.4, + "step": 1320 + }, + { + "epoch": 1.0696356275303645, + "grad_norm": 1.2033656269010369, + "learning_rate": 1.4602972900620596e-05, + "loss": 1.3656, + "step": 1321 + }, + { + "epoch": 1.0704453441295547, + "grad_norm": 1.2654857663774528, + "learning_rate": 1.459534057061895e-05, + "loss": 1.314, + "step": 1322 + }, + { + "epoch": 1.071255060728745, + "grad_norm": 1.1409457494007944, + "learning_rate": 1.4587704845594784e-05, + "loss": 1.3914, + "step": 1323 + }, + { + "epoch": 1.0720647773279353, + "grad_norm": 1.1625130696761021, + "learning_rate": 1.4580065731189344e-05, + "loss": 1.3809, + "step": 1324 + }, + { + "epoch": 1.0728744939271255, + "grad_norm": 1.1754811815518604, + "learning_rate": 1.4572423233046386e-05, + "loss": 1.357, + "step": 1325 + }, + { + "epoch": 1.0736842105263158, + "grad_norm": 1.274907802092002, + "learning_rate": 1.456477735681216e-05, + "loss": 1.3543, + "step": 1326 + }, + { + "epoch": 1.074493927125506, + "grad_norm": 1.194810175856374, + "learning_rate": 1.455712810813542e-05, + "loss": 1.3274, + "step": 1327 + }, + { + "epoch": 1.0753036437246963, + "grad_norm": 1.1841749394103487, + "learning_rate": 1.4549475492667395e-05, + "loss": 1.4214, + "step": 1328 + }, + { + "epoch": 1.0761133603238866, + "grad_norm": 1.1616739871602735, + "learning_rate": 1.4541819516061824e-05, + "loss": 1.3143, + "step": 1329 + }, + { + "epoch": 1.0769230769230769, + "grad_norm": 1.235456801171255, + "learning_rate": 1.4534160183974908e-05, + "loss": 1.3862, + "step": 1330 + }, + { + "epoch": 1.0777327935222671, + "grad_norm": 1.185526004922521, + "learning_rate": 1.4526497502065343e-05, + "loss": 1.3827, + "step": 1331 + }, + { + "epoch": 1.0785425101214574, + "grad_norm": 1.181146915990783, + "learning_rate": 1.4518831475994287e-05, + "loss": 1.3566, + "step": 1332 + }, + { + "epoch": 1.0793522267206477, + "grad_norm": 1.2073400109649837, + "learning_rate": 1.4511162111425377e-05, + "loss": 1.3772, + "step": 1333 + }, + { + "epoch": 1.0801619433198382, + "grad_norm": 1.2061215113680948, + "learning_rate": 1.450348941402472e-05, + "loss": 1.4011, + "step": 1334 + }, + { + "epoch": 1.0809716599190284, + "grad_norm": 1.2058666314849154, + "learning_rate": 1.4495813389460875e-05, + "loss": 1.3617, + "step": 1335 + }, + { + "epoch": 1.0817813765182187, + "grad_norm": 1.1903692062674975, + "learning_rate": 1.448813404340486e-05, + "loss": 1.2916, + "step": 1336 + }, + { + "epoch": 1.082591093117409, + "grad_norm": 1.2340367866655837, + "learning_rate": 1.4480451381530159e-05, + "loss": 1.2935, + "step": 1337 + }, + { + "epoch": 1.0834008097165992, + "grad_norm": 1.2659821523943215, + "learning_rate": 1.447276540951269e-05, + "loss": 1.362, + "step": 1338 + }, + { + "epoch": 1.0842105263157895, + "grad_norm": 1.2228755854040374, + "learning_rate": 1.4465076133030828e-05, + "loss": 1.3781, + "step": 1339 + }, + { + "epoch": 1.0850202429149798, + "grad_norm": 1.1570938076905548, + "learning_rate": 1.4457383557765385e-05, + "loss": 1.3808, + "step": 1340 + }, + { + "epoch": 1.08582995951417, + "grad_norm": 1.1637617032486594, + "learning_rate": 1.4449687689399607e-05, + "loss": 1.3515, + "step": 1341 + }, + { + "epoch": 1.0866396761133603, + "grad_norm": 1.1678076483156412, + "learning_rate": 1.4441988533619182e-05, + "loss": 1.3391, + "step": 1342 + }, + { + "epoch": 1.0874493927125506, + "grad_norm": 1.2160441877476182, + "learning_rate": 1.4434286096112215e-05, + "loss": 1.3475, + "step": 1343 + }, + { + "epoch": 1.0882591093117409, + "grad_norm": 1.235249585783021, + "learning_rate": 1.4426580382569241e-05, + "loss": 1.2836, + "step": 1344 + }, + { + "epoch": 1.0890688259109311, + "grad_norm": 1.2382294650559509, + "learning_rate": 1.4418871398683227e-05, + "loss": 1.392, + "step": 1345 + }, + { + "epoch": 1.0898785425101214, + "grad_norm": 1.173232332418364, + "learning_rate": 1.4411159150149532e-05, + "loss": 1.3071, + "step": 1346 + }, + { + "epoch": 1.0906882591093117, + "grad_norm": 1.1510674680418143, + "learning_rate": 1.4403443642665946e-05, + "loss": 1.3228, + "step": 1347 + }, + { + "epoch": 1.091497975708502, + "grad_norm": 1.1961923726867185, + "learning_rate": 1.439572488193266e-05, + "loss": 1.3194, + "step": 1348 + }, + { + "epoch": 1.0923076923076924, + "grad_norm": 1.1902807444112073, + "learning_rate": 1.438800287365227e-05, + "loss": 1.3624, + "step": 1349 + }, + { + "epoch": 1.0931174089068827, + "grad_norm": 1.1834064174607142, + "learning_rate": 1.4380277623529766e-05, + "loss": 1.3173, + "step": 1350 + }, + { + "epoch": 1.093927125506073, + "grad_norm": 1.2136583995550188, + "learning_rate": 1.437254913727254e-05, + "loss": 1.3443, + "step": 1351 + }, + { + "epoch": 1.0947368421052632, + "grad_norm": 1.154981299062566, + "learning_rate": 1.4364817420590373e-05, + "loss": 1.3872, + "step": 1352 + }, + { + "epoch": 1.0955465587044535, + "grad_norm": 1.2102553779995304, + "learning_rate": 1.4357082479195435e-05, + "loss": 1.2916, + "step": 1353 + }, + { + "epoch": 1.0963562753036438, + "grad_norm": 1.188609622826024, + "learning_rate": 1.434934431880227e-05, + "loss": 1.3302, + "step": 1354 + }, + { + "epoch": 1.097165991902834, + "grad_norm": 1.200714765859928, + "learning_rate": 1.4341602945127806e-05, + "loss": 1.2963, + "step": 1355 + }, + { + "epoch": 1.0979757085020243, + "grad_norm": 1.1787138315863135, + "learning_rate": 1.4333858363891346e-05, + "loss": 1.3517, + "step": 1356 + }, + { + "epoch": 1.0987854251012146, + "grad_norm": 1.2170954780680936, + "learning_rate": 1.4326110580814563e-05, + "loss": 1.3765, + "step": 1357 + }, + { + "epoch": 1.0995951417004048, + "grad_norm": 1.250816449134447, + "learning_rate": 1.431835960162149e-05, + "loss": 1.3175, + "step": 1358 + }, + { + "epoch": 1.1004048582995951, + "grad_norm": 1.177624560111963, + "learning_rate": 1.4310605432038527e-05, + "loss": 1.3935, + "step": 1359 + }, + { + "epoch": 1.1012145748987854, + "grad_norm": 1.2197836972826388, + "learning_rate": 1.4302848077794427e-05, + "loss": 1.4803, + "step": 1360 + }, + { + "epoch": 1.1020242914979756, + "grad_norm": 1.2127800632997665, + "learning_rate": 1.42950875446203e-05, + "loss": 1.3772, + "step": 1361 + }, + { + "epoch": 1.102834008097166, + "grad_norm": 1.23399617147964, + "learning_rate": 1.4287323838249603e-05, + "loss": 1.3533, + "step": 1362 + }, + { + "epoch": 1.1036437246963562, + "grad_norm": 1.2240074250465314, + "learning_rate": 1.4279556964418135e-05, + "loss": 1.3569, + "step": 1363 + }, + { + "epoch": 1.1044534412955465, + "grad_norm": 1.2564824376072379, + "learning_rate": 1.4271786928864037e-05, + "loss": 1.4501, + "step": 1364 + }, + { + "epoch": 1.1052631578947367, + "grad_norm": 1.1991759525151162, + "learning_rate": 1.426401373732779e-05, + "loss": 1.3907, + "step": 1365 + }, + { + "epoch": 1.1060728744939272, + "grad_norm": 1.2378861157976855, + "learning_rate": 1.4256237395552195e-05, + "loss": 1.3469, + "step": 1366 + }, + { + "epoch": 1.1068825910931175, + "grad_norm": 1.2292213801115794, + "learning_rate": 1.4248457909282391e-05, + "loss": 1.3883, + "step": 1367 + }, + { + "epoch": 1.1076923076923078, + "grad_norm": 1.1914894912948184, + "learning_rate": 1.4240675284265838e-05, + "loss": 1.3029, + "step": 1368 + }, + { + "epoch": 1.108502024291498, + "grad_norm": 1.180343522184691, + "learning_rate": 1.4232889526252316e-05, + "loss": 1.3656, + "step": 1369 + }, + { + "epoch": 1.1093117408906883, + "grad_norm": 1.1905529250584874, + "learning_rate": 1.422510064099391e-05, + "loss": 1.3206, + "step": 1370 + }, + { + "epoch": 1.1101214574898786, + "grad_norm": 1.211461020000152, + "learning_rate": 1.421730863424503e-05, + "loss": 1.2979, + "step": 1371 + }, + { + "epoch": 1.1109311740890688, + "grad_norm": 1.2001565370308576, + "learning_rate": 1.4209513511762381e-05, + "loss": 1.3776, + "step": 1372 + }, + { + "epoch": 1.111740890688259, + "grad_norm": 1.2955199788028005, + "learning_rate": 1.420171527930498e-05, + "loss": 1.3443, + "step": 1373 + }, + { + "epoch": 1.1125506072874494, + "grad_norm": 1.1599118475957895, + "learning_rate": 1.4193913942634122e-05, + "loss": 1.3644, + "step": 1374 + }, + { + "epoch": 1.1133603238866396, + "grad_norm": 1.1585810167212327, + "learning_rate": 1.4186109507513425e-05, + "loss": 1.3979, + "step": 1375 + }, + { + "epoch": 1.11417004048583, + "grad_norm": 1.2720040494303266, + "learning_rate": 1.417830197970877e-05, + "loss": 1.3597, + "step": 1376 + }, + { + "epoch": 1.1149797570850202, + "grad_norm": 1.2098574998597365, + "learning_rate": 1.4170491364988336e-05, + "loss": 1.4478, + "step": 1377 + }, + { + "epoch": 1.1157894736842104, + "grad_norm": 1.2313276250589487, + "learning_rate": 1.416267766912258e-05, + "loss": 1.3725, + "step": 1378 + }, + { + "epoch": 1.1165991902834007, + "grad_norm": 1.1949141861834391, + "learning_rate": 1.4154860897884234e-05, + "loss": 1.394, + "step": 1379 + }, + { + "epoch": 1.117408906882591, + "grad_norm": 1.1798433594200126, + "learning_rate": 1.4147041057048303e-05, + "loss": 1.3759, + "step": 1380 + }, + { + "epoch": 1.1182186234817815, + "grad_norm": 1.2126045567567152, + "learning_rate": 1.4139218152392058e-05, + "loss": 1.4342, + "step": 1381 + }, + { + "epoch": 1.1190283400809717, + "grad_norm": 1.1831352962334098, + "learning_rate": 1.4131392189695037e-05, + "loss": 1.3377, + "step": 1382 + }, + { + "epoch": 1.119838056680162, + "grad_norm": 1.1965883843750686, + "learning_rate": 1.4123563174739036e-05, + "loss": 1.3369, + "step": 1383 + }, + { + "epoch": 1.1206477732793523, + "grad_norm": 1.1612564728192138, + "learning_rate": 1.4115731113308106e-05, + "loss": 1.3275, + "step": 1384 + }, + { + "epoch": 1.1214574898785425, + "grad_norm": 1.2213439759905513, + "learning_rate": 1.4107896011188546e-05, + "loss": 1.3599, + "step": 1385 + }, + { + "epoch": 1.1222672064777328, + "grad_norm": 1.2321562126222285, + "learning_rate": 1.4100057874168906e-05, + "loss": 1.4032, + "step": 1386 + }, + { + "epoch": 1.123076923076923, + "grad_norm": 1.2058709080617371, + "learning_rate": 1.4092216708039974e-05, + "loss": 1.3196, + "step": 1387 + }, + { + "epoch": 1.1238866396761134, + "grad_norm": 1.1731302542020694, + "learning_rate": 1.4084372518594777e-05, + "loss": 1.372, + "step": 1388 + }, + { + "epoch": 1.1246963562753036, + "grad_norm": 1.2473568748894555, + "learning_rate": 1.4076525311628581e-05, + "loss": 1.3703, + "step": 1389 + }, + { + "epoch": 1.125506072874494, + "grad_norm": 1.1445398846114365, + "learning_rate": 1.4068675092938872e-05, + "loss": 1.3289, + "step": 1390 + }, + { + "epoch": 1.1263157894736842, + "grad_norm": 1.178653116534255, + "learning_rate": 1.406082186832537e-05, + "loss": 1.3859, + "step": 1391 + }, + { + "epoch": 1.1271255060728744, + "grad_norm": 1.2540075296621336, + "learning_rate": 1.4052965643590006e-05, + "loss": 1.3663, + "step": 1392 + }, + { + "epoch": 1.1279352226720647, + "grad_norm": 1.278208376958459, + "learning_rate": 1.4045106424536938e-05, + "loss": 1.4342, + "step": 1393 + }, + { + "epoch": 1.128744939271255, + "grad_norm": 1.2422771469402274, + "learning_rate": 1.403724421697253e-05, + "loss": 1.3634, + "step": 1394 + }, + { + "epoch": 1.1295546558704452, + "grad_norm": 1.220480716235926, + "learning_rate": 1.4029379026705352e-05, + "loss": 1.3913, + "step": 1395 + }, + { + "epoch": 1.1303643724696357, + "grad_norm": 1.2008169986282988, + "learning_rate": 1.4021510859546184e-05, + "loss": 1.3316, + "step": 1396 + }, + { + "epoch": 1.131174089068826, + "grad_norm": 1.206940001904269, + "learning_rate": 1.4013639721308004e-05, + "loss": 1.3366, + "step": 1397 + }, + { + "epoch": 1.1319838056680163, + "grad_norm": 1.1864838007376735, + "learning_rate": 1.4005765617805977e-05, + "loss": 1.3345, + "step": 1398 + }, + { + "epoch": 1.1327935222672065, + "grad_norm": 1.1806069165225561, + "learning_rate": 1.3997888554857468e-05, + "loss": 1.3515, + "step": 1399 + }, + { + "epoch": 1.1336032388663968, + "grad_norm": 1.244689296436616, + "learning_rate": 1.3990008538282027e-05, + "loss": 1.4887, + "step": 1400 + }, + { + "epoch": 1.134412955465587, + "grad_norm": 1.2092159259655217, + "learning_rate": 1.3982125573901384e-05, + "loss": 1.3932, + "step": 1401 + }, + { + "epoch": 1.1352226720647773, + "grad_norm": 1.201131078854986, + "learning_rate": 1.3974239667539445e-05, + "loss": 1.3621, + "step": 1402 + }, + { + "epoch": 1.1360323886639676, + "grad_norm": 1.2951035372189361, + "learning_rate": 1.396635082502229e-05, + "loss": 1.3733, + "step": 1403 + }, + { + "epoch": 1.1368421052631579, + "grad_norm": 1.2124240532342052, + "learning_rate": 1.3958459052178175e-05, + "loss": 1.3841, + "step": 1404 + }, + { + "epoch": 1.1376518218623481, + "grad_norm": 1.2078175074517925, + "learning_rate": 1.3950564354837512e-05, + "loss": 1.3062, + "step": 1405 + }, + { + "epoch": 1.1384615384615384, + "grad_norm": 1.2235438631380933, + "learning_rate": 1.3942666738832879e-05, + "loss": 1.3788, + "step": 1406 + }, + { + "epoch": 1.1392712550607287, + "grad_norm": 1.2535687484192757, + "learning_rate": 1.3934766209999012e-05, + "loss": 1.32, + "step": 1407 + }, + { + "epoch": 1.140080971659919, + "grad_norm": 1.1792494735598216, + "learning_rate": 1.3926862774172789e-05, + "loss": 1.3501, + "step": 1408 + }, + { + "epoch": 1.1408906882591092, + "grad_norm": 1.2550576055364107, + "learning_rate": 1.391895643719325e-05, + "loss": 1.3685, + "step": 1409 + }, + { + "epoch": 1.1417004048582995, + "grad_norm": 1.187775539751402, + "learning_rate": 1.391104720490156e-05, + "loss": 1.3756, + "step": 1410 + }, + { + "epoch": 1.14251012145749, + "grad_norm": 1.1716789792191453, + "learning_rate": 1.3903135083141046e-05, + "loss": 1.3657, + "step": 1411 + }, + { + "epoch": 1.1433198380566802, + "grad_norm": 1.1878226281741322, + "learning_rate": 1.389522007775715e-05, + "loss": 1.3513, + "step": 1412 + }, + { + "epoch": 1.1441295546558705, + "grad_norm": 1.172121127796374, + "learning_rate": 1.3887302194597455e-05, + "loss": 1.3747, + "step": 1413 + }, + { + "epoch": 1.1449392712550608, + "grad_norm": 1.1977041128005224, + "learning_rate": 1.3879381439511664e-05, + "loss": 1.3213, + "step": 1414 + }, + { + "epoch": 1.145748987854251, + "grad_norm": 1.1951668935892397, + "learning_rate": 1.387145781835161e-05, + "loss": 1.3693, + "step": 1415 + }, + { + "epoch": 1.1465587044534413, + "grad_norm": 1.2425273987483105, + "learning_rate": 1.3863531336971231e-05, + "loss": 1.3388, + "step": 1416 + }, + { + "epoch": 1.1473684210526316, + "grad_norm": 1.2738201413338597, + "learning_rate": 1.3855602001226596e-05, + "loss": 1.4744, + "step": 1417 + }, + { + "epoch": 1.1481781376518219, + "grad_norm": 1.2067856613703325, + "learning_rate": 1.384766981697586e-05, + "loss": 1.3698, + "step": 1418 + }, + { + "epoch": 1.1489878542510121, + "grad_norm": 1.2309122591312711, + "learning_rate": 1.3839734790079304e-05, + "loss": 1.4008, + "step": 1419 + }, + { + "epoch": 1.1497975708502024, + "grad_norm": 1.1924268380366414, + "learning_rate": 1.3831796926399295e-05, + "loss": 1.4003, + "step": 1420 + }, + { + "epoch": 1.1506072874493927, + "grad_norm": 1.252471702177511, + "learning_rate": 1.3823856231800301e-05, + "loss": 1.3868, + "step": 1421 + }, + { + "epoch": 1.151417004048583, + "grad_norm": 1.209467941700512, + "learning_rate": 1.3815912712148885e-05, + "loss": 1.3534, + "step": 1422 + }, + { + "epoch": 1.1522267206477732, + "grad_norm": 1.1971438481581644, + "learning_rate": 1.3807966373313689e-05, + "loss": 1.355, + "step": 1423 + }, + { + "epoch": 1.1530364372469635, + "grad_norm": 1.2066657341311795, + "learning_rate": 1.380001722116544e-05, + "loss": 1.3531, + "step": 1424 + }, + { + "epoch": 1.1538461538461537, + "grad_norm": 1.1713117606005494, + "learning_rate": 1.3792065261576953e-05, + "loss": 1.3873, + "step": 1425 + }, + { + "epoch": 1.1546558704453442, + "grad_norm": 1.2248765090877878, + "learning_rate": 1.3784110500423104e-05, + "loss": 1.3402, + "step": 1426 + }, + { + "epoch": 1.1554655870445345, + "grad_norm": 1.2378827680661408, + "learning_rate": 1.3776152943580846e-05, + "loss": 1.2761, + "step": 1427 + }, + { + "epoch": 1.1562753036437248, + "grad_norm": 1.1711500146006906, + "learning_rate": 1.3768192596929195e-05, + "loss": 1.377, + "step": 1428 + }, + { + "epoch": 1.157085020242915, + "grad_norm": 1.2549196705935586, + "learning_rate": 1.376022946634923e-05, + "loss": 1.4122, + "step": 1429 + }, + { + "epoch": 1.1578947368421053, + "grad_norm": 1.221604782084902, + "learning_rate": 1.3752263557724088e-05, + "loss": 1.4329, + "step": 1430 + }, + { + "epoch": 1.1587044534412956, + "grad_norm": 1.244326599131787, + "learning_rate": 1.3744294876938953e-05, + "loss": 1.4161, + "step": 1431 + }, + { + "epoch": 1.1595141700404858, + "grad_norm": 1.1776858381109803, + "learning_rate": 1.3736323429881056e-05, + "loss": 1.411, + "step": 1432 + }, + { + "epoch": 1.1603238866396761, + "grad_norm": 1.2341697465268895, + "learning_rate": 1.3728349222439682e-05, + "loss": 1.3794, + "step": 1433 + }, + { + "epoch": 1.1611336032388664, + "grad_norm": 1.2025715980532212, + "learning_rate": 1.3720372260506152e-05, + "loss": 1.4168, + "step": 1434 + }, + { + "epoch": 1.1619433198380567, + "grad_norm": 1.2316761185815908, + "learning_rate": 1.3712392549973814e-05, + "loss": 1.4173, + "step": 1435 + }, + { + "epoch": 1.162753036437247, + "grad_norm": 1.1692638075647215, + "learning_rate": 1.370441009673805e-05, + "loss": 1.3268, + "step": 1436 + }, + { + "epoch": 1.1635627530364372, + "grad_norm": 1.2081335987676685, + "learning_rate": 1.3696424906696275e-05, + "loss": 1.3603, + "step": 1437 + }, + { + "epoch": 1.1643724696356275, + "grad_norm": 1.2286436380651753, + "learning_rate": 1.3688436985747922e-05, + "loss": 1.398, + "step": 1438 + }, + { + "epoch": 1.1651821862348177, + "grad_norm": 1.2708424368976516, + "learning_rate": 1.3680446339794436e-05, + "loss": 1.36, + "step": 1439 + }, + { + "epoch": 1.165991902834008, + "grad_norm": 1.2883548798199778, + "learning_rate": 1.3672452974739278e-05, + "loss": 1.4225, + "step": 1440 + }, + { + "epoch": 1.1668016194331985, + "grad_norm": 1.2173917634376092, + "learning_rate": 1.366445689648793e-05, + "loss": 1.315, + "step": 1441 + }, + { + "epoch": 1.1676113360323888, + "grad_norm": 1.242957617527675, + "learning_rate": 1.3656458110947864e-05, + "loss": 1.3515, + "step": 1442 + }, + { + "epoch": 1.168421052631579, + "grad_norm": 1.2488134934449235, + "learning_rate": 1.364845662402855e-05, + "loss": 1.3045, + "step": 1443 + }, + { + "epoch": 1.1692307692307693, + "grad_norm": 1.2204335700413085, + "learning_rate": 1.3640452441641466e-05, + "loss": 1.396, + "step": 1444 + }, + { + "epoch": 1.1700404858299596, + "grad_norm": 1.28192971889865, + "learning_rate": 1.3632445569700078e-05, + "loss": 1.3832, + "step": 1445 + }, + { + "epoch": 1.1708502024291498, + "grad_norm": 1.244268004826121, + "learning_rate": 1.362443601411983e-05, + "loss": 1.3672, + "step": 1446 + }, + { + "epoch": 1.17165991902834, + "grad_norm": 1.155976566029991, + "learning_rate": 1.361642378081816e-05, + "loss": 1.2883, + "step": 1447 + }, + { + "epoch": 1.1724696356275304, + "grad_norm": 1.2611118512015467, + "learning_rate": 1.3608408875714478e-05, + "loss": 1.3181, + "step": 1448 + }, + { + "epoch": 1.1732793522267206, + "grad_norm": 1.1853603357638687, + "learning_rate": 1.3600391304730174e-05, + "loss": 1.3431, + "step": 1449 + }, + { + "epoch": 1.174089068825911, + "grad_norm": 1.1956700418684363, + "learning_rate": 1.3592371073788595e-05, + "loss": 1.4223, + "step": 1450 + }, + { + "epoch": 1.1748987854251012, + "grad_norm": 1.2850373149613217, + "learning_rate": 1.3584348188815066e-05, + "loss": 1.4498, + "step": 1451 + }, + { + "epoch": 1.1757085020242914, + "grad_norm": 1.193949378852388, + "learning_rate": 1.357632265573687e-05, + "loss": 1.3437, + "step": 1452 + }, + { + "epoch": 1.1765182186234817, + "grad_norm": 1.1874525953583595, + "learning_rate": 1.356829448048324e-05, + "loss": 1.3978, + "step": 1453 + }, + { + "epoch": 1.177327935222672, + "grad_norm": 1.194001231503012, + "learning_rate": 1.3560263668985366e-05, + "loss": 1.2924, + "step": 1454 + }, + { + "epoch": 1.1781376518218623, + "grad_norm": 1.1791770992975843, + "learning_rate": 1.355223022717639e-05, + "loss": 1.3411, + "step": 1455 + }, + { + "epoch": 1.1789473684210527, + "grad_norm": 1.2243559430427213, + "learning_rate": 1.3544194160991388e-05, + "loss": 1.4058, + "step": 1456 + }, + { + "epoch": 1.1797570850202428, + "grad_norm": 1.1894939791550536, + "learning_rate": 1.353615547636738e-05, + "loss": 1.3892, + "step": 1457 + }, + { + "epoch": 1.1805668016194333, + "grad_norm": 1.2776607461433331, + "learning_rate": 1.3528114179243322e-05, + "loss": 1.3352, + "step": 1458 + }, + { + "epoch": 1.1813765182186236, + "grad_norm": 1.201017466707103, + "learning_rate": 1.3520070275560093e-05, + "loss": 1.3605, + "step": 1459 + }, + { + "epoch": 1.1821862348178138, + "grad_norm": 1.2221052297543349, + "learning_rate": 1.3512023771260507e-05, + "loss": 1.3831, + "step": 1460 + }, + { + "epoch": 1.182995951417004, + "grad_norm": 1.2535433317816922, + "learning_rate": 1.3503974672289295e-05, + "loss": 1.4064, + "step": 1461 + }, + { + "epoch": 1.1838056680161944, + "grad_norm": 1.25001419317799, + "learning_rate": 1.3495922984593101e-05, + "loss": 1.354, + "step": 1462 + }, + { + "epoch": 1.1846153846153846, + "grad_norm": 1.2089559561561227, + "learning_rate": 1.3487868714120494e-05, + "loss": 1.3245, + "step": 1463 + }, + { + "epoch": 1.185425101214575, + "grad_norm": 1.1679267338188017, + "learning_rate": 1.347981186682193e-05, + "loss": 1.4321, + "step": 1464 + }, + { + "epoch": 1.1862348178137652, + "grad_norm": 1.2411376067872044, + "learning_rate": 1.347175244864979e-05, + "loss": 1.4119, + "step": 1465 + }, + { + "epoch": 1.1870445344129554, + "grad_norm": 1.2595186312439597, + "learning_rate": 1.3463690465558346e-05, + "loss": 1.3369, + "step": 1466 + }, + { + "epoch": 1.1878542510121457, + "grad_norm": 1.1909117279605281, + "learning_rate": 1.3455625923503762e-05, + "loss": 1.3863, + "step": 1467 + }, + { + "epoch": 1.188663967611336, + "grad_norm": 1.2054369811230554, + "learning_rate": 1.344755882844409e-05, + "loss": 1.3767, + "step": 1468 + }, + { + "epoch": 1.1894736842105262, + "grad_norm": 1.2323426762141974, + "learning_rate": 1.3439489186339283e-05, + "loss": 1.377, + "step": 1469 + }, + { + "epoch": 1.1902834008097165, + "grad_norm": 1.2454039577692348, + "learning_rate": 1.3431417003151162e-05, + "loss": 1.3446, + "step": 1470 + }, + { + "epoch": 1.191093117408907, + "grad_norm": 1.2587301942793938, + "learning_rate": 1.3423342284843428e-05, + "loss": 1.3762, + "step": 1471 + }, + { + "epoch": 1.191902834008097, + "grad_norm": 1.2722119186788876, + "learning_rate": 1.3415265037381657e-05, + "loss": 1.3654, + "step": 1472 + }, + { + "epoch": 1.1927125506072875, + "grad_norm": 1.2805355921406205, + "learning_rate": 1.3407185266733294e-05, + "loss": 1.3376, + "step": 1473 + }, + { + "epoch": 1.1935222672064778, + "grad_norm": 1.2081785962123999, + "learning_rate": 1.3399102978867648e-05, + "loss": 1.4041, + "step": 1474 + }, + { + "epoch": 1.194331983805668, + "grad_norm": 1.1925409611242321, + "learning_rate": 1.3391018179755886e-05, + "loss": 1.3684, + "step": 1475 + }, + { + "epoch": 1.1951417004048583, + "grad_norm": 1.227526446454291, + "learning_rate": 1.3382930875371028e-05, + "loss": 1.4175, + "step": 1476 + }, + { + "epoch": 1.1959514170040486, + "grad_norm": 1.1817699551420382, + "learning_rate": 1.3374841071687949e-05, + "loss": 1.3924, + "step": 1477 + }, + { + "epoch": 1.1967611336032389, + "grad_norm": 1.1756054088231052, + "learning_rate": 1.3366748774683376e-05, + "loss": 1.357, + "step": 1478 + }, + { + "epoch": 1.1975708502024291, + "grad_norm": 1.222659746027629, + "learning_rate": 1.3358653990335863e-05, + "loss": 1.3636, + "step": 1479 + }, + { + "epoch": 1.1983805668016194, + "grad_norm": 1.2400190458109341, + "learning_rate": 1.3350556724625809e-05, + "loss": 1.3603, + "step": 1480 + }, + { + "epoch": 1.1991902834008097, + "grad_norm": 1.2088314629672083, + "learning_rate": 1.3342456983535457e-05, + "loss": 1.416, + "step": 1481 + }, + { + "epoch": 1.2, + "grad_norm": 1.2142465353436538, + "learning_rate": 1.3334354773048863e-05, + "loss": 1.3035, + "step": 1482 + }, + { + "epoch": 1.2008097165991902, + "grad_norm": 1.1986889230221165, + "learning_rate": 1.3326250099151911e-05, + "loss": 1.3957, + "step": 1483 + }, + { + "epoch": 1.2016194331983805, + "grad_norm": 1.2413091007676673, + "learning_rate": 1.331814296783231e-05, + "loss": 1.3504, + "step": 1484 + }, + { + "epoch": 1.2024291497975708, + "grad_norm": 1.2256638183999424, + "learning_rate": 1.3310033385079589e-05, + "loss": 1.3678, + "step": 1485 + }, + { + "epoch": 1.2032388663967613, + "grad_norm": 1.2400693672871794, + "learning_rate": 1.330192135688507e-05, + "loss": 1.4066, + "step": 1486 + }, + { + "epoch": 1.2040485829959513, + "grad_norm": 1.2362345111970168, + "learning_rate": 1.3293806889241898e-05, + "loss": 1.3888, + "step": 1487 + }, + { + "epoch": 1.2048582995951418, + "grad_norm": 1.2587009847578705, + "learning_rate": 1.3285689988145011e-05, + "loss": 1.4359, + "step": 1488 + }, + { + "epoch": 1.205668016194332, + "grad_norm": 1.1914663304562276, + "learning_rate": 1.3277570659591159e-05, + "loss": 1.3391, + "step": 1489 + }, + { + "epoch": 1.2064777327935223, + "grad_norm": 1.1834896782723838, + "learning_rate": 1.3269448909578866e-05, + "loss": 1.4029, + "step": 1490 + }, + { + "epoch": 1.2072874493927126, + "grad_norm": 1.1813018458720312, + "learning_rate": 1.3261324744108454e-05, + "loss": 1.415, + "step": 1491 + }, + { + "epoch": 1.2080971659919029, + "grad_norm": 1.1818375702199966, + "learning_rate": 1.3253198169182033e-05, + "loss": 1.3473, + "step": 1492 + }, + { + "epoch": 1.2089068825910931, + "grad_norm": 1.194541380193379, + "learning_rate": 1.3245069190803495e-05, + "loss": 1.3753, + "step": 1493 + }, + { + "epoch": 1.2097165991902834, + "grad_norm": 1.2134975986163172, + "learning_rate": 1.3236937814978493e-05, + "loss": 1.3514, + "step": 1494 + }, + { + "epoch": 1.2105263157894737, + "grad_norm": 1.1847950487403296, + "learning_rate": 1.3228804047714462e-05, + "loss": 1.3309, + "step": 1495 + }, + { + "epoch": 1.211336032388664, + "grad_norm": 1.1484211925319983, + "learning_rate": 1.322066789502061e-05, + "loss": 1.3631, + "step": 1496 + }, + { + "epoch": 1.2121457489878542, + "grad_norm": 1.2388971531409982, + "learning_rate": 1.3212529362907894e-05, + "loss": 1.3734, + "step": 1497 + }, + { + "epoch": 1.2129554655870445, + "grad_norm": 1.247111008797553, + "learning_rate": 1.3204388457389032e-05, + "loss": 1.3013, + "step": 1498 + }, + { + "epoch": 1.2137651821862347, + "grad_norm": 1.2555192712705792, + "learning_rate": 1.319624518447851e-05, + "loss": 1.3507, + "step": 1499 + }, + { + "epoch": 1.214574898785425, + "grad_norm": 1.3054090163651753, + "learning_rate": 1.3188099550192537e-05, + "loss": 1.3748, + "step": 1500 + }, + { + "epoch": 1.2153846153846155, + "grad_norm": 1.250883089302353, + "learning_rate": 1.317995156054909e-05, + "loss": 1.3597, + "step": 1501 + }, + { + "epoch": 1.2161943319838056, + "grad_norm": 1.2376049269488396, + "learning_rate": 1.3171801221567872e-05, + "loss": 1.3149, + "step": 1502 + }, + { + "epoch": 1.217004048582996, + "grad_norm": 1.1862482520157116, + "learning_rate": 1.3163648539270333e-05, + "loss": 1.4034, + "step": 1503 + }, + { + "epoch": 1.2178137651821863, + "grad_norm": 1.26463939374948, + "learning_rate": 1.315549351967964e-05, + "loss": 1.3365, + "step": 1504 + }, + { + "epoch": 1.2186234817813766, + "grad_norm": 1.2437748606370247, + "learning_rate": 1.31473361688207e-05, + "loss": 1.4052, + "step": 1505 + }, + { + "epoch": 1.2194331983805669, + "grad_norm": 1.2684663340272135, + "learning_rate": 1.3139176492720137e-05, + "loss": 1.3346, + "step": 1506 + }, + { + "epoch": 1.2202429149797571, + "grad_norm": 1.2311472588728176, + "learning_rate": 1.3131014497406288e-05, + "loss": 1.3399, + "step": 1507 + }, + { + "epoch": 1.2210526315789474, + "grad_norm": 1.262810193750128, + "learning_rate": 1.3122850188909216e-05, + "loss": 1.3896, + "step": 1508 + }, + { + "epoch": 1.2218623481781377, + "grad_norm": 1.227527399226647, + "learning_rate": 1.3114683573260677e-05, + "loss": 1.408, + "step": 1509 + }, + { + "epoch": 1.222672064777328, + "grad_norm": 1.2375397432956372, + "learning_rate": 1.3106514656494147e-05, + "loss": 1.3952, + "step": 1510 + }, + { + "epoch": 1.2234817813765182, + "grad_norm": 1.1550935938629225, + "learning_rate": 1.3098343444644793e-05, + "loss": 1.3888, + "step": 1511 + }, + { + "epoch": 1.2242914979757085, + "grad_norm": 1.1942656359224868, + "learning_rate": 1.3090169943749475e-05, + "loss": 1.3832, + "step": 1512 + }, + { + "epoch": 1.2251012145748987, + "grad_norm": 1.1727975184000596, + "learning_rate": 1.3081994159846753e-05, + "loss": 1.3812, + "step": 1513 + }, + { + "epoch": 1.225910931174089, + "grad_norm": 1.197178362301058, + "learning_rate": 1.3073816098976871e-05, + "loss": 1.3731, + "step": 1514 + }, + { + "epoch": 1.2267206477732793, + "grad_norm": 1.2142805284091027, + "learning_rate": 1.3065635767181748e-05, + "loss": 1.4057, + "step": 1515 + }, + { + "epoch": 1.2275303643724695, + "grad_norm": 1.1711215174781089, + "learning_rate": 1.3057453170504988e-05, + "loss": 1.4099, + "step": 1516 + }, + { + "epoch": 1.2283400809716598, + "grad_norm": 1.183153242000738, + "learning_rate": 1.304926831499187e-05, + "loss": 1.374, + "step": 1517 + }, + { + "epoch": 1.2291497975708503, + "grad_norm": 1.2202202744850468, + "learning_rate": 1.3041081206689335e-05, + "loss": 1.3325, + "step": 1518 + }, + { + "epoch": 1.2299595141700406, + "grad_norm": 1.1858098990909935, + "learning_rate": 1.3032891851645994e-05, + "loss": 1.3086, + "step": 1519 + }, + { + "epoch": 1.2307692307692308, + "grad_norm": 1.230959944898959, + "learning_rate": 1.302470025591211e-05, + "loss": 1.3373, + "step": 1520 + }, + { + "epoch": 1.231578947368421, + "grad_norm": 1.1451597949607148, + "learning_rate": 1.3016506425539615e-05, + "loss": 1.3721, + "step": 1521 + }, + { + "epoch": 1.2323886639676114, + "grad_norm": 1.3441292904286972, + "learning_rate": 1.3008310366582081e-05, + "loss": 1.4799, + "step": 1522 + }, + { + "epoch": 1.2331983805668016, + "grad_norm": 1.2341770886622436, + "learning_rate": 1.300011208509473e-05, + "loss": 1.3638, + "step": 1523 + }, + { + "epoch": 1.234008097165992, + "grad_norm": 1.1915183190289644, + "learning_rate": 1.2991911587134416e-05, + "loss": 1.4123, + "step": 1524 + }, + { + "epoch": 1.2348178137651822, + "grad_norm": 1.2162901503483015, + "learning_rate": 1.2983708878759655e-05, + "loss": 1.4574, + "step": 1525 + }, + { + "epoch": 1.2356275303643725, + "grad_norm": 1.2621342134046012, + "learning_rate": 1.2975503966030574e-05, + "loss": 1.3648, + "step": 1526 + }, + { + "epoch": 1.2364372469635627, + "grad_norm": 1.1896912894693648, + "learning_rate": 1.2967296855008932e-05, + "loss": 1.3118, + "step": 1527 + }, + { + "epoch": 1.237246963562753, + "grad_norm": 1.2023777230870145, + "learning_rate": 1.2959087551758121e-05, + "loss": 1.4103, + "step": 1528 + }, + { + "epoch": 1.2380566801619433, + "grad_norm": 1.228550678791859, + "learning_rate": 1.2950876062343147e-05, + "loss": 1.3102, + "step": 1529 + }, + { + "epoch": 1.2388663967611335, + "grad_norm": 1.2650925867678326, + "learning_rate": 1.2942662392830632e-05, + "loss": 1.388, + "step": 1530 + }, + { + "epoch": 1.2396761133603238, + "grad_norm": 1.1719459344716467, + "learning_rate": 1.2934446549288801e-05, + "loss": 1.3515, + "step": 1531 + }, + { + "epoch": 1.240485829959514, + "grad_norm": 1.2442167711303942, + "learning_rate": 1.2926228537787498e-05, + "loss": 1.3685, + "step": 1532 + }, + { + "epoch": 1.2412955465587046, + "grad_norm": 1.192558403731443, + "learning_rate": 1.2918008364398164e-05, + "loss": 1.3777, + "step": 1533 + }, + { + "epoch": 1.2421052631578948, + "grad_norm": 1.187107614370798, + "learning_rate": 1.2909786035193836e-05, + "loss": 1.444, + "step": 1534 + }, + { + "epoch": 1.242914979757085, + "grad_norm": 1.2441337176267466, + "learning_rate": 1.290156155624914e-05, + "loss": 1.3631, + "step": 1535 + }, + { + "epoch": 1.2437246963562754, + "grad_norm": 1.174168892326793, + "learning_rate": 1.2893334933640296e-05, + "loss": 1.3321, + "step": 1536 + }, + { + "epoch": 1.2445344129554656, + "grad_norm": 1.208004365868136, + "learning_rate": 1.2885106173445108e-05, + "loss": 1.363, + "step": 1537 + }, + { + "epoch": 1.245344129554656, + "grad_norm": 1.1950930591538556, + "learning_rate": 1.287687528174295e-05, + "loss": 1.3179, + "step": 1538 + }, + { + "epoch": 1.2461538461538462, + "grad_norm": 1.1790688919955996, + "learning_rate": 1.2868642264614787e-05, + "loss": 1.4061, + "step": 1539 + }, + { + "epoch": 1.2469635627530364, + "grad_norm": 1.1987689806967812, + "learning_rate": 1.286040712814314e-05, + "loss": 1.3569, + "step": 1540 + }, + { + "epoch": 1.2477732793522267, + "grad_norm": 1.1817370751262748, + "learning_rate": 1.2852169878412102e-05, + "loss": 1.3369, + "step": 1541 + }, + { + "epoch": 1.248582995951417, + "grad_norm": 1.2282449878735011, + "learning_rate": 1.2843930521507324e-05, + "loss": 1.3283, + "step": 1542 + }, + { + "epoch": 1.2493927125506072, + "grad_norm": 1.26597563474877, + "learning_rate": 1.2835689063516019e-05, + "loss": 1.4247, + "step": 1543 + }, + { + "epoch": 1.2502024291497975, + "grad_norm": 1.2159532514071452, + "learning_rate": 1.2827445510526945e-05, + "loss": 1.3501, + "step": 1544 + }, + { + "epoch": 1.2510121457489878, + "grad_norm": 1.2015354671386715, + "learning_rate": 1.2819199868630419e-05, + "loss": 1.4067, + "step": 1545 + }, + { + "epoch": 1.2518218623481783, + "grad_norm": 1.1488073343742868, + "learning_rate": 1.2810952143918284e-05, + "loss": 1.3881, + "step": 1546 + }, + { + "epoch": 1.2526315789473683, + "grad_norm": 1.1469365945891747, + "learning_rate": 1.2802702342483941e-05, + "loss": 1.2904, + "step": 1547 + }, + { + "epoch": 1.2534412955465588, + "grad_norm": 1.2210656858397426, + "learning_rate": 1.279445047042231e-05, + "loss": 1.3727, + "step": 1548 + }, + { + "epoch": 1.2542510121457489, + "grad_norm": 1.199892214891525, + "learning_rate": 1.278619653382985e-05, + "loss": 1.4002, + "step": 1549 + }, + { + "epoch": 1.2550607287449393, + "grad_norm": 1.233459919301486, + "learning_rate": 1.2777940538804545e-05, + "loss": 1.4107, + "step": 1550 + }, + { + "epoch": 1.2558704453441296, + "grad_norm": 1.147765149659166, + "learning_rate": 1.2769682491445892e-05, + "loss": 1.3451, + "step": 1551 + }, + { + "epoch": 1.2566801619433199, + "grad_norm": 1.1348340061961697, + "learning_rate": 1.276142239785491e-05, + "loss": 1.2995, + "step": 1552 + }, + { + "epoch": 1.2574898785425102, + "grad_norm": 1.2149990877087464, + "learning_rate": 1.275316026413413e-05, + "loss": 1.2791, + "step": 1553 + }, + { + "epoch": 1.2582995951417004, + "grad_norm": 1.1996055435289459, + "learning_rate": 1.274489609638759e-05, + "loss": 1.3854, + "step": 1554 + }, + { + "epoch": 1.2591093117408907, + "grad_norm": 1.2048053640102194, + "learning_rate": 1.2736629900720832e-05, + "loss": 1.3213, + "step": 1555 + }, + { + "epoch": 1.259919028340081, + "grad_norm": 1.1862492357917775, + "learning_rate": 1.2728361683240889e-05, + "loss": 1.4001, + "step": 1556 + }, + { + "epoch": 1.2607287449392712, + "grad_norm": 1.246276511809421, + "learning_rate": 1.2720091450056293e-05, + "loss": 1.3218, + "step": 1557 + }, + { + "epoch": 1.2615384615384615, + "grad_norm": 1.223220081058916, + "learning_rate": 1.2711819207277071e-05, + "loss": 1.356, + "step": 1558 + }, + { + "epoch": 1.2623481781376518, + "grad_norm": 1.2502562370531392, + "learning_rate": 1.2703544961014727e-05, + "loss": 1.3222, + "step": 1559 + }, + { + "epoch": 1.263157894736842, + "grad_norm": 1.224324008996058, + "learning_rate": 1.2695268717382242e-05, + "loss": 1.3478, + "step": 1560 + }, + { + "epoch": 1.2639676113360325, + "grad_norm": 1.2105633319160907, + "learning_rate": 1.268699048249408e-05, + "loss": 1.4391, + "step": 1561 + }, + { + "epoch": 1.2647773279352226, + "grad_norm": 1.2837901936738587, + "learning_rate": 1.2678710262466178e-05, + "loss": 1.4009, + "step": 1562 + }, + { + "epoch": 1.265587044534413, + "grad_norm": 1.2548245764229349, + "learning_rate": 1.2670428063415932e-05, + "loss": 1.4045, + "step": 1563 + }, + { + "epoch": 1.266396761133603, + "grad_norm": 1.1561209573769804, + "learning_rate": 1.26621438914622e-05, + "loss": 1.3793, + "step": 1564 + }, + { + "epoch": 1.2672064777327936, + "grad_norm": 1.1691572154518026, + "learning_rate": 1.2653857752725305e-05, + "loss": 1.3663, + "step": 1565 + }, + { + "epoch": 1.2680161943319839, + "grad_norm": 1.2306512954942797, + "learning_rate": 1.2645569653327024e-05, + "loss": 1.3454, + "step": 1566 + }, + { + "epoch": 1.2688259109311741, + "grad_norm": 1.1802175610787244, + "learning_rate": 1.2637279599390569e-05, + "loss": 1.369, + "step": 1567 + }, + { + "epoch": 1.2696356275303644, + "grad_norm": 1.1959566114951496, + "learning_rate": 1.2628987597040605e-05, + "loss": 1.3326, + "step": 1568 + }, + { + "epoch": 1.2704453441295547, + "grad_norm": 1.2258150541786852, + "learning_rate": 1.2620693652403241e-05, + "loss": 1.4098, + "step": 1569 + }, + { + "epoch": 1.271255060728745, + "grad_norm": 1.2413950001716263, + "learning_rate": 1.2612397771606015e-05, + "loss": 1.328, + "step": 1570 + }, + { + "epoch": 1.2720647773279352, + "grad_norm": 1.1898669800873198, + "learning_rate": 1.2604099960777896e-05, + "loss": 1.3843, + "step": 1571 + }, + { + "epoch": 1.2728744939271255, + "grad_norm": 1.2359955799975981, + "learning_rate": 1.2595800226049277e-05, + "loss": 1.4085, + "step": 1572 + }, + { + "epoch": 1.2736842105263158, + "grad_norm": 1.2178561670112904, + "learning_rate": 1.258749857355198e-05, + "loss": 1.2919, + "step": 1573 + }, + { + "epoch": 1.274493927125506, + "grad_norm": 1.2042766779705247, + "learning_rate": 1.2579195009419234e-05, + "loss": 1.3142, + "step": 1574 + }, + { + "epoch": 1.2753036437246963, + "grad_norm": 1.2110686014257106, + "learning_rate": 1.2570889539785683e-05, + "loss": 1.3382, + "step": 1575 + }, + { + "epoch": 1.2761133603238866, + "grad_norm": 1.2052344706519602, + "learning_rate": 1.2562582170787385e-05, + "loss": 1.4298, + "step": 1576 + }, + { + "epoch": 1.2769230769230768, + "grad_norm": 1.243022267671595, + "learning_rate": 1.2554272908561798e-05, + "loss": 1.3593, + "step": 1577 + }, + { + "epoch": 1.2777327935222673, + "grad_norm": 1.2033161377378727, + "learning_rate": 1.2545961759247775e-05, + "loss": 1.3927, + "step": 1578 + }, + { + "epoch": 1.2785425101214574, + "grad_norm": 1.2330632054756852, + "learning_rate": 1.2537648728985565e-05, + "loss": 1.4024, + "step": 1579 + }, + { + "epoch": 1.2793522267206479, + "grad_norm": 1.201478870274986, + "learning_rate": 1.2529333823916807e-05, + "loss": 1.4012, + "step": 1580 + }, + { + "epoch": 1.2801619433198381, + "grad_norm": 1.1664964260293886, + "learning_rate": 1.2521017050184531e-05, + "loss": 1.3771, + "step": 1581 + }, + { + "epoch": 1.2809716599190284, + "grad_norm": 1.180036690598496, + "learning_rate": 1.251269841393314e-05, + "loss": 1.3307, + "step": 1582 + }, + { + "epoch": 1.2817813765182187, + "grad_norm": 1.2026900475737514, + "learning_rate": 1.2504377921308408e-05, + "loss": 1.3842, + "step": 1583 + }, + { + "epoch": 1.282591093117409, + "grad_norm": 1.2152832958337092, + "learning_rate": 1.2496055578457496e-05, + "loss": 1.323, + "step": 1584 + }, + { + "epoch": 1.2834008097165992, + "grad_norm": 1.1654873586192795, + "learning_rate": 1.2487731391528919e-05, + "loss": 1.3791, + "step": 1585 + }, + { + "epoch": 1.2842105263157895, + "grad_norm": 1.2189615995410232, + "learning_rate": 1.2479405366672562e-05, + "loss": 1.4297, + "step": 1586 + }, + { + "epoch": 1.2850202429149797, + "grad_norm": 1.2231303739404338, + "learning_rate": 1.2471077510039665e-05, + "loss": 1.4837, + "step": 1587 + }, + { + "epoch": 1.28582995951417, + "grad_norm": 1.1937220120755236, + "learning_rate": 1.2462747827782818e-05, + "loss": 1.3918, + "step": 1588 + }, + { + "epoch": 1.2866396761133603, + "grad_norm": 1.263374882520897, + "learning_rate": 1.2454416326055964e-05, + "loss": 1.4037, + "step": 1589 + }, + { + "epoch": 1.2874493927125505, + "grad_norm": 1.2138622497968994, + "learning_rate": 1.2446083011014389e-05, + "loss": 1.3822, + "step": 1590 + }, + { + "epoch": 1.2882591093117408, + "grad_norm": 1.1753589986143762, + "learning_rate": 1.2437747888814722e-05, + "loss": 1.3099, + "step": 1591 + }, + { + "epoch": 1.289068825910931, + "grad_norm": 1.195846367618967, + "learning_rate": 1.242941096561492e-05, + "loss": 1.3619, + "step": 1592 + }, + { + "epoch": 1.2898785425101216, + "grad_norm": 1.1675847607196994, + "learning_rate": 1.2421072247574277e-05, + "loss": 1.3881, + "step": 1593 + }, + { + "epoch": 1.2906882591093116, + "grad_norm": 1.2109119056603415, + "learning_rate": 1.2412731740853405e-05, + "loss": 1.3412, + "step": 1594 + }, + { + "epoch": 1.291497975708502, + "grad_norm": 1.1465692161369532, + "learning_rate": 1.2404389451614253e-05, + "loss": 1.3789, + "step": 1595 + }, + { + "epoch": 1.2923076923076924, + "grad_norm": 1.1968403107276433, + "learning_rate": 1.2396045386020066e-05, + "loss": 1.3803, + "step": 1596 + }, + { + "epoch": 1.2931174089068826, + "grad_norm": 1.3220159374383795, + "learning_rate": 1.2387699550235419e-05, + "loss": 1.3292, + "step": 1597 + }, + { + "epoch": 1.293927125506073, + "grad_norm": 1.1851391120160597, + "learning_rate": 1.2379351950426188e-05, + "loss": 1.388, + "step": 1598 + }, + { + "epoch": 1.2947368421052632, + "grad_norm": 1.216804570437914, + "learning_rate": 1.2371002592759553e-05, + "loss": 1.3748, + "step": 1599 + }, + { + "epoch": 1.2955465587044535, + "grad_norm": 1.3015331306057583, + "learning_rate": 1.2362651483403985e-05, + "loss": 1.3344, + "step": 1600 + }, + { + "epoch": 1.2963562753036437, + "grad_norm": 1.1871910644240773, + "learning_rate": 1.2354298628529263e-05, + "loss": 1.3712, + "step": 1601 + }, + { + "epoch": 1.297165991902834, + "grad_norm": 1.2506226744247222, + "learning_rate": 1.2345944034306447e-05, + "loss": 1.3352, + "step": 1602 + }, + { + "epoch": 1.2979757085020243, + "grad_norm": 1.222415145371624, + "learning_rate": 1.2337587706907885e-05, + "loss": 1.4298, + "step": 1603 + }, + { + "epoch": 1.2987854251012145, + "grad_norm": 1.2046074904547404, + "learning_rate": 1.2329229652507199e-05, + "loss": 1.3016, + "step": 1604 + }, + { + "epoch": 1.2995951417004048, + "grad_norm": 1.1789481943544442, + "learning_rate": 1.2320869877279297e-05, + "loss": 1.4301, + "step": 1605 + }, + { + "epoch": 1.300404858299595, + "grad_norm": 1.2312716488423778, + "learning_rate": 1.2312508387400356e-05, + "loss": 1.3756, + "step": 1606 + }, + { + "epoch": 1.3012145748987853, + "grad_norm": 1.1876391454771218, + "learning_rate": 1.230414518904781e-05, + "loss": 1.347, + "step": 1607 + }, + { + "epoch": 1.3020242914979758, + "grad_norm": 1.1818075707743736, + "learning_rate": 1.2295780288400365e-05, + "loss": 1.2704, + "step": 1608 + }, + { + "epoch": 1.3028340080971659, + "grad_norm": 1.230394929800309, + "learning_rate": 1.2287413691637986e-05, + "loss": 1.4056, + "step": 1609 + }, + { + "epoch": 1.3036437246963564, + "grad_norm": 1.2093149673925736, + "learning_rate": 1.2279045404941883e-05, + "loss": 1.355, + "step": 1610 + }, + { + "epoch": 1.3044534412955466, + "grad_norm": 1.2368989128597225, + "learning_rate": 1.2270675434494523e-05, + "loss": 1.3272, + "step": 1611 + }, + { + "epoch": 1.305263157894737, + "grad_norm": 1.2457418382739227, + "learning_rate": 1.2262303786479603e-05, + "loss": 1.3158, + "step": 1612 + }, + { + "epoch": 1.3060728744939272, + "grad_norm": 1.193000113626177, + "learning_rate": 1.2253930467082082e-05, + "loss": 1.3006, + "step": 1613 + }, + { + "epoch": 1.3068825910931174, + "grad_norm": 1.177523460512185, + "learning_rate": 1.2245555482488134e-05, + "loss": 1.4537, + "step": 1614 + }, + { + "epoch": 1.3076923076923077, + "grad_norm": 1.20594416334784, + "learning_rate": 1.2237178838885168e-05, + "loss": 1.3691, + "step": 1615 + }, + { + "epoch": 1.308502024291498, + "grad_norm": 1.2486014445170293, + "learning_rate": 1.2228800542461828e-05, + "loss": 1.333, + "step": 1616 + }, + { + "epoch": 1.3093117408906882, + "grad_norm": 1.183684994706002, + "learning_rate": 1.2220420599407965e-05, + "loss": 1.3391, + "step": 1617 + }, + { + "epoch": 1.3101214574898785, + "grad_norm": 1.2014854800059243, + "learning_rate": 1.2212039015914656e-05, + "loss": 1.3952, + "step": 1618 + }, + { + "epoch": 1.3109311740890688, + "grad_norm": 1.2525560938912015, + "learning_rate": 1.2203655798174188e-05, + "loss": 1.3096, + "step": 1619 + }, + { + "epoch": 1.311740890688259, + "grad_norm": 1.1545428943491032, + "learning_rate": 1.2195270952380052e-05, + "loss": 1.3919, + "step": 1620 + }, + { + "epoch": 1.3125506072874493, + "grad_norm": 1.1981415320424176, + "learning_rate": 1.2186884484726948e-05, + "loss": 1.3424, + "step": 1621 + }, + { + "epoch": 1.3133603238866396, + "grad_norm": 1.2551393649132647, + "learning_rate": 1.2178496401410772e-05, + "loss": 1.3471, + "step": 1622 + }, + { + "epoch": 1.31417004048583, + "grad_norm": 1.2509875632018495, + "learning_rate": 1.2170106708628604e-05, + "loss": 1.3454, + "step": 1623 + }, + { + "epoch": 1.3149797570850201, + "grad_norm": 1.2300196893904036, + "learning_rate": 1.2161715412578729e-05, + "loss": 1.3805, + "step": 1624 + }, + { + "epoch": 1.3157894736842106, + "grad_norm": 1.1907433595568389, + "learning_rate": 1.215332251946061e-05, + "loss": 1.3729, + "step": 1625 + }, + { + "epoch": 1.3165991902834009, + "grad_norm": 1.16306796314768, + "learning_rate": 1.2144928035474886e-05, + "loss": 1.314, + "step": 1626 + }, + { + "epoch": 1.3174089068825912, + "grad_norm": 1.1442311854910465, + "learning_rate": 1.213653196682337e-05, + "loss": 1.3982, + "step": 1627 + }, + { + "epoch": 1.3182186234817814, + "grad_norm": 1.2255785520913092, + "learning_rate": 1.2128134319709057e-05, + "loss": 1.4047, + "step": 1628 + }, + { + "epoch": 1.3190283400809717, + "grad_norm": 1.144767078265005, + "learning_rate": 1.21197351003361e-05, + "loss": 1.3996, + "step": 1629 + }, + { + "epoch": 1.319838056680162, + "grad_norm": 1.2322890116383336, + "learning_rate": 1.2111334314909811e-05, + "loss": 1.3791, + "step": 1630 + }, + { + "epoch": 1.3206477732793522, + "grad_norm": 1.2107662604876777, + "learning_rate": 1.2102931969636664e-05, + "loss": 1.3275, + "step": 1631 + }, + { + "epoch": 1.3214574898785425, + "grad_norm": 1.2271628627713616, + "learning_rate": 1.2094528070724286e-05, + "loss": 1.3068, + "step": 1632 + }, + { + "epoch": 1.3222672064777328, + "grad_norm": 1.240148364284859, + "learning_rate": 1.2086122624381446e-05, + "loss": 1.3663, + "step": 1633 + }, + { + "epoch": 1.323076923076923, + "grad_norm": 1.2055440099566974, + "learning_rate": 1.2077715636818066e-05, + "loss": 1.4334, + "step": 1634 + }, + { + "epoch": 1.3238866396761133, + "grad_norm": 1.230867923850948, + "learning_rate": 1.2069307114245197e-05, + "loss": 1.3325, + "step": 1635 + }, + { + "epoch": 1.3246963562753036, + "grad_norm": 1.1621119132494113, + "learning_rate": 1.2060897062875027e-05, + "loss": 1.3705, + "step": 1636 + }, + { + "epoch": 1.3255060728744938, + "grad_norm": 1.2483246064419058, + "learning_rate": 1.2052485488920877e-05, + "loss": 1.3473, + "step": 1637 + }, + { + "epoch": 1.3263157894736843, + "grad_norm": 1.2207617901689776, + "learning_rate": 1.2044072398597188e-05, + "loss": 1.3736, + "step": 1638 + }, + { + "epoch": 1.3271255060728744, + "grad_norm": 1.181566130734583, + "learning_rate": 1.2035657798119527e-05, + "loss": 1.3351, + "step": 1639 + }, + { + "epoch": 1.3279352226720649, + "grad_norm": 1.2130403891894281, + "learning_rate": 1.2027241693704567e-05, + "loss": 1.4527, + "step": 1640 + }, + { + "epoch": 1.3287449392712551, + "grad_norm": 1.2379457536405214, + "learning_rate": 1.2018824091570103e-05, + "loss": 1.4002, + "step": 1641 + }, + { + "epoch": 1.3295546558704454, + "grad_norm": 1.2287910940298021, + "learning_rate": 1.2010404997935032e-05, + "loss": 1.4004, + "step": 1642 + }, + { + "epoch": 1.3303643724696357, + "grad_norm": 1.1412833841977206, + "learning_rate": 1.2001984419019353e-05, + "loss": 1.3346, + "step": 1643 + }, + { + "epoch": 1.331174089068826, + "grad_norm": 1.15781602550099, + "learning_rate": 1.1993562361044157e-05, + "loss": 1.4022, + "step": 1644 + }, + { + "epoch": 1.3319838056680162, + "grad_norm": 1.1570840461641474, + "learning_rate": 1.1985138830231638e-05, + "loss": 1.3471, + "step": 1645 + }, + { + "epoch": 1.3327935222672065, + "grad_norm": 1.1957634256697691, + "learning_rate": 1.1976713832805071e-05, + "loss": 1.3155, + "step": 1646 + }, + { + "epoch": 1.3336032388663968, + "grad_norm": 1.1870263239595638, + "learning_rate": 1.1968287374988819e-05, + "loss": 1.3684, + "step": 1647 + }, + { + "epoch": 1.334412955465587, + "grad_norm": 1.2264800114026502, + "learning_rate": 1.1959859463008316e-05, + "loss": 1.3983, + "step": 1648 + }, + { + "epoch": 1.3352226720647773, + "grad_norm": 1.2388487173488427, + "learning_rate": 1.1951430103090079e-05, + "loss": 1.3473, + "step": 1649 + }, + { + "epoch": 1.3360323886639676, + "grad_norm": 1.2542679619160093, + "learning_rate": 1.1942999301461694e-05, + "loss": 1.2761, + "step": 1650 + }, + { + "epoch": 1.3368421052631578, + "grad_norm": 1.1812219242563955, + "learning_rate": 1.1934567064351802e-05, + "loss": 1.3625, + "step": 1651 + }, + { + "epoch": 1.337651821862348, + "grad_norm": 1.2130955096327123, + "learning_rate": 1.192613339799012e-05, + "loss": 1.3876, + "step": 1652 + }, + { + "epoch": 1.3384615384615386, + "grad_norm": 1.261762862681965, + "learning_rate": 1.1917698308607409e-05, + "loss": 1.3848, + "step": 1653 + }, + { + "epoch": 1.3392712550607286, + "grad_norm": 1.187112353472305, + "learning_rate": 1.1909261802435485e-05, + "loss": 1.3847, + "step": 1654 + }, + { + "epoch": 1.3400809716599191, + "grad_norm": 1.1853951778926974, + "learning_rate": 1.1900823885707216e-05, + "loss": 1.3522, + "step": 1655 + }, + { + "epoch": 1.3408906882591092, + "grad_norm": 1.2146857185059894, + "learning_rate": 1.1892384564656499e-05, + "loss": 1.3787, + "step": 1656 + }, + { + "epoch": 1.3417004048582997, + "grad_norm": 1.3043886121989412, + "learning_rate": 1.1883943845518282e-05, + "loss": 1.3538, + "step": 1657 + }, + { + "epoch": 1.34251012145749, + "grad_norm": 1.184170924694241, + "learning_rate": 1.187550173452854e-05, + "loss": 1.3652, + "step": 1658 + }, + { + "epoch": 1.3433198380566802, + "grad_norm": 1.2197338560870996, + "learning_rate": 1.1867058237924276e-05, + "loss": 1.3972, + "step": 1659 + }, + { + "epoch": 1.3441295546558705, + "grad_norm": 1.158381117646045, + "learning_rate": 1.1858613361943518e-05, + "loss": 1.3444, + "step": 1660 + }, + { + "epoch": 1.3449392712550607, + "grad_norm": 1.1488130103833536, + "learning_rate": 1.1850167112825316e-05, + "loss": 1.3224, + "step": 1661 + }, + { + "epoch": 1.345748987854251, + "grad_norm": 1.189395323195293, + "learning_rate": 1.1841719496809725e-05, + "loss": 1.3928, + "step": 1662 + }, + { + "epoch": 1.3465587044534413, + "grad_norm": 1.2004459789438373, + "learning_rate": 1.1833270520137819e-05, + "loss": 1.3517, + "step": 1663 + }, + { + "epoch": 1.3473684210526315, + "grad_norm": 1.204495169303041, + "learning_rate": 1.182482018905167e-05, + "loss": 1.4155, + "step": 1664 + }, + { + "epoch": 1.3481781376518218, + "grad_norm": 1.2633604245127454, + "learning_rate": 1.1816368509794365e-05, + "loss": 1.4195, + "step": 1665 + }, + { + "epoch": 1.348987854251012, + "grad_norm": 1.180591301251816, + "learning_rate": 1.1807915488609968e-05, + "loss": 1.3682, + "step": 1666 + }, + { + "epoch": 1.3497975708502024, + "grad_norm": 1.1375804044964366, + "learning_rate": 1.1799461131743548e-05, + "loss": 1.3662, + "step": 1667 + }, + { + "epoch": 1.3506072874493928, + "grad_norm": 1.1650602887537418, + "learning_rate": 1.179100544544115e-05, + "loss": 1.3532, + "step": 1668 + }, + { + "epoch": 1.351417004048583, + "grad_norm": 1.1656597543760445, + "learning_rate": 1.1782548435949814e-05, + "loss": 1.3439, + "step": 1669 + }, + { + "epoch": 1.3522267206477734, + "grad_norm": 1.1984145456570525, + "learning_rate": 1.177409010951755e-05, + "loss": 1.3584, + "step": 1670 + }, + { + "epoch": 1.3530364372469634, + "grad_norm": 1.2125180037933434, + "learning_rate": 1.1765630472393338e-05, + "loss": 1.3553, + "step": 1671 + }, + { + "epoch": 1.353846153846154, + "grad_norm": 1.278487148224879, + "learning_rate": 1.1757169530827129e-05, + "loss": 1.3817, + "step": 1672 + }, + { + "epoch": 1.3546558704453442, + "grad_norm": 1.2618389577279896, + "learning_rate": 1.1748707291069846e-05, + "loss": 1.4483, + "step": 1673 + }, + { + "epoch": 1.3554655870445345, + "grad_norm": 1.2332866269192326, + "learning_rate": 1.1740243759373358e-05, + "loss": 1.3525, + "step": 1674 + }, + { + "epoch": 1.3562753036437247, + "grad_norm": 1.2162494031574367, + "learning_rate": 1.1731778941990497e-05, + "loss": 1.3197, + "step": 1675 + }, + { + "epoch": 1.357085020242915, + "grad_norm": 1.182031395601063, + "learning_rate": 1.1723312845175041e-05, + "loss": 1.3705, + "step": 1676 + }, + { + "epoch": 1.3578947368421053, + "grad_norm": 1.188685842481188, + "learning_rate": 1.1714845475181716e-05, + "loss": 1.3931, + "step": 1677 + }, + { + "epoch": 1.3587044534412955, + "grad_norm": 1.216181329532205, + "learning_rate": 1.1706376838266185e-05, + "loss": 1.3821, + "step": 1678 + }, + { + "epoch": 1.3595141700404858, + "grad_norm": 1.2693089283894199, + "learning_rate": 1.169790694068505e-05, + "loss": 1.3803, + "step": 1679 + }, + { + "epoch": 1.360323886639676, + "grad_norm": 1.214964553877776, + "learning_rate": 1.1689435788695844e-05, + "loss": 1.3528, + "step": 1680 + }, + { + "epoch": 1.3611336032388663, + "grad_norm": 1.2317976939891357, + "learning_rate": 1.1680963388557028e-05, + "loss": 1.4125, + "step": 1681 + }, + { + "epoch": 1.3619433198380566, + "grad_norm": 1.2844113480291337, + "learning_rate": 1.1672489746527979e-05, + "loss": 1.3941, + "step": 1682 + }, + { + "epoch": 1.362753036437247, + "grad_norm": 1.2279453781778626, + "learning_rate": 1.1664014868869e-05, + "loss": 1.3184, + "step": 1683 + }, + { + "epoch": 1.3635627530364371, + "grad_norm": 1.2537576929676622, + "learning_rate": 1.16555387618413e-05, + "loss": 1.3569, + "step": 1684 + }, + { + "epoch": 1.3643724696356276, + "grad_norm": 1.2833642997336299, + "learning_rate": 1.1647061431707e-05, + "loss": 1.4056, + "step": 1685 + }, + { + "epoch": 1.3651821862348177, + "grad_norm": 1.235092352736005, + "learning_rate": 1.1638582884729127e-05, + "loss": 1.4097, + "step": 1686 + }, + { + "epoch": 1.3659919028340082, + "grad_norm": 1.187634661590241, + "learning_rate": 1.16301031271716e-05, + "loss": 1.4244, + "step": 1687 + }, + { + "epoch": 1.3668016194331984, + "grad_norm": 1.19737827161909, + "learning_rate": 1.1621622165299233e-05, + "loss": 1.3609, + "step": 1688 + }, + { + "epoch": 1.3676113360323887, + "grad_norm": 1.2414142941245716, + "learning_rate": 1.161314000537774e-05, + "loss": 1.4491, + "step": 1689 + }, + { + "epoch": 1.368421052631579, + "grad_norm": 1.231861969240117, + "learning_rate": 1.1604656653673707e-05, + "loss": 1.3283, + "step": 1690 + }, + { + "epoch": 1.3692307692307693, + "grad_norm": 1.1830853058793631, + "learning_rate": 1.1596172116454609e-05, + "loss": 1.3827, + "step": 1691 + }, + { + "epoch": 1.3700404858299595, + "grad_norm": 1.2656452551880795, + "learning_rate": 1.1587686399988793e-05, + "loss": 1.419, + "step": 1692 + }, + { + "epoch": 1.3708502024291498, + "grad_norm": 1.2452981014868072, + "learning_rate": 1.157919951054548e-05, + "loss": 1.3345, + "step": 1693 + }, + { + "epoch": 1.37165991902834, + "grad_norm": 1.2262829471723262, + "learning_rate": 1.1570711454394759e-05, + "loss": 1.3604, + "step": 1694 + }, + { + "epoch": 1.3724696356275303, + "grad_norm": 1.1921863334032212, + "learning_rate": 1.156222223780757e-05, + "loss": 1.3358, + "step": 1695 + }, + { + "epoch": 1.3732793522267206, + "grad_norm": 1.2818863761177053, + "learning_rate": 1.1553731867055724e-05, + "loss": 1.3675, + "step": 1696 + }, + { + "epoch": 1.3740890688259109, + "grad_norm": 1.2743486168767448, + "learning_rate": 1.1545240348411877e-05, + "loss": 1.4062, + "step": 1697 + }, + { + "epoch": 1.3748987854251011, + "grad_norm": 1.2406034474747183, + "learning_rate": 1.1536747688149537e-05, + "loss": 1.4102, + "step": 1698 + }, + { + "epoch": 1.3757085020242914, + "grad_norm": 1.2104446295299616, + "learning_rate": 1.1528253892543053e-05, + "loss": 1.44, + "step": 1699 + }, + { + "epoch": 1.376518218623482, + "grad_norm": 1.176123338179055, + "learning_rate": 1.1519758967867608e-05, + "loss": 1.4084, + "step": 1700 + }, + { + "epoch": 1.377327935222672, + "grad_norm": 1.2508166417186997, + "learning_rate": 1.1511262920399233e-05, + "loss": 1.3585, + "step": 1701 + }, + { + "epoch": 1.3781376518218624, + "grad_norm": 1.1964180815673577, + "learning_rate": 1.1502765756414776e-05, + "loss": 1.3996, + "step": 1702 + }, + { + "epoch": 1.3789473684210527, + "grad_norm": 1.2382693764014767, + "learning_rate": 1.1494267482191912e-05, + "loss": 1.2927, + "step": 1703 + }, + { + "epoch": 1.379757085020243, + "grad_norm": 1.346059359155808, + "learning_rate": 1.1485768104009141e-05, + "loss": 1.3396, + "step": 1704 + }, + { + "epoch": 1.3805668016194332, + "grad_norm": 1.3195598291408253, + "learning_rate": 1.1477267628145777e-05, + "loss": 1.3325, + "step": 1705 + }, + { + "epoch": 1.3813765182186235, + "grad_norm": 1.278714888738768, + "learning_rate": 1.146876606088194e-05, + "loss": 1.3388, + "step": 1706 + }, + { + "epoch": 1.3821862348178138, + "grad_norm": 1.2060319063156253, + "learning_rate": 1.1460263408498557e-05, + "loss": 1.337, + "step": 1707 + }, + { + "epoch": 1.382995951417004, + "grad_norm": 1.3011345426618277, + "learning_rate": 1.1451759677277367e-05, + "loss": 1.4089, + "step": 1708 + }, + { + "epoch": 1.3838056680161943, + "grad_norm": 1.238105150141394, + "learning_rate": 1.1443254873500897e-05, + "loss": 1.3463, + "step": 1709 + }, + { + "epoch": 1.3846153846153846, + "grad_norm": 1.2043600689977974, + "learning_rate": 1.1434749003452467e-05, + "loss": 1.4289, + "step": 1710 + }, + { + "epoch": 1.3854251012145749, + "grad_norm": 1.252328542955408, + "learning_rate": 1.1426242073416183e-05, + "loss": 1.3714, + "step": 1711 + }, + { + "epoch": 1.3862348178137651, + "grad_norm": 1.302041753695186, + "learning_rate": 1.1417734089676939e-05, + "loss": 1.363, + "step": 1712 + }, + { + "epoch": 1.3870445344129554, + "grad_norm": 1.2218471138487301, + "learning_rate": 1.140922505852041e-05, + "loss": 1.3402, + "step": 1713 + }, + { + "epoch": 1.3878542510121457, + "grad_norm": 1.2397585007328973, + "learning_rate": 1.1400714986233035e-05, + "loss": 1.3859, + "step": 1714 + }, + { + "epoch": 1.3886639676113361, + "grad_norm": 1.2723411222710095, + "learning_rate": 1.1392203879102027e-05, + "loss": 1.4312, + "step": 1715 + }, + { + "epoch": 1.3894736842105262, + "grad_norm": 1.2616308378503096, + "learning_rate": 1.1383691743415364e-05, + "loss": 1.3441, + "step": 1716 + }, + { + "epoch": 1.3902834008097167, + "grad_norm": 1.2096630244554334, + "learning_rate": 1.1375178585461788e-05, + "loss": 1.3332, + "step": 1717 + }, + { + "epoch": 1.391093117408907, + "grad_norm": 1.159693781497365, + "learning_rate": 1.136666441153079e-05, + "loss": 1.3252, + "step": 1718 + }, + { + "epoch": 1.3919028340080972, + "grad_norm": 1.1854318858942796, + "learning_rate": 1.1358149227912613e-05, + "loss": 1.3676, + "step": 1719 + }, + { + "epoch": 1.3927125506072875, + "grad_norm": 1.2233170433120137, + "learning_rate": 1.1349633040898246e-05, + "loss": 1.399, + "step": 1720 + }, + { + "epoch": 1.3935222672064778, + "grad_norm": 1.2879065439442032, + "learning_rate": 1.1341115856779423e-05, + "loss": 1.4255, + "step": 1721 + }, + { + "epoch": 1.394331983805668, + "grad_norm": 1.2707729718124967, + "learning_rate": 1.133259768184861e-05, + "loss": 1.3265, + "step": 1722 + }, + { + "epoch": 1.3951417004048583, + "grad_norm": 1.236173964659415, + "learning_rate": 1.1324078522399005e-05, + "loss": 1.4351, + "step": 1723 + }, + { + "epoch": 1.3959514170040486, + "grad_norm": 1.2153770369914687, + "learning_rate": 1.1315558384724537e-05, + "loss": 1.4008, + "step": 1724 + }, + { + "epoch": 1.3967611336032388, + "grad_norm": 1.1899505676690036, + "learning_rate": 1.1307037275119854e-05, + "loss": 1.3839, + "step": 1725 + }, + { + "epoch": 1.397570850202429, + "grad_norm": 1.1928149560420538, + "learning_rate": 1.1298515199880327e-05, + "loss": 1.3546, + "step": 1726 + }, + { + "epoch": 1.3983805668016194, + "grad_norm": 1.2238669730900604, + "learning_rate": 1.1289992165302036e-05, + "loss": 1.2984, + "step": 1727 + }, + { + "epoch": 1.3991902834008096, + "grad_norm": 1.1887889650994987, + "learning_rate": 1.1281468177681767e-05, + "loss": 1.3361, + "step": 1728 + }, + { + "epoch": 1.4, + "grad_norm": 1.2214260231171459, + "learning_rate": 1.1272943243317017e-05, + "loss": 1.3368, + "step": 1729 + }, + { + "epoch": 1.4008097165991904, + "grad_norm": 1.206685962166214, + "learning_rate": 1.1264417368505981e-05, + "loss": 1.377, + "step": 1730 + }, + { + "epoch": 1.4016194331983804, + "grad_norm": 1.2113952130760868, + "learning_rate": 1.1255890559547549e-05, + "loss": 1.3545, + "step": 1731 + }, + { + "epoch": 1.402429149797571, + "grad_norm": 1.1657293085603149, + "learning_rate": 1.1247362822741292e-05, + "loss": 1.3407, + "step": 1732 + }, + { + "epoch": 1.4032388663967612, + "grad_norm": 1.1487734412817017, + "learning_rate": 1.123883416438748e-05, + "loss": 1.2937, + "step": 1733 + }, + { + "epoch": 1.4040485829959515, + "grad_norm": 1.1513182139430822, + "learning_rate": 1.1230304590787059e-05, + "loss": 1.4015, + "step": 1734 + }, + { + "epoch": 1.4048582995951417, + "grad_norm": 1.2294748713620087, + "learning_rate": 1.1221774108241646e-05, + "loss": 1.4282, + "step": 1735 + }, + { + "epoch": 1.405668016194332, + "grad_norm": 1.2580084797725928, + "learning_rate": 1.121324272305353e-05, + "loss": 1.4106, + "step": 1736 + }, + { + "epoch": 1.4064777327935223, + "grad_norm": 1.2246093554948188, + "learning_rate": 1.1204710441525677e-05, + "loss": 1.3874, + "step": 1737 + }, + { + "epoch": 1.4072874493927126, + "grad_norm": 1.2289401792154757, + "learning_rate": 1.119617726996171e-05, + "loss": 1.4159, + "step": 1738 + }, + { + "epoch": 1.4080971659919028, + "grad_norm": 1.1896362142853578, + "learning_rate": 1.1187643214665905e-05, + "loss": 1.3545, + "step": 1739 + }, + { + "epoch": 1.408906882591093, + "grad_norm": 1.1629116832244053, + "learning_rate": 1.117910828194319e-05, + "loss": 1.3602, + "step": 1740 + }, + { + "epoch": 1.4097165991902834, + "grad_norm": 1.2609850112703145, + "learning_rate": 1.117057247809915e-05, + "loss": 1.4164, + "step": 1741 + }, + { + "epoch": 1.4105263157894736, + "grad_norm": 1.220342239587353, + "learning_rate": 1.1162035809440005e-05, + "loss": 1.3738, + "step": 1742 + }, + { + "epoch": 1.411336032388664, + "grad_norm": 1.168299823561213, + "learning_rate": 1.1153498282272626e-05, + "loss": 1.3956, + "step": 1743 + }, + { + "epoch": 1.4121457489878542, + "grad_norm": 1.1755175754663574, + "learning_rate": 1.11449599029045e-05, + "loss": 1.3535, + "step": 1744 + }, + { + "epoch": 1.4129554655870447, + "grad_norm": 1.2143297219911169, + "learning_rate": 1.1136420677643763e-05, + "loss": 1.3979, + "step": 1745 + }, + { + "epoch": 1.4137651821862347, + "grad_norm": 1.1767586889180381, + "learning_rate": 1.1127880612799158e-05, + "loss": 1.4074, + "step": 1746 + }, + { + "epoch": 1.4145748987854252, + "grad_norm": 1.1817264441831037, + "learning_rate": 1.1119339714680062e-05, + "loss": 1.3971, + "step": 1747 + }, + { + "epoch": 1.4153846153846155, + "grad_norm": 1.1851169547567846, + "learning_rate": 1.111079798959646e-05, + "loss": 1.3668, + "step": 1748 + }, + { + "epoch": 1.4161943319838057, + "grad_norm": 1.2004222820473625, + "learning_rate": 1.1102255443858953e-05, + "loss": 1.3868, + "step": 1749 + }, + { + "epoch": 1.417004048582996, + "grad_norm": 1.1884889082407253, + "learning_rate": 1.1093712083778748e-05, + "loss": 1.3551, + "step": 1750 + }, + { + "epoch": 1.4178137651821863, + "grad_norm": 1.1839821452630235, + "learning_rate": 1.108516791566764e-05, + "loss": 1.3438, + "step": 1751 + }, + { + "epoch": 1.4186234817813765, + "grad_norm": 1.2051936032476567, + "learning_rate": 1.1076622945838045e-05, + "loss": 1.3842, + "step": 1752 + }, + { + "epoch": 1.4194331983805668, + "grad_norm": 1.2344360426305427, + "learning_rate": 1.1068077180602953e-05, + "loss": 1.3924, + "step": 1753 + }, + { + "epoch": 1.420242914979757, + "grad_norm": 1.2188954123830977, + "learning_rate": 1.1059530626275948e-05, + "loss": 1.4137, + "step": 1754 + }, + { + "epoch": 1.4210526315789473, + "grad_norm": 1.2359084462091137, + "learning_rate": 1.1050983289171195e-05, + "loss": 1.3869, + "step": 1755 + }, + { + "epoch": 1.4218623481781376, + "grad_norm": 1.2213164143326687, + "learning_rate": 1.1042435175603439e-05, + "loss": 1.3499, + "step": 1756 + }, + { + "epoch": 1.4226720647773279, + "grad_norm": 1.2425021636630276, + "learning_rate": 1.1033886291888004e-05, + "loss": 1.3418, + "step": 1757 + }, + { + "epoch": 1.4234817813765182, + "grad_norm": 1.188641014563366, + "learning_rate": 1.102533664434077e-05, + "loss": 1.3177, + "step": 1758 + }, + { + "epoch": 1.4242914979757084, + "grad_norm": 1.1958395164768627, + "learning_rate": 1.1016786239278188e-05, + "loss": 1.3896, + "step": 1759 + }, + { + "epoch": 1.425101214574899, + "grad_norm": 1.1862097808375378, + "learning_rate": 1.1008235083017272e-05, + "loss": 1.3035, + "step": 1760 + }, + { + "epoch": 1.425910931174089, + "grad_norm": 1.2172514085079498, + "learning_rate": 1.0999683181875591e-05, + "loss": 1.3385, + "step": 1761 + }, + { + "epoch": 1.4267206477732794, + "grad_norm": 1.263795641893233, + "learning_rate": 1.0991130542171255e-05, + "loss": 1.3504, + "step": 1762 + }, + { + "epoch": 1.4275303643724697, + "grad_norm": 1.218807375034323, + "learning_rate": 1.0982577170222934e-05, + "loss": 1.3728, + "step": 1763 + }, + { + "epoch": 1.42834008097166, + "grad_norm": 1.2785005424913352, + "learning_rate": 1.0974023072349824e-05, + "loss": 1.3292, + "step": 1764 + }, + { + "epoch": 1.4291497975708503, + "grad_norm": 1.1955787893390333, + "learning_rate": 1.096546825487167e-05, + "loss": 1.4012, + "step": 1765 + }, + { + "epoch": 1.4299595141700405, + "grad_norm": 1.2472396886736037, + "learning_rate": 1.0956912724108737e-05, + "loss": 1.3221, + "step": 1766 + }, + { + "epoch": 1.4307692307692308, + "grad_norm": 1.2548879430356208, + "learning_rate": 1.0948356486381829e-05, + "loss": 1.3776, + "step": 1767 + }, + { + "epoch": 1.431578947368421, + "grad_norm": 1.242576432624912, + "learning_rate": 1.0939799548012262e-05, + "loss": 1.4025, + "step": 1768 + }, + { + "epoch": 1.4323886639676113, + "grad_norm": 1.2574563622805863, + "learning_rate": 1.0931241915321877e-05, + "loss": 1.412, + "step": 1769 + }, + { + "epoch": 1.4331983805668016, + "grad_norm": 1.1995040638978254, + "learning_rate": 1.092268359463302e-05, + "loss": 1.3246, + "step": 1770 + }, + { + "epoch": 1.4340080971659919, + "grad_norm": 1.2739773809734911, + "learning_rate": 1.0914124592268557e-05, + "loss": 1.3698, + "step": 1771 + }, + { + "epoch": 1.4348178137651821, + "grad_norm": 1.2199023602628671, + "learning_rate": 1.0905564914551847e-05, + "loss": 1.434, + "step": 1772 + }, + { + "epoch": 1.4356275303643724, + "grad_norm": 1.2464548777989193, + "learning_rate": 1.0897004567806754e-05, + "loss": 1.4201, + "step": 1773 + }, + { + "epoch": 1.4364372469635627, + "grad_norm": 1.214101570752936, + "learning_rate": 1.088844355835763e-05, + "loss": 1.3015, + "step": 1774 + }, + { + "epoch": 1.4372469635627532, + "grad_norm": 1.163110661848742, + "learning_rate": 1.0879881892529325e-05, + "loss": 1.3638, + "step": 1775 + }, + { + "epoch": 1.4380566801619432, + "grad_norm": 1.1936498971792766, + "learning_rate": 1.0871319576647166e-05, + "loss": 1.3742, + "step": 1776 + }, + { + "epoch": 1.4388663967611337, + "grad_norm": 1.2451768719667269, + "learning_rate": 1.0862756617036965e-05, + "loss": 1.4094, + "step": 1777 + }, + { + "epoch": 1.4396761133603238, + "grad_norm": 1.1478943380908433, + "learning_rate": 1.085419302002501e-05, + "loss": 1.3577, + "step": 1778 + }, + { + "epoch": 1.4404858299595142, + "grad_norm": 1.2145161220203882, + "learning_rate": 1.0845628791938058e-05, + "loss": 1.3611, + "step": 1779 + }, + { + "epoch": 1.4412955465587045, + "grad_norm": 1.23124963332477, + "learning_rate": 1.0837063939103332e-05, + "loss": 1.3967, + "step": 1780 + }, + { + "epoch": 1.4421052631578948, + "grad_norm": 1.1833240982834867, + "learning_rate": 1.0828498467848515e-05, + "loss": 1.381, + "step": 1781 + }, + { + "epoch": 1.442914979757085, + "grad_norm": 1.2492813677148087, + "learning_rate": 1.0819932384501755e-05, + "loss": 1.3991, + "step": 1782 + }, + { + "epoch": 1.4437246963562753, + "grad_norm": 1.340019879016165, + "learning_rate": 1.081136569539164e-05, + "loss": 1.3585, + "step": 1783 + }, + { + "epoch": 1.4445344129554656, + "grad_norm": 1.2206768859147779, + "learning_rate": 1.0802798406847213e-05, + "loss": 1.4541, + "step": 1784 + }, + { + "epoch": 1.4453441295546559, + "grad_norm": 1.1832342780679208, + "learning_rate": 1.0794230525197959e-05, + "loss": 1.3752, + "step": 1785 + }, + { + "epoch": 1.4461538461538461, + "grad_norm": 1.196053859850085, + "learning_rate": 1.0785662056773805e-05, + "loss": 1.327, + "step": 1786 + }, + { + "epoch": 1.4469635627530364, + "grad_norm": 1.211397922712786, + "learning_rate": 1.0777093007905102e-05, + "loss": 1.4449, + "step": 1787 + }, + { + "epoch": 1.4477732793522267, + "grad_norm": 1.2836918819568495, + "learning_rate": 1.0768523384922635e-05, + "loss": 1.342, + "step": 1788 + }, + { + "epoch": 1.448582995951417, + "grad_norm": 1.2272591940165982, + "learning_rate": 1.0759953194157617e-05, + "loss": 1.4064, + "step": 1789 + }, + { + "epoch": 1.4493927125506074, + "grad_norm": 1.2062079148051859, + "learning_rate": 1.0751382441941677e-05, + "loss": 1.3203, + "step": 1790 + }, + { + "epoch": 1.4502024291497975, + "grad_norm": 1.116389048563294, + "learning_rate": 1.0742811134606856e-05, + "loss": 1.3733, + "step": 1791 + }, + { + "epoch": 1.451012145748988, + "grad_norm": 1.25258756424204, + "learning_rate": 1.0734239278485608e-05, + "loss": 1.3817, + "step": 1792 + }, + { + "epoch": 1.451821862348178, + "grad_norm": 1.208392946627262, + "learning_rate": 1.0725666879910792e-05, + "loss": 1.3108, + "step": 1793 + }, + { + "epoch": 1.4526315789473685, + "grad_norm": 1.1948986892653393, + "learning_rate": 1.071709394521567e-05, + "loss": 1.4184, + "step": 1794 + }, + { + "epoch": 1.4534412955465588, + "grad_norm": 1.1408273455202025, + "learning_rate": 1.0708520480733895e-05, + "loss": 1.4017, + "step": 1795 + }, + { + "epoch": 1.454251012145749, + "grad_norm": 1.1858291893292587, + "learning_rate": 1.0699946492799515e-05, + "loss": 1.3898, + "step": 1796 + }, + { + "epoch": 1.4550607287449393, + "grad_norm": 1.1442203734732623, + "learning_rate": 1.0691371987746968e-05, + "loss": 1.3862, + "step": 1797 + }, + { + "epoch": 1.4558704453441296, + "grad_norm": 1.1796945509151606, + "learning_rate": 1.0682796971911067e-05, + "loss": 1.3721, + "step": 1798 + }, + { + "epoch": 1.4566801619433198, + "grad_norm": 1.2518379890147606, + "learning_rate": 1.0674221451627003e-05, + "loss": 1.4382, + "step": 1799 + }, + { + "epoch": 1.45748987854251, + "grad_norm": 1.1544556302617863, + "learning_rate": 1.0665645433230345e-05, + "loss": 1.3804, + "step": 1800 + }, + { + "epoch": 1.4582995951417004, + "grad_norm": 1.2433303275893108, + "learning_rate": 1.065706892305703e-05, + "loss": 1.4092, + "step": 1801 + }, + { + "epoch": 1.4591093117408906, + "grad_norm": 1.277806150716417, + "learning_rate": 1.0648491927443352e-05, + "loss": 1.3671, + "step": 1802 + }, + { + "epoch": 1.459919028340081, + "grad_norm": 1.2764604355710378, + "learning_rate": 1.0639914452725966e-05, + "loss": 1.3823, + "step": 1803 + }, + { + "epoch": 1.4607287449392712, + "grad_norm": 1.1879055573384147, + "learning_rate": 1.0631336505241885e-05, + "loss": 1.3828, + "step": 1804 + }, + { + "epoch": 1.4615384615384617, + "grad_norm": 1.2161612489980715, + "learning_rate": 1.0622758091328469e-05, + "loss": 1.3319, + "step": 1805 + }, + { + "epoch": 1.4623481781376517, + "grad_norm": 1.2261075654968276, + "learning_rate": 1.0614179217323418e-05, + "loss": 1.3419, + "step": 1806 + }, + { + "epoch": 1.4631578947368422, + "grad_norm": 1.2117166223283236, + "learning_rate": 1.0605599889564782e-05, + "loss": 1.3463, + "step": 1807 + }, + { + "epoch": 1.4639676113360323, + "grad_norm": 1.2031717859192184, + "learning_rate": 1.0597020114390932e-05, + "loss": 1.3635, + "step": 1808 + }, + { + "epoch": 1.4647773279352228, + "grad_norm": 1.2031388198613926, + "learning_rate": 1.0588439898140586e-05, + "loss": 1.3451, + "step": 1809 + }, + { + "epoch": 1.465587044534413, + "grad_norm": 1.3474438239487712, + "learning_rate": 1.0579859247152774e-05, + "loss": 1.4206, + "step": 1810 + }, + { + "epoch": 1.4663967611336033, + "grad_norm": 1.2444786342836838, + "learning_rate": 1.0571278167766857e-05, + "loss": 1.3636, + "step": 1811 + }, + { + "epoch": 1.4672064777327936, + "grad_norm": 1.219013237010411, + "learning_rate": 1.0562696666322502e-05, + "loss": 1.3193, + "step": 1812 + }, + { + "epoch": 1.4680161943319838, + "grad_norm": 1.1690703137014555, + "learning_rate": 1.05541147491597e-05, + "loss": 1.315, + "step": 1813 + }, + { + "epoch": 1.468825910931174, + "grad_norm": 1.2063145766276413, + "learning_rate": 1.0545532422618742e-05, + "loss": 1.3397, + "step": 1814 + }, + { + "epoch": 1.4696356275303644, + "grad_norm": 1.2003457207424921, + "learning_rate": 1.0536949693040224e-05, + "loss": 1.3279, + "step": 1815 + }, + { + "epoch": 1.4704453441295546, + "grad_norm": 1.2020614644434036, + "learning_rate": 1.0528366566765032e-05, + "loss": 1.408, + "step": 1816 + }, + { + "epoch": 1.471255060728745, + "grad_norm": 1.281360311469663, + "learning_rate": 1.0519783050134358e-05, + "loss": 1.3833, + "step": 1817 + }, + { + "epoch": 1.4720647773279352, + "grad_norm": 1.2169999342183757, + "learning_rate": 1.0511199149489673e-05, + "loss": 1.3173, + "step": 1818 + }, + { + "epoch": 1.4728744939271254, + "grad_norm": 1.1845396830671333, + "learning_rate": 1.0502614871172736e-05, + "loss": 1.3344, + "step": 1819 + }, + { + "epoch": 1.4736842105263157, + "grad_norm": 1.2867321451496658, + "learning_rate": 1.0494030221525582e-05, + "loss": 1.4286, + "step": 1820 + }, + { + "epoch": 1.474493927125506, + "grad_norm": 1.2064628750455095, + "learning_rate": 1.0485445206890522e-05, + "loss": 1.3356, + "step": 1821 + }, + { + "epoch": 1.4753036437246965, + "grad_norm": 1.228960560736752, + "learning_rate": 1.0476859833610142e-05, + "loss": 1.4522, + "step": 1822 + }, + { + "epoch": 1.4761133603238865, + "grad_norm": 1.2058195619871348, + "learning_rate": 1.046827410802728e-05, + "loss": 1.3681, + "step": 1823 + }, + { + "epoch": 1.476923076923077, + "grad_norm": 1.1852192696221284, + "learning_rate": 1.0459688036485044e-05, + "loss": 1.3427, + "step": 1824 + }, + { + "epoch": 1.4777327935222673, + "grad_norm": 1.20736083607171, + "learning_rate": 1.0451101625326798e-05, + "loss": 1.4147, + "step": 1825 + }, + { + "epoch": 1.4785425101214575, + "grad_norm": 1.2516509317308389, + "learning_rate": 1.0442514880896156e-05, + "loss": 1.3319, + "step": 1826 + }, + { + "epoch": 1.4793522267206478, + "grad_norm": 1.2212025089204959, + "learning_rate": 1.043392780953697e-05, + "loss": 1.34, + "step": 1827 + }, + { + "epoch": 1.480161943319838, + "grad_norm": 1.241429196352147, + "learning_rate": 1.0425340417593341e-05, + "loss": 1.3743, + "step": 1828 + }, + { + "epoch": 1.4809716599190283, + "grad_norm": 1.18843326826917, + "learning_rate": 1.0416752711409612e-05, + "loss": 1.3653, + "step": 1829 + }, + { + "epoch": 1.4817813765182186, + "grad_norm": 1.218215122004799, + "learning_rate": 1.0408164697330348e-05, + "loss": 1.3632, + "step": 1830 + }, + { + "epoch": 1.4825910931174089, + "grad_norm": 1.203207018787504, + "learning_rate": 1.0399576381700346e-05, + "loss": 1.3731, + "step": 1831 + }, + { + "epoch": 1.4834008097165992, + "grad_norm": 1.2359647155470024, + "learning_rate": 1.0390987770864623e-05, + "loss": 1.363, + "step": 1832 + }, + { + "epoch": 1.4842105263157894, + "grad_norm": 1.211501248267098, + "learning_rate": 1.0382398871168421e-05, + "loss": 1.3222, + "step": 1833 + }, + { + "epoch": 1.4850202429149797, + "grad_norm": 1.1936705020590501, + "learning_rate": 1.0373809688957192e-05, + "loss": 1.3877, + "step": 1834 + }, + { + "epoch": 1.48582995951417, + "grad_norm": 1.1937889310278138, + "learning_rate": 1.0365220230576592e-05, + "loss": 1.4284, + "step": 1835 + }, + { + "epoch": 1.4866396761133602, + "grad_norm": 1.1808597278513449, + "learning_rate": 1.035663050237248e-05, + "loss": 1.3622, + "step": 1836 + }, + { + "epoch": 1.4874493927125507, + "grad_norm": 1.1900999195170403, + "learning_rate": 1.0348040510690929e-05, + "loss": 1.4129, + "step": 1837 + }, + { + "epoch": 1.4882591093117408, + "grad_norm": 1.2368390047722708, + "learning_rate": 1.033945026187819e-05, + "loss": 1.3577, + "step": 1838 + }, + { + "epoch": 1.4890688259109313, + "grad_norm": 1.250086709756194, + "learning_rate": 1.0330859762280712e-05, + "loss": 1.3997, + "step": 1839 + }, + { + "epoch": 1.4898785425101215, + "grad_norm": 1.2472030369660945, + "learning_rate": 1.0322269018245128e-05, + "loss": 1.3501, + "step": 1840 + }, + { + "epoch": 1.4906882591093118, + "grad_norm": 1.2452205777512158, + "learning_rate": 1.0313678036118253e-05, + "loss": 1.3399, + "step": 1841 + }, + { + "epoch": 1.491497975708502, + "grad_norm": 1.2034361473861883, + "learning_rate": 1.0305086822247077e-05, + "loss": 1.3746, + "step": 1842 + }, + { + "epoch": 1.4923076923076923, + "grad_norm": 1.2952780810330986, + "learning_rate": 1.0296495382978756e-05, + "loss": 1.3704, + "step": 1843 + }, + { + "epoch": 1.4931174089068826, + "grad_norm": 1.2614416983159662, + "learning_rate": 1.0287903724660617e-05, + "loss": 1.3932, + "step": 1844 + }, + { + "epoch": 1.4939271255060729, + "grad_norm": 1.207803764074404, + "learning_rate": 1.0279311853640157e-05, + "loss": 1.3694, + "step": 1845 + }, + { + "epoch": 1.4947368421052631, + "grad_norm": 1.181956708022318, + "learning_rate": 1.0270719776265017e-05, + "loss": 1.3917, + "step": 1846 + }, + { + "epoch": 1.4955465587044534, + "grad_norm": 1.1444875887339165, + "learning_rate": 1.0262127498882992e-05, + "loss": 1.3259, + "step": 1847 + }, + { + "epoch": 1.4963562753036437, + "grad_norm": 1.2022791631767018, + "learning_rate": 1.0253535027842032e-05, + "loss": 1.3427, + "step": 1848 + }, + { + "epoch": 1.497165991902834, + "grad_norm": 1.2070972768865362, + "learning_rate": 1.024494236949023e-05, + "loss": 1.3986, + "step": 1849 + }, + { + "epoch": 1.4979757085020242, + "grad_norm": 1.2147753961877357, + "learning_rate": 1.0236349530175807e-05, + "loss": 1.3675, + "step": 1850 + }, + { + "epoch": 1.4987854251012145, + "grad_norm": 1.1591467128751487, + "learning_rate": 1.0227756516247127e-05, + "loss": 1.3841, + "step": 1851 + }, + { + "epoch": 1.499595141700405, + "grad_norm": 1.2069756938729441, + "learning_rate": 1.0219163334052682e-05, + "loss": 1.3365, + "step": 1852 + }, + { + "epoch": 1.500404858299595, + "grad_norm": 1.2440005880081049, + "learning_rate": 1.0210569989941085e-05, + "loss": 1.357, + "step": 1853 + }, + { + "epoch": 1.5012145748987855, + "grad_norm": 1.2000119522275203, + "learning_rate": 1.020197649026107e-05, + "loss": 1.3938, + "step": 1854 + }, + { + "epoch": 1.5020242914979756, + "grad_norm": 1.187811020715873, + "learning_rate": 1.019338284136149e-05, + "loss": 1.3288, + "step": 1855 + }, + { + "epoch": 1.502834008097166, + "grad_norm": 1.1684805914260927, + "learning_rate": 1.01847890495913e-05, + "loss": 1.3791, + "step": 1856 + }, + { + "epoch": 1.5036437246963563, + "grad_norm": 1.2362993183012971, + "learning_rate": 1.0176195121299567e-05, + "loss": 1.394, + "step": 1857 + }, + { + "epoch": 1.5044534412955466, + "grad_norm": 1.2284232654390486, + "learning_rate": 1.0167601062835459e-05, + "loss": 1.3728, + "step": 1858 + }, + { + "epoch": 1.5052631578947369, + "grad_norm": 1.2440453874825037, + "learning_rate": 1.0159006880548237e-05, + "loss": 1.3519, + "step": 1859 + }, + { + "epoch": 1.5060728744939271, + "grad_norm": 1.14460238709053, + "learning_rate": 1.015041258078725e-05, + "loss": 1.365, + "step": 1860 + }, + { + "epoch": 1.5068825910931174, + "grad_norm": 1.1391435364818712, + "learning_rate": 1.0141818169901945e-05, + "loss": 1.3191, + "step": 1861 + }, + { + "epoch": 1.5076923076923077, + "grad_norm": 1.1847183299604456, + "learning_rate": 1.013322365424184e-05, + "loss": 1.4322, + "step": 1862 + }, + { + "epoch": 1.508502024291498, + "grad_norm": 1.2242479719103367, + "learning_rate": 1.012462904015654e-05, + "loss": 1.3728, + "step": 1863 + }, + { + "epoch": 1.5093117408906882, + "grad_norm": 1.2141368241612376, + "learning_rate": 1.011603433399571e-05, + "loss": 1.4276, + "step": 1864 + }, + { + "epoch": 1.5101214574898787, + "grad_norm": 1.197406422654741, + "learning_rate": 1.0107439542109097e-05, + "loss": 1.387, + "step": 1865 + }, + { + "epoch": 1.5109311740890687, + "grad_norm": 1.141149626390884, + "learning_rate": 1.0098844670846504e-05, + "loss": 1.3297, + "step": 1866 + }, + { + "epoch": 1.5117408906882592, + "grad_norm": 1.2001491455611746, + "learning_rate": 1.0090249726557795e-05, + "loss": 1.2897, + "step": 1867 + }, + { + "epoch": 1.5125506072874493, + "grad_norm": 1.1667785806675015, + "learning_rate": 1.0081654715592881e-05, + "loss": 1.3459, + "step": 1868 + }, + { + "epoch": 1.5133603238866398, + "grad_norm": 1.1839167724233615, + "learning_rate": 1.007305964430173e-05, + "loss": 1.3219, + "step": 1869 + }, + { + "epoch": 1.5141700404858298, + "grad_norm": 1.2144434763318714, + "learning_rate": 1.0064464519034358e-05, + "loss": 1.3816, + "step": 1870 + }, + { + "epoch": 1.5149797570850203, + "grad_norm": 1.2135857961398115, + "learning_rate": 1.005586934614081e-05, + "loss": 1.4097, + "step": 1871 + }, + { + "epoch": 1.5157894736842106, + "grad_norm": 1.2095532800796946, + "learning_rate": 1.004727413197117e-05, + "loss": 1.332, + "step": 1872 + }, + { + "epoch": 1.5165991902834008, + "grad_norm": 1.2181508610257055, + "learning_rate": 1.0038678882875557e-05, + "loss": 1.3892, + "step": 1873 + }, + { + "epoch": 1.5174089068825911, + "grad_norm": 1.2026433653521167, + "learning_rate": 1.0030083605204115e-05, + "loss": 1.4083, + "step": 1874 + }, + { + "epoch": 1.5182186234817814, + "grad_norm": 1.1773517841295498, + "learning_rate": 1.0021488305307003e-05, + "loss": 1.3584, + "step": 1875 + }, + { + "epoch": 1.5190283400809717, + "grad_norm": 1.231433388400886, + "learning_rate": 1.00128929895344e-05, + "loss": 1.374, + "step": 1876 + }, + { + "epoch": 1.519838056680162, + "grad_norm": 1.2332502916196273, + "learning_rate": 1.0004297664236502e-05, + "loss": 1.4351, + "step": 1877 + }, + { + "epoch": 1.5206477732793522, + "grad_norm": 1.213292815409757, + "learning_rate": 9.9957023357635e-06, + "loss": 1.2996, + "step": 1878 + }, + { + "epoch": 1.5214574898785425, + "grad_norm": 1.2346861234405477, + "learning_rate": 9.9871070104656e-06, + "loss": 1.3471, + "step": 1879 + }, + { + "epoch": 1.522267206477733, + "grad_norm": 1.254827265085309, + "learning_rate": 9.978511694692999e-06, + "loss": 1.3533, + "step": 1880 + }, + { + "epoch": 1.523076923076923, + "grad_norm": 1.1917367114901438, + "learning_rate": 9.969916394795888e-06, + "loss": 1.3383, + "step": 1881 + }, + { + "epoch": 1.5238866396761135, + "grad_norm": 1.180916076508306, + "learning_rate": 9.961321117124444e-06, + "loss": 1.3722, + "step": 1882 + }, + { + "epoch": 1.5246963562753035, + "grad_norm": 1.2010789539898061, + "learning_rate": 9.952725868028831e-06, + "loss": 1.342, + "step": 1883 + }, + { + "epoch": 1.525506072874494, + "grad_norm": 1.2029836828056097, + "learning_rate": 9.944130653859195e-06, + "loss": 1.3714, + "step": 1884 + }, + { + "epoch": 1.526315789473684, + "grad_norm": 1.230235446023344, + "learning_rate": 9.935535480965647e-06, + "loss": 1.419, + "step": 1885 + }, + { + "epoch": 1.5271255060728746, + "grad_norm": 1.2304103134503401, + "learning_rate": 9.92694035569827e-06, + "loss": 1.3459, + "step": 1886 + }, + { + "epoch": 1.5279352226720648, + "grad_norm": 1.2003919942780323, + "learning_rate": 9.918345284407122e-06, + "loss": 1.4122, + "step": 1887 + }, + { + "epoch": 1.528744939271255, + "grad_norm": 1.2639140447939796, + "learning_rate": 9.909750273442208e-06, + "loss": 1.3368, + "step": 1888 + }, + { + "epoch": 1.5295546558704454, + "grad_norm": 1.2269119630065992, + "learning_rate": 9.901155329153498e-06, + "loss": 1.3554, + "step": 1889 + }, + { + "epoch": 1.5303643724696356, + "grad_norm": 1.2657559146957076, + "learning_rate": 9.892560457890907e-06, + "loss": 1.4207, + "step": 1890 + }, + { + "epoch": 1.531174089068826, + "grad_norm": 1.1993508806620081, + "learning_rate": 9.883965666004293e-06, + "loss": 1.3638, + "step": 1891 + }, + { + "epoch": 1.5319838056680162, + "grad_norm": 1.219522089298675, + "learning_rate": 9.875370959843465e-06, + "loss": 1.3768, + "step": 1892 + }, + { + "epoch": 1.5327935222672064, + "grad_norm": 1.2178201278785572, + "learning_rate": 9.866776345758166e-06, + "loss": 1.408, + "step": 1893 + }, + { + "epoch": 1.5336032388663967, + "grad_norm": 1.1451898153111888, + "learning_rate": 9.858181830098058e-06, + "loss": 1.3561, + "step": 1894 + }, + { + "epoch": 1.5344129554655872, + "grad_norm": 1.2469841070465375, + "learning_rate": 9.849587419212751e-06, + "loss": 1.4224, + "step": 1895 + }, + { + "epoch": 1.5352226720647772, + "grad_norm": 1.1845691507885716, + "learning_rate": 9.840993119451768e-06, + "loss": 1.4053, + "step": 1896 + }, + { + "epoch": 1.5360323886639677, + "grad_norm": 1.1330718380072349, + "learning_rate": 9.832398937164545e-06, + "loss": 1.3581, + "step": 1897 + }, + { + "epoch": 1.5368421052631578, + "grad_norm": 1.1691776979372561, + "learning_rate": 9.823804878700434e-06, + "loss": 1.3208, + "step": 1898 + }, + { + "epoch": 1.5376518218623483, + "grad_norm": 1.1868100114985074, + "learning_rate": 9.815210950408703e-06, + "loss": 1.351, + "step": 1899 + }, + { + "epoch": 1.5384615384615383, + "grad_norm": 1.1980676569924105, + "learning_rate": 9.806617158638515e-06, + "loss": 1.3409, + "step": 1900 + }, + { + "epoch": 1.5392712550607288, + "grad_norm": 1.2286369763296952, + "learning_rate": 9.798023509738932e-06, + "loss": 1.3736, + "step": 1901 + }, + { + "epoch": 1.5400809716599189, + "grad_norm": 1.230286054787407, + "learning_rate": 9.789430010058918e-06, + "loss": 1.3594, + "step": 1902 + }, + { + "epoch": 1.5408906882591094, + "grad_norm": 1.1833188959041652, + "learning_rate": 9.78083666594732e-06, + "loss": 1.3609, + "step": 1903 + }, + { + "epoch": 1.5417004048582996, + "grad_norm": 1.225456208787112, + "learning_rate": 9.772243483752876e-06, + "loss": 1.3461, + "step": 1904 + }, + { + "epoch": 1.54251012145749, + "grad_norm": 1.1594495050829507, + "learning_rate": 9.763650469824198e-06, + "loss": 1.3852, + "step": 1905 + }, + { + "epoch": 1.5433198380566802, + "grad_norm": 1.2470244297864663, + "learning_rate": 9.755057630509774e-06, + "loss": 1.3146, + "step": 1906 + }, + { + "epoch": 1.5441295546558704, + "grad_norm": 1.2036173468340592, + "learning_rate": 9.746464972157971e-06, + "loss": 1.3576, + "step": 1907 + }, + { + "epoch": 1.5449392712550607, + "grad_norm": 1.235082778436893, + "learning_rate": 9.737872501117013e-06, + "loss": 1.3347, + "step": 1908 + }, + { + "epoch": 1.545748987854251, + "grad_norm": 1.247100018489477, + "learning_rate": 9.729280223734988e-06, + "loss": 1.4114, + "step": 1909 + }, + { + "epoch": 1.5465587044534415, + "grad_norm": 1.1457263494567647, + "learning_rate": 9.720688146359843e-06, + "loss": 1.386, + "step": 1910 + }, + { + "epoch": 1.5473684210526315, + "grad_norm": 1.1729296197232764, + "learning_rate": 9.712096275339381e-06, + "loss": 1.3698, + "step": 1911 + }, + { + "epoch": 1.548178137651822, + "grad_norm": 1.1426028010449, + "learning_rate": 9.703504617021247e-06, + "loss": 1.286, + "step": 1912 + }, + { + "epoch": 1.548987854251012, + "grad_norm": 1.2004758472296264, + "learning_rate": 9.694913177752927e-06, + "loss": 1.4253, + "step": 1913 + }, + { + "epoch": 1.5497975708502025, + "grad_norm": 1.2117453405073084, + "learning_rate": 9.68632196388175e-06, + "loss": 1.2984, + "step": 1914 + }, + { + "epoch": 1.5506072874493926, + "grad_norm": 1.2033634956141823, + "learning_rate": 9.677730981754875e-06, + "loss": 1.2999, + "step": 1915 + }, + { + "epoch": 1.551417004048583, + "grad_norm": 1.2564126564820501, + "learning_rate": 9.669140237719292e-06, + "loss": 1.3905, + "step": 1916 + }, + { + "epoch": 1.5522267206477731, + "grad_norm": 1.2062084382081446, + "learning_rate": 9.660549738121814e-06, + "loss": 1.3715, + "step": 1917 + }, + { + "epoch": 1.5530364372469636, + "grad_norm": 1.2829351534612063, + "learning_rate": 9.651959489309073e-06, + "loss": 1.3494, + "step": 1918 + }, + { + "epoch": 1.5538461538461539, + "grad_norm": 1.2383174742670793, + "learning_rate": 9.643369497627521e-06, + "loss": 1.4225, + "step": 1919 + }, + { + "epoch": 1.5546558704453441, + "grad_norm": 1.222803913479318, + "learning_rate": 9.634779769423412e-06, + "loss": 1.3943, + "step": 1920 + }, + { + "epoch": 1.5554655870445344, + "grad_norm": 1.193502143064031, + "learning_rate": 9.62619031104281e-06, + "loss": 1.3762, + "step": 1921 + }, + { + "epoch": 1.5562753036437247, + "grad_norm": 1.1907724889410738, + "learning_rate": 9.61760112883158e-06, + "loss": 1.3944, + "step": 1922 + }, + { + "epoch": 1.557085020242915, + "grad_norm": 1.198345852396561, + "learning_rate": 9.609012229135379e-06, + "loss": 1.3854, + "step": 1923 + }, + { + "epoch": 1.5578947368421052, + "grad_norm": 1.2182523045236915, + "learning_rate": 9.600423618299659e-06, + "loss": 1.367, + "step": 1924 + }, + { + "epoch": 1.5587044534412957, + "grad_norm": 1.1805065924488671, + "learning_rate": 9.591835302669657e-06, + "loss": 1.3546, + "step": 1925 + }, + { + "epoch": 1.5595141700404858, + "grad_norm": 1.2249464401299464, + "learning_rate": 9.58324728859039e-06, + "loss": 1.3609, + "step": 1926 + }, + { + "epoch": 1.5603238866396762, + "grad_norm": 1.2243122967938407, + "learning_rate": 9.57465958240666e-06, + "loss": 1.3999, + "step": 1927 + }, + { + "epoch": 1.5611336032388663, + "grad_norm": 1.2274491912534922, + "learning_rate": 9.566072190463032e-06, + "loss": 1.306, + "step": 1928 + }, + { + "epoch": 1.5619433198380568, + "grad_norm": 1.2488797733856527, + "learning_rate": 9.557485119103849e-06, + "loss": 1.3739, + "step": 1929 + }, + { + "epoch": 1.5627530364372468, + "grad_norm": 1.2239459163300153, + "learning_rate": 9.548898374673205e-06, + "loss": 1.3517, + "step": 1930 + }, + { + "epoch": 1.5635627530364373, + "grad_norm": 1.2178846415184048, + "learning_rate": 9.540311963514957e-06, + "loss": 1.3731, + "step": 1931 + }, + { + "epoch": 1.5643724696356274, + "grad_norm": 1.265558847290855, + "learning_rate": 9.531725891972725e-06, + "loss": 1.4236, + "step": 1932 + }, + { + "epoch": 1.5651821862348179, + "grad_norm": 1.1681836474926013, + "learning_rate": 9.523140166389864e-06, + "loss": 1.4025, + "step": 1933 + }, + { + "epoch": 1.5659919028340081, + "grad_norm": 1.2040234728629644, + "learning_rate": 9.514554793109477e-06, + "loss": 1.4339, + "step": 1934 + }, + { + "epoch": 1.5668016194331984, + "grad_norm": 1.265710944730449, + "learning_rate": 9.505969778474418e-06, + "loss": 1.3561, + "step": 1935 + }, + { + "epoch": 1.5676113360323887, + "grad_norm": 1.1846379371827758, + "learning_rate": 9.497385128827266e-06, + "loss": 1.3652, + "step": 1936 + }, + { + "epoch": 1.568421052631579, + "grad_norm": 1.174180232851037, + "learning_rate": 9.48880085051033e-06, + "loss": 1.2807, + "step": 1937 + }, + { + "epoch": 1.5692307692307692, + "grad_norm": 1.242422752714555, + "learning_rate": 9.480216949865644e-06, + "loss": 1.375, + "step": 1938 + }, + { + "epoch": 1.5700404858299595, + "grad_norm": 1.239409495392598, + "learning_rate": 9.471633433234972e-06, + "loss": 1.3435, + "step": 1939 + }, + { + "epoch": 1.5708502024291497, + "grad_norm": 1.2564868657700816, + "learning_rate": 9.463050306959782e-06, + "loss": 1.3503, + "step": 1940 + }, + { + "epoch": 1.57165991902834, + "grad_norm": 1.2398521277680479, + "learning_rate": 9.454467577381263e-06, + "loss": 1.3266, + "step": 1941 + }, + { + "epoch": 1.5724696356275305, + "grad_norm": 1.2135937983968192, + "learning_rate": 9.445885250840301e-06, + "loss": 1.3269, + "step": 1942 + }, + { + "epoch": 1.5732793522267206, + "grad_norm": 1.1895520607669423, + "learning_rate": 9.4373033336775e-06, + "loss": 1.3121, + "step": 1943 + }, + { + "epoch": 1.574089068825911, + "grad_norm": 1.1824970891733284, + "learning_rate": 9.428721832233148e-06, + "loss": 1.3569, + "step": 1944 + }, + { + "epoch": 1.574898785425101, + "grad_norm": 1.2023256894687824, + "learning_rate": 9.42014075284723e-06, + "loss": 1.4145, + "step": 1945 + }, + { + "epoch": 1.5757085020242916, + "grad_norm": 1.18100975780351, + "learning_rate": 9.411560101859417e-06, + "loss": 1.3616, + "step": 1946 + }, + { + "epoch": 1.5765182186234816, + "grad_norm": 1.2357755167800077, + "learning_rate": 9.402979885609071e-06, + "loss": 1.3371, + "step": 1947 + }, + { + "epoch": 1.5773279352226721, + "grad_norm": 1.2279972155267704, + "learning_rate": 9.394400110435225e-06, + "loss": 1.3994, + "step": 1948 + }, + { + "epoch": 1.5781376518218624, + "grad_norm": 1.1914709750868986, + "learning_rate": 9.385820782676584e-06, + "loss": 1.3488, + "step": 1949 + }, + { + "epoch": 1.5789473684210527, + "grad_norm": 1.1994067475194277, + "learning_rate": 9.377241908671533e-06, + "loss": 1.3631, + "step": 1950 + }, + { + "epoch": 1.579757085020243, + "grad_norm": 1.1718520198213915, + "learning_rate": 9.368663494758115e-06, + "loss": 1.3229, + "step": 1951 + }, + { + "epoch": 1.5805668016194332, + "grad_norm": 1.2111814984055784, + "learning_rate": 9.360085547274036e-06, + "loss": 1.3721, + "step": 1952 + }, + { + "epoch": 1.5813765182186235, + "grad_norm": 1.2166854804513723, + "learning_rate": 9.351508072556651e-06, + "loss": 1.3382, + "step": 1953 + }, + { + "epoch": 1.5821862348178137, + "grad_norm": 1.2328123213604463, + "learning_rate": 9.342931076942973e-06, + "loss": 1.4074, + "step": 1954 + }, + { + "epoch": 1.582995951417004, + "grad_norm": 1.2394612442811164, + "learning_rate": 9.334354566769658e-06, + "loss": 1.3018, + "step": 1955 + }, + { + "epoch": 1.5838056680161943, + "grad_norm": 1.2112567483732097, + "learning_rate": 9.325778548373e-06, + "loss": 1.2831, + "step": 1956 + }, + { + "epoch": 1.5846153846153848, + "grad_norm": 1.1733648170105058, + "learning_rate": 9.317203028088938e-06, + "loss": 1.3761, + "step": 1957 + }, + { + "epoch": 1.5854251012145748, + "grad_norm": 1.2203734468272454, + "learning_rate": 9.308628012253032e-06, + "loss": 1.3634, + "step": 1958 + }, + { + "epoch": 1.5862348178137653, + "grad_norm": 1.189447117782888, + "learning_rate": 9.300053507200487e-06, + "loss": 1.4065, + "step": 1959 + }, + { + "epoch": 1.5870445344129553, + "grad_norm": 1.2206492902377073, + "learning_rate": 9.291479519266108e-06, + "loss": 1.3927, + "step": 1960 + }, + { + "epoch": 1.5878542510121458, + "grad_norm": 1.1783321735346974, + "learning_rate": 9.282906054784333e-06, + "loss": 1.3714, + "step": 1961 + }, + { + "epoch": 1.5886639676113359, + "grad_norm": 1.165535947374695, + "learning_rate": 9.274333120089211e-06, + "loss": 1.3888, + "step": 1962 + }, + { + "epoch": 1.5894736842105264, + "grad_norm": 1.2497678418850922, + "learning_rate": 9.265760721514397e-06, + "loss": 1.3666, + "step": 1963 + }, + { + "epoch": 1.5902834008097166, + "grad_norm": 1.2311086453584528, + "learning_rate": 9.257188865393148e-06, + "loss": 1.411, + "step": 1964 + }, + { + "epoch": 1.591093117408907, + "grad_norm": 1.1853289463831915, + "learning_rate": 9.248617558058328e-06, + "loss": 1.3884, + "step": 1965 + }, + { + "epoch": 1.5919028340080972, + "grad_norm": 1.2149617440803817, + "learning_rate": 9.240046805842383e-06, + "loss": 1.3175, + "step": 1966 + }, + { + "epoch": 1.5927125506072874, + "grad_norm": 1.2303091959416708, + "learning_rate": 9.231476615077366e-06, + "loss": 1.3443, + "step": 1967 + }, + { + "epoch": 1.5935222672064777, + "grad_norm": 1.2403372778652388, + "learning_rate": 9.2229069920949e-06, + "loss": 1.3279, + "step": 1968 + }, + { + "epoch": 1.594331983805668, + "grad_norm": 1.2729706893834039, + "learning_rate": 9.214337943226199e-06, + "loss": 1.4091, + "step": 1969 + }, + { + "epoch": 1.5951417004048583, + "grad_norm": 1.1994129479779139, + "learning_rate": 9.205769474802045e-06, + "loss": 1.3433, + "step": 1970 + }, + { + "epoch": 1.5959514170040485, + "grad_norm": 1.2304449200153609, + "learning_rate": 9.19720159315279e-06, + "loss": 1.2943, + "step": 1971 + }, + { + "epoch": 1.596761133603239, + "grad_norm": 1.2186847444701223, + "learning_rate": 9.188634304608366e-06, + "loss": 1.3831, + "step": 1972 + }, + { + "epoch": 1.597570850202429, + "grad_norm": 1.2840247109912253, + "learning_rate": 9.180067615498251e-06, + "loss": 1.3342, + "step": 1973 + }, + { + "epoch": 1.5983805668016196, + "grad_norm": 1.183188023738025, + "learning_rate": 9.171501532151486e-06, + "loss": 1.3873, + "step": 1974 + }, + { + "epoch": 1.5991902834008096, + "grad_norm": 1.1776750182121347, + "learning_rate": 9.162936060896672e-06, + "loss": 1.3365, + "step": 1975 + }, + { + "epoch": 1.6, + "grad_norm": 1.328668412553802, + "learning_rate": 9.154371208061943e-06, + "loss": 1.3847, + "step": 1976 + }, + { + "epoch": 1.6008097165991901, + "grad_norm": 1.2168461558396328, + "learning_rate": 9.145806979974991e-06, + "loss": 1.3555, + "step": 1977 + }, + { + "epoch": 1.6016194331983806, + "grad_norm": 1.1622150592681906, + "learning_rate": 9.137243382963039e-06, + "loss": 1.3405, + "step": 1978 + }, + { + "epoch": 1.602429149797571, + "grad_norm": 1.2436189606158277, + "learning_rate": 9.128680423352839e-06, + "loss": 1.3591, + "step": 1979 + }, + { + "epoch": 1.6032388663967612, + "grad_norm": 1.2610631910478973, + "learning_rate": 9.12011810747068e-06, + "loss": 1.4099, + "step": 1980 + }, + { + "epoch": 1.6040485829959514, + "grad_norm": 1.2026899264299413, + "learning_rate": 9.111556441642375e-06, + "loss": 1.3482, + "step": 1981 + }, + { + "epoch": 1.6048582995951417, + "grad_norm": 1.2364975494939607, + "learning_rate": 9.10299543219325e-06, + "loss": 1.3467, + "step": 1982 + }, + { + "epoch": 1.605668016194332, + "grad_norm": 1.2106651427399964, + "learning_rate": 9.094435085448153e-06, + "loss": 1.3102, + "step": 1983 + }, + { + "epoch": 1.6064777327935222, + "grad_norm": 1.1715140130298451, + "learning_rate": 9.085875407731444e-06, + "loss": 1.3365, + "step": 1984 + }, + { + "epoch": 1.6072874493927125, + "grad_norm": 1.1983281783955513, + "learning_rate": 9.07731640536698e-06, + "loss": 1.4136, + "step": 1985 + }, + { + "epoch": 1.6080971659919028, + "grad_norm": 1.1502136649316144, + "learning_rate": 9.068758084678126e-06, + "loss": 1.3614, + "step": 1986 + }, + { + "epoch": 1.6089068825910933, + "grad_norm": 1.213144325311362, + "learning_rate": 9.060200451987741e-06, + "loss": 1.4013, + "step": 1987 + }, + { + "epoch": 1.6097165991902833, + "grad_norm": 1.2047368862136312, + "learning_rate": 9.051643513618176e-06, + "loss": 1.3126, + "step": 1988 + }, + { + "epoch": 1.6105263157894738, + "grad_norm": 1.2612831728284535, + "learning_rate": 9.043087275891266e-06, + "loss": 1.3652, + "step": 1989 + }, + { + "epoch": 1.6113360323886639, + "grad_norm": 1.1804234187245097, + "learning_rate": 9.034531745128334e-06, + "loss": 1.3749, + "step": 1990 + }, + { + "epoch": 1.6121457489878543, + "grad_norm": 1.1934129239753541, + "learning_rate": 9.025976927650176e-06, + "loss": 1.3223, + "step": 1991 + }, + { + "epoch": 1.6129554655870444, + "grad_norm": 1.2290844683292121, + "learning_rate": 9.017422829777068e-06, + "loss": 1.3693, + "step": 1992 + }, + { + "epoch": 1.6137651821862349, + "grad_norm": 1.234955363945963, + "learning_rate": 9.008869457828748e-06, + "loss": 1.3732, + "step": 1993 + }, + { + "epoch": 1.6145748987854251, + "grad_norm": 1.2343436283938842, + "learning_rate": 9.000316818124412e-06, + "loss": 1.3267, + "step": 1994 + }, + { + "epoch": 1.6153846153846154, + "grad_norm": 1.1943281327624646, + "learning_rate": 8.991764916982731e-06, + "loss": 1.2832, + "step": 1995 + }, + { + "epoch": 1.6161943319838057, + "grad_norm": 1.241814934842723, + "learning_rate": 8.98321376072182e-06, + "loss": 1.4049, + "step": 1996 + }, + { + "epoch": 1.617004048582996, + "grad_norm": 1.2281313938106257, + "learning_rate": 8.974663355659237e-06, + "loss": 1.3636, + "step": 1997 + }, + { + "epoch": 1.6178137651821862, + "grad_norm": 1.2025757332697389, + "learning_rate": 8.966113708111998e-06, + "loss": 1.2603, + "step": 1998 + }, + { + "epoch": 1.6186234817813765, + "grad_norm": 1.209319926631317, + "learning_rate": 8.957564824396561e-06, + "loss": 1.3905, + "step": 1999 + }, + { + "epoch": 1.6194331983805668, + "grad_norm": 1.2048829491205542, + "learning_rate": 8.949016710828808e-06, + "loss": 1.3835, + "step": 2000 + }, + { + "epoch": 1.620242914979757, + "grad_norm": 1.1837603866283808, + "learning_rate": 8.940469373724054e-06, + "loss": 1.3579, + "step": 2001 + }, + { + "epoch": 1.6210526315789475, + "grad_norm": 7.115639597248133, + "learning_rate": 8.93192281939705e-06, + "loss": 1.328, + "step": 2002 + }, + { + "epoch": 1.6218623481781376, + "grad_norm": 1.225986024046339, + "learning_rate": 8.923377054161959e-06, + "loss": 1.3977, + "step": 2003 + }, + { + "epoch": 1.622672064777328, + "grad_norm": 1.173818300527572, + "learning_rate": 8.914832084332363e-06, + "loss": 1.4135, + "step": 2004 + }, + { + "epoch": 1.623481781376518, + "grad_norm": 1.2141976317329137, + "learning_rate": 8.906287916221259e-06, + "loss": 1.3668, + "step": 2005 + }, + { + "epoch": 1.6242914979757086, + "grad_norm": 1.1459187530159702, + "learning_rate": 8.897744556141047e-06, + "loss": 1.3697, + "step": 2006 + }, + { + "epoch": 1.6251012145748986, + "grad_norm": 1.1655094826381835, + "learning_rate": 8.88920201040354e-06, + "loss": 1.3506, + "step": 2007 + }, + { + "epoch": 1.6259109311740891, + "grad_norm": 1.2237829104408862, + "learning_rate": 8.880660285319941e-06, + "loss": 1.4369, + "step": 2008 + }, + { + "epoch": 1.6267206477732794, + "grad_norm": 1.2400129820732735, + "learning_rate": 8.872119387200844e-06, + "loss": 1.3921, + "step": 2009 + }, + { + "epoch": 1.6275303643724697, + "grad_norm": 1.2312502428364493, + "learning_rate": 8.863579322356242e-06, + "loss": 1.2692, + "step": 2010 + }, + { + "epoch": 1.62834008097166, + "grad_norm": 1.2294849309798999, + "learning_rate": 8.855040097095504e-06, + "loss": 1.3047, + "step": 2011 + }, + { + "epoch": 1.6291497975708502, + "grad_norm": 1.224426204175276, + "learning_rate": 8.846501717727378e-06, + "loss": 1.4028, + "step": 2012 + }, + { + "epoch": 1.6299595141700405, + "grad_norm": 1.2489783782355426, + "learning_rate": 8.837964190559998e-06, + "loss": 1.384, + "step": 2013 + }, + { + "epoch": 1.6307692307692307, + "grad_norm": 1.2161261861703765, + "learning_rate": 8.829427521900852e-06, + "loss": 1.4003, + "step": 2014 + }, + { + "epoch": 1.631578947368421, + "grad_norm": 1.1628457277298792, + "learning_rate": 8.820891718056815e-06, + "loss": 1.3563, + "step": 2015 + }, + { + "epoch": 1.6323886639676113, + "grad_norm": 1.1999497209012282, + "learning_rate": 8.8123567853341e-06, + "loss": 1.3742, + "step": 2016 + }, + { + "epoch": 1.6331983805668018, + "grad_norm": 1.1700076641674673, + "learning_rate": 8.803822730038292e-06, + "loss": 1.3734, + "step": 2017 + }, + { + "epoch": 1.6340080971659918, + "grad_norm": 1.184177246603208, + "learning_rate": 8.795289558474325e-06, + "loss": 1.3199, + "step": 2018 + }, + { + "epoch": 1.6348178137651823, + "grad_norm": 1.207218490736053, + "learning_rate": 8.786757276946473e-06, + "loss": 1.4515, + "step": 2019 + }, + { + "epoch": 1.6356275303643724, + "grad_norm": 1.1821552126386663, + "learning_rate": 8.77822589175836e-06, + "loss": 1.383, + "step": 2020 + }, + { + "epoch": 1.6364372469635629, + "grad_norm": 1.2274446623556885, + "learning_rate": 8.769695409212946e-06, + "loss": 1.3733, + "step": 2021 + }, + { + "epoch": 1.637246963562753, + "grad_norm": 1.2480667270847892, + "learning_rate": 8.76116583561252e-06, + "loss": 1.4254, + "step": 2022 + }, + { + "epoch": 1.6380566801619434, + "grad_norm": 1.2019082482183845, + "learning_rate": 8.752637177258708e-06, + "loss": 1.3923, + "step": 2023 + }, + { + "epoch": 1.6388663967611334, + "grad_norm": 1.1636761907843682, + "learning_rate": 8.744109440452455e-06, + "loss": 1.3485, + "step": 2024 + }, + { + "epoch": 1.639676113360324, + "grad_norm": 1.2715956411428166, + "learning_rate": 8.73558263149402e-06, + "loss": 1.2978, + "step": 2025 + }, + { + "epoch": 1.6404858299595142, + "grad_norm": 1.2120052316388328, + "learning_rate": 8.727056756682985e-06, + "loss": 1.4335, + "step": 2026 + }, + { + "epoch": 1.6412955465587045, + "grad_norm": 1.2616855428891294, + "learning_rate": 8.718531822318236e-06, + "loss": 1.3146, + "step": 2027 + }, + { + "epoch": 1.6421052631578947, + "grad_norm": 1.2332469667194608, + "learning_rate": 8.71000783469797e-06, + "loss": 1.4207, + "step": 2028 + }, + { + "epoch": 1.642914979757085, + "grad_norm": 1.184361570240323, + "learning_rate": 8.701484800119678e-06, + "loss": 1.4211, + "step": 2029 + }, + { + "epoch": 1.6437246963562753, + "grad_norm": 1.1901404032712164, + "learning_rate": 8.692962724880148e-06, + "loss": 1.2767, + "step": 2030 + }, + { + "epoch": 1.6445344129554655, + "grad_norm": 1.1932059161816782, + "learning_rate": 8.684441615275465e-06, + "loss": 1.3932, + "step": 2031 + }, + { + "epoch": 1.645344129554656, + "grad_norm": 1.1891787691549471, + "learning_rate": 8.675921477600996e-06, + "loss": 1.3344, + "step": 2032 + }, + { + "epoch": 1.646153846153846, + "grad_norm": 1.2379204694615744, + "learning_rate": 8.667402318151394e-06, + "loss": 1.3101, + "step": 2033 + }, + { + "epoch": 1.6469635627530366, + "grad_norm": 1.1672160458877656, + "learning_rate": 8.65888414322058e-06, + "loss": 1.3827, + "step": 2034 + }, + { + "epoch": 1.6477732793522266, + "grad_norm": 1.2157475649535694, + "learning_rate": 8.650366959101757e-06, + "loss": 1.3832, + "step": 2035 + }, + { + "epoch": 1.648582995951417, + "grad_norm": 1.1812552928529416, + "learning_rate": 8.641850772087392e-06, + "loss": 1.3611, + "step": 2036 + }, + { + "epoch": 1.6493927125506072, + "grad_norm": 1.2769305561908675, + "learning_rate": 8.633335588469215e-06, + "loss": 1.3163, + "step": 2037 + }, + { + "epoch": 1.6502024291497976, + "grad_norm": 1.1910880418512533, + "learning_rate": 8.62482141453821e-06, + "loss": 1.3636, + "step": 2038 + }, + { + "epoch": 1.6510121457489877, + "grad_norm": 1.2422148934633142, + "learning_rate": 8.616308256584636e-06, + "loss": 1.3399, + "step": 2039 + }, + { + "epoch": 1.6518218623481782, + "grad_norm": 1.2039779168904876, + "learning_rate": 8.607796120897978e-06, + "loss": 1.3927, + "step": 2040 + }, + { + "epoch": 1.6526315789473685, + "grad_norm": 1.2033223841568303, + "learning_rate": 8.599285013766969e-06, + "loss": 1.3786, + "step": 2041 + }, + { + "epoch": 1.6534412955465587, + "grad_norm": 1.2064528480180141, + "learning_rate": 8.590774941479594e-06, + "loss": 1.3453, + "step": 2042 + }, + { + "epoch": 1.654251012145749, + "grad_norm": 1.2144333343052642, + "learning_rate": 8.582265910323063e-06, + "loss": 1.3132, + "step": 2043 + }, + { + "epoch": 1.6550607287449393, + "grad_norm": 1.2356297509544136, + "learning_rate": 8.57375792658382e-06, + "loss": 1.384, + "step": 2044 + }, + { + "epoch": 1.6558704453441295, + "grad_norm": 1.2113533133382017, + "learning_rate": 8.565250996547538e-06, + "loss": 1.4371, + "step": 2045 + }, + { + "epoch": 1.6566801619433198, + "grad_norm": 1.19928327777792, + "learning_rate": 8.556745126499104e-06, + "loss": 1.3767, + "step": 2046 + }, + { + "epoch": 1.6574898785425103, + "grad_norm": 1.2044105767224726, + "learning_rate": 8.548240322722634e-06, + "loss": 1.3739, + "step": 2047 + }, + { + "epoch": 1.6582995951417003, + "grad_norm": 1.2310691515165295, + "learning_rate": 8.539736591501444e-06, + "loss": 1.4491, + "step": 2048 + }, + { + "epoch": 1.6591093117408908, + "grad_norm": 1.1954706292971082, + "learning_rate": 8.531233939118064e-06, + "loss": 1.3675, + "step": 2049 + }, + { + "epoch": 1.6599190283400809, + "grad_norm": 1.1942959686441643, + "learning_rate": 8.522732371854228e-06, + "loss": 1.5045, + "step": 2050 + }, + { + "epoch": 1.6607287449392714, + "grad_norm": 1.1935221793122746, + "learning_rate": 8.514231895990862e-06, + "loss": 1.3651, + "step": 2051 + }, + { + "epoch": 1.6615384615384614, + "grad_norm": 1.1971712548588236, + "learning_rate": 8.50573251780809e-06, + "loss": 1.3726, + "step": 2052 + }, + { + "epoch": 1.662348178137652, + "grad_norm": 1.189506111043996, + "learning_rate": 8.497234243585229e-06, + "loss": 1.4117, + "step": 2053 + }, + { + "epoch": 1.663157894736842, + "grad_norm": 1.1995587258339253, + "learning_rate": 8.488737079600767e-06, + "loss": 1.3298, + "step": 2054 + }, + { + "epoch": 1.6639676113360324, + "grad_norm": 1.1893431869171311, + "learning_rate": 8.480241032132394e-06, + "loss": 1.368, + "step": 2055 + }, + { + "epoch": 1.6647773279352227, + "grad_norm": 1.2451970649626896, + "learning_rate": 8.47174610745695e-06, + "loss": 1.3723, + "step": 2056 + }, + { + "epoch": 1.665587044534413, + "grad_norm": 1.19193248474202, + "learning_rate": 8.463252311850466e-06, + "loss": 1.3371, + "step": 2057 + }, + { + "epoch": 1.6663967611336032, + "grad_norm": 1.2609189929314095, + "learning_rate": 8.454759651588127e-06, + "loss": 1.4365, + "step": 2058 + }, + { + "epoch": 1.6672064777327935, + "grad_norm": 1.214748523646701, + "learning_rate": 8.446268132944279e-06, + "loss": 1.3793, + "step": 2059 + }, + { + "epoch": 1.6680161943319838, + "grad_norm": 1.2916028366953496, + "learning_rate": 8.437777762192434e-06, + "loss": 1.4646, + "step": 2060 + }, + { + "epoch": 1.668825910931174, + "grad_norm": 1.217799194319695, + "learning_rate": 8.429288545605248e-06, + "loss": 1.374, + "step": 2061 + }, + { + "epoch": 1.6696356275303643, + "grad_norm": 1.1681254207450982, + "learning_rate": 8.42080048945452e-06, + "loss": 1.3769, + "step": 2062 + }, + { + "epoch": 1.6704453441295546, + "grad_norm": 1.2364124010512334, + "learning_rate": 8.412313600011209e-06, + "loss": 1.3553, + "step": 2063 + }, + { + "epoch": 1.671255060728745, + "grad_norm": 1.237450931928774, + "learning_rate": 8.403827883545393e-06, + "loss": 1.4733, + "step": 2064 + }, + { + "epoch": 1.6720647773279351, + "grad_norm": 1.1760825158488026, + "learning_rate": 8.395343346326295e-06, + "loss": 1.3914, + "step": 2065 + }, + { + "epoch": 1.6728744939271256, + "grad_norm": 1.2383834233283117, + "learning_rate": 8.386859994622266e-06, + "loss": 1.3251, + "step": 2066 + }, + { + "epoch": 1.6736842105263157, + "grad_norm": 1.2192262734981085, + "learning_rate": 8.378377834700769e-06, + "loss": 1.3688, + "step": 2067 + }, + { + "epoch": 1.6744939271255062, + "grad_norm": 1.1753730133070337, + "learning_rate": 8.369896872828406e-06, + "loss": 1.376, + "step": 2068 + }, + { + "epoch": 1.6753036437246962, + "grad_norm": 1.1758350186139863, + "learning_rate": 8.361417115270878e-06, + "loss": 1.3457, + "step": 2069 + }, + { + "epoch": 1.6761133603238867, + "grad_norm": 1.2192998551029626, + "learning_rate": 8.352938568293e-06, + "loss": 1.3951, + "step": 2070 + }, + { + "epoch": 1.676923076923077, + "grad_norm": 1.2527451825200207, + "learning_rate": 8.3444612381587e-06, + "loss": 1.3647, + "step": 2071 + }, + { + "epoch": 1.6777327935222672, + "grad_norm": 1.2447953070318596, + "learning_rate": 8.335985131131002e-06, + "loss": 1.4072, + "step": 2072 + }, + { + "epoch": 1.6785425101214575, + "grad_norm": 1.2245743293643092, + "learning_rate": 8.327510253472023e-06, + "loss": 1.3841, + "step": 2073 + }, + { + "epoch": 1.6793522267206478, + "grad_norm": 1.210563123337093, + "learning_rate": 8.319036611442974e-06, + "loss": 1.3674, + "step": 2074 + }, + { + "epoch": 1.680161943319838, + "grad_norm": 1.215284281361853, + "learning_rate": 8.310564211304159e-06, + "loss": 1.3919, + "step": 2075 + }, + { + "epoch": 1.6809716599190283, + "grad_norm": 1.1606361526638542, + "learning_rate": 8.302093059314955e-06, + "loss": 1.4128, + "step": 2076 + }, + { + "epoch": 1.6817813765182186, + "grad_norm": 1.2280121763912009, + "learning_rate": 8.293623161733819e-06, + "loss": 1.3144, + "step": 2077 + }, + { + "epoch": 1.6825910931174088, + "grad_norm": 1.1672110474120607, + "learning_rate": 8.285154524818288e-06, + "loss": 1.3403, + "step": 2078 + }, + { + "epoch": 1.6834008097165993, + "grad_norm": 1.1959606638614337, + "learning_rate": 8.27668715482496e-06, + "loss": 1.4199, + "step": 2079 + }, + { + "epoch": 1.6842105263157894, + "grad_norm": 1.178527090126989, + "learning_rate": 8.268221058009506e-06, + "loss": 1.331, + "step": 2080 + }, + { + "epoch": 1.6850202429149799, + "grad_norm": 1.212039723837128, + "learning_rate": 8.259756240626646e-06, + "loss": 1.3688, + "step": 2081 + }, + { + "epoch": 1.68582995951417, + "grad_norm": 1.2117727419701412, + "learning_rate": 8.251292708930156e-06, + "loss": 1.4021, + "step": 2082 + }, + { + "epoch": 1.6866396761133604, + "grad_norm": 1.1328543998014728, + "learning_rate": 8.242830469172873e-06, + "loss": 1.2837, + "step": 2083 + }, + { + "epoch": 1.6874493927125505, + "grad_norm": 1.174797996844683, + "learning_rate": 8.234369527606667e-06, + "loss": 1.3522, + "step": 2084 + }, + { + "epoch": 1.688259109311741, + "grad_norm": 1.187195276450799, + "learning_rate": 8.225909890482456e-06, + "loss": 1.3685, + "step": 2085 + }, + { + "epoch": 1.6890688259109312, + "grad_norm": 1.2247680029203445, + "learning_rate": 8.217451564050185e-06, + "loss": 1.3567, + "step": 2086 + }, + { + "epoch": 1.6898785425101215, + "grad_norm": 1.1886609297985533, + "learning_rate": 8.20899455455885e-06, + "loss": 1.336, + "step": 2087 + }, + { + "epoch": 1.6906882591093118, + "grad_norm": 1.209073662488914, + "learning_rate": 8.200538868256455e-06, + "loss": 1.3241, + "step": 2088 + }, + { + "epoch": 1.691497975708502, + "grad_norm": 1.2521452085782394, + "learning_rate": 8.192084511390033e-06, + "loss": 1.3865, + "step": 2089 + }, + { + "epoch": 1.6923076923076923, + "grad_norm": 1.1899332902396975, + "learning_rate": 8.183631490205636e-06, + "loss": 1.3589, + "step": 2090 + }, + { + "epoch": 1.6931174089068826, + "grad_norm": 1.1695363453646483, + "learning_rate": 8.17517981094833e-06, + "loss": 1.3299, + "step": 2091 + }, + { + "epoch": 1.6939271255060728, + "grad_norm": 1.1873531255692886, + "learning_rate": 8.166729479862185e-06, + "loss": 1.41, + "step": 2092 + }, + { + "epoch": 1.694736842105263, + "grad_norm": 1.177164040688767, + "learning_rate": 8.15828050319028e-06, + "loss": 1.376, + "step": 2093 + }, + { + "epoch": 1.6955465587044536, + "grad_norm": 1.2131772752466448, + "learning_rate": 8.149832887174686e-06, + "loss": 1.3092, + "step": 2094 + }, + { + "epoch": 1.6963562753036436, + "grad_norm": 1.1863052171248314, + "learning_rate": 8.141386638056482e-06, + "loss": 1.3064, + "step": 2095 + }, + { + "epoch": 1.6971659919028341, + "grad_norm": 1.2087525026441543, + "learning_rate": 8.132941762075726e-06, + "loss": 1.3384, + "step": 2096 + }, + { + "epoch": 1.6979757085020242, + "grad_norm": 1.1909825383271502, + "learning_rate": 8.124498265471462e-06, + "loss": 1.3282, + "step": 2097 + }, + { + "epoch": 1.6987854251012147, + "grad_norm": 1.2073382417531069, + "learning_rate": 8.116056154481721e-06, + "loss": 1.4204, + "step": 2098 + }, + { + "epoch": 1.6995951417004047, + "grad_norm": 1.2336753723660032, + "learning_rate": 8.107615435343506e-06, + "loss": 1.4096, + "step": 2099 + }, + { + "epoch": 1.7004048582995952, + "grad_norm": 1.2244965473022886, + "learning_rate": 8.09917611429279e-06, + "loss": 1.3345, + "step": 2100 + }, + { + "epoch": 1.7012145748987855, + "grad_norm": 1.2365934665855882, + "learning_rate": 8.090738197564519e-06, + "loss": 1.3549, + "step": 2101 + }, + { + "epoch": 1.7020242914979757, + "grad_norm": 1.2403759805495438, + "learning_rate": 8.082301691392593e-06, + "loss": 1.4056, + "step": 2102 + }, + { + "epoch": 1.702834008097166, + "grad_norm": 1.23974335524669, + "learning_rate": 8.073866602009883e-06, + "loss": 1.314, + "step": 2103 + }, + { + "epoch": 1.7036437246963563, + "grad_norm": 1.2444897063130003, + "learning_rate": 8.0654329356482e-06, + "loss": 1.3807, + "step": 2104 + }, + { + "epoch": 1.7044534412955465, + "grad_norm": 1.1949535882055378, + "learning_rate": 8.057000698538311e-06, + "loss": 1.3949, + "step": 2105 + }, + { + "epoch": 1.7052631578947368, + "grad_norm": 1.2104242593278358, + "learning_rate": 8.048569896909925e-06, + "loss": 1.2957, + "step": 2106 + }, + { + "epoch": 1.706072874493927, + "grad_norm": 1.242882880007763, + "learning_rate": 8.040140536991688e-06, + "loss": 1.3838, + "step": 2107 + }, + { + "epoch": 1.7068825910931174, + "grad_norm": 1.2414139288923256, + "learning_rate": 8.031712625011186e-06, + "loss": 1.3637, + "step": 2108 + }, + { + "epoch": 1.7076923076923078, + "grad_norm": 1.1934131947311941, + "learning_rate": 8.023286167194934e-06, + "loss": 1.345, + "step": 2109 + }, + { + "epoch": 1.708502024291498, + "grad_norm": 1.2339934690903747, + "learning_rate": 8.014861169768362e-06, + "loss": 1.43, + "step": 2110 + }, + { + "epoch": 1.7093117408906884, + "grad_norm": 1.172849536919364, + "learning_rate": 8.006437638955846e-06, + "loss": 1.4032, + "step": 2111 + }, + { + "epoch": 1.7101214574898784, + "grad_norm": 1.1971353964773175, + "learning_rate": 7.99801558098065e-06, + "loss": 1.3609, + "step": 2112 + }, + { + "epoch": 1.710931174089069, + "grad_norm": 1.210793522412267, + "learning_rate": 7.98959500206497e-06, + "loss": 1.3799, + "step": 2113 + }, + { + "epoch": 1.711740890688259, + "grad_norm": 1.1992770532754535, + "learning_rate": 7.9811759084299e-06, + "loss": 1.3861, + "step": 2114 + }, + { + "epoch": 1.7125506072874495, + "grad_norm": 1.250496757926102, + "learning_rate": 7.972758306295436e-06, + "loss": 1.3585, + "step": 2115 + }, + { + "epoch": 1.7133603238866397, + "grad_norm": 1.2143039563042084, + "learning_rate": 7.964342201880478e-06, + "loss": 1.3448, + "step": 2116 + }, + { + "epoch": 1.71417004048583, + "grad_norm": 1.2503268692965592, + "learning_rate": 7.955927601402817e-06, + "loss": 1.342, + "step": 2117 + }, + { + "epoch": 1.7149797570850203, + "grad_norm": 1.272542261752361, + "learning_rate": 7.947514511079126e-06, + "loss": 1.4221, + "step": 2118 + }, + { + "epoch": 1.7157894736842105, + "grad_norm": 1.2723130848911237, + "learning_rate": 7.939102937124975e-06, + "loss": 1.4166, + "step": 2119 + }, + { + "epoch": 1.7165991902834008, + "grad_norm": 1.2332492013002891, + "learning_rate": 7.930692885754806e-06, + "loss": 1.3435, + "step": 2120 + }, + { + "epoch": 1.717408906882591, + "grad_norm": 1.2106787991881476, + "learning_rate": 7.922284363181937e-06, + "loss": 1.3128, + "step": 2121 + }, + { + "epoch": 1.7182186234817813, + "grad_norm": 1.2133263111046222, + "learning_rate": 7.913877375618555e-06, + "loss": 1.3606, + "step": 2122 + }, + { + "epoch": 1.7190283400809716, + "grad_norm": 1.2393512093973866, + "learning_rate": 7.90547192927572e-06, + "loss": 1.3813, + "step": 2123 + }, + { + "epoch": 1.719838056680162, + "grad_norm": 1.18596615707014, + "learning_rate": 7.897068030363341e-06, + "loss": 1.3718, + "step": 2124 + }, + { + "epoch": 1.7206477732793521, + "grad_norm": 1.1774499718505693, + "learning_rate": 7.888665685090194e-06, + "loss": 1.3439, + "step": 2125 + }, + { + "epoch": 1.7214574898785426, + "grad_norm": 1.196454598660986, + "learning_rate": 7.880264899663901e-06, + "loss": 1.3454, + "step": 2126 + }, + { + "epoch": 1.7222672064777327, + "grad_norm": 1.247233467944127, + "learning_rate": 7.871865680290943e-06, + "loss": 1.3977, + "step": 2127 + }, + { + "epoch": 1.7230769230769232, + "grad_norm": 1.189652568647767, + "learning_rate": 7.863468033176632e-06, + "loss": 1.3581, + "step": 2128 + }, + { + "epoch": 1.7238866396761132, + "grad_norm": 1.1938170625000661, + "learning_rate": 7.855071964525115e-06, + "loss": 1.3333, + "step": 2129 + }, + { + "epoch": 1.7246963562753037, + "grad_norm": 1.1673683171310625, + "learning_rate": 7.846677480539392e-06, + "loss": 1.3601, + "step": 2130 + }, + { + "epoch": 1.725506072874494, + "grad_norm": 1.1775636728244776, + "learning_rate": 7.838284587421273e-06, + "loss": 1.3889, + "step": 2131 + }, + { + "epoch": 1.7263157894736842, + "grad_norm": 1.2347561246730585, + "learning_rate": 7.829893291371399e-06, + "loss": 1.3683, + "step": 2132 + }, + { + "epoch": 1.7271255060728745, + "grad_norm": 1.1960868396656386, + "learning_rate": 7.821503598589234e-06, + "loss": 1.364, + "step": 2133 + }, + { + "epoch": 1.7279352226720648, + "grad_norm": 1.1576414391616703, + "learning_rate": 7.813115515273052e-06, + "loss": 1.3586, + "step": 2134 + }, + { + "epoch": 1.728744939271255, + "grad_norm": 1.2728426715326342, + "learning_rate": 7.80472904761995e-06, + "loss": 1.3417, + "step": 2135 + }, + { + "epoch": 1.7295546558704453, + "grad_norm": 1.196854159805804, + "learning_rate": 7.796344201825816e-06, + "loss": 1.3683, + "step": 2136 + }, + { + "epoch": 1.7303643724696356, + "grad_norm": 1.2227814698116768, + "learning_rate": 7.787960984085346e-06, + "loss": 1.3132, + "step": 2137 + }, + { + "epoch": 1.7311740890688259, + "grad_norm": 1.1904404210188404, + "learning_rate": 7.779579400592039e-06, + "loss": 1.2808, + "step": 2138 + }, + { + "epoch": 1.7319838056680164, + "grad_norm": 1.2071293931299591, + "learning_rate": 7.771199457538177e-06, + "loss": 1.3415, + "step": 2139 + }, + { + "epoch": 1.7327935222672064, + "grad_norm": 1.2589835991554157, + "learning_rate": 7.762821161114834e-06, + "loss": 1.3671, + "step": 2140 + }, + { + "epoch": 1.733603238866397, + "grad_norm": 1.2502098184841257, + "learning_rate": 7.754444517511869e-06, + "loss": 1.3771, + "step": 2141 + }, + { + "epoch": 1.734412955465587, + "grad_norm": 1.194748364690635, + "learning_rate": 7.746069532917918e-06, + "loss": 1.3148, + "step": 2142 + }, + { + "epoch": 1.7352226720647774, + "grad_norm": 1.2615364812188445, + "learning_rate": 7.737696213520397e-06, + "loss": 1.3587, + "step": 2143 + }, + { + "epoch": 1.7360323886639675, + "grad_norm": 1.1791684259343356, + "learning_rate": 7.72932456550548e-06, + "loss": 1.335, + "step": 2144 + }, + { + "epoch": 1.736842105263158, + "grad_norm": 1.2804937427129839, + "learning_rate": 7.720954595058118e-06, + "loss": 1.3563, + "step": 2145 + }, + { + "epoch": 1.737651821862348, + "grad_norm": 1.2678895788470363, + "learning_rate": 7.712586308362017e-06, + "loss": 1.3297, + "step": 2146 + }, + { + "epoch": 1.7384615384615385, + "grad_norm": 1.1864809688777394, + "learning_rate": 7.704219711599637e-06, + "loss": 1.3111, + "step": 2147 + }, + { + "epoch": 1.7392712550607288, + "grad_norm": 1.1913833583807754, + "learning_rate": 7.695854810952194e-06, + "loss": 1.3823, + "step": 2148 + }, + { + "epoch": 1.740080971659919, + "grad_norm": 1.2454501834374865, + "learning_rate": 7.687491612599651e-06, + "loss": 1.3384, + "step": 2149 + }, + { + "epoch": 1.7408906882591093, + "grad_norm": 1.2311094048424676, + "learning_rate": 7.679130122720704e-06, + "loss": 1.4009, + "step": 2150 + }, + { + "epoch": 1.7417004048582996, + "grad_norm": 1.2146362073312729, + "learning_rate": 7.670770347492804e-06, + "loss": 1.3641, + "step": 2151 + }, + { + "epoch": 1.7425101214574898, + "grad_norm": 1.214516287695011, + "learning_rate": 7.662412293092118e-06, + "loss": 1.3468, + "step": 2152 + }, + { + "epoch": 1.7433198380566801, + "grad_norm": 1.1764628400623898, + "learning_rate": 7.654055965693556e-06, + "loss": 1.3273, + "step": 2153 + }, + { + "epoch": 1.7441295546558706, + "grad_norm": 1.1411724141436934, + "learning_rate": 7.64570137147074e-06, + "loss": 1.3301, + "step": 2154 + }, + { + "epoch": 1.7449392712550607, + "grad_norm": 1.2122093274665457, + "learning_rate": 7.637348516596016e-06, + "loss": 1.3525, + "step": 2155 + }, + { + "epoch": 1.7457489878542511, + "grad_norm": 1.2399842029700836, + "learning_rate": 7.628997407240453e-06, + "loss": 1.3807, + "step": 2156 + }, + { + "epoch": 1.7465587044534412, + "grad_norm": 1.256148734557012, + "learning_rate": 7.620648049573815e-06, + "loss": 1.3519, + "step": 2157 + }, + { + "epoch": 1.7473684210526317, + "grad_norm": 1.2286990308441597, + "learning_rate": 7.61230044976458e-06, + "loss": 1.3506, + "step": 2158 + }, + { + "epoch": 1.7481781376518217, + "grad_norm": 1.2260430970223977, + "learning_rate": 7.603954613979933e-06, + "loss": 1.2832, + "step": 2159 + }, + { + "epoch": 1.7489878542510122, + "grad_norm": 1.1988962090230735, + "learning_rate": 7.59561054838575e-06, + "loss": 1.307, + "step": 2160 + }, + { + "epoch": 1.7497975708502023, + "grad_norm": 1.1876276282662488, + "learning_rate": 7.587268259146596e-06, + "loss": 1.3742, + "step": 2161 + }, + { + "epoch": 1.7506072874493928, + "grad_norm": 1.327874554832877, + "learning_rate": 7.578927752425727e-06, + "loss": 1.3885, + "step": 2162 + }, + { + "epoch": 1.751417004048583, + "grad_norm": 1.2617574575652093, + "learning_rate": 7.570589034385083e-06, + "loss": 1.4301, + "step": 2163 + }, + { + "epoch": 1.7522267206477733, + "grad_norm": 1.1822211358693704, + "learning_rate": 7.562252111185282e-06, + "loss": 1.3268, + "step": 2164 + }, + { + "epoch": 1.7530364372469636, + "grad_norm": 1.2264998801603384, + "learning_rate": 7.5539169889856135e-06, + "loss": 1.3632, + "step": 2165 + }, + { + "epoch": 1.7538461538461538, + "grad_norm": 1.2308240591640838, + "learning_rate": 7.545583673944038e-06, + "loss": 1.3703, + "step": 2166 + }, + { + "epoch": 1.754655870445344, + "grad_norm": 1.2034026004405005, + "learning_rate": 7.537252172217185e-06, + "loss": 1.4068, + "step": 2167 + }, + { + "epoch": 1.7554655870445344, + "grad_norm": 1.2716169869380658, + "learning_rate": 7.528922489960339e-06, + "loss": 1.3546, + "step": 2168 + }, + { + "epoch": 1.7562753036437249, + "grad_norm": 1.1963463371260916, + "learning_rate": 7.52059463332744e-06, + "loss": 1.3827, + "step": 2169 + }, + { + "epoch": 1.757085020242915, + "grad_norm": 1.2214699387119217, + "learning_rate": 7.512268608471083e-06, + "loss": 1.3597, + "step": 2170 + }, + { + "epoch": 1.7578947368421054, + "grad_norm": 1.1980661545941786, + "learning_rate": 7.503944421542508e-06, + "loss": 1.4244, + "step": 2171 + }, + { + "epoch": 1.7587044534412954, + "grad_norm": 1.2346433817261153, + "learning_rate": 7.495622078691597e-06, + "loss": 1.3887, + "step": 2172 + }, + { + "epoch": 1.759514170040486, + "grad_norm": 1.2316791370226217, + "learning_rate": 7.487301586066866e-06, + "loss": 1.4128, + "step": 2173 + }, + { + "epoch": 1.760323886639676, + "grad_norm": 1.247303111408137, + "learning_rate": 7.47898294981547e-06, + "loss": 1.3464, + "step": 2174 + }, + { + "epoch": 1.7611336032388665, + "grad_norm": 1.1858468665595168, + "learning_rate": 7.470666176083193e-06, + "loss": 1.3297, + "step": 2175 + }, + { + "epoch": 1.7619433198380565, + "grad_norm": 1.17298641444367, + "learning_rate": 7.462351271014438e-06, + "loss": 1.3335, + "step": 2176 + }, + { + "epoch": 1.762753036437247, + "grad_norm": 1.2071311078649347, + "learning_rate": 7.454038240752228e-06, + "loss": 1.3957, + "step": 2177 + }, + { + "epoch": 1.7635627530364373, + "grad_norm": 1.2383361119848715, + "learning_rate": 7.4457270914382056e-06, + "loss": 1.3592, + "step": 2178 + }, + { + "epoch": 1.7643724696356275, + "grad_norm": 1.2321882683113468, + "learning_rate": 7.437417829212618e-06, + "loss": 1.3527, + "step": 2179 + }, + { + "epoch": 1.7651821862348178, + "grad_norm": 1.1470511569497224, + "learning_rate": 7.42911046021432e-06, + "loss": 1.3536, + "step": 2180 + }, + { + "epoch": 1.765991902834008, + "grad_norm": 1.1704186219839523, + "learning_rate": 7.420804990580772e-06, + "loss": 1.3738, + "step": 2181 + }, + { + "epoch": 1.7668016194331984, + "grad_norm": 1.155250015703002, + "learning_rate": 7.4125014264480225e-06, + "loss": 1.3488, + "step": 2182 + }, + { + "epoch": 1.7676113360323886, + "grad_norm": 1.2361131817316744, + "learning_rate": 7.404199773950724e-06, + "loss": 1.3283, + "step": 2183 + }, + { + "epoch": 1.768421052631579, + "grad_norm": 1.1572149457442449, + "learning_rate": 7.395900039222108e-06, + "loss": 1.4089, + "step": 2184 + }, + { + "epoch": 1.7692307692307692, + "grad_norm": 1.2024459906835356, + "learning_rate": 7.387602228393987e-06, + "loss": 1.3428, + "step": 2185 + }, + { + "epoch": 1.7700404858299597, + "grad_norm": 1.2444788254439085, + "learning_rate": 7.379306347596762e-06, + "loss": 1.3889, + "step": 2186 + }, + { + "epoch": 1.7708502024291497, + "grad_norm": 1.2199415758978813, + "learning_rate": 7.371012402959399e-06, + "loss": 1.3438, + "step": 2187 + }, + { + "epoch": 1.7716599190283402, + "grad_norm": 1.147217425822318, + "learning_rate": 7.362720400609437e-06, + "loss": 1.4056, + "step": 2188 + }, + { + "epoch": 1.7724696356275302, + "grad_norm": 1.2527673146015181, + "learning_rate": 7.354430346672983e-06, + "loss": 1.3813, + "step": 2189 + }, + { + "epoch": 1.7732793522267207, + "grad_norm": 1.261309500485428, + "learning_rate": 7.346142247274695e-06, + "loss": 1.4284, + "step": 2190 + }, + { + "epoch": 1.7740890688259108, + "grad_norm": 1.2320074330261714, + "learning_rate": 7.337856108537802e-06, + "loss": 1.4159, + "step": 2191 + }, + { + "epoch": 1.7748987854251013, + "grad_norm": 1.247234002362416, + "learning_rate": 7.329571936584072e-06, + "loss": 1.4277, + "step": 2192 + }, + { + "epoch": 1.7757085020242915, + "grad_norm": 1.2396500425068102, + "learning_rate": 7.321289737533826e-06, + "loss": 1.2784, + "step": 2193 + }, + { + "epoch": 1.7765182186234818, + "grad_norm": 1.221512450585911, + "learning_rate": 7.313009517505923e-06, + "loss": 1.4277, + "step": 2194 + }, + { + "epoch": 1.777327935222672, + "grad_norm": 1.2207042254565934, + "learning_rate": 7.304731282617762e-06, + "loss": 1.3541, + "step": 2195 + }, + { + "epoch": 1.7781376518218623, + "grad_norm": 1.179448361791702, + "learning_rate": 7.29645503898528e-06, + "loss": 1.3417, + "step": 2196 + }, + { + "epoch": 1.7789473684210526, + "grad_norm": 1.210105556711578, + "learning_rate": 7.288180792722934e-06, + "loss": 1.3518, + "step": 2197 + }, + { + "epoch": 1.7797570850202429, + "grad_norm": 1.2517556094115736, + "learning_rate": 7.279908549943708e-06, + "loss": 1.3949, + "step": 2198 + }, + { + "epoch": 1.7805668016194331, + "grad_norm": 1.213422808224917, + "learning_rate": 7.271638316759116e-06, + "loss": 1.2904, + "step": 2199 + }, + { + "epoch": 1.7813765182186234, + "grad_norm": 1.21215761739376, + "learning_rate": 7.263370099279173e-06, + "loss": 1.282, + "step": 2200 + }, + { + "epoch": 1.782186234817814, + "grad_norm": 1.2357528688986366, + "learning_rate": 7.255103903612413e-06, + "loss": 1.3415, + "step": 2201 + }, + { + "epoch": 1.782995951417004, + "grad_norm": 1.2102242154201936, + "learning_rate": 7.246839735865874e-06, + "loss": 1.3856, + "step": 2202 + }, + { + "epoch": 1.7838056680161944, + "grad_norm": 1.2066789803427276, + "learning_rate": 7.238577602145094e-06, + "loss": 1.3408, + "step": 2203 + }, + { + "epoch": 1.7846153846153845, + "grad_norm": 1.2002015685758425, + "learning_rate": 7.230317508554113e-06, + "loss": 1.3005, + "step": 2204 + }, + { + "epoch": 1.785425101214575, + "grad_norm": 1.2371780286395402, + "learning_rate": 7.2220594611954606e-06, + "loss": 1.3464, + "step": 2205 + }, + { + "epoch": 1.786234817813765, + "grad_norm": 1.2122162279977085, + "learning_rate": 7.21380346617015e-06, + "loss": 1.4097, + "step": 2206 + }, + { + "epoch": 1.7870445344129555, + "grad_norm": 1.1890616495957214, + "learning_rate": 7.20554952957769e-06, + "loss": 1.3381, + "step": 2207 + }, + { + "epoch": 1.7878542510121458, + "grad_norm": 1.2240916448009678, + "learning_rate": 7.197297657516062e-06, + "loss": 1.2961, + "step": 2208 + }, + { + "epoch": 1.788663967611336, + "grad_norm": 1.2291351852794627, + "learning_rate": 7.189047856081719e-06, + "loss": 1.3057, + "step": 2209 + }, + { + "epoch": 1.7894736842105263, + "grad_norm": 1.2303308564438198, + "learning_rate": 7.1808001313695855e-06, + "loss": 1.434, + "step": 2210 + }, + { + "epoch": 1.7902834008097166, + "grad_norm": 1.2068008557972785, + "learning_rate": 7.172554489473057e-06, + "loss": 1.371, + "step": 2211 + }, + { + "epoch": 1.7910931174089069, + "grad_norm": 1.236399538012193, + "learning_rate": 7.164310936483986e-06, + "loss": 1.3062, + "step": 2212 + }, + { + "epoch": 1.7919028340080971, + "grad_norm": 1.1998903560290575, + "learning_rate": 7.156069478492679e-06, + "loss": 1.3007, + "step": 2213 + }, + { + "epoch": 1.7927125506072874, + "grad_norm": 1.2525924525988954, + "learning_rate": 7.1478301215878975e-06, + "loss": 1.4231, + "step": 2214 + }, + { + "epoch": 1.7935222672064777, + "grad_norm": 1.2758950545332266, + "learning_rate": 7.1395928718568605e-06, + "loss": 1.359, + "step": 2215 + }, + { + "epoch": 1.7943319838056682, + "grad_norm": 1.2001272107207346, + "learning_rate": 7.131357735385213e-06, + "loss": 1.3625, + "step": 2216 + }, + { + "epoch": 1.7951417004048582, + "grad_norm": 1.2200265505607486, + "learning_rate": 7.123124718257052e-06, + "loss": 1.3997, + "step": 2217 + }, + { + "epoch": 1.7959514170040487, + "grad_norm": 1.2159603840435709, + "learning_rate": 7.114893826554896e-06, + "loss": 1.3696, + "step": 2218 + }, + { + "epoch": 1.7967611336032387, + "grad_norm": 1.2133557545824332, + "learning_rate": 7.106665066359708e-06, + "loss": 1.4013, + "step": 2219 + }, + { + "epoch": 1.7975708502024292, + "grad_norm": 1.2428462413099133, + "learning_rate": 7.098438443750865e-06, + "loss": 1.4124, + "step": 2220 + }, + { + "epoch": 1.7983805668016193, + "grad_norm": 1.1962533108457247, + "learning_rate": 7.0902139648061676e-06, + "loss": 1.3797, + "step": 2221 + }, + { + "epoch": 1.7991902834008098, + "grad_norm": 1.1949448871594797, + "learning_rate": 7.081991635601835e-06, + "loss": 1.3777, + "step": 2222 + }, + { + "epoch": 1.8, + "grad_norm": 1.1739839756203938, + "learning_rate": 7.073771462212502e-06, + "loss": 1.3431, + "step": 2223 + }, + { + "epoch": 1.8008097165991903, + "grad_norm": 1.2246364614759249, + "learning_rate": 7.065553450711202e-06, + "loss": 1.3858, + "step": 2224 + }, + { + "epoch": 1.8016194331983806, + "grad_norm": 1.2541194418394814, + "learning_rate": 7.057337607169373e-06, + "loss": 1.4208, + "step": 2225 + }, + { + "epoch": 1.8024291497975709, + "grad_norm": 1.1625873389342016, + "learning_rate": 7.049123937656855e-06, + "loss": 1.2865, + "step": 2226 + }, + { + "epoch": 1.8032388663967611, + "grad_norm": 1.1789523515967617, + "learning_rate": 7.040912448241881e-06, + "loss": 1.3901, + "step": 2227 + }, + { + "epoch": 1.8040485829959514, + "grad_norm": 1.1706741638509202, + "learning_rate": 7.032703144991071e-06, + "loss": 1.3657, + "step": 2228 + }, + { + "epoch": 1.8048582995951417, + "grad_norm": 1.1921078298881833, + "learning_rate": 7.024496033969432e-06, + "loss": 1.3611, + "step": 2229 + }, + { + "epoch": 1.805668016194332, + "grad_norm": 1.17097174113593, + "learning_rate": 7.016291121240346e-06, + "loss": 1.3633, + "step": 2230 + }, + { + "epoch": 1.8064777327935224, + "grad_norm": 1.1852035894468138, + "learning_rate": 7.0080884128655844e-06, + "loss": 1.3807, + "step": 2231 + }, + { + "epoch": 1.8072874493927125, + "grad_norm": 1.2725267927716697, + "learning_rate": 6.999887914905275e-06, + "loss": 1.4361, + "step": 2232 + }, + { + "epoch": 1.808097165991903, + "grad_norm": 1.220459422861511, + "learning_rate": 6.991689633417922e-06, + "loss": 1.3571, + "step": 2233 + }, + { + "epoch": 1.808906882591093, + "grad_norm": 1.1788740570003207, + "learning_rate": 6.983493574460387e-06, + "loss": 1.3274, + "step": 2234 + }, + { + "epoch": 1.8097165991902835, + "grad_norm": 1.3416750880261363, + "learning_rate": 6.975299744087891e-06, + "loss": 1.3581, + "step": 2235 + }, + { + "epoch": 1.8105263157894735, + "grad_norm": 1.225592957552821, + "learning_rate": 6.967108148354012e-06, + "loss": 1.4076, + "step": 2236 + }, + { + "epoch": 1.811336032388664, + "grad_norm": 1.2485859196199363, + "learning_rate": 6.958918793310669e-06, + "loss": 1.434, + "step": 2237 + }, + { + "epoch": 1.8121457489878543, + "grad_norm": 1.179048945702912, + "learning_rate": 6.950731685008132e-06, + "loss": 1.3373, + "step": 2238 + }, + { + "epoch": 1.8129554655870446, + "grad_norm": 1.208305356954193, + "learning_rate": 6.942546829495014e-06, + "loss": 1.3677, + "step": 2239 + }, + { + "epoch": 1.8137651821862348, + "grad_norm": 1.179294023466567, + "learning_rate": 6.934364232818254e-06, + "loss": 1.3397, + "step": 2240 + }, + { + "epoch": 1.814574898785425, + "grad_norm": 1.238375865893255, + "learning_rate": 6.926183901023134e-06, + "loss": 1.3259, + "step": 2241 + }, + { + "epoch": 1.8153846153846154, + "grad_norm": 1.2004335795147096, + "learning_rate": 6.91800584015325e-06, + "loss": 1.3741, + "step": 2242 + }, + { + "epoch": 1.8161943319838056, + "grad_norm": 1.2101607717425258, + "learning_rate": 6.909830056250527e-06, + "loss": 1.3455, + "step": 2243 + }, + { + "epoch": 1.817004048582996, + "grad_norm": 1.203030571859603, + "learning_rate": 6.901656555355212e-06, + "loss": 1.3705, + "step": 2244 + }, + { + "epoch": 1.8178137651821862, + "grad_norm": 1.276448418257517, + "learning_rate": 6.8934853435058566e-06, + "loss": 1.3609, + "step": 2245 + }, + { + "epoch": 1.8186234817813767, + "grad_norm": 1.1499994230561823, + "learning_rate": 6.8853164267393234e-06, + "loss": 1.3902, + "step": 2246 + }, + { + "epoch": 1.8194331983805667, + "grad_norm": 1.1943909447731087, + "learning_rate": 6.877149811090785e-06, + "loss": 1.2936, + "step": 2247 + }, + { + "epoch": 1.8202429149797572, + "grad_norm": 1.2276088320117058, + "learning_rate": 6.8689855025937124e-06, + "loss": 1.3459, + "step": 2248 + }, + { + "epoch": 1.8210526315789473, + "grad_norm": 1.2131982314479368, + "learning_rate": 6.860823507279868e-06, + "loss": 1.3071, + "step": 2249 + }, + { + "epoch": 1.8218623481781377, + "grad_norm": 1.2662064967702196, + "learning_rate": 6.852663831179303e-06, + "loss": 1.3959, + "step": 2250 + }, + { + "epoch": 1.8226720647773278, + "grad_norm": 1.28182916895796, + "learning_rate": 6.844506480320363e-06, + "loss": 1.3637, + "step": 2251 + }, + { + "epoch": 1.8234817813765183, + "grad_norm": 1.1875870250303597, + "learning_rate": 6.836351460729673e-06, + "loss": 1.354, + "step": 2252 + }, + { + "epoch": 1.8242914979757086, + "grad_norm": 1.2135144065873424, + "learning_rate": 6.828198778432131e-06, + "loss": 1.3731, + "step": 2253 + }, + { + "epoch": 1.8251012145748988, + "grad_norm": 1.190208517610891, + "learning_rate": 6.820048439450913e-06, + "loss": 1.3976, + "step": 2254 + }, + { + "epoch": 1.825910931174089, + "grad_norm": 1.1613715093942938, + "learning_rate": 6.811900449807465e-06, + "loss": 1.3521, + "step": 2255 + }, + { + "epoch": 1.8267206477732794, + "grad_norm": 1.1600854701348962, + "learning_rate": 6.803754815521495e-06, + "loss": 1.3905, + "step": 2256 + }, + { + "epoch": 1.8275303643724696, + "grad_norm": 1.1683331040020393, + "learning_rate": 6.7956115426109695e-06, + "loss": 1.3801, + "step": 2257 + }, + { + "epoch": 1.82834008097166, + "grad_norm": 1.1228286175250644, + "learning_rate": 6.78747063709211e-06, + "loss": 1.3634, + "step": 2258 + }, + { + "epoch": 1.8291497975708502, + "grad_norm": 1.1651839094767948, + "learning_rate": 6.779332104979394e-06, + "loss": 1.3268, + "step": 2259 + }, + { + "epoch": 1.8299595141700404, + "grad_norm": 1.1999740716361973, + "learning_rate": 6.771195952285541e-06, + "loss": 1.3391, + "step": 2260 + }, + { + "epoch": 1.830769230769231, + "grad_norm": 1.1698993424325916, + "learning_rate": 6.763062185021511e-06, + "loss": 1.3434, + "step": 2261 + }, + { + "epoch": 1.831578947368421, + "grad_norm": 1.197859926690675, + "learning_rate": 6.754930809196507e-06, + "loss": 1.3116, + "step": 2262 + }, + { + "epoch": 1.8323886639676115, + "grad_norm": 1.2075936874086974, + "learning_rate": 6.746801830817966e-06, + "loss": 1.3516, + "step": 2263 + }, + { + "epoch": 1.8331983805668015, + "grad_norm": 1.2280800864883632, + "learning_rate": 6.738675255891548e-06, + "loss": 1.3995, + "step": 2264 + }, + { + "epoch": 1.834008097165992, + "grad_norm": 1.186966643343012, + "learning_rate": 6.730551090421137e-06, + "loss": 1.3301, + "step": 2265 + }, + { + "epoch": 1.834817813765182, + "grad_norm": 1.1752482043139705, + "learning_rate": 6.7224293404088445e-06, + "loss": 1.288, + "step": 2266 + }, + { + "epoch": 1.8356275303643725, + "grad_norm": 1.1737943908874262, + "learning_rate": 6.714310011854989e-06, + "loss": 1.379, + "step": 2267 + }, + { + "epoch": 1.8364372469635628, + "grad_norm": 1.1881877312022122, + "learning_rate": 6.7061931107581055e-06, + "loss": 1.3928, + "step": 2268 + }, + { + "epoch": 1.837246963562753, + "grad_norm": 1.2419072015459927, + "learning_rate": 6.698078643114935e-06, + "loss": 1.3845, + "step": 2269 + }, + { + "epoch": 1.8380566801619433, + "grad_norm": 1.1961721466530393, + "learning_rate": 6.689966614920414e-06, + "loss": 1.3745, + "step": 2270 + }, + { + "epoch": 1.8388663967611336, + "grad_norm": 1.151446689287607, + "learning_rate": 6.681857032167689e-06, + "loss": 1.3118, + "step": 2271 + }, + { + "epoch": 1.8396761133603239, + "grad_norm": 1.196438525357998, + "learning_rate": 6.673749900848092e-06, + "loss": 1.3183, + "step": 2272 + }, + { + "epoch": 1.8404858299595142, + "grad_norm": 1.2018986018410842, + "learning_rate": 6.665645226951141e-06, + "loss": 1.4007, + "step": 2273 + }, + { + "epoch": 1.8412955465587044, + "grad_norm": 1.175635606943612, + "learning_rate": 6.657543016464546e-06, + "loss": 1.39, + "step": 2274 + }, + { + "epoch": 1.8421052631578947, + "grad_norm": 1.2177369319411393, + "learning_rate": 6.6494432753741935e-06, + "loss": 1.3989, + "step": 2275 + }, + { + "epoch": 1.8429149797570852, + "grad_norm": 1.15435997929181, + "learning_rate": 6.641346009664142e-06, + "loss": 1.3644, + "step": 2276 + }, + { + "epoch": 1.8437246963562752, + "grad_norm": 1.1812983300442905, + "learning_rate": 6.63325122531663e-06, + "loss": 1.3, + "step": 2277 + }, + { + "epoch": 1.8445344129554657, + "grad_norm": 1.1786903009596423, + "learning_rate": 6.62515892831205e-06, + "loss": 1.3513, + "step": 2278 + }, + { + "epoch": 1.8453441295546558, + "grad_norm": 1.1533542073236585, + "learning_rate": 6.6170691246289744e-06, + "loss": 1.3916, + "step": 2279 + }, + { + "epoch": 1.8461538461538463, + "grad_norm": 1.122771919065958, + "learning_rate": 6.608981820244116e-06, + "loss": 1.2917, + "step": 2280 + }, + { + "epoch": 1.8469635627530363, + "grad_norm": 1.1565154319571267, + "learning_rate": 6.600897021132353e-06, + "loss": 1.3448, + "step": 2281 + }, + { + "epoch": 1.8477732793522268, + "grad_norm": 1.1994419287429507, + "learning_rate": 6.592814733266708e-06, + "loss": 1.3973, + "step": 2282 + }, + { + "epoch": 1.8485829959514168, + "grad_norm": 1.2145511142569878, + "learning_rate": 6.5847349626183444e-06, + "loss": 1.2976, + "step": 2283 + }, + { + "epoch": 1.8493927125506073, + "grad_norm": 1.2470922439448142, + "learning_rate": 6.576657715156576e-06, + "loss": 1.4211, + "step": 2284 + }, + { + "epoch": 1.8502024291497976, + "grad_norm": 1.1849987198146783, + "learning_rate": 6.568582996848844e-06, + "loss": 1.4217, + "step": 2285 + }, + { + "epoch": 1.8510121457489879, + "grad_norm": 1.2246339182732113, + "learning_rate": 6.560510813660719e-06, + "loss": 1.347, + "step": 2286 + }, + { + "epoch": 1.8518218623481781, + "grad_norm": 1.2511212401796088, + "learning_rate": 6.5524411715559125e-06, + "loss": 1.4248, + "step": 2287 + }, + { + "epoch": 1.8526315789473684, + "grad_norm": 1.2304999920366795, + "learning_rate": 6.544374076496243e-06, + "loss": 1.3445, + "step": 2288 + }, + { + "epoch": 1.8534412955465587, + "grad_norm": 1.2349915243981289, + "learning_rate": 6.536309534441658e-06, + "loss": 1.3398, + "step": 2289 + }, + { + "epoch": 1.854251012145749, + "grad_norm": 1.1884282705027234, + "learning_rate": 6.528247551350213e-06, + "loss": 1.2971, + "step": 2290 + }, + { + "epoch": 1.8550607287449394, + "grad_norm": 1.181801214740644, + "learning_rate": 6.5201881331780725e-06, + "loss": 1.3792, + "step": 2291 + }, + { + "epoch": 1.8558704453441295, + "grad_norm": 1.2135112397667827, + "learning_rate": 6.512131285879513e-06, + "loss": 1.3898, + "step": 2292 + }, + { + "epoch": 1.85668016194332, + "grad_norm": 1.2180427276548653, + "learning_rate": 6.504077015406902e-06, + "loss": 1.3809, + "step": 2293 + }, + { + "epoch": 1.85748987854251, + "grad_norm": 1.1561446887739786, + "learning_rate": 6.496025327710707e-06, + "loss": 1.3113, + "step": 2294 + }, + { + "epoch": 1.8582995951417005, + "grad_norm": 1.2247735536559143, + "learning_rate": 6.487976228739493e-06, + "loss": 1.3531, + "step": 2295 + }, + { + "epoch": 1.8591093117408906, + "grad_norm": 1.2152586819122253, + "learning_rate": 6.4799297244399085e-06, + "loss": 1.3964, + "step": 2296 + }, + { + "epoch": 1.859919028340081, + "grad_norm": 1.2454451424496096, + "learning_rate": 6.471885820756683e-06, + "loss": 1.3904, + "step": 2297 + }, + { + "epoch": 1.860728744939271, + "grad_norm": 1.1655322922036064, + "learning_rate": 6.463844523632622e-06, + "loss": 1.3364, + "step": 2298 + }, + { + "epoch": 1.8615384615384616, + "grad_norm": 1.2996377865771735, + "learning_rate": 6.455805839008615e-06, + "loss": 1.4075, + "step": 2299 + }, + { + "epoch": 1.8623481781376519, + "grad_norm": 1.1964728670717037, + "learning_rate": 6.4477697728236146e-06, + "loss": 1.3517, + "step": 2300 + }, + { + "epoch": 1.8631578947368421, + "grad_norm": 1.16799096604399, + "learning_rate": 6.439736331014637e-06, + "loss": 1.3809, + "step": 2301 + }, + { + "epoch": 1.8639676113360324, + "grad_norm": 1.2252233979220486, + "learning_rate": 6.431705519516763e-06, + "loss": 1.3675, + "step": 2302 + }, + { + "epoch": 1.8647773279352227, + "grad_norm": 1.2104500576735182, + "learning_rate": 6.4236773442631325e-06, + "loss": 1.3805, + "step": 2303 + }, + { + "epoch": 1.865587044534413, + "grad_norm": 1.3193178213753143, + "learning_rate": 6.415651811184935e-06, + "loss": 1.3997, + "step": 2304 + }, + { + "epoch": 1.8663967611336032, + "grad_norm": 1.2018055746906207, + "learning_rate": 6.407628926211409e-06, + "loss": 1.3964, + "step": 2305 + }, + { + "epoch": 1.8672064777327935, + "grad_norm": 1.174760468858027, + "learning_rate": 6.39960869526983e-06, + "loss": 1.3464, + "step": 2306 + }, + { + "epoch": 1.8680161943319837, + "grad_norm": 1.1810773457338066, + "learning_rate": 6.391591124285524e-06, + "loss": 1.2858, + "step": 2307 + }, + { + "epoch": 1.8688259109311742, + "grad_norm": 1.2262009022694145, + "learning_rate": 6.383576219181844e-06, + "loss": 1.3559, + "step": 2308 + }, + { + "epoch": 1.8696356275303643, + "grad_norm": 1.2773594939431656, + "learning_rate": 6.375563985880174e-06, + "loss": 1.3705, + "step": 2309 + }, + { + "epoch": 1.8704453441295548, + "grad_norm": 1.2208108751520061, + "learning_rate": 6.367554430299924e-06, + "loss": 1.3333, + "step": 2310 + }, + { + "epoch": 1.8712550607287448, + "grad_norm": 1.4160184181629654, + "learning_rate": 6.3595475583585344e-06, + "loss": 1.3816, + "step": 2311 + }, + { + "epoch": 1.8720647773279353, + "grad_norm": 1.2094735305859114, + "learning_rate": 6.351543375971453e-06, + "loss": 1.3275, + "step": 2312 + }, + { + "epoch": 1.8728744939271254, + "grad_norm": 1.2855800361668372, + "learning_rate": 6.34354188905214e-06, + "loss": 1.4212, + "step": 2313 + }, + { + "epoch": 1.8736842105263158, + "grad_norm": 1.1929901078467973, + "learning_rate": 6.335543103512072e-06, + "loss": 1.368, + "step": 2314 + }, + { + "epoch": 1.874493927125506, + "grad_norm": 1.1975185842016003, + "learning_rate": 6.327547025260723e-06, + "loss": 1.3372, + "step": 2315 + }, + { + "epoch": 1.8753036437246964, + "grad_norm": 1.2685945017090825, + "learning_rate": 6.319553660205569e-06, + "loss": 1.3461, + "step": 2316 + }, + { + "epoch": 1.8761133603238866, + "grad_norm": 1.1962632720706978, + "learning_rate": 6.3115630142520835e-06, + "loss": 1.404, + "step": 2317 + }, + { + "epoch": 1.876923076923077, + "grad_norm": 1.198552614864501, + "learning_rate": 6.303575093303725e-06, + "loss": 1.4111, + "step": 2318 + }, + { + "epoch": 1.8777327935222672, + "grad_norm": 1.1891573003084546, + "learning_rate": 6.2955899032619515e-06, + "loss": 1.3048, + "step": 2319 + }, + { + "epoch": 1.8785425101214575, + "grad_norm": 1.1495642468155007, + "learning_rate": 6.287607450026189e-06, + "loss": 1.4159, + "step": 2320 + }, + { + "epoch": 1.8793522267206477, + "grad_norm": 1.1707115053399164, + "learning_rate": 6.27962773949385e-06, + "loss": 1.4223, + "step": 2321 + }, + { + "epoch": 1.880161943319838, + "grad_norm": 1.2148777682926812, + "learning_rate": 6.271650777560318e-06, + "loss": 1.3416, + "step": 2322 + }, + { + "epoch": 1.8809716599190285, + "grad_norm": 1.222151610467891, + "learning_rate": 6.263676570118948e-06, + "loss": 1.3497, + "step": 2323 + }, + { + "epoch": 1.8817813765182185, + "grad_norm": 1.1961853962501905, + "learning_rate": 6.2557051230610534e-06, + "loss": 1.3338, + "step": 2324 + }, + { + "epoch": 1.882591093117409, + "grad_norm": 1.1926637799351172, + "learning_rate": 6.247736442275918e-06, + "loss": 1.4051, + "step": 2325 + }, + { + "epoch": 1.883400809716599, + "grad_norm": 1.1918108744163147, + "learning_rate": 6.239770533650771e-06, + "loss": 1.398, + "step": 2326 + }, + { + "epoch": 1.8842105263157896, + "grad_norm": 1.234775751606103, + "learning_rate": 6.231807403070806e-06, + "loss": 1.3671, + "step": 2327 + }, + { + "epoch": 1.8850202429149796, + "grad_norm": 1.2099918714991094, + "learning_rate": 6.223847056419154e-06, + "loss": 1.3623, + "step": 2328 + }, + { + "epoch": 1.88582995951417, + "grad_norm": 1.2291790207100255, + "learning_rate": 6.215889499576898e-06, + "loss": 1.3359, + "step": 2329 + }, + { + "epoch": 1.8866396761133604, + "grad_norm": 1.2504678829644937, + "learning_rate": 6.2079347384230505e-06, + "loss": 1.3474, + "step": 2330 + }, + { + "epoch": 1.8874493927125506, + "grad_norm": 1.1571650257183987, + "learning_rate": 6.199982778834561e-06, + "loss": 1.3704, + "step": 2331 + }, + { + "epoch": 1.888259109311741, + "grad_norm": 1.192980883967285, + "learning_rate": 6.192033626686316e-06, + "loss": 1.3893, + "step": 2332 + }, + { + "epoch": 1.8890688259109312, + "grad_norm": 1.2365704933508046, + "learning_rate": 6.1840872878511215e-06, + "loss": 1.3739, + "step": 2333 + }, + { + "epoch": 1.8898785425101214, + "grad_norm": 1.2430427056052022, + "learning_rate": 6.1761437681997e-06, + "loss": 1.3773, + "step": 2334 + }, + { + "epoch": 1.8906882591093117, + "grad_norm": 1.2140264228246762, + "learning_rate": 6.168203073600706e-06, + "loss": 1.3265, + "step": 2335 + }, + { + "epoch": 1.891497975708502, + "grad_norm": 1.1949388259542468, + "learning_rate": 6.160265209920698e-06, + "loss": 1.4013, + "step": 2336 + }, + { + "epoch": 1.8923076923076922, + "grad_norm": 1.2825641807955082, + "learning_rate": 6.152330183024142e-06, + "loss": 1.3582, + "step": 2337 + }, + { + "epoch": 1.8931174089068827, + "grad_norm": 1.1900913534174808, + "learning_rate": 6.1443979987734086e-06, + "loss": 1.3772, + "step": 2338 + }, + { + "epoch": 1.8939271255060728, + "grad_norm": 1.2005697884400928, + "learning_rate": 6.1364686630287694e-06, + "loss": 1.3625, + "step": 2339 + }, + { + "epoch": 1.8947368421052633, + "grad_norm": 1.2630270064813116, + "learning_rate": 6.128542181648395e-06, + "loss": 1.3342, + "step": 2340 + }, + { + "epoch": 1.8955465587044533, + "grad_norm": 1.2002800976397625, + "learning_rate": 6.120618560488341e-06, + "loss": 1.3267, + "step": 2341 + }, + { + "epoch": 1.8963562753036438, + "grad_norm": 1.215442464121004, + "learning_rate": 6.112697805402548e-06, + "loss": 1.3688, + "step": 2342 + }, + { + "epoch": 1.8971659919028339, + "grad_norm": 1.2288864318659574, + "learning_rate": 6.104779922242851e-06, + "loss": 1.3636, + "step": 2343 + }, + { + "epoch": 1.8979757085020244, + "grad_norm": 1.194317008155087, + "learning_rate": 6.096864916858957e-06, + "loss": 1.32, + "step": 2344 + }, + { + "epoch": 1.8987854251012146, + "grad_norm": 1.2124723409709166, + "learning_rate": 6.088952795098442e-06, + "loss": 1.3127, + "step": 2345 + }, + { + "epoch": 1.8995951417004049, + "grad_norm": 1.188500019539976, + "learning_rate": 6.081043562806754e-06, + "loss": 1.388, + "step": 2346 + }, + { + "epoch": 1.9004048582995952, + "grad_norm": 1.1552719513756389, + "learning_rate": 6.073137225827213e-06, + "loss": 1.3228, + "step": 2347 + }, + { + "epoch": 1.9012145748987854, + "grad_norm": 1.2237471760776777, + "learning_rate": 6.065233790000993e-06, + "loss": 1.3489, + "step": 2348 + }, + { + "epoch": 1.9020242914979757, + "grad_norm": 1.1792800865204056, + "learning_rate": 6.057333261167122e-06, + "loss": 1.3479, + "step": 2349 + }, + { + "epoch": 1.902834008097166, + "grad_norm": 1.199559285857203, + "learning_rate": 6.049435645162487e-06, + "loss": 1.3519, + "step": 2350 + }, + { + "epoch": 1.9036437246963562, + "grad_norm": 1.181957404617024, + "learning_rate": 6.041540947821827e-06, + "loss": 1.3371, + "step": 2351 + }, + { + "epoch": 1.9044534412955465, + "grad_norm": 1.2505276548541542, + "learning_rate": 6.0336491749777115e-06, + "loss": 1.4061, + "step": 2352 + }, + { + "epoch": 1.905263157894737, + "grad_norm": 1.195960955129131, + "learning_rate": 6.025760332460558e-06, + "loss": 1.3586, + "step": 2353 + }, + { + "epoch": 1.906072874493927, + "grad_norm": 1.1707714015115926, + "learning_rate": 6.01787442609862e-06, + "loss": 1.3412, + "step": 2354 + }, + { + "epoch": 1.9068825910931175, + "grad_norm": 1.140910772316072, + "learning_rate": 6.009991461717977e-06, + "loss": 1.3314, + "step": 2355 + }, + { + "epoch": 1.9076923076923076, + "grad_norm": 1.23349460378788, + "learning_rate": 6.002111445142533e-06, + "loss": 1.3255, + "step": 2356 + }, + { + "epoch": 1.908502024291498, + "grad_norm": 1.2462129342380746, + "learning_rate": 5.994234382194026e-06, + "loss": 1.3989, + "step": 2357 + }, + { + "epoch": 1.9093117408906881, + "grad_norm": 1.2688670795842787, + "learning_rate": 5.986360278691998e-06, + "loss": 1.3718, + "step": 2358 + }, + { + "epoch": 1.9101214574898786, + "grad_norm": 1.2095702791808156, + "learning_rate": 5.978489140453817e-06, + "loss": 1.3534, + "step": 2359 + }, + { + "epoch": 1.9109311740890689, + "grad_norm": 1.1982546045786988, + "learning_rate": 5.9706209732946495e-06, + "loss": 1.3671, + "step": 2360 + }, + { + "epoch": 1.9117408906882591, + "grad_norm": 1.1822667381374636, + "learning_rate": 5.962755783027473e-06, + "loss": 1.2912, + "step": 2361 + }, + { + "epoch": 1.9125506072874494, + "grad_norm": 1.242784856625613, + "learning_rate": 5.954893575463064e-06, + "loss": 1.369, + "step": 2362 + }, + { + "epoch": 1.9133603238866397, + "grad_norm": 1.2344835123108033, + "learning_rate": 5.9470343564099975e-06, + "loss": 1.3867, + "step": 2363 + }, + { + "epoch": 1.91417004048583, + "grad_norm": 1.2420930503800902, + "learning_rate": 5.939178131674633e-06, + "loss": 1.3344, + "step": 2364 + }, + { + "epoch": 1.9149797570850202, + "grad_norm": 1.2002007408254076, + "learning_rate": 5.931324907061131e-06, + "loss": 1.4089, + "step": 2365 + }, + { + "epoch": 1.9157894736842105, + "grad_norm": 1.1431280050371697, + "learning_rate": 5.92347468837142e-06, + "loss": 1.3774, + "step": 2366 + }, + { + "epoch": 1.9165991902834008, + "grad_norm": 1.2176788699652055, + "learning_rate": 5.915627481405224e-06, + "loss": 1.3993, + "step": 2367 + }, + { + "epoch": 1.9174089068825912, + "grad_norm": 1.1570899200524465, + "learning_rate": 5.907783291960027e-06, + "loss": 1.2658, + "step": 2368 + }, + { + "epoch": 1.9182186234817813, + "grad_norm": 1.2498879079115792, + "learning_rate": 5.899942125831097e-06, + "loss": 1.286, + "step": 2369 + }, + { + "epoch": 1.9190283400809718, + "grad_norm": 1.2146345067146647, + "learning_rate": 5.892103988811457e-06, + "loss": 1.4007, + "step": 2370 + }, + { + "epoch": 1.9198380566801618, + "grad_norm": 1.2400136674029751, + "learning_rate": 5.884268886691898e-06, + "loss": 1.3853, + "step": 2371 + }, + { + "epoch": 1.9206477732793523, + "grad_norm": 1.2266043645344005, + "learning_rate": 5.876436825260967e-06, + "loss": 1.3606, + "step": 2372 + }, + { + "epoch": 1.9214574898785424, + "grad_norm": 1.1901114955712122, + "learning_rate": 5.868607810304967e-06, + "loss": 1.4107, + "step": 2373 + }, + { + "epoch": 1.9222672064777329, + "grad_norm": 1.2389492781589038, + "learning_rate": 5.860781847607943e-06, + "loss": 1.3883, + "step": 2374 + }, + { + "epoch": 1.9230769230769231, + "grad_norm": 1.223439005294772, + "learning_rate": 5.852958942951701e-06, + "loss": 1.3636, + "step": 2375 + }, + { + "epoch": 1.9238866396761134, + "grad_norm": 1.1896039830081746, + "learning_rate": 5.845139102115769e-06, + "loss": 1.3259, + "step": 2376 + }, + { + "epoch": 1.9246963562753037, + "grad_norm": 1.2049235269890495, + "learning_rate": 5.837322330877421e-06, + "loss": 1.3567, + "step": 2377 + }, + { + "epoch": 1.925506072874494, + "grad_norm": 1.2157467237161108, + "learning_rate": 5.829508635011667e-06, + "loss": 1.3588, + "step": 2378 + }, + { + "epoch": 1.9263157894736842, + "grad_norm": 1.248188375178168, + "learning_rate": 5.821698020291234e-06, + "loss": 1.3572, + "step": 2379 + }, + { + "epoch": 1.9271255060728745, + "grad_norm": 1.212346730632553, + "learning_rate": 5.8138904924865766e-06, + "loss": 1.3043, + "step": 2380 + }, + { + "epoch": 1.9279352226720647, + "grad_norm": 1.1824421069183566, + "learning_rate": 5.806086057365878e-06, + "loss": 1.3943, + "step": 2381 + }, + { + "epoch": 1.928744939271255, + "grad_norm": 1.1763429489984647, + "learning_rate": 5.798284720695022e-06, + "loss": 1.365, + "step": 2382 + }, + { + "epoch": 1.9295546558704455, + "grad_norm": 1.204932841462832, + "learning_rate": 5.790486488237619e-06, + "loss": 1.3377, + "step": 2383 + }, + { + "epoch": 1.9303643724696355, + "grad_norm": 1.2045578783510957, + "learning_rate": 5.782691365754971e-06, + "loss": 1.4084, + "step": 2384 + }, + { + "epoch": 1.931174089068826, + "grad_norm": 1.2608711906516323, + "learning_rate": 5.774899359006092e-06, + "loss": 1.2661, + "step": 2385 + }, + { + "epoch": 1.931983805668016, + "grad_norm": 1.1990033949590717, + "learning_rate": 5.76711047374769e-06, + "loss": 1.3528, + "step": 2386 + }, + { + "epoch": 1.9327935222672066, + "grad_norm": 1.172948011576386, + "learning_rate": 5.759324715734166e-06, + "loss": 1.4189, + "step": 2387 + }, + { + "epoch": 1.9336032388663966, + "grad_norm": 1.241926186312199, + "learning_rate": 5.7515420907176105e-06, + "loss": 1.4028, + "step": 2388 + }, + { + "epoch": 1.9344129554655871, + "grad_norm": 1.1964066986025503, + "learning_rate": 5.743762604447809e-06, + "loss": 1.3317, + "step": 2389 + }, + { + "epoch": 1.9352226720647774, + "grad_norm": 1.1945494693033618, + "learning_rate": 5.735986262672211e-06, + "loss": 1.341, + "step": 2390 + }, + { + "epoch": 1.9360323886639677, + "grad_norm": 1.1755268375572105, + "learning_rate": 5.728213071135962e-06, + "loss": 1.3911, + "step": 2391 + }, + { + "epoch": 1.936842105263158, + "grad_norm": 1.1804944891484381, + "learning_rate": 5.720443035581867e-06, + "loss": 1.3697, + "step": 2392 + }, + { + "epoch": 1.9376518218623482, + "grad_norm": 1.2046317934373794, + "learning_rate": 5.712676161750399e-06, + "loss": 1.4039, + "step": 2393 + }, + { + "epoch": 1.9384615384615385, + "grad_norm": 1.1637358644282172, + "learning_rate": 5.704912455379703e-06, + "loss": 1.363, + "step": 2394 + }, + { + "epoch": 1.9392712550607287, + "grad_norm": 1.1974299223091525, + "learning_rate": 5.697151922205575e-06, + "loss": 1.3971, + "step": 2395 + }, + { + "epoch": 1.940080971659919, + "grad_norm": 1.157981186836544, + "learning_rate": 5.689394567961477e-06, + "loss": 1.2967, + "step": 2396 + }, + { + "epoch": 1.9408906882591093, + "grad_norm": 1.1670968402428008, + "learning_rate": 5.681640398378514e-06, + "loss": 1.3849, + "step": 2397 + }, + { + "epoch": 1.9417004048582998, + "grad_norm": 1.1292919479938222, + "learning_rate": 5.673889419185439e-06, + "loss": 1.4253, + "step": 2398 + }, + { + "epoch": 1.9425101214574898, + "grad_norm": 1.213702805085859, + "learning_rate": 5.666141636108655e-06, + "loss": 1.3198, + "step": 2399 + }, + { + "epoch": 1.9433198380566803, + "grad_norm": 1.157847450719407, + "learning_rate": 5.658397054872197e-06, + "loss": 1.3175, + "step": 2400 + }, + { + "epoch": 1.9441295546558703, + "grad_norm": 1.2106036490901373, + "learning_rate": 5.650655681197734e-06, + "loss": 1.4105, + "step": 2401 + }, + { + "epoch": 1.9449392712550608, + "grad_norm": 1.219262613527522, + "learning_rate": 5.642917520804569e-06, + "loss": 1.3505, + "step": 2402 + }, + { + "epoch": 1.9457489878542509, + "grad_norm": 1.1742978362533214, + "learning_rate": 5.635182579409626e-06, + "loss": 1.299, + "step": 2403 + }, + { + "epoch": 1.9465587044534414, + "grad_norm": 1.2293963021779448, + "learning_rate": 5.627450862727461e-06, + "loss": 1.3391, + "step": 2404 + }, + { + "epoch": 1.9473684210526314, + "grad_norm": 1.2034045616590177, + "learning_rate": 5.619722376470238e-06, + "loss": 1.3877, + "step": 2405 + }, + { + "epoch": 1.948178137651822, + "grad_norm": 1.2272183799034553, + "learning_rate": 5.611997126347732e-06, + "loss": 1.3988, + "step": 2406 + }, + { + "epoch": 1.9489878542510122, + "grad_norm": 1.2695110730414088, + "learning_rate": 5.604275118067341e-06, + "loss": 1.3653, + "step": 2407 + }, + { + "epoch": 1.9497975708502024, + "grad_norm": 1.248909293580144, + "learning_rate": 5.596556357334056e-06, + "loss": 1.3562, + "step": 2408 + }, + { + "epoch": 1.9506072874493927, + "grad_norm": 1.2131088122427793, + "learning_rate": 5.588840849850472e-06, + "loss": 1.3928, + "step": 2409 + }, + { + "epoch": 1.951417004048583, + "grad_norm": 1.1900481644176077, + "learning_rate": 5.581128601316774e-06, + "loss": 1.3777, + "step": 2410 + }, + { + "epoch": 1.9522267206477733, + "grad_norm": 1.2099498350109, + "learning_rate": 5.573419617430758e-06, + "loss": 1.3499, + "step": 2411 + }, + { + "epoch": 1.9530364372469635, + "grad_norm": 1.217148269565451, + "learning_rate": 5.565713903887788e-06, + "loss": 1.2877, + "step": 2412 + }, + { + "epoch": 1.953846153846154, + "grad_norm": 1.2490893741382743, + "learning_rate": 5.558011466380824e-06, + "loss": 1.3768, + "step": 2413 + }, + { + "epoch": 1.954655870445344, + "grad_norm": 1.240461136852775, + "learning_rate": 5.550312310600394e-06, + "loss": 1.4229, + "step": 2414 + }, + { + "epoch": 1.9554655870445345, + "grad_norm": 1.2295726427309241, + "learning_rate": 5.542616442234618e-06, + "loss": 1.3347, + "step": 2415 + }, + { + "epoch": 1.9562753036437246, + "grad_norm": 1.2008815138181963, + "learning_rate": 5.534923866969175e-06, + "loss": 1.4176, + "step": 2416 + }, + { + "epoch": 1.957085020242915, + "grad_norm": 1.1676560157055105, + "learning_rate": 5.527234590487314e-06, + "loss": 1.3314, + "step": 2417 + }, + { + "epoch": 1.9578947368421051, + "grad_norm": 1.1701886277830924, + "learning_rate": 5.5195486184698435e-06, + "loss": 1.363, + "step": 2418 + }, + { + "epoch": 1.9587044534412956, + "grad_norm": 1.1778230913774466, + "learning_rate": 5.511865956595142e-06, + "loss": 1.3683, + "step": 2419 + }, + { + "epoch": 1.9595141700404857, + "grad_norm": 1.173338300441953, + "learning_rate": 5.504186610539131e-06, + "loss": 1.3393, + "step": 2420 + }, + { + "epoch": 1.9603238866396762, + "grad_norm": 1.1767463019339799, + "learning_rate": 5.496510585975285e-06, + "loss": 1.3235, + "step": 2421 + }, + { + "epoch": 1.9611336032388664, + "grad_norm": 1.2644216036741416, + "learning_rate": 5.488837888574623e-06, + "loss": 1.3373, + "step": 2422 + }, + { + "epoch": 1.9619433198380567, + "grad_norm": 1.2103895583574469, + "learning_rate": 5.4811685240057165e-06, + "loss": 1.3983, + "step": 2423 + }, + { + "epoch": 1.962753036437247, + "grad_norm": 1.1846356504436384, + "learning_rate": 5.473502497934663e-06, + "loss": 1.3294, + "step": 2424 + }, + { + "epoch": 1.9635627530364372, + "grad_norm": 1.2235189988456545, + "learning_rate": 5.465839816025093e-06, + "loss": 1.3276, + "step": 2425 + }, + { + "epoch": 1.9643724696356275, + "grad_norm": 1.2314816730763305, + "learning_rate": 5.458180483938179e-06, + "loss": 1.4498, + "step": 2426 + }, + { + "epoch": 1.9651821862348178, + "grad_norm": 1.212596357115965, + "learning_rate": 5.450524507332606e-06, + "loss": 1.3656, + "step": 2427 + }, + { + "epoch": 1.965991902834008, + "grad_norm": 1.1639800229825885, + "learning_rate": 5.442871891864585e-06, + "loss": 1.3776, + "step": 2428 + }, + { + "epoch": 1.9668016194331983, + "grad_norm": 1.1367690723379495, + "learning_rate": 5.435222643187843e-06, + "loss": 1.3242, + "step": 2429 + }, + { + "epoch": 1.9676113360323888, + "grad_norm": 1.2165886607110268, + "learning_rate": 5.427576766953615e-06, + "loss": 1.3591, + "step": 2430 + }, + { + "epoch": 1.9684210526315788, + "grad_norm": 1.1790081897002724, + "learning_rate": 5.419934268810659e-06, + "loss": 1.3848, + "step": 2431 + }, + { + "epoch": 1.9692307692307693, + "grad_norm": 1.2315181532137929, + "learning_rate": 5.412295154405217e-06, + "loss": 1.3431, + "step": 2432 + }, + { + "epoch": 1.9700404858299594, + "grad_norm": 1.2728808844442372, + "learning_rate": 5.4046594293810515e-06, + "loss": 1.3639, + "step": 2433 + }, + { + "epoch": 1.9708502024291499, + "grad_norm": 1.232337828546579, + "learning_rate": 5.397027099379406e-06, + "loss": 1.3697, + "step": 2434 + }, + { + "epoch": 1.97165991902834, + "grad_norm": 1.243179350091489, + "learning_rate": 5.3893981700390215e-06, + "loss": 1.3252, + "step": 2435 + }, + { + "epoch": 1.9724696356275304, + "grad_norm": 1.2371857762137026, + "learning_rate": 5.381772646996128e-06, + "loss": 1.3827, + "step": 2436 + }, + { + "epoch": 1.9732793522267207, + "grad_norm": 1.288958547364644, + "learning_rate": 5.374150535884433e-06, + "loss": 1.3265, + "step": 2437 + }, + { + "epoch": 1.974089068825911, + "grad_norm": 1.2475048960238355, + "learning_rate": 5.3665318423351255e-06, + "loss": 1.2822, + "step": 2438 + }, + { + "epoch": 1.9748987854251012, + "grad_norm": 1.1986060107719656, + "learning_rate": 5.358916571976878e-06, + "loss": 1.3558, + "step": 2439 + }, + { + "epoch": 1.9757085020242915, + "grad_norm": 1.1809389390768783, + "learning_rate": 5.35130473043582e-06, + "loss": 1.3428, + "step": 2440 + }, + { + "epoch": 1.9765182186234818, + "grad_norm": 1.1854316658221693, + "learning_rate": 5.343696323335564e-06, + "loss": 1.4093, + "step": 2441 + }, + { + "epoch": 1.977327935222672, + "grad_norm": 1.211156343447039, + "learning_rate": 5.336091356297168e-06, + "loss": 1.2688, + "step": 2442 + }, + { + "epoch": 1.9781376518218623, + "grad_norm": 1.2978073337950151, + "learning_rate": 5.328489834939162e-06, + "loss": 1.3924, + "step": 2443 + }, + { + "epoch": 1.9789473684210526, + "grad_norm": 1.1739628299787876, + "learning_rate": 5.320891764877522e-06, + "loss": 1.371, + "step": 2444 + }, + { + "epoch": 1.979757085020243, + "grad_norm": 1.198921849736746, + "learning_rate": 5.313297151725679e-06, + "loss": 1.3149, + "step": 2445 + }, + { + "epoch": 1.980566801619433, + "grad_norm": 1.2012656881257282, + "learning_rate": 5.305706001094504e-06, + "loss": 1.3979, + "step": 2446 + }, + { + "epoch": 1.9813765182186236, + "grad_norm": 1.1827392393082905, + "learning_rate": 5.298118318592316e-06, + "loss": 1.3565, + "step": 2447 + }, + { + "epoch": 1.9821862348178136, + "grad_norm": 1.2628168108725613, + "learning_rate": 5.290534109824875e-06, + "loss": 1.3705, + "step": 2448 + }, + { + "epoch": 1.9829959514170041, + "grad_norm": 1.248756806447712, + "learning_rate": 5.282953380395366e-06, + "loss": 1.2367, + "step": 2449 + }, + { + "epoch": 1.9838056680161942, + "grad_norm": 1.2171206230007487, + "learning_rate": 5.275376135904408e-06, + "loss": 1.3403, + "step": 2450 + }, + { + "epoch": 1.9846153846153847, + "grad_norm": 1.289234216018497, + "learning_rate": 5.267802381950042e-06, + "loss": 1.3819, + "step": 2451 + }, + { + "epoch": 1.985425101214575, + "grad_norm": 1.2241063424118177, + "learning_rate": 5.260232124127734e-06, + "loss": 1.3171, + "step": 2452 + }, + { + "epoch": 1.9862348178137652, + "grad_norm": 1.2256066411153002, + "learning_rate": 5.252665368030362e-06, + "loss": 1.4032, + "step": 2453 + }, + { + "epoch": 1.9870445344129555, + "grad_norm": 1.2295354974621036, + "learning_rate": 5.245102119248227e-06, + "loss": 1.3562, + "step": 2454 + }, + { + "epoch": 1.9878542510121457, + "grad_norm": 1.207357620526114, + "learning_rate": 5.2375423833690255e-06, + "loss": 1.3962, + "step": 2455 + }, + { + "epoch": 1.988663967611336, + "grad_norm": 1.2123767685021807, + "learning_rate": 5.229986165977874e-06, + "loss": 1.3524, + "step": 2456 + }, + { + "epoch": 1.9894736842105263, + "grad_norm": 1.1985170410291037, + "learning_rate": 5.222433472657276e-06, + "loss": 1.342, + "step": 2457 + }, + { + "epoch": 1.9902834008097166, + "grad_norm": 1.2549278916904512, + "learning_rate": 5.214884308987136e-06, + "loss": 1.2894, + "step": 2458 + }, + { + "epoch": 1.9910931174089068, + "grad_norm": 1.2721385349454946, + "learning_rate": 5.207338680544754e-06, + "loss": 1.3614, + "step": 2459 + }, + { + "epoch": 1.9919028340080973, + "grad_norm": 1.1730908013315933, + "learning_rate": 5.1997965929048125e-06, + "loss": 1.3449, + "step": 2460 + }, + { + "epoch": 1.9927125506072874, + "grad_norm": 1.237418271403445, + "learning_rate": 5.192258051639378e-06, + "loss": 1.3128, + "step": 2461 + }, + { + "epoch": 1.9935222672064778, + "grad_norm": 1.195751862323348, + "learning_rate": 5.184723062317905e-06, + "loss": 1.3297, + "step": 2462 + }, + { + "epoch": 1.994331983805668, + "grad_norm": 1.1842709296523235, + "learning_rate": 5.177191630507221e-06, + "loss": 1.3214, + "step": 2463 + }, + { + "epoch": 1.9951417004048584, + "grad_norm": 1.1803392844860812, + "learning_rate": 5.169663761771522e-06, + "loss": 1.415, + "step": 2464 + }, + { + "epoch": 1.9959514170040484, + "grad_norm": 1.1998108340136762, + "learning_rate": 5.1621394616723705e-06, + "loss": 1.3449, + "step": 2465 + }, + { + "epoch": 1.996761133603239, + "grad_norm": 1.2193208223683405, + "learning_rate": 5.154618735768695e-06, + "loss": 1.2948, + "step": 2466 + }, + { + "epoch": 1.9975708502024292, + "grad_norm": 1.1939406883857941, + "learning_rate": 5.147101589616783e-06, + "loss": 1.3627, + "step": 2467 + }, + { + "epoch": 1.9983805668016195, + "grad_norm": 1.209031327432166, + "learning_rate": 5.139588028770275e-06, + "loss": 1.3994, + "step": 2468 + }, + { + "epoch": 1.9991902834008097, + "grad_norm": 1.169763485845793, + "learning_rate": 5.13207805878017e-06, + "loss": 1.3228, + "step": 2469 + }, + { + "epoch": 2.0, + "grad_norm": 1.532603626298016, + "learning_rate": 5.124571685194804e-06, + "loss": 1.2018, + "step": 2470 + }, + { + "epoch": 2.0008097165991905, + "grad_norm": 2.854383023719633, + "learning_rate": 5.1170689135598675e-06, + "loss": 1.0829, + "step": 2471 + }, + { + "epoch": 2.0016194331983805, + "grad_norm": 3.050414009521738, + "learning_rate": 5.10956974941838e-06, + "loss": 1.0063, + "step": 2472 + }, + { + "epoch": 2.002429149797571, + "grad_norm": 2.4422306774820672, + "learning_rate": 5.102074198310701e-06, + "loss": 1.0396, + "step": 2473 + }, + { + "epoch": 2.003238866396761, + "grad_norm": 2.0451498041031044, + "learning_rate": 5.094582265774515e-06, + "loss": 0.9888, + "step": 2474 + }, + { + "epoch": 2.0040485829959516, + "grad_norm": 4.046585838573169, + "learning_rate": 5.087093957344841e-06, + "loss": 1.0822, + "step": 2475 + }, + { + "epoch": 2.0048582995951416, + "grad_norm": 4.926775146816468, + "learning_rate": 5.079609278554011e-06, + "loss": 1.0585, + "step": 2476 + }, + { + "epoch": 2.005668016194332, + "grad_norm": 3.3365273107950393, + "learning_rate": 5.07212823493169e-06, + "loss": 0.9847, + "step": 2477 + }, + { + "epoch": 2.006477732793522, + "grad_norm": 2.24516980725426, + "learning_rate": 5.064650832004839e-06, + "loss": 1.0298, + "step": 2478 + }, + { + "epoch": 2.0072874493927126, + "grad_norm": 1.7417896088574096, + "learning_rate": 5.057177075297748e-06, + "loss": 0.9663, + "step": 2479 + }, + { + "epoch": 2.0080971659919027, + "grad_norm": 2.031730299689621, + "learning_rate": 5.049706970332e-06, + "loss": 0.9755, + "step": 2480 + }, + { + "epoch": 2.008906882591093, + "grad_norm": 1.9976555875126931, + "learning_rate": 5.0422405226264825e-06, + "loss": 0.9815, + "step": 2481 + }, + { + "epoch": 2.0097165991902832, + "grad_norm": 1.9153896673542294, + "learning_rate": 5.034777737697384e-06, + "loss": 0.9825, + "step": 2482 + }, + { + "epoch": 2.0105263157894737, + "grad_norm": 1.6395893752150272, + "learning_rate": 5.027318621058182e-06, + "loss": 1.0408, + "step": 2483 + }, + { + "epoch": 2.0113360323886638, + "grad_norm": 1.6423936187066637, + "learning_rate": 5.019863178219653e-06, + "loss": 0.9731, + "step": 2484 + }, + { + "epoch": 2.0121457489878543, + "grad_norm": 1.7371817317654348, + "learning_rate": 5.0124114146898505e-06, + "loss": 0.9819, + "step": 2485 + }, + { + "epoch": 2.0129554655870447, + "grad_norm": 1.7846901106310733, + "learning_rate": 5.004963335974112e-06, + "loss": 1.0394, + "step": 2486 + }, + { + "epoch": 2.013765182186235, + "grad_norm": 1.7793293478267915, + "learning_rate": 4.997518947575058e-06, + "loss": 1.0592, + "step": 2487 + }, + { + "epoch": 2.0145748987854253, + "grad_norm": 1.6437092415729435, + "learning_rate": 4.990078254992574e-06, + "loss": 0.999, + "step": 2488 + }, + { + "epoch": 2.0153846153846153, + "grad_norm": 1.6534586866603884, + "learning_rate": 4.982641263723822e-06, + "loss": 1.0597, + "step": 2489 + }, + { + "epoch": 2.016194331983806, + "grad_norm": 1.5683644605257934, + "learning_rate": 4.9752079792632244e-06, + "loss": 1.0008, + "step": 2490 + }, + { + "epoch": 2.017004048582996, + "grad_norm": 1.6067812383281972, + "learning_rate": 4.967778407102466e-06, + "loss": 0.9898, + "step": 2491 + }, + { + "epoch": 2.0178137651821864, + "grad_norm": 1.6310900165582578, + "learning_rate": 4.960352552730495e-06, + "loss": 0.978, + "step": 2492 + }, + { + "epoch": 2.0186234817813764, + "grad_norm": 1.6529602670760182, + "learning_rate": 4.952930421633506e-06, + "loss": 0.9264, + "step": 2493 + }, + { + "epoch": 2.019433198380567, + "grad_norm": 1.7217510219463308, + "learning_rate": 4.945512019294941e-06, + "loss": 0.9976, + "step": 2494 + }, + { + "epoch": 2.020242914979757, + "grad_norm": 1.7026003567072832, + "learning_rate": 4.938097351195499e-06, + "loss": 0.9611, + "step": 2495 + }, + { + "epoch": 2.0210526315789474, + "grad_norm": 1.6872762681564812, + "learning_rate": 4.9306864228131094e-06, + "loss": 0.9497, + "step": 2496 + }, + { + "epoch": 2.0218623481781375, + "grad_norm": 1.6295705608523692, + "learning_rate": 4.92327923962294e-06, + "loss": 1.054, + "step": 2497 + }, + { + "epoch": 2.022672064777328, + "grad_norm": 1.6156871771493067, + "learning_rate": 4.91587580709739e-06, + "loss": 1.0382, + "step": 2498 + }, + { + "epoch": 2.023481781376518, + "grad_norm": 1.667645461991409, + "learning_rate": 4.9084761307061e-06, + "loss": 1.0005, + "step": 2499 + }, + { + "epoch": 2.0242914979757085, + "grad_norm": 1.6226089763668954, + "learning_rate": 4.9010802159159224e-06, + "loss": 0.9241, + "step": 2500 + }, + { + "epoch": 2.025101214574899, + "grad_norm": 1.6532518189213758, + "learning_rate": 4.893688068190933e-06, + "loss": 1.0058, + "step": 2501 + }, + { + "epoch": 2.025910931174089, + "grad_norm": 1.5598422496487256, + "learning_rate": 4.886299692992425e-06, + "loss": 0.9634, + "step": 2502 + }, + { + "epoch": 2.0267206477732795, + "grad_norm": 1.6760718021088303, + "learning_rate": 4.878915095778911e-06, + "loss": 0.9279, + "step": 2503 + }, + { + "epoch": 2.0275303643724696, + "grad_norm": 1.6857184074433873, + "learning_rate": 4.871534282006105e-06, + "loss": 1.033, + "step": 2504 + }, + { + "epoch": 2.02834008097166, + "grad_norm": 1.6164129806079546, + "learning_rate": 4.864157257126928e-06, + "loss": 1.0426, + "step": 2505 + }, + { + "epoch": 2.02914979757085, + "grad_norm": 1.6723447210528488, + "learning_rate": 4.856784026591497e-06, + "loss": 0.9716, + "step": 2506 + }, + { + "epoch": 2.0299595141700406, + "grad_norm": 1.574208483958483, + "learning_rate": 4.849414595847138e-06, + "loss": 1.0813, + "step": 2507 + }, + { + "epoch": 2.0307692307692307, + "grad_norm": 1.5924598069963187, + "learning_rate": 4.84204897033836e-06, + "loss": 1.0232, + "step": 2508 + }, + { + "epoch": 2.031578947368421, + "grad_norm": 1.6310804523408304, + "learning_rate": 4.834687155506861e-06, + "loss": 0.9985, + "step": 2509 + }, + { + "epoch": 2.032388663967611, + "grad_norm": 1.6173868253735737, + "learning_rate": 4.8273291567915225e-06, + "loss": 0.9707, + "step": 2510 + }, + { + "epoch": 2.0331983805668017, + "grad_norm": 1.665833483770265, + "learning_rate": 4.8199749796284175e-06, + "loss": 0.9886, + "step": 2511 + }, + { + "epoch": 2.0340080971659917, + "grad_norm": 1.5330111731342393, + "learning_rate": 4.812624629450785e-06, + "loss": 1.0049, + "step": 2512 + }, + { + "epoch": 2.0348178137651822, + "grad_norm": 1.6868658667526049, + "learning_rate": 4.805278111689035e-06, + "loss": 0.9885, + "step": 2513 + }, + { + "epoch": 2.0356275303643723, + "grad_norm": 1.7897259905366785, + "learning_rate": 4.797935431770758e-06, + "loss": 0.9826, + "step": 2514 + }, + { + "epoch": 2.0364372469635628, + "grad_norm": 1.7488839728709102, + "learning_rate": 4.790596595120699e-06, + "loss": 1.0691, + "step": 2515 + }, + { + "epoch": 2.0372469635627533, + "grad_norm": 1.6698774320169687, + "learning_rate": 4.783261607160764e-06, + "loss": 1.0223, + "step": 2516 + }, + { + "epoch": 2.0380566801619433, + "grad_norm": 1.6944023574692242, + "learning_rate": 4.775930473310021e-06, + "loss": 0.9541, + "step": 2517 + }, + { + "epoch": 2.038866396761134, + "grad_norm": 1.6030259296495724, + "learning_rate": 4.768603198984683e-06, + "loss": 0.9359, + "step": 2518 + }, + { + "epoch": 2.039676113360324, + "grad_norm": 1.7509997817466008, + "learning_rate": 4.761279789598122e-06, + "loss": 0.9543, + "step": 2519 + }, + { + "epoch": 2.0404858299595143, + "grad_norm": 1.6053230945872223, + "learning_rate": 4.753960250560843e-06, + "loss": 1.0359, + "step": 2520 + }, + { + "epoch": 2.0412955465587044, + "grad_norm": 1.6470550257288128, + "learning_rate": 4.746644587280505e-06, + "loss": 1.0407, + "step": 2521 + }, + { + "epoch": 2.042105263157895, + "grad_norm": 1.5960268921634935, + "learning_rate": 4.739332805161892e-06, + "loss": 1.0435, + "step": 2522 + }, + { + "epoch": 2.042914979757085, + "grad_norm": 1.6787492821611882, + "learning_rate": 4.732024909606923e-06, + "loss": 0.918, + "step": 2523 + }, + { + "epoch": 2.0437246963562754, + "grad_norm": 1.686860082454776, + "learning_rate": 4.7247209060146495e-06, + "loss": 1.0009, + "step": 2524 + }, + { + "epoch": 2.0445344129554655, + "grad_norm": 1.8404325405304682, + "learning_rate": 4.7174207997812436e-06, + "loss": 0.9994, + "step": 2525 + }, + { + "epoch": 2.045344129554656, + "grad_norm": 1.6172176781453194, + "learning_rate": 4.710124596299998e-06, + "loss": 1.0434, + "step": 2526 + }, + { + "epoch": 2.046153846153846, + "grad_norm": 1.603579157288264, + "learning_rate": 4.70283230096133e-06, + "loss": 0.9923, + "step": 2527 + }, + { + "epoch": 2.0469635627530365, + "grad_norm": 1.6474458591059657, + "learning_rate": 4.6955439191527556e-06, + "loss": 0.9799, + "step": 2528 + }, + { + "epoch": 2.0477732793522265, + "grad_norm": 1.6208921660937512, + "learning_rate": 4.688259456258916e-06, + "loss": 1.0971, + "step": 2529 + }, + { + "epoch": 2.048582995951417, + "grad_norm": 1.6056850691607139, + "learning_rate": 4.680978917661544e-06, + "loss": 0.9683, + "step": 2530 + }, + { + "epoch": 2.049392712550607, + "grad_norm": 1.6643700134128743, + "learning_rate": 4.673702308739478e-06, + "loss": 1.0623, + "step": 2531 + }, + { + "epoch": 2.0502024291497976, + "grad_norm": 1.6630243276082293, + "learning_rate": 4.666429634868651e-06, + "loss": 0.9393, + "step": 2532 + }, + { + "epoch": 2.051012145748988, + "grad_norm": 1.6267769934323475, + "learning_rate": 4.659160901422094e-06, + "loss": 1.0042, + "step": 2533 + }, + { + "epoch": 2.051821862348178, + "grad_norm": 1.798215794043288, + "learning_rate": 4.651896113769917e-06, + "loss": 1.0247, + "step": 2534 + }, + { + "epoch": 2.0526315789473686, + "grad_norm": 1.5817141672309059, + "learning_rate": 4.6446352772793256e-06, + "loss": 1.045, + "step": 2535 + }, + { + "epoch": 2.0534412955465586, + "grad_norm": 1.589804333120599, + "learning_rate": 4.637378397314607e-06, + "loss": 1.0288, + "step": 2536 + }, + { + "epoch": 2.054251012145749, + "grad_norm": 1.6164730177481579, + "learning_rate": 4.630125479237114e-06, + "loss": 0.9591, + "step": 2537 + }, + { + "epoch": 2.055060728744939, + "grad_norm": 1.5869372349513844, + "learning_rate": 4.622876528405281e-06, + "loss": 1.0036, + "step": 2538 + }, + { + "epoch": 2.0558704453441297, + "grad_norm": 1.6778929953471509, + "learning_rate": 4.615631550174609e-06, + "loss": 1.0513, + "step": 2539 + }, + { + "epoch": 2.0566801619433197, + "grad_norm": 1.5623260111430552, + "learning_rate": 4.608390549897661e-06, + "loss": 1.0336, + "step": 2540 + }, + { + "epoch": 2.05748987854251, + "grad_norm": 1.6803766333569634, + "learning_rate": 4.601153532924064e-06, + "loss": 0.9903, + "step": 2541 + }, + { + "epoch": 2.0582995951417002, + "grad_norm": 1.7893026888075112, + "learning_rate": 4.593920504600508e-06, + "loss": 0.9702, + "step": 2542 + }, + { + "epoch": 2.0591093117408907, + "grad_norm": 1.7193621454208152, + "learning_rate": 4.586691470270725e-06, + "loss": 1.0157, + "step": 2543 + }, + { + "epoch": 2.059919028340081, + "grad_norm": 1.660520678676988, + "learning_rate": 4.579466435275506e-06, + "loss": 0.9825, + "step": 2544 + }, + { + "epoch": 2.0607287449392713, + "grad_norm": 1.654111044168718, + "learning_rate": 4.5722454049526825e-06, + "loss": 0.9855, + "step": 2545 + }, + { + "epoch": 2.0615384615384613, + "grad_norm": 1.6536593350319584, + "learning_rate": 4.565028384637127e-06, + "loss": 0.9827, + "step": 2546 + }, + { + "epoch": 2.062348178137652, + "grad_norm": 1.7206634958914158, + "learning_rate": 4.557815379660749e-06, + "loss": 0.9644, + "step": 2547 + }, + { + "epoch": 2.0631578947368423, + "grad_norm": 1.6167239693996955, + "learning_rate": 4.550606395352496e-06, + "loss": 1.0163, + "step": 2548 + }, + { + "epoch": 2.0639676113360323, + "grad_norm": 1.6089251662105508, + "learning_rate": 4.543401437038335e-06, + "loss": 0.9805, + "step": 2549 + }, + { + "epoch": 2.064777327935223, + "grad_norm": 1.6783012771976722, + "learning_rate": 4.536200510041271e-06, + "loss": 1.0095, + "step": 2550 + }, + { + "epoch": 2.065587044534413, + "grad_norm": 1.7505966956320587, + "learning_rate": 4.5290036196813294e-06, + "loss": 0.9746, + "step": 2551 + }, + { + "epoch": 2.0663967611336034, + "grad_norm": 1.8266001199253628, + "learning_rate": 4.521810771275543e-06, + "loss": 1.0251, + "step": 2552 + }, + { + "epoch": 2.0672064777327934, + "grad_norm": 1.6164027260208844, + "learning_rate": 4.514621970137967e-06, + "loss": 1.0155, + "step": 2553 + }, + { + "epoch": 2.068016194331984, + "grad_norm": 1.7050380636821716, + "learning_rate": 4.507437221579662e-06, + "loss": 0.9616, + "step": 2554 + }, + { + "epoch": 2.068825910931174, + "grad_norm": 1.7946850395276819, + "learning_rate": 4.5002565309087e-06, + "loss": 0.9502, + "step": 2555 + }, + { + "epoch": 2.0696356275303645, + "grad_norm": 1.585579416581573, + "learning_rate": 4.493079903430144e-06, + "loss": 0.998, + "step": 2556 + }, + { + "epoch": 2.0704453441295545, + "grad_norm": 1.6347584013839378, + "learning_rate": 4.485907344446073e-06, + "loss": 1.006, + "step": 2557 + }, + { + "epoch": 2.071255060728745, + "grad_norm": 1.7338467691904824, + "learning_rate": 4.478738859255542e-06, + "loss": 1.0128, + "step": 2558 + }, + { + "epoch": 2.072064777327935, + "grad_norm": 1.8039959794063494, + "learning_rate": 4.4715744531546115e-06, + "loss": 0.9555, + "step": 2559 + }, + { + "epoch": 2.0728744939271255, + "grad_norm": 1.7310213842710942, + "learning_rate": 4.4644141314363165e-06, + "loss": 1.0433, + "step": 2560 + }, + { + "epoch": 2.0736842105263156, + "grad_norm": 1.5892088009452698, + "learning_rate": 4.45725789939068e-06, + "loss": 0.9779, + "step": 2561 + }, + { + "epoch": 2.074493927125506, + "grad_norm": 1.5328100810210468, + "learning_rate": 4.450105762304703e-06, + "loss": 1.057, + "step": 2562 + }, + { + "epoch": 2.0753036437246966, + "grad_norm": 1.5920760677864, + "learning_rate": 4.44295772546236e-06, + "loss": 1.0849, + "step": 2563 + }, + { + "epoch": 2.0761133603238866, + "grad_norm": 1.660029789485431, + "learning_rate": 4.435813794144596e-06, + "loss": 1.0217, + "step": 2564 + }, + { + "epoch": 2.076923076923077, + "grad_norm": 1.665382546622789, + "learning_rate": 4.4286739736293285e-06, + "loss": 0.9562, + "step": 2565 + }, + { + "epoch": 2.077732793522267, + "grad_norm": 1.6552078961930563, + "learning_rate": 4.421538269191427e-06, + "loss": 1.031, + "step": 2566 + }, + { + "epoch": 2.0785425101214576, + "grad_norm": 1.6606164514071944, + "learning_rate": 4.414406686102734e-06, + "loss": 0.9781, + "step": 2567 + }, + { + "epoch": 2.0793522267206477, + "grad_norm": 1.6124589826824705, + "learning_rate": 4.407279229632034e-06, + "loss": 1.0374, + "step": 2568 + }, + { + "epoch": 2.080161943319838, + "grad_norm": 1.7193015635747628, + "learning_rate": 4.400155905045073e-06, + "loss": 1.0336, + "step": 2569 + }, + { + "epoch": 2.080971659919028, + "grad_norm": 1.7315044687603365, + "learning_rate": 4.393036717604536e-06, + "loss": 0.9621, + "step": 2570 + }, + { + "epoch": 2.0817813765182187, + "grad_norm": 1.7098596418635459, + "learning_rate": 4.385921672570054e-06, + "loss": 0.9662, + "step": 2571 + }, + { + "epoch": 2.0825910931174088, + "grad_norm": 1.607788160578827, + "learning_rate": 4.378810775198203e-06, + "loss": 1.0082, + "step": 2572 + }, + { + "epoch": 2.0834008097165992, + "grad_norm": 1.653291357704816, + "learning_rate": 4.371704030742491e-06, + "loss": 1.0356, + "step": 2573 + }, + { + "epoch": 2.0842105263157893, + "grad_norm": 1.6881096442374148, + "learning_rate": 4.36460144445335e-06, + "loss": 1.0342, + "step": 2574 + }, + { + "epoch": 2.08502024291498, + "grad_norm": 1.7125542227995731, + "learning_rate": 4.357503021578158e-06, + "loss": 0.9266, + "step": 2575 + }, + { + "epoch": 2.08582995951417, + "grad_norm": 1.670269245177905, + "learning_rate": 4.3504087673612e-06, + "loss": 1.001, + "step": 2576 + }, + { + "epoch": 2.0866396761133603, + "grad_norm": 1.5784965452737714, + "learning_rate": 4.343318687043691e-06, + "loss": 1.0012, + "step": 2577 + }, + { + "epoch": 2.087449392712551, + "grad_norm": 1.6681995748372629, + "learning_rate": 4.336232785863756e-06, + "loss": 1.0095, + "step": 2578 + }, + { + "epoch": 2.088259109311741, + "grad_norm": 1.748694757073851, + "learning_rate": 4.329151069056432e-06, + "loss": 0.9591, + "step": 2579 + }, + { + "epoch": 2.0890688259109313, + "grad_norm": 1.7719387348638524, + "learning_rate": 4.322073541853677e-06, + "loss": 0.9696, + "step": 2580 + }, + { + "epoch": 2.0898785425101214, + "grad_norm": 1.64114377111076, + "learning_rate": 4.3150002094843415e-06, + "loss": 0.9863, + "step": 2581 + }, + { + "epoch": 2.090688259109312, + "grad_norm": 1.64738596574169, + "learning_rate": 4.307931077174175e-06, + "loss": 1.0393, + "step": 2582 + }, + { + "epoch": 2.091497975708502, + "grad_norm": 1.691860211239468, + "learning_rate": 4.300866150145837e-06, + "loss": 0.9525, + "step": 2583 + }, + { + "epoch": 2.0923076923076924, + "grad_norm": 1.576591959382383, + "learning_rate": 4.293805433618869e-06, + "loss": 1.0705, + "step": 2584 + }, + { + "epoch": 2.0931174089068825, + "grad_norm": 1.5738372098947238, + "learning_rate": 4.286748932809707e-06, + "loss": 1.0264, + "step": 2585 + }, + { + "epoch": 2.093927125506073, + "grad_norm": 1.6490322231201804, + "learning_rate": 4.279696652931663e-06, + "loss": 0.9917, + "step": 2586 + }, + { + "epoch": 2.094736842105263, + "grad_norm": 1.8188564774664804, + "learning_rate": 4.272648599194948e-06, + "loss": 0.9347, + "step": 2587 + }, + { + "epoch": 2.0955465587044535, + "grad_norm": 1.8581675482172395, + "learning_rate": 4.265604776806638e-06, + "loss": 0.9164, + "step": 2588 + }, + { + "epoch": 2.0963562753036435, + "grad_norm": 1.8577584612627356, + "learning_rate": 4.258565190970684e-06, + "loss": 1.0061, + "step": 2589 + }, + { + "epoch": 2.097165991902834, + "grad_norm": 1.743492463005408, + "learning_rate": 4.2515298468879064e-06, + "loss": 1.0067, + "step": 2590 + }, + { + "epoch": 2.097975708502024, + "grad_norm": 1.646875858354436, + "learning_rate": 4.244498749756e-06, + "loss": 1.0357, + "step": 2591 + }, + { + "epoch": 2.0987854251012146, + "grad_norm": 1.7035172799966736, + "learning_rate": 4.237471904769514e-06, + "loss": 1.0574, + "step": 2592 + }, + { + "epoch": 2.099595141700405, + "grad_norm": 1.729516687109549, + "learning_rate": 4.2304493171198605e-06, + "loss": 1.0129, + "step": 2593 + }, + { + "epoch": 2.100404858299595, + "grad_norm": 1.8017575479884655, + "learning_rate": 4.223430991995296e-06, + "loss": 1.0019, + "step": 2594 + }, + { + "epoch": 2.1012145748987856, + "grad_norm": 1.6594942547323506, + "learning_rate": 4.216416934580947e-06, + "loss": 0.9952, + "step": 2595 + }, + { + "epoch": 2.1020242914979756, + "grad_norm": 1.7239899018632698, + "learning_rate": 4.2094071500587695e-06, + "loss": 0.9851, + "step": 2596 + }, + { + "epoch": 2.102834008097166, + "grad_norm": 1.703904052950475, + "learning_rate": 4.202401643607572e-06, + "loss": 1.0284, + "step": 2597 + }, + { + "epoch": 2.103643724696356, + "grad_norm": 1.8593640278653734, + "learning_rate": 4.1954004204029945e-06, + "loss": 0.9578, + "step": 2598 + }, + { + "epoch": 2.1044534412955467, + "grad_norm": 1.856774903804991, + "learning_rate": 4.188403485617526e-06, + "loss": 1.0438, + "step": 2599 + }, + { + "epoch": 2.1052631578947367, + "grad_norm": 1.78449027364225, + "learning_rate": 4.181410844420473e-06, + "loss": 0.9355, + "step": 2600 + }, + { + "epoch": 2.106072874493927, + "grad_norm": 1.6445451971076204, + "learning_rate": 4.174422501977976e-06, + "loss": 1.0028, + "step": 2601 + }, + { + "epoch": 2.1068825910931173, + "grad_norm": 1.6462257848218957, + "learning_rate": 4.167438463453003e-06, + "loss": 0.9803, + "step": 2602 + }, + { + "epoch": 2.1076923076923078, + "grad_norm": 1.6907368674533254, + "learning_rate": 4.160458734005337e-06, + "loss": 1.0294, + "step": 2603 + }, + { + "epoch": 2.108502024291498, + "grad_norm": 1.6636161911260379, + "learning_rate": 4.153483318791579e-06, + "loss": 0.9813, + "step": 2604 + }, + { + "epoch": 2.1093117408906883, + "grad_norm": 1.7183152187153377, + "learning_rate": 4.146512222965144e-06, + "loss": 0.9823, + "step": 2605 + }, + { + "epoch": 2.1101214574898783, + "grad_norm": 1.6814396856451228, + "learning_rate": 4.139545451676248e-06, + "loss": 1.032, + "step": 2606 + }, + { + "epoch": 2.110931174089069, + "grad_norm": 1.7645866557280991, + "learning_rate": 4.1325830100719275e-06, + "loss": 1.0356, + "step": 2607 + }, + { + "epoch": 2.1117408906882593, + "grad_norm": 1.6961679753232464, + "learning_rate": 4.125624903296009e-06, + "loss": 1.0811, + "step": 2608 + }, + { + "epoch": 2.1125506072874494, + "grad_norm": 1.6525910699956303, + "learning_rate": 4.118671136489115e-06, + "loss": 0.9451, + "step": 2609 + }, + { + "epoch": 2.11336032388664, + "grad_norm": 1.6360702685596986, + "learning_rate": 4.111721714788671e-06, + "loss": 0.9375, + "step": 2610 + }, + { + "epoch": 2.11417004048583, + "grad_norm": 1.6179640067775025, + "learning_rate": 4.104776643328886e-06, + "loss": 1.0224, + "step": 2611 + }, + { + "epoch": 2.1149797570850204, + "grad_norm": 1.6525238056683458, + "learning_rate": 4.097835927240753e-06, + "loss": 1.0351, + "step": 2612 + }, + { + "epoch": 2.1157894736842104, + "grad_norm": 1.585412724307839, + "learning_rate": 4.090899571652053e-06, + "loss": 0.9931, + "step": 2613 + }, + { + "epoch": 2.116599190283401, + "grad_norm": 1.646625538543497, + "learning_rate": 4.083967581687338e-06, + "loss": 0.9385, + "step": 2614 + }, + { + "epoch": 2.117408906882591, + "grad_norm": 1.6984647506858865, + "learning_rate": 4.077039962467946e-06, + "loss": 0.9629, + "step": 2615 + }, + { + "epoch": 2.1182186234817815, + "grad_norm": 1.6719455263043292, + "learning_rate": 4.070116719111973e-06, + "loss": 1.0239, + "step": 2616 + }, + { + "epoch": 2.1190283400809715, + "grad_norm": 1.7112777230654765, + "learning_rate": 4.063197856734295e-06, + "loss": 0.979, + "step": 2617 + }, + { + "epoch": 2.119838056680162, + "grad_norm": 1.6987541653351368, + "learning_rate": 4.056283380446542e-06, + "loss": 1.0153, + "step": 2618 + }, + { + "epoch": 2.120647773279352, + "grad_norm": 1.6871431309146279, + "learning_rate": 4.049373295357105e-06, + "loss": 1.0376, + "step": 2619 + }, + { + "epoch": 2.1214574898785425, + "grad_norm": 1.6903927451402816, + "learning_rate": 4.042467606571134e-06, + "loss": 0.9581, + "step": 2620 + }, + { + "epoch": 2.1222672064777326, + "grad_norm": 1.6637711833229312, + "learning_rate": 4.0355663191905285e-06, + "loss": 1.0201, + "step": 2621 + }, + { + "epoch": 2.123076923076923, + "grad_norm": 1.7111155440518553, + "learning_rate": 4.028669438313933e-06, + "loss": 0.9114, + "step": 2622 + }, + { + "epoch": 2.1238866396761136, + "grad_norm": 1.7277429546609078, + "learning_rate": 4.0217769690367426e-06, + "loss": 0.9072, + "step": 2623 + }, + { + "epoch": 2.1246963562753036, + "grad_norm": 1.6772060985996178, + "learning_rate": 4.014888916451097e-06, + "loss": 0.9913, + "step": 2624 + }, + { + "epoch": 2.125506072874494, + "grad_norm": 1.6078641618211225, + "learning_rate": 4.008005285645863e-06, + "loss": 1.069, + "step": 2625 + }, + { + "epoch": 2.126315789473684, + "grad_norm": 1.620520443252991, + "learning_rate": 4.001126081706643e-06, + "loss": 0.9733, + "step": 2626 + }, + { + "epoch": 2.1271255060728746, + "grad_norm": 1.6078988933775202, + "learning_rate": 3.994251309715772e-06, + "loss": 1.0605, + "step": 2627 + }, + { + "epoch": 2.1279352226720647, + "grad_norm": 1.6029687010632365, + "learning_rate": 3.9873809747523075e-06, + "loss": 0.9842, + "step": 2628 + }, + { + "epoch": 2.128744939271255, + "grad_norm": 1.705951335796002, + "learning_rate": 3.98051508189203e-06, + "loss": 1.0009, + "step": 2629 + }, + { + "epoch": 2.1295546558704452, + "grad_norm": 1.6635774008101378, + "learning_rate": 3.973653636207437e-06, + "loss": 0.8982, + "step": 2630 + }, + { + "epoch": 2.1303643724696357, + "grad_norm": 1.7082989524349448, + "learning_rate": 3.966796642767745e-06, + "loss": 1.0141, + "step": 2631 + }, + { + "epoch": 2.1311740890688258, + "grad_norm": 1.6653197593597078, + "learning_rate": 3.959944106638881e-06, + "loss": 0.9809, + "step": 2632 + }, + { + "epoch": 2.1319838056680163, + "grad_norm": 1.6678594956056483, + "learning_rate": 3.953096032883473e-06, + "loss": 0.98, + "step": 2633 + }, + { + "epoch": 2.1327935222672063, + "grad_norm": 1.7005284089411195, + "learning_rate": 3.946252426560855e-06, + "loss": 0.9979, + "step": 2634 + }, + { + "epoch": 2.133603238866397, + "grad_norm": 1.6151071121961287, + "learning_rate": 3.939413292727061e-06, + "loss": 0.9968, + "step": 2635 + }, + { + "epoch": 2.134412955465587, + "grad_norm": 1.6806724677037892, + "learning_rate": 3.932578636434822e-06, + "loss": 0.9318, + "step": 2636 + }, + { + "epoch": 2.1352226720647773, + "grad_norm": 1.7311866714728976, + "learning_rate": 3.9257484627335545e-06, + "loss": 0.965, + "step": 2637 + }, + { + "epoch": 2.136032388663968, + "grad_norm": 1.7711121211961232, + "learning_rate": 3.9189227766693715e-06, + "loss": 0.9824, + "step": 2638 + }, + { + "epoch": 2.136842105263158, + "grad_norm": 1.684667995414289, + "learning_rate": 3.912101583285072e-06, + "loss": 0.915, + "step": 2639 + }, + { + "epoch": 2.1376518218623484, + "grad_norm": 1.7191802163538186, + "learning_rate": 3.9052848876201285e-06, + "loss": 0.9592, + "step": 2640 + }, + { + "epoch": 2.1384615384615384, + "grad_norm": 1.6407834559216017, + "learning_rate": 3.898472694710692e-06, + "loss": 0.9821, + "step": 2641 + }, + { + "epoch": 2.139271255060729, + "grad_norm": 1.6128402655438978, + "learning_rate": 3.891665009589588e-06, + "loss": 0.9976, + "step": 2642 + }, + { + "epoch": 2.140080971659919, + "grad_norm": 1.7113632799708938, + "learning_rate": 3.884861837286314e-06, + "loss": 0.9884, + "step": 2643 + }, + { + "epoch": 2.1408906882591094, + "grad_norm": 1.6447541227519287, + "learning_rate": 3.878063182827025e-06, + "loss": 0.9051, + "step": 2644 + }, + { + "epoch": 2.1417004048582995, + "grad_norm": 1.700018398274763, + "learning_rate": 3.8712690512345555e-06, + "loss": 1.0079, + "step": 2645 + }, + { + "epoch": 2.14251012145749, + "grad_norm": 1.7278282489587906, + "learning_rate": 3.8644794475283754e-06, + "loss": 0.9488, + "step": 2646 + }, + { + "epoch": 2.14331983805668, + "grad_norm": 1.7030788584383703, + "learning_rate": 3.857694376724634e-06, + "loss": 0.9766, + "step": 2647 + }, + { + "epoch": 2.1441295546558705, + "grad_norm": 1.6975892598366673, + "learning_rate": 3.850913843836111e-06, + "loss": 0.9645, + "step": 2648 + }, + { + "epoch": 2.1449392712550606, + "grad_norm": 1.6362040151463373, + "learning_rate": 3.844137853872245e-06, + "loss": 0.9809, + "step": 2649 + }, + { + "epoch": 2.145748987854251, + "grad_norm": 1.6598578036651748, + "learning_rate": 3.837366411839114e-06, + "loss": 0.9822, + "step": 2650 + }, + { + "epoch": 2.146558704453441, + "grad_norm": 1.6357704564069009, + "learning_rate": 3.830599522739437e-06, + "loss": 1.0175, + "step": 2651 + }, + { + "epoch": 2.1473684210526316, + "grad_norm": 1.6232730145470107, + "learning_rate": 3.823837191572567e-06, + "loss": 0.9417, + "step": 2652 + }, + { + "epoch": 2.148178137651822, + "grad_norm": 1.6210092712146753, + "learning_rate": 3.817079423334497e-06, + "loss": 0.9656, + "step": 2653 + }, + { + "epoch": 2.148987854251012, + "grad_norm": 1.688514119577297, + "learning_rate": 3.8103262230178395e-06, + "loss": 1.0027, + "step": 2654 + }, + { + "epoch": 2.1497975708502026, + "grad_norm": 1.6972006744474526, + "learning_rate": 3.8035775956118416e-06, + "loss": 1.0177, + "step": 2655 + }, + { + "epoch": 2.1506072874493927, + "grad_norm": 1.6862311012034847, + "learning_rate": 3.7968335461023654e-06, + "loss": 0.9403, + "step": 2656 + }, + { + "epoch": 2.151417004048583, + "grad_norm": 1.6576918628517623, + "learning_rate": 3.790094079471891e-06, + "loss": 0.9654, + "step": 2657 + }, + { + "epoch": 2.152226720647773, + "grad_norm": 1.705286165779682, + "learning_rate": 3.7833592006995144e-06, + "loss": 1.0502, + "step": 2658 + }, + { + "epoch": 2.1530364372469637, + "grad_norm": 1.6153316475940291, + "learning_rate": 3.77662891476094e-06, + "loss": 0.9806, + "step": 2659 + }, + { + "epoch": 2.1538461538461537, + "grad_norm": 1.6880698498431286, + "learning_rate": 3.7699032266284863e-06, + "loss": 1.0253, + "step": 2660 + }, + { + "epoch": 2.1546558704453442, + "grad_norm": 1.7777836789038124, + "learning_rate": 3.7631821412710668e-06, + "loss": 0.9982, + "step": 2661 + }, + { + "epoch": 2.1554655870445343, + "grad_norm": 1.6474905913845634, + "learning_rate": 3.7564656636541928e-06, + "loss": 0.9736, + "step": 2662 + }, + { + "epoch": 2.1562753036437248, + "grad_norm": 1.64817525670547, + "learning_rate": 3.7497537987399836e-06, + "loss": 0.9996, + "step": 2663 + }, + { + "epoch": 2.157085020242915, + "grad_norm": 1.6328494859671836, + "learning_rate": 3.7430465514871405e-06, + "loss": 0.9529, + "step": 2664 + }, + { + "epoch": 2.1578947368421053, + "grad_norm": 1.5460907995988389, + "learning_rate": 3.736343926850954e-06, + "loss": 0.9776, + "step": 2665 + }, + { + "epoch": 2.1587044534412954, + "grad_norm": 1.7999028275226707, + "learning_rate": 3.729645929783302e-06, + "loss": 0.9444, + "step": 2666 + }, + { + "epoch": 2.159514170040486, + "grad_norm": 1.6674054885247187, + "learning_rate": 3.7229525652326392e-06, + "loss": 0.9698, + "step": 2667 + }, + { + "epoch": 2.1603238866396763, + "grad_norm": 1.7459838166181108, + "learning_rate": 3.7162638381440077e-06, + "loss": 0.9791, + "step": 2668 + }, + { + "epoch": 2.1611336032388664, + "grad_norm": 1.7171429916272887, + "learning_rate": 3.709579753459015e-06, + "loss": 1.0177, + "step": 2669 + }, + { + "epoch": 2.161943319838057, + "grad_norm": 1.729852115207468, + "learning_rate": 3.702900316115836e-06, + "loss": 1.0042, + "step": 2670 + }, + { + "epoch": 2.162753036437247, + "grad_norm": 1.7165892661582856, + "learning_rate": 3.6962255310492256e-06, + "loss": 0.9719, + "step": 2671 + }, + { + "epoch": 2.1635627530364374, + "grad_norm": 1.628850570144987, + "learning_rate": 3.689555403190488e-06, + "loss": 1.0131, + "step": 2672 + }, + { + "epoch": 2.1643724696356275, + "grad_norm": 1.6296844076079475, + "learning_rate": 3.6828899374674933e-06, + "loss": 0.9733, + "step": 2673 + }, + { + "epoch": 2.165182186234818, + "grad_norm": 1.623822465699174, + "learning_rate": 3.67622913880466e-06, + "loss": 1.0049, + "step": 2674 + }, + { + "epoch": 2.165991902834008, + "grad_norm": 1.6705667827629207, + "learning_rate": 3.6695730121229734e-06, + "loss": 1.0427, + "step": 2675 + }, + { + "epoch": 2.1668016194331985, + "grad_norm": 1.6901112337582092, + "learning_rate": 3.6629215623399526e-06, + "loss": 0.9016, + "step": 2676 + }, + { + "epoch": 2.1676113360323885, + "grad_norm": 1.635822547002648, + "learning_rate": 3.6562747943696696e-06, + "loss": 1.0312, + "step": 2677 + }, + { + "epoch": 2.168421052631579, + "grad_norm": 1.773469367884702, + "learning_rate": 3.6496327131227284e-06, + "loss": 0.9799, + "step": 2678 + }, + { + "epoch": 2.169230769230769, + "grad_norm": 1.703119798535184, + "learning_rate": 3.6429953235062853e-06, + "loss": 0.9642, + "step": 2679 + }, + { + "epoch": 2.1700404858299596, + "grad_norm": 1.5950037755310869, + "learning_rate": 3.6363626304240185e-06, + "loss": 0.9954, + "step": 2680 + }, + { + "epoch": 2.1708502024291496, + "grad_norm": 1.6651131112700766, + "learning_rate": 3.629734638776139e-06, + "loss": 0.9598, + "step": 2681 + }, + { + "epoch": 2.17165991902834, + "grad_norm": 1.674961128025693, + "learning_rate": 3.6231113534593833e-06, + "loss": 0.9485, + "step": 2682 + }, + { + "epoch": 2.1724696356275306, + "grad_norm": 1.6927204922354684, + "learning_rate": 3.616492779367018e-06, + "loss": 1.0114, + "step": 2683 + }, + { + "epoch": 2.1732793522267206, + "grad_norm": 1.6141482144444916, + "learning_rate": 3.609878921388822e-06, + "loss": 1.0204, + "step": 2684 + }, + { + "epoch": 2.174089068825911, + "grad_norm": 1.5938306093069654, + "learning_rate": 3.6032697844110896e-06, + "loss": 1.0281, + "step": 2685 + }, + { + "epoch": 2.174898785425101, + "grad_norm": 1.674185209940073, + "learning_rate": 3.596665373316629e-06, + "loss": 1.0156, + "step": 2686 + }, + { + "epoch": 2.1757085020242917, + "grad_norm": 1.652753408219103, + "learning_rate": 3.590065692984762e-06, + "loss": 0.9686, + "step": 2687 + }, + { + "epoch": 2.1765182186234817, + "grad_norm": 1.6554790863907958, + "learning_rate": 3.583470748291309e-06, + "loss": 0.961, + "step": 2688 + }, + { + "epoch": 2.177327935222672, + "grad_norm": 1.7595295916415807, + "learning_rate": 3.5768805441085885e-06, + "loss": 1.0052, + "step": 2689 + }, + { + "epoch": 2.1781376518218623, + "grad_norm": 1.7129389096310346, + "learning_rate": 3.5702950853054284e-06, + "loss": 1.0033, + "step": 2690 + }, + { + "epoch": 2.1789473684210527, + "grad_norm": 1.6958273168912117, + "learning_rate": 3.5637143767471427e-06, + "loss": 1.0284, + "step": 2691 + }, + { + "epoch": 2.179757085020243, + "grad_norm": 1.6851614860716266, + "learning_rate": 3.5571384232955365e-06, + "loss": 0.971, + "step": 2692 + }, + { + "epoch": 2.1805668016194333, + "grad_norm": 1.6326875705295927, + "learning_rate": 3.550567229808901e-06, + "loss": 1.002, + "step": 2693 + }, + { + "epoch": 2.1813765182186233, + "grad_norm": 1.6508788198860427, + "learning_rate": 3.5440008011420103e-06, + "loss": 1.0164, + "step": 2694 + }, + { + "epoch": 2.182186234817814, + "grad_norm": 1.6307505301931884, + "learning_rate": 3.5374391421461273e-06, + "loss": 0.9336, + "step": 2695 + }, + { + "epoch": 2.182995951417004, + "grad_norm": 1.6240785364108545, + "learning_rate": 3.5308822576689805e-06, + "loss": 1.0049, + "step": 2696 + }, + { + "epoch": 2.1838056680161944, + "grad_norm": 1.6543399441514246, + "learning_rate": 3.5243301525547714e-06, + "loss": 1.0455, + "step": 2697 + }, + { + "epoch": 2.184615384615385, + "grad_norm": 1.650315122489001, + "learning_rate": 3.5177828316441797e-06, + "loss": 0.9574, + "step": 2698 + }, + { + "epoch": 2.185425101214575, + "grad_norm": 1.6675806777098334, + "learning_rate": 3.511240299774341e-06, + "loss": 1.005, + "step": 2699 + }, + { + "epoch": 2.1862348178137654, + "grad_norm": 1.6634779623745304, + "learning_rate": 3.5047025617788578e-06, + "loss": 1.0074, + "step": 2700 + }, + { + "epoch": 2.1870445344129554, + "grad_norm": 1.6256164088565501, + "learning_rate": 3.4981696224877893e-06, + "loss": 0.9875, + "step": 2701 + }, + { + "epoch": 2.187854251012146, + "grad_norm": 1.7291877052522024, + "learning_rate": 3.491641486727645e-06, + "loss": 0.9844, + "step": 2702 + }, + { + "epoch": 2.188663967611336, + "grad_norm": 1.7129557045779524, + "learning_rate": 3.4851181593213967e-06, + "loss": 0.9625, + "step": 2703 + }, + { + "epoch": 2.1894736842105265, + "grad_norm": 1.7620888900482394, + "learning_rate": 3.478599645088453e-06, + "loss": 0.9872, + "step": 2704 + }, + { + "epoch": 2.1902834008097165, + "grad_norm": 1.6649502085884391, + "learning_rate": 3.4720859488446744e-06, + "loss": 0.9264, + "step": 2705 + }, + { + "epoch": 2.191093117408907, + "grad_norm": 1.6084607566937745, + "learning_rate": 3.4655770754023574e-06, + "loss": 0.9531, + "step": 2706 + }, + { + "epoch": 2.191902834008097, + "grad_norm": 1.6974794965679043, + "learning_rate": 3.4590730295702356e-06, + "loss": 0.9283, + "step": 2707 + }, + { + "epoch": 2.1927125506072875, + "grad_norm": 1.687339468024081, + "learning_rate": 3.452573816153476e-06, + "loss": 0.9698, + "step": 2708 + }, + { + "epoch": 2.1935222672064776, + "grad_norm": 1.7543324832553966, + "learning_rate": 3.446079439953677e-06, + "loss": 0.9411, + "step": 2709 + }, + { + "epoch": 2.194331983805668, + "grad_norm": 1.711648121979403, + "learning_rate": 3.4395899057688575e-06, + "loss": 0.9817, + "step": 2710 + }, + { + "epoch": 2.195141700404858, + "grad_norm": 1.664633436704711, + "learning_rate": 3.4331052183934687e-06, + "loss": 0.9834, + "step": 2711 + }, + { + "epoch": 2.1959514170040486, + "grad_norm": 1.655215067217286, + "learning_rate": 3.4266253826183805e-06, + "loss": 0.9665, + "step": 2712 + }, + { + "epoch": 2.196761133603239, + "grad_norm": 1.7006734272766477, + "learning_rate": 3.4201504032308695e-06, + "loss": 0.9777, + "step": 2713 + }, + { + "epoch": 2.197570850202429, + "grad_norm": 1.714785930728774, + "learning_rate": 3.41368028501463e-06, + "loss": 0.9656, + "step": 2714 + }, + { + "epoch": 2.1983805668016196, + "grad_norm": 1.6909113867716612, + "learning_rate": 3.407215032749763e-06, + "loss": 1.0659, + "step": 2715 + }, + { + "epoch": 2.1991902834008097, + "grad_norm": 1.68682986907009, + "learning_rate": 3.4007546512127764e-06, + "loss": 0.9103, + "step": 2716 + }, + { + "epoch": 2.2, + "grad_norm": 1.6137573322333, + "learning_rate": 3.3942991451765793e-06, + "loss": 0.9957, + "step": 2717 + }, + { + "epoch": 2.2008097165991902, + "grad_norm": 1.6129056026950326, + "learning_rate": 3.387848519410475e-06, + "loss": 0.9935, + "step": 2718 + }, + { + "epoch": 2.2016194331983807, + "grad_norm": 1.694139432275297, + "learning_rate": 3.3814027786801675e-06, + "loss": 0.9418, + "step": 2719 + }, + { + "epoch": 2.2024291497975708, + "grad_norm": 1.7634838421878205, + "learning_rate": 3.374961927747751e-06, + "loss": 0.9193, + "step": 2720 + }, + { + "epoch": 2.2032388663967613, + "grad_norm": 1.6980171565314421, + "learning_rate": 3.3685259713717034e-06, + "loss": 0.9727, + "step": 2721 + }, + { + "epoch": 2.2040485829959513, + "grad_norm": 1.722930414761989, + "learning_rate": 3.362094914306888e-06, + "loss": 0.93, + "step": 2722 + }, + { + "epoch": 2.204858299595142, + "grad_norm": 1.6124571834590236, + "learning_rate": 3.355668761304548e-06, + "loss": 1.1032, + "step": 2723 + }, + { + "epoch": 2.205668016194332, + "grad_norm": 1.6394333882696688, + "learning_rate": 3.349247517112305e-06, + "loss": 1.0452, + "step": 2724 + }, + { + "epoch": 2.2064777327935223, + "grad_norm": 1.6345795057358297, + "learning_rate": 3.342831186474149e-06, + "loss": 0.9504, + "step": 2725 + }, + { + "epoch": 2.2072874493927124, + "grad_norm": 1.6475621511846046, + "learning_rate": 3.336419774130447e-06, + "loss": 1.0154, + "step": 2726 + }, + { + "epoch": 2.208097165991903, + "grad_norm": 1.6238349459249526, + "learning_rate": 3.3300132848179346e-06, + "loss": 0.9662, + "step": 2727 + }, + { + "epoch": 2.208906882591093, + "grad_norm": 1.6615756750809165, + "learning_rate": 3.3236117232696984e-06, + "loss": 1.0466, + "step": 2728 + }, + { + "epoch": 2.2097165991902834, + "grad_norm": 1.7120719200934522, + "learning_rate": 3.3172150942151947e-06, + "loss": 0.9345, + "step": 2729 + }, + { + "epoch": 2.2105263157894735, + "grad_norm": 1.6804653865746528, + "learning_rate": 3.31082340238023e-06, + "loss": 0.9928, + "step": 2730 + }, + { + "epoch": 2.211336032388664, + "grad_norm": 1.7147429622035564, + "learning_rate": 3.3044366524869652e-06, + "loss": 0.9555, + "step": 2731 + }, + { + "epoch": 2.2121457489878544, + "grad_norm": 1.664455141304305, + "learning_rate": 3.2980548492539064e-06, + "loss": 1.0162, + "step": 2732 + }, + { + "epoch": 2.2129554655870445, + "grad_norm": 1.7280945010756663, + "learning_rate": 3.291677997395918e-06, + "loss": 0.9628, + "step": 2733 + }, + { + "epoch": 2.213765182186235, + "grad_norm": 1.673275550114484, + "learning_rate": 3.2853061016241884e-06, + "loss": 1.0242, + "step": 2734 + }, + { + "epoch": 2.214574898785425, + "grad_norm": 1.7272385557412295, + "learning_rate": 3.27893916664626e-06, + "loss": 0.9813, + "step": 2735 + }, + { + "epoch": 2.2153846153846155, + "grad_norm": 1.6681828389749171, + "learning_rate": 3.2725771971660002e-06, + "loss": 0.9479, + "step": 2736 + }, + { + "epoch": 2.2161943319838056, + "grad_norm": 1.6481760708593136, + "learning_rate": 3.266220197883613e-06, + "loss": 0.9863, + "step": 2737 + }, + { + "epoch": 2.217004048582996, + "grad_norm": 1.6193831329668813, + "learning_rate": 3.259868173495626e-06, + "loss": 0.972, + "step": 2738 + }, + { + "epoch": 2.217813765182186, + "grad_norm": 1.6439379984114413, + "learning_rate": 3.2535211286948955e-06, + "loss": 0.9832, + "step": 2739 + }, + { + "epoch": 2.2186234817813766, + "grad_norm": 1.6572224667024693, + "learning_rate": 3.2471790681705928e-06, + "loss": 1.036, + "step": 2740 + }, + { + "epoch": 2.2194331983805666, + "grad_norm": 1.6609689760873279, + "learning_rate": 3.2408419966082195e-06, + "loss": 1.0051, + "step": 2741 + }, + { + "epoch": 2.220242914979757, + "grad_norm": 1.6875131566811306, + "learning_rate": 3.2345099186895758e-06, + "loss": 0.9738, + "step": 2742 + }, + { + "epoch": 2.221052631578947, + "grad_norm": 1.7178275452285305, + "learning_rate": 3.2281828390927873e-06, + "loss": 1.0007, + "step": 2743 + }, + { + "epoch": 2.2218623481781377, + "grad_norm": 1.7172578500910411, + "learning_rate": 3.221860762492275e-06, + "loss": 1.0334, + "step": 2744 + }, + { + "epoch": 2.2226720647773277, + "grad_norm": 1.6450445517301915, + "learning_rate": 3.215543693558769e-06, + "loss": 1.0215, + "step": 2745 + }, + { + "epoch": 2.223481781376518, + "grad_norm": 1.6443211869924201, + "learning_rate": 3.2092316369593e-06, + "loss": 0.9702, + "step": 2746 + }, + { + "epoch": 2.2242914979757087, + "grad_norm": 1.6072557369507485, + "learning_rate": 3.20292459735719e-06, + "loss": 0.9568, + "step": 2747 + }, + { + "epoch": 2.2251012145748987, + "grad_norm": 1.5890263149144106, + "learning_rate": 3.1966225794120666e-06, + "loss": 0.9662, + "step": 2748 + }, + { + "epoch": 2.2259109311740892, + "grad_norm": 1.6626050407972168, + "learning_rate": 3.1903255877798365e-06, + "loss": 0.9184, + "step": 2749 + }, + { + "epoch": 2.2267206477732793, + "grad_norm": 1.6429805630498902, + "learning_rate": 3.1840336271126935e-06, + "loss": 0.9815, + "step": 2750 + }, + { + "epoch": 2.2275303643724698, + "grad_norm": 1.7143845132000823, + "learning_rate": 3.1777467020591236e-06, + "loss": 1.0032, + "step": 2751 + }, + { + "epoch": 2.22834008097166, + "grad_norm": 1.7160771696928598, + "learning_rate": 3.1714648172638827e-06, + "loss": 0.9347, + "step": 2752 + }, + { + "epoch": 2.2291497975708503, + "grad_norm": 1.737109438949758, + "learning_rate": 3.165187977368007e-06, + "loss": 0.9485, + "step": 2753 + }, + { + "epoch": 2.2299595141700403, + "grad_norm": 1.6534753583589024, + "learning_rate": 3.158916187008806e-06, + "loss": 0.9403, + "step": 2754 + }, + { + "epoch": 2.230769230769231, + "grad_norm": 1.6881735304925691, + "learning_rate": 3.152649450819852e-06, + "loss": 0.989, + "step": 2755 + }, + { + "epoch": 2.231578947368421, + "grad_norm": 1.683248270203457, + "learning_rate": 3.146387773431e-06, + "loss": 1.0457, + "step": 2756 + }, + { + "epoch": 2.2323886639676114, + "grad_norm": 1.716360447509341, + "learning_rate": 3.1401311594683494e-06, + "loss": 1.011, + "step": 2757 + }, + { + "epoch": 2.2331983805668014, + "grad_norm": 1.726117425194692, + "learning_rate": 3.1338796135542647e-06, + "loss": 0.9894, + "step": 2758 + }, + { + "epoch": 2.234008097165992, + "grad_norm": 1.722136573207579, + "learning_rate": 3.1276331403073733e-06, + "loss": 0.9064, + "step": 2759 + }, + { + "epoch": 2.234817813765182, + "grad_norm": 1.6672196996268878, + "learning_rate": 3.1213917443425456e-06, + "loss": 0.9917, + "step": 2760 + }, + { + "epoch": 2.2356275303643725, + "grad_norm": 1.7657193112711613, + "learning_rate": 3.1151554302709063e-06, + "loss": 1.0076, + "step": 2761 + }, + { + "epoch": 2.236437246963563, + "grad_norm": 1.6902194564599813, + "learning_rate": 3.108924202699819e-06, + "loss": 1.0375, + "step": 2762 + }, + { + "epoch": 2.237246963562753, + "grad_norm": 1.7990661956680112, + "learning_rate": 3.1026980662328997e-06, + "loss": 0.9798, + "step": 2763 + }, + { + "epoch": 2.2380566801619435, + "grad_norm": 1.6588109292539122, + "learning_rate": 3.096477025469996e-06, + "loss": 0.9786, + "step": 2764 + }, + { + "epoch": 2.2388663967611335, + "grad_norm": 1.7473866019174604, + "learning_rate": 3.0902610850071922e-06, + "loss": 1.001, + "step": 2765 + }, + { + "epoch": 2.239676113360324, + "grad_norm": 1.5963776881440452, + "learning_rate": 3.084050249436802e-06, + "loss": 0.9453, + "step": 2766 + }, + { + "epoch": 2.240485829959514, + "grad_norm": 1.6511854722362378, + "learning_rate": 3.077844523347374e-06, + "loss": 0.9677, + "step": 2767 + }, + { + "epoch": 2.2412955465587046, + "grad_norm": 1.8031304104053283, + "learning_rate": 3.0716439113236785e-06, + "loss": 0.9372, + "step": 2768 + }, + { + "epoch": 2.2421052631578946, + "grad_norm": 1.6716997949617207, + "learning_rate": 3.0654484179467047e-06, + "loss": 0.9719, + "step": 2769 + }, + { + "epoch": 2.242914979757085, + "grad_norm": 1.623759924284454, + "learning_rate": 3.0592580477936606e-06, + "loss": 0.9986, + "step": 2770 + }, + { + "epoch": 2.243724696356275, + "grad_norm": 1.750132587533433, + "learning_rate": 3.0530728054379787e-06, + "loss": 0.9968, + "step": 2771 + }, + { + "epoch": 2.2445344129554656, + "grad_norm": 1.6683052360621997, + "learning_rate": 3.0468926954492907e-06, + "loss": 0.9192, + "step": 2772 + }, + { + "epoch": 2.2453441295546557, + "grad_norm": 1.7517498796441024, + "learning_rate": 3.0407177223934426e-06, + "loss": 0.9623, + "step": 2773 + }, + { + "epoch": 2.246153846153846, + "grad_norm": 1.7300413517444366, + "learning_rate": 3.034547890832481e-06, + "loss": 1.0161, + "step": 2774 + }, + { + "epoch": 2.246963562753036, + "grad_norm": 1.6389215305609568, + "learning_rate": 3.0283832053246644e-06, + "loss": 0.9079, + "step": 2775 + }, + { + "epoch": 2.2477732793522267, + "grad_norm": 1.6760927536723689, + "learning_rate": 3.022223670424437e-06, + "loss": 0.9628, + "step": 2776 + }, + { + "epoch": 2.248582995951417, + "grad_norm": 1.6857597576034187, + "learning_rate": 3.016069290682441e-06, + "loss": 0.9542, + "step": 2777 + }, + { + "epoch": 2.2493927125506072, + "grad_norm": 1.6715075659799083, + "learning_rate": 3.009920070645518e-06, + "loss": 1.048, + "step": 2778 + }, + { + "epoch": 2.2502024291497977, + "grad_norm": 1.6998697843304018, + "learning_rate": 3.0037760148566874e-06, + "loss": 0.9779, + "step": 2779 + }, + { + "epoch": 2.251012145748988, + "grad_norm": 1.7822427598135133, + "learning_rate": 2.99763712785516e-06, + "loss": 0.8956, + "step": 2780 + }, + { + "epoch": 2.2518218623481783, + "grad_norm": 1.6755771799162749, + "learning_rate": 2.9915034141763234e-06, + "loss": 0.9552, + "step": 2781 + }, + { + "epoch": 2.2526315789473683, + "grad_norm": 1.7432840821094, + "learning_rate": 2.9853748783517435e-06, + "loss": 1.0165, + "step": 2782 + }, + { + "epoch": 2.253441295546559, + "grad_norm": 1.7142235267700365, + "learning_rate": 2.9792515249091657e-06, + "loss": 1.0134, + "step": 2783 + }, + { + "epoch": 2.254251012145749, + "grad_norm": 1.7197547849869146, + "learning_rate": 2.973133358372504e-06, + "loss": 1.0114, + "step": 2784 + }, + { + "epoch": 2.2550607287449393, + "grad_norm": 1.6535591134631202, + "learning_rate": 2.967020383261834e-06, + "loss": 1.0338, + "step": 2785 + }, + { + "epoch": 2.2558704453441294, + "grad_norm": 1.5835662400274373, + "learning_rate": 2.960912604093409e-06, + "loss": 1.0032, + "step": 2786 + }, + { + "epoch": 2.25668016194332, + "grad_norm": 1.5736860921183824, + "learning_rate": 2.954810025379633e-06, + "loss": 1.0048, + "step": 2787 + }, + { + "epoch": 2.25748987854251, + "grad_norm": 1.7772332085899527, + "learning_rate": 2.948712651629071e-06, + "loss": 0.9409, + "step": 2788 + }, + { + "epoch": 2.2582995951417004, + "grad_norm": 1.8153302390569306, + "learning_rate": 2.9426204873464414e-06, + "loss": 0.9485, + "step": 2789 + }, + { + "epoch": 2.2591093117408905, + "grad_norm": 1.6435844735478726, + "learning_rate": 2.9365335370326143e-06, + "loss": 0.9743, + "step": 2790 + }, + { + "epoch": 2.259919028340081, + "grad_norm": 1.7393090327566456, + "learning_rate": 2.9304518051846143e-06, + "loss": 0.9736, + "step": 2791 + }, + { + "epoch": 2.2607287449392715, + "grad_norm": 1.6924799124970187, + "learning_rate": 2.924375296295597e-06, + "loss": 0.9779, + "step": 2792 + }, + { + "epoch": 2.2615384615384615, + "grad_norm": 1.6747668468696226, + "learning_rate": 2.9183040148548757e-06, + "loss": 0.9792, + "step": 2793 + }, + { + "epoch": 2.262348178137652, + "grad_norm": 1.8608821257094024, + "learning_rate": 2.9122379653478894e-06, + "loss": 0.9717, + "step": 2794 + }, + { + "epoch": 2.263157894736842, + "grad_norm": 1.6857679680681223, + "learning_rate": 2.9061771522562143e-06, + "loss": 0.9856, + "step": 2795 + }, + { + "epoch": 2.2639676113360325, + "grad_norm": 1.5974903308608277, + "learning_rate": 2.90012158005756e-06, + "loss": 0.9913, + "step": 2796 + }, + { + "epoch": 2.2647773279352226, + "grad_norm": 1.699823981139351, + "learning_rate": 2.8940712532257633e-06, + "loss": 0.9562, + "step": 2797 + }, + { + "epoch": 2.265587044534413, + "grad_norm": 1.6553947780834652, + "learning_rate": 2.8880261762307837e-06, + "loss": 0.9666, + "step": 2798 + }, + { + "epoch": 2.266396761133603, + "grad_norm": 1.6975659495235669, + "learning_rate": 2.8819863535387083e-06, + "loss": 0.983, + "step": 2799 + }, + { + "epoch": 2.2672064777327936, + "grad_norm": 1.6417408606225983, + "learning_rate": 2.875951789611734e-06, + "loss": 0.9544, + "step": 2800 + }, + { + "epoch": 2.2680161943319836, + "grad_norm": 1.6970530026128983, + "learning_rate": 2.8699224889081825e-06, + "loss": 1.0598, + "step": 2801 + }, + { + "epoch": 2.268825910931174, + "grad_norm": 1.6632904185152642, + "learning_rate": 2.8638984558824777e-06, + "loss": 1.0024, + "step": 2802 + }, + { + "epoch": 2.269635627530364, + "grad_norm": 1.7530525316894194, + "learning_rate": 2.857879694985156e-06, + "loss": 1.0393, + "step": 2803 + }, + { + "epoch": 2.2704453441295547, + "grad_norm": 1.6495544362471581, + "learning_rate": 2.851866210662858e-06, + "loss": 0.9628, + "step": 2804 + }, + { + "epoch": 2.2712550607287447, + "grad_norm": 1.636310387000526, + "learning_rate": 2.8458580073583262e-06, + "loss": 0.9938, + "step": 2805 + }, + { + "epoch": 2.272064777327935, + "grad_norm": 1.6690369023149698, + "learning_rate": 2.839855089510398e-06, + "loss": 0.9759, + "step": 2806 + }, + { + "epoch": 2.2728744939271257, + "grad_norm": 1.7396582090782353, + "learning_rate": 2.8338574615540136e-06, + "loss": 1.0541, + "step": 2807 + }, + { + "epoch": 2.2736842105263158, + "grad_norm": 1.6449963646693437, + "learning_rate": 2.827865127920203e-06, + "loss": 0.9446, + "step": 2808 + }, + { + "epoch": 2.2744939271255062, + "grad_norm": 1.71078655363176, + "learning_rate": 2.821878093036079e-06, + "loss": 0.9266, + "step": 2809 + }, + { + "epoch": 2.2753036437246963, + "grad_norm": 1.722249474010595, + "learning_rate": 2.8158963613248437e-06, + "loss": 0.9671, + "step": 2810 + }, + { + "epoch": 2.276113360323887, + "grad_norm": 1.6729015082048833, + "learning_rate": 2.8099199372057818e-06, + "loss": 0.995, + "step": 2811 + }, + { + "epoch": 2.276923076923077, + "grad_norm": 1.7356714743398183, + "learning_rate": 2.803948825094255e-06, + "loss": 1.0026, + "step": 2812 + }, + { + "epoch": 2.2777327935222673, + "grad_norm": 1.7087100915274263, + "learning_rate": 2.7979830294016985e-06, + "loss": 0.9608, + "step": 2813 + }, + { + "epoch": 2.2785425101214574, + "grad_norm": 1.6622618230635267, + "learning_rate": 2.792022554535625e-06, + "loss": 1.0405, + "step": 2814 + }, + { + "epoch": 2.279352226720648, + "grad_norm": 1.6282728230738879, + "learning_rate": 2.7860674048996174e-06, + "loss": 1.0188, + "step": 2815 + }, + { + "epoch": 2.280161943319838, + "grad_norm": 1.6465016952207205, + "learning_rate": 2.780117584893317e-06, + "loss": 1.0251, + "step": 2816 + }, + { + "epoch": 2.2809716599190284, + "grad_norm": 1.6700201012055247, + "learning_rate": 2.774173098912433e-06, + "loss": 0.9897, + "step": 2817 + }, + { + "epoch": 2.2817813765182184, + "grad_norm": 1.654997034971534, + "learning_rate": 2.76823395134873e-06, + "loss": 0.9931, + "step": 2818 + }, + { + "epoch": 2.282591093117409, + "grad_norm": 1.6463795592129717, + "learning_rate": 2.7623001465900323e-06, + "loss": 0.9646, + "step": 2819 + }, + { + "epoch": 2.283400809716599, + "grad_norm": 1.689096129045378, + "learning_rate": 2.756371689020214e-06, + "loss": 1.037, + "step": 2820 + }, + { + "epoch": 2.2842105263157895, + "grad_norm": 1.739455812749162, + "learning_rate": 2.7504485830191985e-06, + "loss": 1.0553, + "step": 2821 + }, + { + "epoch": 2.28502024291498, + "grad_norm": 1.737347509141222, + "learning_rate": 2.7445308329629593e-06, + "loss": 0.9492, + "step": 2822 + }, + { + "epoch": 2.28582995951417, + "grad_norm": 1.7541768615419884, + "learning_rate": 2.738618443223513e-06, + "loss": 0.954, + "step": 2823 + }, + { + "epoch": 2.2866396761133605, + "grad_norm": 1.680772296035219, + "learning_rate": 2.7327114181689117e-06, + "loss": 1.0036, + "step": 2824 + }, + { + "epoch": 2.2874493927125505, + "grad_norm": 1.7198767387432525, + "learning_rate": 2.7268097621632473e-06, + "loss": 1.0243, + "step": 2825 + }, + { + "epoch": 2.288259109311741, + "grad_norm": 1.7541843187045827, + "learning_rate": 2.7209134795666404e-06, + "loss": 0.9636, + "step": 2826 + }, + { + "epoch": 2.289068825910931, + "grad_norm": 1.7969143994647754, + "learning_rate": 2.715022574735249e-06, + "loss": 0.956, + "step": 2827 + }, + { + "epoch": 2.2898785425101216, + "grad_norm": 1.7235098932288484, + "learning_rate": 2.709137052021248e-06, + "loss": 0.9696, + "step": 2828 + }, + { + "epoch": 2.2906882591093116, + "grad_norm": 1.7147800208396093, + "learning_rate": 2.7032569157728503e-06, + "loss": 0.9945, + "step": 2829 + }, + { + "epoch": 2.291497975708502, + "grad_norm": 1.6634199908382443, + "learning_rate": 2.697382170334275e-06, + "loss": 0.9715, + "step": 2830 + }, + { + "epoch": 2.292307692307692, + "grad_norm": 1.7025316188789756, + "learning_rate": 2.6915128200457706e-06, + "loss": 0.9584, + "step": 2831 + }, + { + "epoch": 2.2931174089068826, + "grad_norm": 1.7135149326400867, + "learning_rate": 2.68564886924359e-06, + "loss": 0.9675, + "step": 2832 + }, + { + "epoch": 2.2939271255060727, + "grad_norm": 1.697368241080622, + "learning_rate": 2.679790322260002e-06, + "loss": 1.0273, + "step": 2833 + }, + { + "epoch": 2.294736842105263, + "grad_norm": 1.6304843444402075, + "learning_rate": 2.673937183423282e-06, + "loss": 0.998, + "step": 2834 + }, + { + "epoch": 2.2955465587044532, + "grad_norm": 1.6737945308046214, + "learning_rate": 2.6680894570577042e-06, + "loss": 0.9654, + "step": 2835 + }, + { + "epoch": 2.2963562753036437, + "grad_norm": 1.6851080909667857, + "learning_rate": 2.6622471474835585e-06, + "loss": 0.9605, + "step": 2836 + }, + { + "epoch": 2.297165991902834, + "grad_norm": 1.7289635381651745, + "learning_rate": 2.6564102590171204e-06, + "loss": 1.0554, + "step": 2837 + }, + { + "epoch": 2.2979757085020243, + "grad_norm": 1.6276718372129562, + "learning_rate": 2.6505787959706607e-06, + "loss": 0.9341, + "step": 2838 + }, + { + "epoch": 2.2987854251012148, + "grad_norm": 1.775958697906154, + "learning_rate": 2.6447527626524504e-06, + "loss": 0.929, + "step": 2839 + }, + { + "epoch": 2.299595141700405, + "grad_norm": 1.6271115167068193, + "learning_rate": 2.638932163366742e-06, + "loss": 1.0366, + "step": 2840 + }, + { + "epoch": 2.3004048582995953, + "grad_norm": 1.7404302828297658, + "learning_rate": 2.633117002413774e-06, + "loss": 1.0187, + "step": 2841 + }, + { + "epoch": 2.3012145748987853, + "grad_norm": 1.7172201475593398, + "learning_rate": 2.6273072840897685e-06, + "loss": 0.9908, + "step": 2842 + }, + { + "epoch": 2.302024291497976, + "grad_norm": 1.7540383203717043, + "learning_rate": 2.6215030126869235e-06, + "loss": 0.9614, + "step": 2843 + }, + { + "epoch": 2.302834008097166, + "grad_norm": 1.7302429828447257, + "learning_rate": 2.6157041924934223e-06, + "loss": 0.9773, + "step": 2844 + }, + { + "epoch": 2.3036437246963564, + "grad_norm": 1.621473177336414, + "learning_rate": 2.6099108277934105e-06, + "loss": 0.9527, + "step": 2845 + }, + { + "epoch": 2.3044534412955464, + "grad_norm": 1.6861502912702049, + "learning_rate": 2.604122922867004e-06, + "loss": 1.0025, + "step": 2846 + }, + { + "epoch": 2.305263157894737, + "grad_norm": 1.6470296585152888, + "learning_rate": 2.5983404819902937e-06, + "loss": 1.0029, + "step": 2847 + }, + { + "epoch": 2.306072874493927, + "grad_norm": 1.732391716550845, + "learning_rate": 2.592563509435325e-06, + "loss": 0.9638, + "step": 2848 + }, + { + "epoch": 2.3068825910931174, + "grad_norm": 1.6830485680896312, + "learning_rate": 2.586792009470107e-06, + "loss": 1.038, + "step": 2849 + }, + { + "epoch": 2.3076923076923075, + "grad_norm": 1.7578887787386592, + "learning_rate": 2.581025986358602e-06, + "loss": 0.9445, + "step": 2850 + }, + { + "epoch": 2.308502024291498, + "grad_norm": 1.7346747000095792, + "learning_rate": 2.575265444360733e-06, + "loss": 0.9888, + "step": 2851 + }, + { + "epoch": 2.3093117408906885, + "grad_norm": 1.7156101092464429, + "learning_rate": 2.5695103877323678e-06, + "loss": 1.0043, + "step": 2852 + }, + { + "epoch": 2.3101214574898785, + "grad_norm": 1.6144048959910942, + "learning_rate": 2.563760820725325e-06, + "loss": 1.0411, + "step": 2853 + }, + { + "epoch": 2.310931174089069, + "grad_norm": 1.6919970110753482, + "learning_rate": 2.5580167475873595e-06, + "loss": 0.9792, + "step": 2854 + }, + { + "epoch": 2.311740890688259, + "grad_norm": 1.763114954790134, + "learning_rate": 2.5522781725621814e-06, + "loss": 0.9677, + "step": 2855 + }, + { + "epoch": 2.3125506072874495, + "grad_norm": 1.7245366432759541, + "learning_rate": 2.5465450998894294e-06, + "loss": 0.9699, + "step": 2856 + }, + { + "epoch": 2.3133603238866396, + "grad_norm": 1.7077901989875472, + "learning_rate": 2.540817533804676e-06, + "loss": 1.0007, + "step": 2857 + }, + { + "epoch": 2.31417004048583, + "grad_norm": 1.6847209459419308, + "learning_rate": 2.535095478539428e-06, + "loss": 1.0254, + "step": 2858 + }, + { + "epoch": 2.31497975708502, + "grad_norm": 1.706175440021964, + "learning_rate": 2.529378938321124e-06, + "loss": 0.9536, + "step": 2859 + }, + { + "epoch": 2.3157894736842106, + "grad_norm": 1.7691390032285985, + "learning_rate": 2.523667917373125e-06, + "loss": 1.0103, + "step": 2860 + }, + { + "epoch": 2.3165991902834007, + "grad_norm": 1.8077440425603108, + "learning_rate": 2.517962419914712e-06, + "loss": 0.9207, + "step": 2861 + }, + { + "epoch": 2.317408906882591, + "grad_norm": 1.6663947281505385, + "learning_rate": 2.512262450161087e-06, + "loss": 0.9743, + "step": 2862 + }, + { + "epoch": 2.318218623481781, + "grad_norm": 1.6909579843290505, + "learning_rate": 2.5065680123233737e-06, + "loss": 0.9382, + "step": 2863 + }, + { + "epoch": 2.3190283400809717, + "grad_norm": 1.6847267233784076, + "learning_rate": 2.5008791106086015e-06, + "loss": 0.9839, + "step": 2864 + }, + { + "epoch": 2.3198380566801617, + "grad_norm": 1.7283759037557056, + "learning_rate": 2.4951957492197097e-06, + "loss": 0.9233, + "step": 2865 + }, + { + "epoch": 2.3206477732793522, + "grad_norm": 1.6108950115636136, + "learning_rate": 2.4895179323555517e-06, + "loss": 0.9222, + "step": 2866 + }, + { + "epoch": 2.3214574898785427, + "grad_norm": 1.759239040012402, + "learning_rate": 2.483845664210879e-06, + "loss": 1.0203, + "step": 2867 + }, + { + "epoch": 2.3222672064777328, + "grad_norm": 1.6569900658909889, + "learning_rate": 2.478178948976342e-06, + "loss": 1.0225, + "step": 2868 + }, + { + "epoch": 2.3230769230769233, + "grad_norm": 1.6165160420682663, + "learning_rate": 2.4725177908384936e-06, + "loss": 0.9963, + "step": 2869 + }, + { + "epoch": 2.3238866396761133, + "grad_norm": 1.6910680194151018, + "learning_rate": 2.4668621939797745e-06, + "loss": 0.9845, + "step": 2870 + }, + { + "epoch": 2.324696356275304, + "grad_norm": 1.7564925975810206, + "learning_rate": 2.461212162578527e-06, + "loss": 1.0053, + "step": 2871 + }, + { + "epoch": 2.325506072874494, + "grad_norm": 1.6736697804350984, + "learning_rate": 2.455567700808974e-06, + "loss": 1.0172, + "step": 2872 + }, + { + "epoch": 2.3263157894736843, + "grad_norm": 1.6235333292980543, + "learning_rate": 2.4499288128412214e-06, + "loss": 1.0632, + "step": 2873 + }, + { + "epoch": 2.3271255060728744, + "grad_norm": 1.7363325081342043, + "learning_rate": 2.4442955028412672e-06, + "loss": 0.9349, + "step": 2874 + }, + { + "epoch": 2.327935222672065, + "grad_norm": 1.6683416186196407, + "learning_rate": 2.438667774970981e-06, + "loss": 0.9533, + "step": 2875 + }, + { + "epoch": 2.328744939271255, + "grad_norm": 1.7103163002011812, + "learning_rate": 2.433045633388106e-06, + "loss": 0.9585, + "step": 2876 + }, + { + "epoch": 2.3295546558704454, + "grad_norm": 1.6372160886662246, + "learning_rate": 2.4274290822462656e-06, + "loss": 0.9704, + "step": 2877 + }, + { + "epoch": 2.3303643724696355, + "grad_norm": 1.7503426443559542, + "learning_rate": 2.4218181256949434e-06, + "loss": 1.0724, + "step": 2878 + }, + { + "epoch": 2.331174089068826, + "grad_norm": 1.6224949379470495, + "learning_rate": 2.4162127678795045e-06, + "loss": 0.9353, + "step": 2879 + }, + { + "epoch": 2.331983805668016, + "grad_norm": 1.7179059296387311, + "learning_rate": 2.4106130129411608e-06, + "loss": 0.9951, + "step": 2880 + }, + { + "epoch": 2.3327935222672065, + "grad_norm": 1.6820449772558062, + "learning_rate": 2.405018865016999e-06, + "loss": 1.0007, + "step": 2881 + }, + { + "epoch": 2.333603238866397, + "grad_norm": 1.640951194648763, + "learning_rate": 2.3994303282399544e-06, + "loss": 0.9456, + "step": 2882 + }, + { + "epoch": 2.334412955465587, + "grad_norm": 1.6725791781090846, + "learning_rate": 2.3938474067388208e-06, + "loss": 0.9674, + "step": 2883 + }, + { + "epoch": 2.3352226720647775, + "grad_norm": 1.8442889599518857, + "learning_rate": 2.388270104638242e-06, + "loss": 0.9674, + "step": 2884 + }, + { + "epoch": 2.3360323886639676, + "grad_norm": 1.676925947550545, + "learning_rate": 2.3826984260587084e-06, + "loss": 0.9523, + "step": 2885 + }, + { + "epoch": 2.336842105263158, + "grad_norm": 1.717243161461183, + "learning_rate": 2.3771323751165563e-06, + "loss": 0.9958, + "step": 2886 + }, + { + "epoch": 2.337651821862348, + "grad_norm": 1.7024062048223407, + "learning_rate": 2.3715719559239727e-06, + "loss": 1.0246, + "step": 2887 + }, + { + "epoch": 2.3384615384615386, + "grad_norm": 1.6526087755299945, + "learning_rate": 2.3660171725889703e-06, + "loss": 0.9659, + "step": 2888 + }, + { + "epoch": 2.3392712550607286, + "grad_norm": 1.6804957380379042, + "learning_rate": 2.360468029215409e-06, + "loss": 1.0254, + "step": 2889 + }, + { + "epoch": 2.340080971659919, + "grad_norm": 1.6575687386773823, + "learning_rate": 2.354924529902978e-06, + "loss": 1.0599, + "step": 2890 + }, + { + "epoch": 2.340890688259109, + "grad_norm": 1.6411725270506965, + "learning_rate": 2.349386678747194e-06, + "loss": 1.0325, + "step": 2891 + }, + { + "epoch": 2.3417004048582997, + "grad_norm": 1.6593350291881466, + "learning_rate": 2.343854479839405e-06, + "loss": 0.9796, + "step": 2892 + }, + { + "epoch": 2.3425101214574897, + "grad_norm": 1.6411951141419265, + "learning_rate": 2.3383279372667787e-06, + "loss": 0.9762, + "step": 2893 + }, + { + "epoch": 2.34331983805668, + "grad_norm": 1.6675022296883175, + "learning_rate": 2.332807055112306e-06, + "loss": 0.9739, + "step": 2894 + }, + { + "epoch": 2.3441295546558703, + "grad_norm": 1.6845312790330145, + "learning_rate": 2.327291837454799e-06, + "loss": 0.9776, + "step": 2895 + }, + { + "epoch": 2.3449392712550607, + "grad_norm": 1.6631177985983452, + "learning_rate": 2.3217822883688855e-06, + "loss": 0.9856, + "step": 2896 + }, + { + "epoch": 2.3457489878542512, + "grad_norm": 1.7002279127037514, + "learning_rate": 2.316278411924998e-06, + "loss": 1.004, + "step": 2897 + }, + { + "epoch": 2.3465587044534413, + "grad_norm": 1.7044705523610206, + "learning_rate": 2.310780212189384e-06, + "loss": 0.9071, + "step": 2898 + }, + { + "epoch": 2.3473684210526318, + "grad_norm": 1.7588763360318254, + "learning_rate": 2.3052876932240943e-06, + "loss": 0.939, + "step": 2899 + }, + { + "epoch": 2.348178137651822, + "grad_norm": 1.6865532061555057, + "learning_rate": 2.2998008590869838e-06, + "loss": 0.9706, + "step": 2900 + }, + { + "epoch": 2.3489878542510123, + "grad_norm": 1.8099761327981618, + "learning_rate": 2.294319713831705e-06, + "loss": 0.9509, + "step": 2901 + }, + { + "epoch": 2.3497975708502024, + "grad_norm": 1.655965409511294, + "learning_rate": 2.2888442615077145e-06, + "loss": 0.9411, + "step": 2902 + }, + { + "epoch": 2.350607287449393, + "grad_norm": 1.661395531775376, + "learning_rate": 2.2833745061602587e-06, + "loss": 1.011, + "step": 2903 + }, + { + "epoch": 2.351417004048583, + "grad_norm": 1.6324577056900818, + "learning_rate": 2.277910451830373e-06, + "loss": 1.0316, + "step": 2904 + }, + { + "epoch": 2.3522267206477734, + "grad_norm": 1.6836043670245235, + "learning_rate": 2.2724521025548828e-06, + "loss": 1.0146, + "step": 2905 + }, + { + "epoch": 2.3530364372469634, + "grad_norm": 1.7435287774310333, + "learning_rate": 2.2669994623664006e-06, + "loss": 0.9744, + "step": 2906 + }, + { + "epoch": 2.353846153846154, + "grad_norm": 1.7490120988400564, + "learning_rate": 2.2615525352933156e-06, + "loss": 0.9675, + "step": 2907 + }, + { + "epoch": 2.354655870445344, + "grad_norm": 1.6560540854955874, + "learning_rate": 2.256111325359801e-06, + "loss": 0.9834, + "step": 2908 + }, + { + "epoch": 2.3554655870445345, + "grad_norm": 1.6937685129731164, + "learning_rate": 2.250675836585803e-06, + "loss": 1.0068, + "step": 2909 + }, + { + "epoch": 2.3562753036437245, + "grad_norm": 1.6622436941899015, + "learning_rate": 2.245246072987045e-06, + "loss": 0.9918, + "step": 2910 + }, + { + "epoch": 2.357085020242915, + "grad_norm": 1.7854484056765068, + "learning_rate": 2.2398220385750213e-06, + "loss": 1.0529, + "step": 2911 + }, + { + "epoch": 2.3578947368421055, + "grad_norm": 1.6881598706045204, + "learning_rate": 2.234403737356987e-06, + "loss": 0.9479, + "step": 2912 + }, + { + "epoch": 2.3587044534412955, + "grad_norm": 1.7582794873521799, + "learning_rate": 2.228991173335967e-06, + "loss": 1.0117, + "step": 2913 + }, + { + "epoch": 2.3595141700404856, + "grad_norm": 1.6302584416633792, + "learning_rate": 2.2235843505107447e-06, + "loss": 0.9808, + "step": 2914 + }, + { + "epoch": 2.360323886639676, + "grad_norm": 1.741990356965201, + "learning_rate": 2.2181832728758635e-06, + "loss": 0.9415, + "step": 2915 + }, + { + "epoch": 2.3611336032388666, + "grad_norm": 1.7084803772752033, + "learning_rate": 2.2127879444216184e-06, + "loss": 1.0223, + "step": 2916 + }, + { + "epoch": 2.3619433198380566, + "grad_norm": 1.680568814995532, + "learning_rate": 2.2073983691340673e-06, + "loss": 0.9924, + "step": 2917 + }, + { + "epoch": 2.362753036437247, + "grad_norm": 1.6884873524658965, + "learning_rate": 2.202014550995003e-06, + "loss": 0.946, + "step": 2918 + }, + { + "epoch": 2.363562753036437, + "grad_norm": 1.6973802116265153, + "learning_rate": 2.1966364939819797e-06, + "loss": 0.9663, + "step": 2919 + }, + { + "epoch": 2.3643724696356276, + "grad_norm": 1.6489385262541483, + "learning_rate": 2.191264202068286e-06, + "loss": 0.985, + "step": 2920 + }, + { + "epoch": 2.3651821862348177, + "grad_norm": 1.6792446355838644, + "learning_rate": 2.185897679222951e-06, + "loss": 0.9868, + "step": 2921 + }, + { + "epoch": 2.365991902834008, + "grad_norm": 1.6804030489521116, + "learning_rate": 2.180536929410748e-06, + "loss": 1.0269, + "step": 2922 + }, + { + "epoch": 2.3668016194331982, + "grad_norm": 1.6589791003331682, + "learning_rate": 2.1751819565921774e-06, + "loss": 0.9795, + "step": 2923 + }, + { + "epoch": 2.3676113360323887, + "grad_norm": 1.6922330016889355, + "learning_rate": 2.169832764723475e-06, + "loss": 0.9302, + "step": 2924 + }, + { + "epoch": 2.3684210526315788, + "grad_norm": 1.617132099924979, + "learning_rate": 2.1644893577566118e-06, + "loss": 0.9114, + "step": 2925 + }, + { + "epoch": 2.3692307692307693, + "grad_norm": 1.683358875672709, + "learning_rate": 2.1591517396392735e-06, + "loss": 1.0196, + "step": 2926 + }, + { + "epoch": 2.3700404858299597, + "grad_norm": 1.662270804025337, + "learning_rate": 2.15381991431488e-06, + "loss": 0.9346, + "step": 2927 + }, + { + "epoch": 2.37085020242915, + "grad_norm": 1.5560767626917962, + "learning_rate": 2.1484938857225636e-06, + "loss": 1.0323, + "step": 2928 + }, + { + "epoch": 2.37165991902834, + "grad_norm": 1.6168560582415574, + "learning_rate": 2.1431736577971763e-06, + "loss": 1.0005, + "step": 2929 + }, + { + "epoch": 2.3724696356275303, + "grad_norm": 1.6890995844263526, + "learning_rate": 2.137859234469286e-06, + "loss": 1.0447, + "step": 2930 + }, + { + "epoch": 2.373279352226721, + "grad_norm": 1.6394854020625869, + "learning_rate": 2.132550619665168e-06, + "loss": 0.9983, + "step": 2931 + }, + { + "epoch": 2.374089068825911, + "grad_norm": 1.6603872171093927, + "learning_rate": 2.1272478173068147e-06, + "loss": 0.9744, + "step": 2932 + }, + { + "epoch": 2.3748987854251014, + "grad_norm": 1.6252945957266938, + "learning_rate": 2.1219508313119163e-06, + "loss": 0.947, + "step": 2933 + }, + { + "epoch": 2.3757085020242914, + "grad_norm": 1.7572254760843087, + "learning_rate": 2.1166596655938676e-06, + "loss": 0.9481, + "step": 2934 + }, + { + "epoch": 2.376518218623482, + "grad_norm": 1.680859166288277, + "learning_rate": 2.1113743240617668e-06, + "loss": 0.9892, + "step": 2935 + }, + { + "epoch": 2.377327935222672, + "grad_norm": 1.7086071225889468, + "learning_rate": 2.1060948106204072e-06, + "loss": 1.0586, + "step": 2936 + }, + { + "epoch": 2.3781376518218624, + "grad_norm": 1.6757697718883062, + "learning_rate": 2.100821129170274e-06, + "loss": 0.9855, + "step": 2937 + }, + { + "epoch": 2.3789473684210525, + "grad_norm": 1.6316446681345318, + "learning_rate": 2.0955532836075445e-06, + "loss": 1.0192, + "step": 2938 + }, + { + "epoch": 2.379757085020243, + "grad_norm": 1.6928291171366985, + "learning_rate": 2.090291277824089e-06, + "loss": 0.974, + "step": 2939 + }, + { + "epoch": 2.380566801619433, + "grad_norm": 1.6550872597895547, + "learning_rate": 2.08503511570746e-06, + "loss": 1.069, + "step": 2940 + }, + { + "epoch": 2.3813765182186235, + "grad_norm": 1.6475664661450247, + "learning_rate": 2.0797848011408906e-06, + "loss": 0.9001, + "step": 2941 + }, + { + "epoch": 2.382186234817814, + "grad_norm": 1.6992711495500126, + "learning_rate": 2.0745403380032947e-06, + "loss": 0.9789, + "step": 2942 + }, + { + "epoch": 2.382995951417004, + "grad_norm": 1.7621872944691108, + "learning_rate": 2.0693017301692698e-06, + "loss": 1.0104, + "step": 2943 + }, + { + "epoch": 2.383805668016194, + "grad_norm": 1.6841266297477848, + "learning_rate": 2.0640689815090777e-06, + "loss": 0.9808, + "step": 2944 + }, + { + "epoch": 2.3846153846153846, + "grad_norm": 1.7074153273587775, + "learning_rate": 2.058842095888658e-06, + "loss": 0.9962, + "step": 2945 + }, + { + "epoch": 2.385425101214575, + "grad_norm": 1.6367366387306832, + "learning_rate": 2.053621077169613e-06, + "loss": 0.9874, + "step": 2946 + }, + { + "epoch": 2.386234817813765, + "grad_norm": 1.6204049507354517, + "learning_rate": 2.0484059292092196e-06, + "loss": 1.047, + "step": 2947 + }, + { + "epoch": 2.3870445344129556, + "grad_norm": 1.6516699596226871, + "learning_rate": 2.0431966558604097e-06, + "loss": 1.0036, + "step": 2948 + }, + { + "epoch": 2.3878542510121457, + "grad_norm": 1.630232092481421, + "learning_rate": 2.0379932609717767e-06, + "loss": 1.016, + "step": 2949 + }, + { + "epoch": 2.388663967611336, + "grad_norm": 1.6118486578131621, + "learning_rate": 2.0327957483875693e-06, + "loss": 1.0195, + "step": 2950 + }, + { + "epoch": 2.389473684210526, + "grad_norm": 1.679620212591038, + "learning_rate": 2.0276041219476985e-06, + "loss": 1.0092, + "step": 2951 + }, + { + "epoch": 2.3902834008097167, + "grad_norm": 1.6192037143814413, + "learning_rate": 2.0224183854877165e-06, + "loss": 1.0289, + "step": 2952 + }, + { + "epoch": 2.3910931174089067, + "grad_norm": 1.6913936221204569, + "learning_rate": 2.0172385428388288e-06, + "loss": 0.9022, + "step": 2953 + }, + { + "epoch": 2.3919028340080972, + "grad_norm": 1.712872458206929, + "learning_rate": 2.0120645978278887e-06, + "loss": 0.9932, + "step": 2954 + }, + { + "epoch": 2.3927125506072873, + "grad_norm": 1.6358216619621548, + "learning_rate": 2.006896554277388e-06, + "loss": 0.9879, + "step": 2955 + }, + { + "epoch": 2.3935222672064778, + "grad_norm": 1.7962624248609893, + "learning_rate": 2.0017344160054597e-06, + "loss": 0.9944, + "step": 2956 + }, + { + "epoch": 2.3943319838056683, + "grad_norm": 1.7097327349314781, + "learning_rate": 1.996578186825876e-06, + "loss": 0.9837, + "step": 2957 + }, + { + "epoch": 2.3951417004048583, + "grad_norm": 1.7984710742922152, + "learning_rate": 1.991427870548038e-06, + "loss": 1.0043, + "step": 2958 + }, + { + "epoch": 2.3959514170040483, + "grad_norm": 1.7633179302331299, + "learning_rate": 1.9862834709769897e-06, + "loss": 0.9943, + "step": 2959 + }, + { + "epoch": 2.396761133603239, + "grad_norm": 1.6081821375494785, + "learning_rate": 1.981144991913392e-06, + "loss": 1.0207, + "step": 2960 + }, + { + "epoch": 2.3975708502024293, + "grad_norm": 1.6574008938350164, + "learning_rate": 1.976012437153535e-06, + "loss": 1.0171, + "step": 2961 + }, + { + "epoch": 2.3983805668016194, + "grad_norm": 1.6854338035743857, + "learning_rate": 1.970885810489337e-06, + "loss": 0.9726, + "step": 2962 + }, + { + "epoch": 2.39919028340081, + "grad_norm": 1.6012012823118715, + "learning_rate": 1.9657651157083322e-06, + "loss": 0.9743, + "step": 2963 + }, + { + "epoch": 2.4, + "grad_norm": 3.279317777475881, + "learning_rate": 1.960650356593672e-06, + "loss": 1.0349, + "step": 2964 + }, + { + "epoch": 2.4008097165991904, + "grad_norm": 1.6559506921615998, + "learning_rate": 1.9555415369241228e-06, + "loss": 1.0237, + "step": 2965 + }, + { + "epoch": 2.4016194331983804, + "grad_norm": 1.696563890993013, + "learning_rate": 1.9504386604740632e-06, + "loss": 1.034, + "step": 2966 + }, + { + "epoch": 2.402429149797571, + "grad_norm": 1.6229320190285514, + "learning_rate": 1.9453417310134857e-06, + "loss": 0.9817, + "step": 2967 + }, + { + "epoch": 2.403238866396761, + "grad_norm": 1.717120363215466, + "learning_rate": 1.9402507523079794e-06, + "loss": 0.9337, + "step": 2968 + }, + { + "epoch": 2.4040485829959515, + "grad_norm": 1.8037959085157829, + "learning_rate": 1.9351657281187484e-06, + "loss": 0.9729, + "step": 2969 + }, + { + "epoch": 2.4048582995951415, + "grad_norm": 1.6684409182222344, + "learning_rate": 1.930086662202589e-06, + "loss": 1.0042, + "step": 2970 + }, + { + "epoch": 2.405668016194332, + "grad_norm": 1.698807530710582, + "learning_rate": 1.9250135583119e-06, + "loss": 0.9531, + "step": 2971 + }, + { + "epoch": 2.4064777327935225, + "grad_norm": 1.6713412841686315, + "learning_rate": 1.9199464201946717e-06, + "loss": 0.9905, + "step": 2972 + }, + { + "epoch": 2.4072874493927126, + "grad_norm": 1.6587820441376417, + "learning_rate": 1.9148852515944893e-06, + "loss": 0.9788, + "step": 2973 + }, + { + "epoch": 2.4080971659919026, + "grad_norm": 1.6459864544000329, + "learning_rate": 1.9098300562505266e-06, + "loss": 0.9819, + "step": 2974 + }, + { + "epoch": 2.408906882591093, + "grad_norm": 1.6811137602745856, + "learning_rate": 1.9047808378975485e-06, + "loss": 1.0037, + "step": 2975 + }, + { + "epoch": 2.4097165991902836, + "grad_norm": 1.6452093088766784, + "learning_rate": 1.8997376002658974e-06, + "loss": 0.9404, + "step": 2976 + }, + { + "epoch": 2.4105263157894736, + "grad_norm": 1.7009168710382707, + "learning_rate": 1.894700347081505e-06, + "loss": 0.9519, + "step": 2977 + }, + { + "epoch": 2.411336032388664, + "grad_norm": 1.6634076517052085, + "learning_rate": 1.8896690820658758e-06, + "loss": 1.0487, + "step": 2978 + }, + { + "epoch": 2.412145748987854, + "grad_norm": 1.6737820388111349, + "learning_rate": 1.8846438089360896e-06, + "loss": 1.0075, + "step": 2979 + }, + { + "epoch": 2.4129554655870447, + "grad_norm": 1.7249507058482862, + "learning_rate": 1.8796245314048046e-06, + "loss": 1.0098, + "step": 2980 + }, + { + "epoch": 2.4137651821862347, + "grad_norm": 1.7251783475658493, + "learning_rate": 1.874611253180244e-06, + "loss": 0.9775, + "step": 2981 + }, + { + "epoch": 2.414574898785425, + "grad_norm": 1.6136959740020433, + "learning_rate": 1.8696039779662012e-06, + "loss": 0.9962, + "step": 2982 + }, + { + "epoch": 2.4153846153846152, + "grad_norm": 1.6425732401790396, + "learning_rate": 1.8646027094620345e-06, + "loss": 0.9816, + "step": 2983 + }, + { + "epoch": 2.4161943319838057, + "grad_norm": 1.8229989217932228, + "learning_rate": 1.8596074513626694e-06, + "loss": 0.9151, + "step": 2984 + }, + { + "epoch": 2.417004048582996, + "grad_norm": 1.7061703390123246, + "learning_rate": 1.8546182073585828e-06, + "loss": 1.0206, + "step": 2985 + }, + { + "epoch": 2.4178137651821863, + "grad_norm": 1.6590284413749856, + "learning_rate": 1.8496349811358116e-06, + "loss": 1.0051, + "step": 2986 + }, + { + "epoch": 2.4186234817813768, + "grad_norm": 1.7189358586751335, + "learning_rate": 1.8446577763759478e-06, + "loss": 0.8961, + "step": 2987 + }, + { + "epoch": 2.419433198380567, + "grad_norm": 1.7082228107534516, + "learning_rate": 1.8396865967561317e-06, + "loss": 1.0398, + "step": 2988 + }, + { + "epoch": 2.420242914979757, + "grad_norm": 1.6578774454136742, + "learning_rate": 1.8347214459490548e-06, + "loss": 0.9466, + "step": 2989 + }, + { + "epoch": 2.4210526315789473, + "grad_norm": 1.7186094993741328, + "learning_rate": 1.829762327622958e-06, + "loss": 1.0007, + "step": 2990 + }, + { + "epoch": 2.421862348178138, + "grad_norm": 1.7016828071558703, + "learning_rate": 1.8248092454416166e-06, + "loss": 1.0301, + "step": 2991 + }, + { + "epoch": 2.422672064777328, + "grad_norm": 1.7081136775369437, + "learning_rate": 1.8198622030643564e-06, + "loss": 0.9879, + "step": 2992 + }, + { + "epoch": 2.4234817813765184, + "grad_norm": 1.7205639728799287, + "learning_rate": 1.814921204146033e-06, + "loss": 0.9609, + "step": 2993 + }, + { + "epoch": 2.4242914979757084, + "grad_norm": 1.6612183377575551, + "learning_rate": 1.8099862523370415e-06, + "loss": 0.9652, + "step": 2994 + }, + { + "epoch": 2.425101214574899, + "grad_norm": 1.674361206817772, + "learning_rate": 1.805057351283307e-06, + "loss": 0.9703, + "step": 2995 + }, + { + "epoch": 2.425910931174089, + "grad_norm": 1.5856102512600692, + "learning_rate": 1.8001345046262864e-06, + "loss": 0.9733, + "step": 2996 + }, + { + "epoch": 2.4267206477732794, + "grad_norm": 1.63966324613164, + "learning_rate": 1.7952177160029594e-06, + "loss": 1.0133, + "step": 2997 + }, + { + "epoch": 2.4275303643724695, + "grad_norm": 1.6634576125787222, + "learning_rate": 1.7903069890458347e-06, + "loss": 0.9702, + "step": 2998 + }, + { + "epoch": 2.42834008097166, + "grad_norm": 1.7282172413441768, + "learning_rate": 1.7854023273829467e-06, + "loss": 0.8979, + "step": 2999 + }, + { + "epoch": 2.42914979757085, + "grad_norm": 1.6699414779639086, + "learning_rate": 1.7805037346378384e-06, + "loss": 0.9921, + "step": 3000 + }, + { + "epoch": 2.4299595141700405, + "grad_norm": 1.6478445275579376, + "learning_rate": 1.7756112144295745e-06, + "loss": 1.0098, + "step": 3001 + }, + { + "epoch": 2.430769230769231, + "grad_norm": 1.7177993329143486, + "learning_rate": 1.7707247703727325e-06, + "loss": 0.9902, + "step": 3002 + }, + { + "epoch": 2.431578947368421, + "grad_norm": 1.6717118318062545, + "learning_rate": 1.7658444060774028e-06, + "loss": 0.9317, + "step": 3003 + }, + { + "epoch": 2.432388663967611, + "grad_norm": 1.656893954893965, + "learning_rate": 1.7609701251491796e-06, + "loss": 1.0093, + "step": 3004 + }, + { + "epoch": 2.4331983805668016, + "grad_norm": 1.672458797771242, + "learning_rate": 1.756101931189169e-06, + "loss": 0.994, + "step": 3005 + }, + { + "epoch": 2.434008097165992, + "grad_norm": 1.7979249229953589, + "learning_rate": 1.7512398277939735e-06, + "loss": 0.9501, + "step": 3006 + }, + { + "epoch": 2.434817813765182, + "grad_norm": 1.7675347987379522, + "learning_rate": 1.7463838185557024e-06, + "loss": 0.9573, + "step": 3007 + }, + { + "epoch": 2.4356275303643726, + "grad_norm": 1.6829178005541199, + "learning_rate": 1.7415339070619586e-06, + "loss": 0.9926, + "step": 3008 + }, + { + "epoch": 2.4364372469635627, + "grad_norm": 1.7579588916573732, + "learning_rate": 1.73669009689584e-06, + "loss": 0.9126, + "step": 3009 + }, + { + "epoch": 2.437246963562753, + "grad_norm": 1.739729410392981, + "learning_rate": 1.7318523916359376e-06, + "loss": 0.9609, + "step": 3010 + }, + { + "epoch": 2.438056680161943, + "grad_norm": 1.6681245792489996, + "learning_rate": 1.7270207948563323e-06, + "loss": 1.0533, + "step": 3011 + }, + { + "epoch": 2.4388663967611337, + "grad_norm": 1.696281337597073, + "learning_rate": 1.7221953101265888e-06, + "loss": 0.9251, + "step": 3012 + }, + { + "epoch": 2.4396761133603238, + "grad_norm": 1.6189610965490386, + "learning_rate": 1.7173759410117663e-06, + "loss": 0.9526, + "step": 3013 + }, + { + "epoch": 2.4404858299595142, + "grad_norm": 1.724367632228813, + "learning_rate": 1.7125626910723915e-06, + "loss": 0.9634, + "step": 3014 + }, + { + "epoch": 2.4412955465587043, + "grad_norm": 1.6773880195931092, + "learning_rate": 1.7077555638644838e-06, + "loss": 0.9881, + "step": 3015 + }, + { + "epoch": 2.442105263157895, + "grad_norm": 1.7197370339372542, + "learning_rate": 1.7029545629395306e-06, + "loss": 0.9735, + "step": 3016 + }, + { + "epoch": 2.4429149797570853, + "grad_norm": 1.7700548528489848, + "learning_rate": 1.6981596918444953e-06, + "loss": 1.0035, + "step": 3017 + }, + { + "epoch": 2.4437246963562753, + "grad_norm": 1.7118799526231616, + "learning_rate": 1.6933709541218125e-06, + "loss": 1.0057, + "step": 3018 + }, + { + "epoch": 2.4445344129554654, + "grad_norm": 1.6696288935408505, + "learning_rate": 1.6885883533093839e-06, + "loss": 1.0043, + "step": 3019 + }, + { + "epoch": 2.445344129554656, + "grad_norm": 1.6491620840254673, + "learning_rate": 1.6838118929405856e-06, + "loss": 0.9407, + "step": 3020 + }, + { + "epoch": 2.4461538461538463, + "grad_norm": 1.659428974002118, + "learning_rate": 1.6790415765442458e-06, + "loss": 1.0047, + "step": 3021 + }, + { + "epoch": 2.4469635627530364, + "grad_norm": 1.6354681687186046, + "learning_rate": 1.6742774076446578e-06, + "loss": 0.986, + "step": 3022 + }, + { + "epoch": 2.447773279352227, + "grad_norm": 1.6775886642207607, + "learning_rate": 1.6695193897615781e-06, + "loss": 0.9518, + "step": 3023 + }, + { + "epoch": 2.448582995951417, + "grad_norm": 1.6648950365114745, + "learning_rate": 1.6647675264102126e-06, + "loss": 0.9382, + "step": 3024 + }, + { + "epoch": 2.4493927125506074, + "grad_norm": 1.646166538722141, + "learning_rate": 1.660021821101222e-06, + "loss": 0.998, + "step": 3025 + }, + { + "epoch": 2.4502024291497975, + "grad_norm": 1.6268027966670777, + "learning_rate": 1.6552822773407163e-06, + "loss": 1.0009, + "step": 3026 + }, + { + "epoch": 2.451012145748988, + "grad_norm": 1.6955695087440212, + "learning_rate": 1.6505488986302586e-06, + "loss": 1.0315, + "step": 3027 + }, + { + "epoch": 2.451821862348178, + "grad_norm": 1.6851857353319628, + "learning_rate": 1.645821688466851e-06, + "loss": 0.9873, + "step": 3028 + }, + { + "epoch": 2.4526315789473685, + "grad_norm": 1.6305706879900275, + "learning_rate": 1.6411006503429428e-06, + "loss": 0.912, + "step": 3029 + }, + { + "epoch": 2.4534412955465585, + "grad_norm": 1.6588245540634141, + "learning_rate": 1.6363857877464161e-06, + "loss": 1.0123, + "step": 3030 + }, + { + "epoch": 2.454251012145749, + "grad_norm": 1.6911002514254574, + "learning_rate": 1.6316771041606027e-06, + "loss": 0.9738, + "step": 3031 + }, + { + "epoch": 2.455060728744939, + "grad_norm": 1.7042305673840112, + "learning_rate": 1.6269746030642607e-06, + "loss": 0.9485, + "step": 3032 + }, + { + "epoch": 2.4558704453441296, + "grad_norm": 1.620560078412615, + "learning_rate": 1.6222782879315802e-06, + "loss": 0.9963, + "step": 3033 + }, + { + "epoch": 2.4566801619433196, + "grad_norm": 1.6956591304380106, + "learning_rate": 1.6175881622321832e-06, + "loss": 0.9385, + "step": 3034 + }, + { + "epoch": 2.45748987854251, + "grad_norm": 1.710112971818022, + "learning_rate": 1.6129042294311227e-06, + "loss": 0.9377, + "step": 3035 + }, + { + "epoch": 2.4582995951417006, + "grad_norm": 1.7103868534898998, + "learning_rate": 1.6082264929888702e-06, + "loss": 0.997, + "step": 3036 + }, + { + "epoch": 2.4591093117408906, + "grad_norm": 1.5708740432232011, + "learning_rate": 1.603554956361324e-06, + "loss": 0.9684, + "step": 3037 + }, + { + "epoch": 2.459919028340081, + "grad_norm": 1.7023113258556464, + "learning_rate": 1.5988896229997952e-06, + "loss": 0.9485, + "step": 3038 + }, + { + "epoch": 2.460728744939271, + "grad_norm": 1.709049003849265, + "learning_rate": 1.5942304963510236e-06, + "loss": 0.9504, + "step": 3039 + }, + { + "epoch": 2.4615384615384617, + "grad_norm": 1.7137018057495825, + "learning_rate": 1.5895775798571523e-06, + "loss": 0.962, + "step": 3040 + }, + { + "epoch": 2.4623481781376517, + "grad_norm": 1.6578218057308711, + "learning_rate": 1.5849308769557393e-06, + "loss": 0.9717, + "step": 3041 + }, + { + "epoch": 2.463157894736842, + "grad_norm": 1.7381084856427103, + "learning_rate": 1.5802903910797584e-06, + "loss": 0.9704, + "step": 3042 + }, + { + "epoch": 2.4639676113360323, + "grad_norm": 1.7855052345593487, + "learning_rate": 1.575656125657583e-06, + "loss": 0.9614, + "step": 3043 + }, + { + "epoch": 2.4647773279352228, + "grad_norm": 1.6487843421840669, + "learning_rate": 1.5710280841129932e-06, + "loss": 1.0039, + "step": 3044 + }, + { + "epoch": 2.465587044534413, + "grad_norm": 1.6882419309228567, + "learning_rate": 1.5664062698651706e-06, + "loss": 0.9755, + "step": 3045 + }, + { + "epoch": 2.4663967611336033, + "grad_norm": 1.6447631511348773, + "learning_rate": 1.5617906863286936e-06, + "loss": 1.026, + "step": 3046 + }, + { + "epoch": 2.4672064777327933, + "grad_norm": 1.7529100807012996, + "learning_rate": 1.5571813369135457e-06, + "loss": 0.9185, + "step": 3047 + }, + { + "epoch": 2.468016194331984, + "grad_norm": 1.7744701731060566, + "learning_rate": 1.5525782250250953e-06, + "loss": 0.9455, + "step": 3048 + }, + { + "epoch": 2.468825910931174, + "grad_norm": 1.7939853067544942, + "learning_rate": 1.5479813540641064e-06, + "loss": 1.0429, + "step": 3049 + }, + { + "epoch": 2.4696356275303644, + "grad_norm": 1.8049689554151258, + "learning_rate": 1.5433907274267357e-06, + "loss": 0.9599, + "step": 3050 + }, + { + "epoch": 2.470445344129555, + "grad_norm": 1.7257816506826475, + "learning_rate": 1.538806348504519e-06, + "loss": 1.0007, + "step": 3051 + }, + { + "epoch": 2.471255060728745, + "grad_norm": 1.6882725120061068, + "learning_rate": 1.534228220684384e-06, + "loss": 0.9825, + "step": 3052 + }, + { + "epoch": 2.4720647773279354, + "grad_norm": 1.5881294637631733, + "learning_rate": 1.5296563473486325e-06, + "loss": 0.9745, + "step": 3053 + }, + { + "epoch": 2.4728744939271254, + "grad_norm": 1.730530663215343, + "learning_rate": 1.525090731874951e-06, + "loss": 0.9413, + "step": 3054 + }, + { + "epoch": 2.473684210526316, + "grad_norm": 1.669751163835487, + "learning_rate": 1.5205313776364028e-06, + "loss": 0.9574, + "step": 3055 + }, + { + "epoch": 2.474493927125506, + "grad_norm": 1.6752509320040634, + "learning_rate": 1.5159782880014207e-06, + "loss": 1.0109, + "step": 3056 + }, + { + "epoch": 2.4753036437246965, + "grad_norm": 1.6347253102331993, + "learning_rate": 1.511431466333817e-06, + "loss": 0.9337, + "step": 3057 + }, + { + "epoch": 2.4761133603238865, + "grad_norm": 1.6962317193949117, + "learning_rate": 1.506890915992766e-06, + "loss": 1.0413, + "step": 3058 + }, + { + "epoch": 2.476923076923077, + "grad_norm": 1.6430785203789686, + "learning_rate": 1.5023566403328105e-06, + "loss": 0.9546, + "step": 3059 + }, + { + "epoch": 2.477732793522267, + "grad_norm": 1.6664785121693346, + "learning_rate": 1.4978286427038602e-06, + "loss": 0.9703, + "step": 3060 + }, + { + "epoch": 2.4785425101214575, + "grad_norm": 1.614002773301002, + "learning_rate": 1.4933069264511834e-06, + "loss": 1.029, + "step": 3061 + }, + { + "epoch": 2.4793522267206476, + "grad_norm": 1.6894913124402384, + "learning_rate": 1.488791494915408e-06, + "loss": 0.8944, + "step": 3062 + }, + { + "epoch": 2.480161943319838, + "grad_norm": 1.5775586891733708, + "learning_rate": 1.4842823514325244e-06, + "loss": 0.9362, + "step": 3063 + }, + { + "epoch": 2.480971659919028, + "grad_norm": 1.645201425602457, + "learning_rate": 1.4797794993338676e-06, + "loss": 0.9915, + "step": 3064 + }, + { + "epoch": 2.4817813765182186, + "grad_norm": 1.6765332328829647, + "learning_rate": 1.4752829419461357e-06, + "loss": 1.0843, + "step": 3065 + }, + { + "epoch": 2.482591093117409, + "grad_norm": 1.718267617032678, + "learning_rate": 1.4707926825913676e-06, + "loss": 0.9818, + "step": 3066 + }, + { + "epoch": 2.483400809716599, + "grad_norm": 1.6812421248362865, + "learning_rate": 1.4663087245869512e-06, + "loss": 0.9961, + "step": 3067 + }, + { + "epoch": 2.4842105263157896, + "grad_norm": 1.6802970912002102, + "learning_rate": 1.4618310712456218e-06, + "loss": 0.9768, + "step": 3068 + }, + { + "epoch": 2.4850202429149797, + "grad_norm": 1.7391114674842005, + "learning_rate": 1.4573597258754546e-06, + "loss": 0.9987, + "step": 3069 + }, + { + "epoch": 2.48582995951417, + "grad_norm": 1.694339682778352, + "learning_rate": 1.4528946917798603e-06, + "loss": 0.9906, + "step": 3070 + }, + { + "epoch": 2.4866396761133602, + "grad_norm": 1.7363838335149249, + "learning_rate": 1.448435972257597e-06, + "loss": 0.9866, + "step": 3071 + }, + { + "epoch": 2.4874493927125507, + "grad_norm": 1.6870306102440977, + "learning_rate": 1.4439835706027526e-06, + "loss": 0.9262, + "step": 3072 + }, + { + "epoch": 2.4882591093117408, + "grad_norm": 1.6692336890689885, + "learning_rate": 1.4395374901047443e-06, + "loss": 0.9166, + "step": 3073 + }, + { + "epoch": 2.4890688259109313, + "grad_norm": 1.6886739622081277, + "learning_rate": 1.4350977340483218e-06, + "loss": 1.0644, + "step": 3074 + }, + { + "epoch": 2.4898785425101213, + "grad_norm": 1.6364093438608744, + "learning_rate": 1.4306643057135638e-06, + "loss": 1.0401, + "step": 3075 + }, + { + "epoch": 2.490688259109312, + "grad_norm": 1.6954035597211397, + "learning_rate": 1.4262372083758714e-06, + "loss": 0.981, + "step": 3076 + }, + { + "epoch": 2.491497975708502, + "grad_norm": 1.6508448957202095, + "learning_rate": 1.4218164453059669e-06, + "loss": 1.0243, + "step": 3077 + }, + { + "epoch": 2.4923076923076923, + "grad_norm": 1.7298928655955168, + "learning_rate": 1.4174020197699e-06, + "loss": 0.9714, + "step": 3078 + }, + { + "epoch": 2.4931174089068824, + "grad_norm": 1.668595707670543, + "learning_rate": 1.4129939350290312e-06, + "loss": 1.01, + "step": 3079 + }, + { + "epoch": 2.493927125506073, + "grad_norm": 1.7117470225490021, + "learning_rate": 1.4085921943400416e-06, + "loss": 0.961, + "step": 3080 + }, + { + "epoch": 2.4947368421052634, + "grad_norm": 1.6420783440308226, + "learning_rate": 1.404196800954921e-06, + "loss": 0.954, + "step": 3081 + }, + { + "epoch": 2.4955465587044534, + "grad_norm": 1.6312366446037767, + "learning_rate": 1.3998077581209712e-06, + "loss": 0.9967, + "step": 3082 + }, + { + "epoch": 2.496356275303644, + "grad_norm": 1.7256120345016341, + "learning_rate": 1.3954250690808036e-06, + "loss": 0.9527, + "step": 3083 + }, + { + "epoch": 2.497165991902834, + "grad_norm": 1.6410665457677764, + "learning_rate": 1.3910487370723347e-06, + "loss": 1.0166, + "step": 3084 + }, + { + "epoch": 2.4979757085020244, + "grad_norm": 1.683999810079675, + "learning_rate": 1.3866787653287804e-06, + "loss": 0.8987, + "step": 3085 + }, + { + "epoch": 2.4987854251012145, + "grad_norm": 1.7049093242978388, + "learning_rate": 1.3823151570786653e-06, + "loss": 0.9651, + "step": 3086 + }, + { + "epoch": 2.499595141700405, + "grad_norm": 1.7012673477415141, + "learning_rate": 1.3779579155458089e-06, + "loss": 0.9147, + "step": 3087 + }, + { + "epoch": 2.500404858299595, + "grad_norm": 1.7058323956163846, + "learning_rate": 1.3736070439493277e-06, + "loss": 0.9477, + "step": 3088 + }, + { + "epoch": 2.5012145748987855, + "grad_norm": 1.7742651035823065, + "learning_rate": 1.369262545503629e-06, + "loss": 0.9419, + "step": 3089 + }, + { + "epoch": 2.5020242914979756, + "grad_norm": 1.5760497645036358, + "learning_rate": 1.3649244234184157e-06, + "loss": 0.9739, + "step": 3090 + }, + { + "epoch": 2.502834008097166, + "grad_norm": 1.637791167494247, + "learning_rate": 1.3605926808986758e-06, + "loss": 0.9618, + "step": 3091 + }, + { + "epoch": 2.5036437246963565, + "grad_norm": 1.6641954657060896, + "learning_rate": 1.3562673211446863e-06, + "loss": 0.992, + "step": 3092 + }, + { + "epoch": 2.5044534412955466, + "grad_norm": 1.687575151546589, + "learning_rate": 1.3519483473520124e-06, + "loss": 0.946, + "step": 3093 + }, + { + "epoch": 2.5052631578947366, + "grad_norm": 1.6264510448768228, + "learning_rate": 1.3476357627114945e-06, + "loss": 1.0184, + "step": 3094 + }, + { + "epoch": 2.506072874493927, + "grad_norm": 1.630753363552384, + "learning_rate": 1.3433295704092586e-06, + "loss": 0.9742, + "step": 3095 + }, + { + "epoch": 2.5068825910931176, + "grad_norm": 1.7544663565446108, + "learning_rate": 1.3390297736267033e-06, + "loss": 0.9339, + "step": 3096 + }, + { + "epoch": 2.5076923076923077, + "grad_norm": 1.6250326610562258, + "learning_rate": 1.3347363755405064e-06, + "loss": 0.9355, + "step": 3097 + }, + { + "epoch": 2.5085020242914977, + "grad_norm": 1.7027349777509002, + "learning_rate": 1.3304493793226135e-06, + "loss": 0.9723, + "step": 3098 + }, + { + "epoch": 2.509311740890688, + "grad_norm": 1.8050244837715836, + "learning_rate": 1.3261687881402464e-06, + "loss": 0.99, + "step": 3099 + }, + { + "epoch": 2.5101214574898787, + "grad_norm": 1.7168877357885066, + "learning_rate": 1.3218946051558867e-06, + "loss": 1.0173, + "step": 3100 + }, + { + "epoch": 2.5109311740890687, + "grad_norm": 1.7435696786750583, + "learning_rate": 1.3176268335272934e-06, + "loss": 1.0018, + "step": 3101 + }, + { + "epoch": 2.5117408906882592, + "grad_norm": 1.5938621762485412, + "learning_rate": 1.3133654764074765e-06, + "loss": 1.111, + "step": 3102 + }, + { + "epoch": 2.5125506072874493, + "grad_norm": 1.6757330684573215, + "learning_rate": 1.3091105369447166e-06, + "loss": 0.9999, + "step": 3103 + }, + { + "epoch": 2.5133603238866398, + "grad_norm": 1.7130683949058882, + "learning_rate": 1.3048620182825478e-06, + "loss": 1.0611, + "step": 3104 + }, + { + "epoch": 2.51417004048583, + "grad_norm": 1.7070694868968392, + "learning_rate": 1.3006199235597628e-06, + "loss": 0.9069, + "step": 3105 + }, + { + "epoch": 2.5149797570850203, + "grad_norm": 1.6521586885628545, + "learning_rate": 1.2963842559104045e-06, + "loss": 1.0559, + "step": 3106 + }, + { + "epoch": 2.515789473684211, + "grad_norm": 1.6872800067468905, + "learning_rate": 1.29215501846377e-06, + "loss": 1.0301, + "step": 3107 + }, + { + "epoch": 2.516599190283401, + "grad_norm": 1.6209272379948023, + "learning_rate": 1.2879322143444095e-06, + "loss": 0.9728, + "step": 3108 + }, + { + "epoch": 2.517408906882591, + "grad_norm": 1.6880406144910838, + "learning_rate": 1.2837158466721155e-06, + "loss": 0.982, + "step": 3109 + }, + { + "epoch": 2.5182186234817814, + "grad_norm": 1.7156607604203853, + "learning_rate": 1.279505918561923e-06, + "loss": 1.0017, + "step": 3110 + }, + { + "epoch": 2.519028340080972, + "grad_norm": 1.6981687300306323, + "learning_rate": 1.2753024331241193e-06, + "loss": 0.9594, + "step": 3111 + }, + { + "epoch": 2.519838056680162, + "grad_norm": 1.7067120833158638, + "learning_rate": 1.2711053934642225e-06, + "loss": 0.9487, + "step": 3112 + }, + { + "epoch": 2.520647773279352, + "grad_norm": 1.689789662984842, + "learning_rate": 1.2669148026829915e-06, + "loss": 1.0139, + "step": 3113 + }, + { + "epoch": 2.5214574898785425, + "grad_norm": 1.6470500284421414, + "learning_rate": 1.2627306638764213e-06, + "loss": 1.0077, + "step": 3114 + }, + { + "epoch": 2.522267206477733, + "grad_norm": 1.5826496661345488, + "learning_rate": 1.2585529801357377e-06, + "loss": 1.0494, + "step": 3115 + }, + { + "epoch": 2.523076923076923, + "grad_norm": 1.742074564535538, + "learning_rate": 1.2543817545474036e-06, + "loss": 1.0169, + "step": 3116 + }, + { + "epoch": 2.5238866396761135, + "grad_norm": 1.7660057671815605, + "learning_rate": 1.250216990193105e-06, + "loss": 0.9641, + "step": 3117 + }, + { + "epoch": 2.5246963562753035, + "grad_norm": 1.652494097329047, + "learning_rate": 1.246058690149755e-06, + "loss": 0.9172, + "step": 3118 + }, + { + "epoch": 2.525506072874494, + "grad_norm": 1.6461968449457183, + "learning_rate": 1.2419068574894943e-06, + "loss": 1.0005, + "step": 3119 + }, + { + "epoch": 2.526315789473684, + "grad_norm": 1.6624721041701427, + "learning_rate": 1.2377614952796825e-06, + "loss": 0.9542, + "step": 3120 + }, + { + "epoch": 2.5271255060728746, + "grad_norm": 1.7097282118454058, + "learning_rate": 1.2336226065828993e-06, + "loss": 0.9636, + "step": 3121 + }, + { + "epoch": 2.527935222672065, + "grad_norm": 1.7315637749359838, + "learning_rate": 1.2294901944569394e-06, + "loss": 0.9522, + "step": 3122 + }, + { + "epoch": 2.528744939271255, + "grad_norm": 1.68876777571607, + "learning_rate": 1.22536426195482e-06, + "loss": 0.9978, + "step": 3123 + }, + { + "epoch": 2.529554655870445, + "grad_norm": 1.7501789467762574, + "learning_rate": 1.2212448121247643e-06, + "loss": 0.9859, + "step": 3124 + }, + { + "epoch": 2.5303643724696356, + "grad_norm": 1.6915559058052043, + "learning_rate": 1.217131848010209e-06, + "loss": 0.9187, + "step": 3125 + }, + { + "epoch": 2.531174089068826, + "grad_norm": 1.7489759930129678, + "learning_rate": 1.2130253726497954e-06, + "loss": 0.9737, + "step": 3126 + }, + { + "epoch": 2.531983805668016, + "grad_norm": 1.6670553247236088, + "learning_rate": 1.2089253890773789e-06, + "loss": 0.8828, + "step": 3127 + }, + { + "epoch": 2.532793522267206, + "grad_norm": 1.6651695693818536, + "learning_rate": 1.204831900322011e-06, + "loss": 0.9528, + "step": 3128 + }, + { + "epoch": 2.5336032388663967, + "grad_norm": 1.6250021225860003, + "learning_rate": 1.200744909407946e-06, + "loss": 0.9981, + "step": 3129 + }, + { + "epoch": 2.534412955465587, + "grad_norm": 1.7704071649167492, + "learning_rate": 1.196664419354644e-06, + "loss": 0.9586, + "step": 3130 + }, + { + "epoch": 2.5352226720647772, + "grad_norm": 1.732509606852866, + "learning_rate": 1.1925904331767545e-06, + "loss": 0.9765, + "step": 3131 + }, + { + "epoch": 2.5360323886639677, + "grad_norm": 1.7830253671280303, + "learning_rate": 1.1885229538841259e-06, + "loss": 0.9934, + "step": 3132 + }, + { + "epoch": 2.536842105263158, + "grad_norm": 1.7228909788826947, + "learning_rate": 1.1844619844817995e-06, + "loss": 0.9926, + "step": 3133 + }, + { + "epoch": 2.5376518218623483, + "grad_norm": 1.7301843770006853, + "learning_rate": 1.1804075279700023e-06, + "loss": 0.9108, + "step": 3134 + }, + { + "epoch": 2.5384615384615383, + "grad_norm": 1.7297103310761568, + "learning_rate": 1.176359587344158e-06, + "loss": 0.9898, + "step": 3135 + }, + { + "epoch": 2.539271255060729, + "grad_norm": 1.6692620409074408, + "learning_rate": 1.17231816559487e-06, + "loss": 0.9766, + "step": 3136 + }, + { + "epoch": 2.540080971659919, + "grad_norm": 1.7138975481366907, + "learning_rate": 1.168283265707927e-06, + "loss": 1.0342, + "step": 3137 + }, + { + "epoch": 2.5408906882591094, + "grad_norm": 1.8684377760837239, + "learning_rate": 1.1642548906643003e-06, + "loss": 0.917, + "step": 3138 + }, + { + "epoch": 2.5417004048582994, + "grad_norm": 1.6682784606850616, + "learning_rate": 1.160233043440141e-06, + "loss": 0.9585, + "step": 3139 + }, + { + "epoch": 2.54251012145749, + "grad_norm": 1.646760129502982, + "learning_rate": 1.1562177270067766e-06, + "loss": 0.9977, + "step": 3140 + }, + { + "epoch": 2.5433198380566804, + "grad_norm": 1.7761301320650948, + "learning_rate": 1.1522089443307083e-06, + "loss": 1.0222, + "step": 3141 + }, + { + "epoch": 2.5441295546558704, + "grad_norm": 1.713346287093232, + "learning_rate": 1.1482066983736095e-06, + "loss": 1.0214, + "step": 3142 + }, + { + "epoch": 2.5449392712550605, + "grad_norm": 1.739109874170232, + "learning_rate": 1.1442109920923317e-06, + "loss": 0.9595, + "step": 3143 + }, + { + "epoch": 2.545748987854251, + "grad_norm": 1.797935688363354, + "learning_rate": 1.1402218284388845e-06, + "loss": 0.9891, + "step": 3144 + }, + { + "epoch": 2.5465587044534415, + "grad_norm": 1.7311448213719867, + "learning_rate": 1.1362392103604536e-06, + "loss": 0.9481, + "step": 3145 + }, + { + "epoch": 2.5473684210526315, + "grad_norm": 1.6344396564565618, + "learning_rate": 1.132263140799381e-06, + "loss": 0.9162, + "step": 3146 + }, + { + "epoch": 2.548178137651822, + "grad_norm": 1.7616052113712068, + "learning_rate": 1.1282936226931762e-06, + "loss": 0.947, + "step": 3147 + }, + { + "epoch": 2.548987854251012, + "grad_norm": 1.671278873181062, + "learning_rate": 1.124330658974503e-06, + "loss": 0.9724, + "step": 3148 + }, + { + "epoch": 2.5497975708502025, + "grad_norm": 1.624337775639022, + "learning_rate": 1.120374252571188e-06, + "loss": 0.9633, + "step": 3149 + }, + { + "epoch": 2.5506072874493926, + "grad_norm": 1.7342053818211673, + "learning_rate": 1.1164244064062101e-06, + "loss": 0.9351, + "step": 3150 + }, + { + "epoch": 2.551417004048583, + "grad_norm": 1.7210916677441777, + "learning_rate": 1.112481123397704e-06, + "loss": 0.936, + "step": 3151 + }, + { + "epoch": 2.552226720647773, + "grad_norm": 1.7346383956750289, + "learning_rate": 1.1085444064589523e-06, + "loss": 0.9564, + "step": 3152 + }, + { + "epoch": 2.5530364372469636, + "grad_norm": 1.7122232488976015, + "learning_rate": 1.104614258498392e-06, + "loss": 1.0299, + "step": 3153 + }, + { + "epoch": 2.5538461538461537, + "grad_norm": 1.6894970983661606, + "learning_rate": 1.1006906824196006e-06, + "loss": 1.052, + "step": 3154 + }, + { + "epoch": 2.554655870445344, + "grad_norm": 1.7405016755106155, + "learning_rate": 1.0967736811213048e-06, + "loss": 0.9761, + "step": 3155 + }, + { + "epoch": 2.5554655870445346, + "grad_norm": 1.6635809223324118, + "learning_rate": 1.0928632574973718e-06, + "loss": 0.9735, + "step": 3156 + }, + { + "epoch": 2.5562753036437247, + "grad_norm": 1.5865674939483103, + "learning_rate": 1.0889594144368088e-06, + "loss": 0.9703, + "step": 3157 + }, + { + "epoch": 2.5570850202429147, + "grad_norm": 1.7886384841861689, + "learning_rate": 1.0850621548237604e-06, + "loss": 0.953, + "step": 3158 + }, + { + "epoch": 2.557894736842105, + "grad_norm": 1.65444440912227, + "learning_rate": 1.081171481537513e-06, + "loss": 1.0416, + "step": 3159 + }, + { + "epoch": 2.5587044534412957, + "grad_norm": 1.6010494418210732, + "learning_rate": 1.0772873974524833e-06, + "loss": 0.978, + "step": 3160 + }, + { + "epoch": 2.5595141700404858, + "grad_norm": 1.753931513386066, + "learning_rate": 1.0734099054382186e-06, + "loss": 1.0272, + "step": 3161 + }, + { + "epoch": 2.5603238866396762, + "grad_norm": 1.7002625406870047, + "learning_rate": 1.069539008359397e-06, + "loss": 0.9895, + "step": 3162 + }, + { + "epoch": 2.5611336032388663, + "grad_norm": 1.696885279537435, + "learning_rate": 1.0656747090758246e-06, + "loss": 1.042, + "step": 3163 + }, + { + "epoch": 2.561943319838057, + "grad_norm": 1.6725123732842846, + "learning_rate": 1.061817010442433e-06, + "loss": 0.9873, + "step": 3164 + }, + { + "epoch": 2.562753036437247, + "grad_norm": 1.744422716922843, + "learning_rate": 1.0579659153092759e-06, + "loss": 0.9915, + "step": 3165 + }, + { + "epoch": 2.5635627530364373, + "grad_norm": 1.6982287591882945, + "learning_rate": 1.0541214265215328e-06, + "loss": 0.993, + "step": 3166 + }, + { + "epoch": 2.5643724696356274, + "grad_norm": 1.6580163858315626, + "learning_rate": 1.0502835469194961e-06, + "loss": 1.0379, + "step": 3167 + }, + { + "epoch": 2.565182186234818, + "grad_norm": 1.6503684640252827, + "learning_rate": 1.0464522793385822e-06, + "loss": 0.9683, + "step": 3168 + }, + { + "epoch": 2.565991902834008, + "grad_norm": 1.6600515587726554, + "learning_rate": 1.0426276266093172e-06, + "loss": 1.0482, + "step": 3169 + }, + { + "epoch": 2.5668016194331984, + "grad_norm": 1.7091294278437665, + "learning_rate": 1.0388095915573427e-06, + "loss": 1.0417, + "step": 3170 + }, + { + "epoch": 2.567611336032389, + "grad_norm": 1.6555794090881855, + "learning_rate": 1.034998177003409e-06, + "loss": 0.9302, + "step": 3171 + }, + { + "epoch": 2.568421052631579, + "grad_norm": 1.7961476476441376, + "learning_rate": 1.0311933857633772e-06, + "loss": 0.967, + "step": 3172 + }, + { + "epoch": 2.569230769230769, + "grad_norm": 1.6778244264366644, + "learning_rate": 1.027395220648213e-06, + "loss": 1.0396, + "step": 3173 + }, + { + "epoch": 2.5700404858299595, + "grad_norm": 1.6859023669000333, + "learning_rate": 1.0236036844639897e-06, + "loss": 0.9793, + "step": 3174 + }, + { + "epoch": 2.57085020242915, + "grad_norm": 1.7334612254526203, + "learning_rate": 1.0198187800118842e-06, + "loss": 0.9165, + "step": 3175 + }, + { + "epoch": 2.57165991902834, + "grad_norm": 1.726216684645843, + "learning_rate": 1.0160405100881699e-06, + "loss": 1.0163, + "step": 3176 + }, + { + "epoch": 2.5724696356275305, + "grad_norm": 1.6800195699709957, + "learning_rate": 1.0122688774842194e-06, + "loss": 0.9615, + "step": 3177 + }, + { + "epoch": 2.5732793522267206, + "grad_norm": 1.7496662315340423, + "learning_rate": 1.0085038849865025e-06, + "loss": 0.9433, + "step": 3178 + }, + { + "epoch": 2.574089068825911, + "grad_norm": 1.669757711256767, + "learning_rate": 1.0047455353765845e-06, + "loss": 0.998, + "step": 3179 + }, + { + "epoch": 2.574898785425101, + "grad_norm": 1.7649644449020268, + "learning_rate": 1.0009938314311186e-06, + "loss": 0.8943, + "step": 3180 + }, + { + "epoch": 2.5757085020242916, + "grad_norm": 1.665726033058726, + "learning_rate": 9.972487759218551e-07, + "loss": 0.9636, + "step": 3181 + }, + { + "epoch": 2.5765182186234816, + "grad_norm": 1.6714613239053895, + "learning_rate": 9.935103716156258e-07, + "loss": 0.9908, + "step": 3182 + }, + { + "epoch": 2.577327935222672, + "grad_norm": 1.780013775907417, + "learning_rate": 9.897786212743543e-07, + "loss": 0.9168, + "step": 3183 + }, + { + "epoch": 2.578137651821862, + "grad_norm": 1.6637866101217857, + "learning_rate": 9.860535276550443e-07, + "loss": 0.9549, + "step": 3184 + }, + { + "epoch": 2.5789473684210527, + "grad_norm": 1.6430785762195612, + "learning_rate": 9.82335093509782e-07, + "loss": 0.9702, + "step": 3185 + }, + { + "epoch": 2.579757085020243, + "grad_norm": 1.7132270223349593, + "learning_rate": 9.786233215857354e-07, + "loss": 0.9692, + "step": 3186 + }, + { + "epoch": 2.580566801619433, + "grad_norm": 1.7060451332011317, + "learning_rate": 9.74918214625149e-07, + "loss": 0.9941, + "step": 3187 + }, + { + "epoch": 2.5813765182186232, + "grad_norm": 1.6254697719748301, + "learning_rate": 9.712197753653418e-07, + "loss": 1.0413, + "step": 3188 + }, + { + "epoch": 2.5821862348178137, + "grad_norm": 1.6890779172726746, + "learning_rate": 9.675280065387117e-07, + "loss": 0.9546, + "step": 3189 + }, + { + "epoch": 2.582995951417004, + "grad_norm": 1.6665698649055485, + "learning_rate": 9.638429108727232e-07, + "loss": 0.9145, + "step": 3190 + }, + { + "epoch": 2.5838056680161943, + "grad_norm": 1.7455050077057428, + "learning_rate": 9.601644910899144e-07, + "loss": 0.9475, + "step": 3191 + }, + { + "epoch": 2.5846153846153848, + "grad_norm": 1.707769647747537, + "learning_rate": 9.56492749907889e-07, + "loss": 0.9925, + "step": 3192 + }, + { + "epoch": 2.585425101214575, + "grad_norm": 1.7626431911020435, + "learning_rate": 9.528276900393185e-07, + "loss": 0.9197, + "step": 3193 + }, + { + "epoch": 2.5862348178137653, + "grad_norm": 1.6723942662699407, + "learning_rate": 9.491693141919345e-07, + "loss": 0.9899, + "step": 3194 + }, + { + "epoch": 2.5870445344129553, + "grad_norm": 1.66701720323838, + "learning_rate": 9.455176250685338e-07, + "loss": 1.0288, + "step": 3195 + }, + { + "epoch": 2.587854251012146, + "grad_norm": 1.6539257270825336, + "learning_rate": 9.418726253669741e-07, + "loss": 1.0232, + "step": 3196 + }, + { + "epoch": 2.588663967611336, + "grad_norm": 1.5465095186573317, + "learning_rate": 9.38234317780169e-07, + "loss": 0.9658, + "step": 3197 + }, + { + "epoch": 2.5894736842105264, + "grad_norm": 1.7029852892332802, + "learning_rate": 9.346027049960849e-07, + "loss": 0.9513, + "step": 3198 + }, + { + "epoch": 2.5902834008097164, + "grad_norm": 1.831209877056402, + "learning_rate": 9.309777896977501e-07, + "loss": 0.9345, + "step": 3199 + }, + { + "epoch": 2.591093117408907, + "grad_norm": 1.6872660132013921, + "learning_rate": 9.27359574563238e-07, + "loss": 0.9709, + "step": 3200 + }, + { + "epoch": 2.5919028340080974, + "grad_norm": 1.6893163114944878, + "learning_rate": 9.237480622656736e-07, + "loss": 0.9725, + "step": 3201 + }, + { + "epoch": 2.5927125506072874, + "grad_norm": 1.6058163315797132, + "learning_rate": 9.201432554732304e-07, + "loss": 1.0278, + "step": 3202 + }, + { + "epoch": 2.5935222672064775, + "grad_norm": 1.692466394304998, + "learning_rate": 9.165451568491257e-07, + "loss": 1.0333, + "step": 3203 + }, + { + "epoch": 2.594331983805668, + "grad_norm": 1.640592263798185, + "learning_rate": 9.129537690516277e-07, + "loss": 0.9587, + "step": 3204 + }, + { + "epoch": 2.5951417004048585, + "grad_norm": 1.6407337256128884, + "learning_rate": 9.093690947340406e-07, + "loss": 0.937, + "step": 3205 + }, + { + "epoch": 2.5959514170040485, + "grad_norm": 1.738587051999838, + "learning_rate": 9.057911365447058e-07, + "loss": 0.9966, + "step": 3206 + }, + { + "epoch": 2.596761133603239, + "grad_norm": 1.6250090533624972, + "learning_rate": 9.022198971270124e-07, + "loss": 0.9348, + "step": 3207 + }, + { + "epoch": 2.597570850202429, + "grad_norm": 1.643745528654048, + "learning_rate": 8.986553791193775e-07, + "loss": 1.0099, + "step": 3208 + }, + { + "epoch": 2.5983805668016196, + "grad_norm": 1.6883666457612738, + "learning_rate": 8.950975851552568e-07, + "loss": 0.9812, + "step": 3209 + }, + { + "epoch": 2.5991902834008096, + "grad_norm": 1.7178288521933882, + "learning_rate": 8.915465178631344e-07, + "loss": 0.9968, + "step": 3210 + }, + { + "epoch": 2.6, + "grad_norm": 1.6723879917548437, + "learning_rate": 8.880021798665295e-07, + "loss": 0.9755, + "step": 3211 + }, + { + "epoch": 2.60080971659919, + "grad_norm": 1.705852934667388, + "learning_rate": 8.844645737839874e-07, + "loss": 0.9901, + "step": 3212 + }, + { + "epoch": 2.6016194331983806, + "grad_norm": 1.7322533252061503, + "learning_rate": 8.809337022290787e-07, + "loss": 0.938, + "step": 3213 + }, + { + "epoch": 2.6024291497975707, + "grad_norm": 1.6855599969482573, + "learning_rate": 8.774095678103978e-07, + "loss": 0.9581, + "step": 3214 + }, + { + "epoch": 2.603238866396761, + "grad_norm": 1.6274201556331367, + "learning_rate": 8.738921731315686e-07, + "loss": 1.0798, + "step": 3215 + }, + { + "epoch": 2.6040485829959517, + "grad_norm": 1.61403911744669, + "learning_rate": 8.70381520791227e-07, + "loss": 1.0021, + "step": 3216 + }, + { + "epoch": 2.6048582995951417, + "grad_norm": 1.678255519779955, + "learning_rate": 8.668776133830315e-07, + "loss": 0.9717, + "step": 3217 + }, + { + "epoch": 2.6056680161943317, + "grad_norm": 1.627162580752701, + "learning_rate": 8.633804534956591e-07, + "loss": 0.8836, + "step": 3218 + }, + { + "epoch": 2.6064777327935222, + "grad_norm": 1.6220265843378787, + "learning_rate": 8.598900437127999e-07, + "loss": 0.9926, + "step": 3219 + }, + { + "epoch": 2.6072874493927127, + "grad_norm": 1.637954735309685, + "learning_rate": 8.564063866131567e-07, + "loss": 1.0443, + "step": 3220 + }, + { + "epoch": 2.6080971659919028, + "grad_norm": 1.6447001385429936, + "learning_rate": 8.529294847704428e-07, + "loss": 0.8968, + "step": 3221 + }, + { + "epoch": 2.6089068825910933, + "grad_norm": 1.732595510365215, + "learning_rate": 8.494593407533814e-07, + "loss": 0.9386, + "step": 3222 + }, + { + "epoch": 2.6097165991902833, + "grad_norm": 1.6660686105327704, + "learning_rate": 8.459959571257071e-07, + "loss": 1.0091, + "step": 3223 + }, + { + "epoch": 2.610526315789474, + "grad_norm": 1.7013721994565274, + "learning_rate": 8.425393364461542e-07, + "loss": 0.9747, + "step": 3224 + }, + { + "epoch": 2.611336032388664, + "grad_norm": 1.7819007792713526, + "learning_rate": 8.390894812684602e-07, + "loss": 0.9579, + "step": 3225 + }, + { + "epoch": 2.6121457489878543, + "grad_norm": 1.5960953357473817, + "learning_rate": 8.356463941413717e-07, + "loss": 1.0558, + "step": 3226 + }, + { + "epoch": 2.6129554655870444, + "grad_norm": 1.7717830181251943, + "learning_rate": 8.322100776086272e-07, + "loss": 1.0088, + "step": 3227 + }, + { + "epoch": 2.613765182186235, + "grad_norm": 1.7439967991496321, + "learning_rate": 8.287805342089672e-07, + "loss": 0.9053, + "step": 3228 + }, + { + "epoch": 2.614574898785425, + "grad_norm": 1.670184838895179, + "learning_rate": 8.253577664761259e-07, + "loss": 1.0389, + "step": 3229 + }, + { + "epoch": 2.6153846153846154, + "grad_norm": 1.7399555221858654, + "learning_rate": 8.219417769388316e-07, + "loss": 0.9939, + "step": 3230 + }, + { + "epoch": 2.616194331983806, + "grad_norm": 1.6749694700524644, + "learning_rate": 8.1853256812081e-07, + "loss": 1.0025, + "step": 3231 + }, + { + "epoch": 2.617004048582996, + "grad_norm": 1.7379585443686947, + "learning_rate": 8.151301425407699e-07, + "loss": 0.9861, + "step": 3232 + }, + { + "epoch": 2.617813765182186, + "grad_norm": 1.6065227038635674, + "learning_rate": 8.117345027124146e-07, + "loss": 1.0168, + "step": 3233 + }, + { + "epoch": 2.6186234817813765, + "grad_norm": 1.7012911860488888, + "learning_rate": 8.083456511444309e-07, + "loss": 1.0043, + "step": 3234 + }, + { + "epoch": 2.619433198380567, + "grad_norm": 1.716691520394183, + "learning_rate": 8.049635903404907e-07, + "loss": 0.9689, + "step": 3235 + }, + { + "epoch": 2.620242914979757, + "grad_norm": 1.7278598183805447, + "learning_rate": 8.015883227992505e-07, + "loss": 1.027, + "step": 3236 + }, + { + "epoch": 2.6210526315789475, + "grad_norm": 1.7367934858252325, + "learning_rate": 7.982198510143457e-07, + "loss": 0.9283, + "step": 3237 + }, + { + "epoch": 2.6218623481781376, + "grad_norm": 1.6429152171351453, + "learning_rate": 7.948581774743902e-07, + "loss": 0.9303, + "step": 3238 + }, + { + "epoch": 2.622672064777328, + "grad_norm": 1.7790796815176042, + "learning_rate": 7.915033046629817e-07, + "loss": 0.9561, + "step": 3239 + }, + { + "epoch": 2.623481781376518, + "grad_norm": 1.686258952709826, + "learning_rate": 7.881552350586863e-07, + "loss": 0.8947, + "step": 3240 + }, + { + "epoch": 2.6242914979757086, + "grad_norm": 1.758970152066431, + "learning_rate": 7.848139711350489e-07, + "loss": 1.048, + "step": 3241 + }, + { + "epoch": 2.6251012145748986, + "grad_norm": 1.6233206462939287, + "learning_rate": 7.814795153605825e-07, + "loss": 1.0282, + "step": 3242 + }, + { + "epoch": 2.625910931174089, + "grad_norm": 1.5853805998689159, + "learning_rate": 7.781518701987734e-07, + "loss": 1.0069, + "step": 3243 + }, + { + "epoch": 2.626720647773279, + "grad_norm": 1.849575708030945, + "learning_rate": 7.748310381080749e-07, + "loss": 0.904, + "step": 3244 + }, + { + "epoch": 2.6275303643724697, + "grad_norm": 1.655980362780037, + "learning_rate": 7.715170215419043e-07, + "loss": 0.988, + "step": 3245 + }, + { + "epoch": 2.62834008097166, + "grad_norm": 1.6899812614311005, + "learning_rate": 7.682098229486478e-07, + "loss": 0.9808, + "step": 3246 + }, + { + "epoch": 2.62914979757085, + "grad_norm": 1.6541647003198807, + "learning_rate": 7.649094447716532e-07, + "loss": 0.961, + "step": 3247 + }, + { + "epoch": 2.6299595141700403, + "grad_norm": 1.6996182835373292, + "learning_rate": 7.616158894492298e-07, + "loss": 0.9375, + "step": 3248 + }, + { + "epoch": 2.6307692307692307, + "grad_norm": 1.644339997790146, + "learning_rate": 7.583291594146458e-07, + "loss": 1.0002, + "step": 3249 + }, + { + "epoch": 2.6315789473684212, + "grad_norm": 1.7094644080217443, + "learning_rate": 7.550492570961243e-07, + "loss": 0.9905, + "step": 3250 + }, + { + "epoch": 2.6323886639676113, + "grad_norm": 1.724822229310986, + "learning_rate": 7.517761849168481e-07, + "loss": 0.9991, + "step": 3251 + }, + { + "epoch": 2.6331983805668018, + "grad_norm": 1.6081123719870767, + "learning_rate": 7.485099452949507e-07, + "loss": 0.9177, + "step": 3252 + }, + { + "epoch": 2.634008097165992, + "grad_norm": 1.7226834341746038, + "learning_rate": 7.452505406435184e-07, + "loss": 0.9244, + "step": 3253 + }, + { + "epoch": 2.6348178137651823, + "grad_norm": 1.6628814186493102, + "learning_rate": 7.419979733705929e-07, + "loss": 1.0378, + "step": 3254 + }, + { + "epoch": 2.6356275303643724, + "grad_norm": 1.738063003473798, + "learning_rate": 7.387522458791552e-07, + "loss": 0.9352, + "step": 3255 + }, + { + "epoch": 2.636437246963563, + "grad_norm": 1.680315939042461, + "learning_rate": 7.355133605671417e-07, + "loss": 1.0976, + "step": 3256 + }, + { + "epoch": 2.637246963562753, + "grad_norm": 1.665843160940704, + "learning_rate": 7.322813198274303e-07, + "loss": 0.9783, + "step": 3257 + }, + { + "epoch": 2.6380566801619434, + "grad_norm": 1.702934773134636, + "learning_rate": 7.290561260478401e-07, + "loss": 1.012, + "step": 3258 + }, + { + "epoch": 2.6388663967611334, + "grad_norm": 1.6427499028473984, + "learning_rate": 7.258377816111339e-07, + "loss": 1.0654, + "step": 3259 + }, + { + "epoch": 2.639676113360324, + "grad_norm": 1.681106134788699, + "learning_rate": 7.226262888950153e-07, + "loss": 0.9816, + "step": 3260 + }, + { + "epoch": 2.6404858299595144, + "grad_norm": 1.6644509739713758, + "learning_rate": 7.194216502721219e-07, + "loss": 1.0321, + "step": 3261 + }, + { + "epoch": 2.6412955465587045, + "grad_norm": 1.6556000089922558, + "learning_rate": 7.16223868110032e-07, + "loss": 0.9842, + "step": 3262 + }, + { + "epoch": 2.6421052631578945, + "grad_norm": 1.6874491415426638, + "learning_rate": 7.130329447712581e-07, + "loss": 0.9057, + "step": 3263 + }, + { + "epoch": 2.642914979757085, + "grad_norm": 1.7091112739486392, + "learning_rate": 7.098488826132422e-07, + "loss": 1.008, + "step": 3264 + }, + { + "epoch": 2.6437246963562755, + "grad_norm": 1.6178389429525835, + "learning_rate": 7.066716839883592e-07, + "loss": 0.9924, + "step": 3265 + }, + { + "epoch": 2.6445344129554655, + "grad_norm": 1.6485423266523553, + "learning_rate": 7.035013512439126e-07, + "loss": 1.054, + "step": 3266 + }, + { + "epoch": 2.645344129554656, + "grad_norm": 1.661813731422273, + "learning_rate": 7.003378867221344e-07, + "loss": 0.9126, + "step": 3267 + }, + { + "epoch": 2.646153846153846, + "grad_norm": 1.6864743875444674, + "learning_rate": 6.971812927601806e-07, + "loss": 0.9689, + "step": 3268 + }, + { + "epoch": 2.6469635627530366, + "grad_norm": 1.7764344486102688, + "learning_rate": 6.940315716901347e-07, + "loss": 0.9694, + "step": 3269 + }, + { + "epoch": 2.6477732793522266, + "grad_norm": 1.7426314403267598, + "learning_rate": 6.908887258389974e-07, + "loss": 1.0287, + "step": 3270 + }, + { + "epoch": 2.648582995951417, + "grad_norm": 1.6967622835020153, + "learning_rate": 6.877527575286958e-07, + "loss": 0.8432, + "step": 3271 + }, + { + "epoch": 2.649392712550607, + "grad_norm": 1.659074402231809, + "learning_rate": 6.846236690760721e-07, + "loss": 0.931, + "step": 3272 + }, + { + "epoch": 2.6502024291497976, + "grad_norm": 1.6674943656562866, + "learning_rate": 6.815014627928862e-07, + "loss": 0.9656, + "step": 3273 + }, + { + "epoch": 2.6510121457489877, + "grad_norm": 1.710449217756733, + "learning_rate": 6.783861409858128e-07, + "loss": 0.9369, + "step": 3274 + }, + { + "epoch": 2.651821862348178, + "grad_norm": 1.7486026332376843, + "learning_rate": 6.752777059564431e-07, + "loss": 0.9058, + "step": 3275 + }, + { + "epoch": 2.6526315789473687, + "grad_norm": 1.7290326137066063, + "learning_rate": 6.721761600012766e-07, + "loss": 0.9418, + "step": 3276 + }, + { + "epoch": 2.6534412955465587, + "grad_norm": 1.6974512106097555, + "learning_rate": 6.690815054117283e-07, + "loss": 0.9965, + "step": 3277 + }, + { + "epoch": 2.6542510121457488, + "grad_norm": 1.570906783647935, + "learning_rate": 6.659937444741149e-07, + "loss": 0.9696, + "step": 3278 + }, + { + "epoch": 2.6550607287449393, + "grad_norm": 1.680749541696145, + "learning_rate": 6.629128794696694e-07, + "loss": 0.8977, + "step": 3279 + }, + { + "epoch": 2.6558704453441297, + "grad_norm": 1.6928204758699041, + "learning_rate": 6.598389126745209e-07, + "loss": 1.0138, + "step": 3280 + }, + { + "epoch": 2.65668016194332, + "grad_norm": 1.5916031288683266, + "learning_rate": 6.567718463597061e-07, + "loss": 1.0033, + "step": 3281 + }, + { + "epoch": 2.6574898785425103, + "grad_norm": 1.7010292729133476, + "learning_rate": 6.537116827911649e-07, + "loss": 0.9784, + "step": 3282 + }, + { + "epoch": 2.6582995951417003, + "grad_norm": 1.7137596107033524, + "learning_rate": 6.506584242297332e-07, + "loss": 0.9155, + "step": 3283 + }, + { + "epoch": 2.659109311740891, + "grad_norm": 1.744228981789455, + "learning_rate": 6.476120729311531e-07, + "loss": 0.9438, + "step": 3284 + }, + { + "epoch": 2.659919028340081, + "grad_norm": 1.6938026901844712, + "learning_rate": 6.445726311460553e-07, + "loss": 1.0437, + "step": 3285 + }, + { + "epoch": 2.6607287449392714, + "grad_norm": 1.7064562907469736, + "learning_rate": 6.415401011199707e-07, + "loss": 0.9533, + "step": 3286 + }, + { + "epoch": 2.6615384615384614, + "grad_norm": 1.653652412179681, + "learning_rate": 6.385144850933222e-07, + "loss": 0.9629, + "step": 3287 + }, + { + "epoch": 2.662348178137652, + "grad_norm": 1.7289904638695122, + "learning_rate": 6.354957853014254e-07, + "loss": 1.017, + "step": 3288 + }, + { + "epoch": 2.663157894736842, + "grad_norm": 1.7152833008239363, + "learning_rate": 6.324840039744862e-07, + "loss": 0.9085, + "step": 3289 + }, + { + "epoch": 2.6639676113360324, + "grad_norm": 1.7289503260777432, + "learning_rate": 6.29479143337598e-07, + "loss": 0.8867, + "step": 3290 + }, + { + "epoch": 2.664777327935223, + "grad_norm": 1.6648250831416591, + "learning_rate": 6.264812056107406e-07, + "loss": 0.9589, + "step": 3291 + }, + { + "epoch": 2.665587044534413, + "grad_norm": 1.7548308959687973, + "learning_rate": 6.234901930087822e-07, + "loss": 0.9672, + "step": 3292 + }, + { + "epoch": 2.666396761133603, + "grad_norm": 1.6680110599487112, + "learning_rate": 6.205061077414743e-07, + "loss": 0.9872, + "step": 3293 + }, + { + "epoch": 2.6672064777327935, + "grad_norm": 1.7292774474924442, + "learning_rate": 6.175289520134464e-07, + "loss": 0.9107, + "step": 3294 + }, + { + "epoch": 2.668016194331984, + "grad_norm": 1.7431371411748449, + "learning_rate": 6.145587280242138e-07, + "loss": 1.009, + "step": 3295 + }, + { + "epoch": 2.668825910931174, + "grad_norm": 1.787321743542622, + "learning_rate": 6.115954379681666e-07, + "loss": 0.9275, + "step": 3296 + }, + { + "epoch": 2.669635627530364, + "grad_norm": 1.6827076549424453, + "learning_rate": 6.086390840345758e-07, + "loss": 0.9042, + "step": 3297 + }, + { + "epoch": 2.6704453441295546, + "grad_norm": 1.6938249525960054, + "learning_rate": 6.05689668407582e-07, + "loss": 1.0097, + "step": 3298 + }, + { + "epoch": 2.671255060728745, + "grad_norm": 1.745931288902666, + "learning_rate": 6.027471932662087e-07, + "loss": 1.028, + "step": 3299 + }, + { + "epoch": 2.672064777327935, + "grad_norm": 1.6780881861216097, + "learning_rate": 5.99811660784344e-07, + "loss": 1.0207, + "step": 3300 + }, + { + "epoch": 2.6728744939271256, + "grad_norm": 1.666413076144872, + "learning_rate": 5.968830731307507e-07, + "loss": 0.9445, + "step": 3301 + }, + { + "epoch": 2.6736842105263157, + "grad_norm": 1.6645120361386307, + "learning_rate": 5.93961432469058e-07, + "loss": 0.9988, + "step": 3302 + }, + { + "epoch": 2.674493927125506, + "grad_norm": 1.6392363775063152, + "learning_rate": 5.910467409577669e-07, + "loss": 0.8894, + "step": 3303 + }, + { + "epoch": 2.675303643724696, + "grad_norm": 1.6878734747415562, + "learning_rate": 5.881390007502397e-07, + "loss": 0.9665, + "step": 3304 + }, + { + "epoch": 2.6761133603238867, + "grad_norm": 1.7768958238131116, + "learning_rate": 5.852382139947077e-07, + "loss": 0.9143, + "step": 3305 + }, + { + "epoch": 2.676923076923077, + "grad_norm": 1.5994762220145167, + "learning_rate": 5.82344382834259e-07, + "loss": 1.0549, + "step": 3306 + }, + { + "epoch": 2.6777327935222672, + "grad_norm": 1.7374717323879252, + "learning_rate": 5.7945750940685e-07, + "loss": 1.0182, + "step": 3307 + }, + { + "epoch": 2.6785425101214573, + "grad_norm": 1.650625300217036, + "learning_rate": 5.765775958452935e-07, + "loss": 1.0116, + "step": 3308 + }, + { + "epoch": 2.6793522267206478, + "grad_norm": 1.7121361852833241, + "learning_rate": 5.737046442772576e-07, + "loss": 1.0502, + "step": 3309 + }, + { + "epoch": 2.6801619433198383, + "grad_norm": 1.6502806678346016, + "learning_rate": 5.708386568252688e-07, + "loss": 0.9503, + "step": 3310 + }, + { + "epoch": 2.6809716599190283, + "grad_norm": 1.686124777598768, + "learning_rate": 5.679796356067135e-07, + "loss": 0.9896, + "step": 3311 + }, + { + "epoch": 2.6817813765182184, + "grad_norm": 1.6024788294961974, + "learning_rate": 5.651275827338242e-07, + "loss": 1.0105, + "step": 3312 + }, + { + "epoch": 2.682591093117409, + "grad_norm": 1.7206476831624593, + "learning_rate": 5.622825003136878e-07, + "loss": 0.977, + "step": 3313 + }, + { + "epoch": 2.6834008097165993, + "grad_norm": 1.6455934468913418, + "learning_rate": 5.594443904482439e-07, + "loss": 0.9792, + "step": 3314 + }, + { + "epoch": 2.6842105263157894, + "grad_norm": 1.7190031808580448, + "learning_rate": 5.566132552342784e-07, + "loss": 1.0892, + "step": 3315 + }, + { + "epoch": 2.68502024291498, + "grad_norm": 1.6275442282090031, + "learning_rate": 5.53789096763423e-07, + "loss": 0.9998, + "step": 3316 + }, + { + "epoch": 2.68582995951417, + "grad_norm": 1.6723953815880728, + "learning_rate": 5.509719171221583e-07, + "loss": 0.9851, + "step": 3317 + }, + { + "epoch": 2.6866396761133604, + "grad_norm": 1.5877924488779105, + "learning_rate": 5.481617183918053e-07, + "loss": 0.9935, + "step": 3318 + }, + { + "epoch": 2.6874493927125505, + "grad_norm": 1.725319551891715, + "learning_rate": 5.45358502648532e-07, + "loss": 0.9045, + "step": 3319 + }, + { + "epoch": 2.688259109311741, + "grad_norm": 1.6281167669786418, + "learning_rate": 5.425622719633428e-07, + "loss": 0.997, + "step": 3320 + }, + { + "epoch": 2.6890688259109314, + "grad_norm": 1.6635114803247797, + "learning_rate": 5.397730284020863e-07, + "loss": 1.04, + "step": 3321 + }, + { + "epoch": 2.6898785425101215, + "grad_norm": 1.6162289487244115, + "learning_rate": 5.369907740254454e-07, + "loss": 0.9811, + "step": 3322 + }, + { + "epoch": 2.6906882591093115, + "grad_norm": 1.7327330282080413, + "learning_rate": 5.342155108889391e-07, + "loss": 0.9866, + "step": 3323 + }, + { + "epoch": 2.691497975708502, + "grad_norm": 1.673161064229558, + "learning_rate": 5.31447241042925e-07, + "loss": 0.9979, + "step": 3324 + }, + { + "epoch": 2.6923076923076925, + "grad_norm": 1.5832573352945025, + "learning_rate": 5.286859665325905e-07, + "loss": 0.9559, + "step": 3325 + }, + { + "epoch": 2.6931174089068826, + "grad_norm": 1.6724549773177813, + "learning_rate": 5.259316893979549e-07, + "loss": 0.9106, + "step": 3326 + }, + { + "epoch": 2.6939271255060726, + "grad_norm": 1.6314795157115118, + "learning_rate": 5.231844116738716e-07, + "loss": 0.972, + "step": 3327 + }, + { + "epoch": 2.694736842105263, + "grad_norm": 1.6811218193411817, + "learning_rate": 5.204441353900169e-07, + "loss": 0.9769, + "step": 3328 + }, + { + "epoch": 2.6955465587044536, + "grad_norm": 1.728655218184744, + "learning_rate": 5.177108625709026e-07, + "loss": 1.0373, + "step": 3329 + }, + { + "epoch": 2.6963562753036436, + "grad_norm": 1.652166979844383, + "learning_rate": 5.149845952358589e-07, + "loss": 1.0337, + "step": 3330 + }, + { + "epoch": 2.697165991902834, + "grad_norm": 1.668823613835129, + "learning_rate": 5.122653353990437e-07, + "loss": 0.9601, + "step": 3331 + }, + { + "epoch": 2.697975708502024, + "grad_norm": 1.619860141890713, + "learning_rate": 5.095530850694375e-07, + "loss": 1.0214, + "step": 3332 + }, + { + "epoch": 2.6987854251012147, + "grad_norm": 1.7472110705893544, + "learning_rate": 5.068478462508409e-07, + "loss": 0.9791, + "step": 3333 + }, + { + "epoch": 2.6995951417004047, + "grad_norm": 1.6679499678127792, + "learning_rate": 5.04149620941875e-07, + "loss": 0.9824, + "step": 3334 + }, + { + "epoch": 2.700404858299595, + "grad_norm": 1.6663417046865165, + "learning_rate": 5.014584111359811e-07, + "loss": 0.9539, + "step": 3335 + }, + { + "epoch": 2.7012145748987857, + "grad_norm": 1.672587987397243, + "learning_rate": 4.987742188214162e-07, + "loss": 0.9183, + "step": 3336 + }, + { + "epoch": 2.7020242914979757, + "grad_norm": 1.7188596938545961, + "learning_rate": 4.960970459812542e-07, + "loss": 0.9772, + "step": 3337 + }, + { + "epoch": 2.702834008097166, + "grad_norm": 1.6817958789804741, + "learning_rate": 4.934268945933784e-07, + "loss": 0.9823, + "step": 3338 + }, + { + "epoch": 2.7036437246963563, + "grad_norm": 1.6819764284995349, + "learning_rate": 4.907637666304898e-07, + "loss": 0.9582, + "step": 3339 + }, + { + "epoch": 2.7044534412955468, + "grad_norm": 1.6643127350965405, + "learning_rate": 4.881076640600979e-07, + "loss": 0.9677, + "step": 3340 + }, + { + "epoch": 2.705263157894737, + "grad_norm": 1.777174635148692, + "learning_rate": 4.854585888445218e-07, + "loss": 0.9946, + "step": 3341 + }, + { + "epoch": 2.706072874493927, + "grad_norm": 1.6692254557052726, + "learning_rate": 4.828165429408926e-07, + "loss": 1.0203, + "step": 3342 + }, + { + "epoch": 2.7068825910931174, + "grad_norm": 1.7053185479226727, + "learning_rate": 4.801815283011413e-07, + "loss": 0.9442, + "step": 3343 + }, + { + "epoch": 2.707692307692308, + "grad_norm": 1.700154883033202, + "learning_rate": 4.775535468720105e-07, + "loss": 0.9464, + "step": 3344 + }, + { + "epoch": 2.708502024291498, + "grad_norm": 1.7405575127592974, + "learning_rate": 4.7493260059504497e-07, + "loss": 0.9651, + "step": 3345 + }, + { + "epoch": 2.7093117408906884, + "grad_norm": 1.6499300506429417, + "learning_rate": 4.7231869140658804e-07, + "loss": 0.9186, + "step": 3346 + }, + { + "epoch": 2.7101214574898784, + "grad_norm": 1.684334696306486, + "learning_rate": 4.6971182123779045e-07, + "loss": 0.8985, + "step": 3347 + }, + { + "epoch": 2.710931174089069, + "grad_norm": 1.6675095321437483, + "learning_rate": 4.6711199201459833e-07, + "loss": 1.0096, + "step": 3348 + }, + { + "epoch": 2.711740890688259, + "grad_norm": 1.7918664382749507, + "learning_rate": 4.645192056577541e-07, + "loss": 1.0911, + "step": 3349 + }, + { + "epoch": 2.7125506072874495, + "grad_norm": 1.7239165785172468, + "learning_rate": 4.6193346408280216e-07, + "loss": 1.0048, + "step": 3350 + }, + { + "epoch": 2.71336032388664, + "grad_norm": 1.6745541046010592, + "learning_rate": 4.5935476920008213e-07, + "loss": 0.9741, + "step": 3351 + }, + { + "epoch": 2.71417004048583, + "grad_norm": 1.770541331198052, + "learning_rate": 4.5678312291472347e-07, + "loss": 0.9229, + "step": 3352 + }, + { + "epoch": 2.71497975708502, + "grad_norm": 1.7454452776343088, + "learning_rate": 4.542185271266486e-07, + "loss": 0.9203, + "step": 3353 + }, + { + "epoch": 2.7157894736842105, + "grad_norm": 1.6546940699373445, + "learning_rate": 4.516609837305741e-07, + "loss": 0.9511, + "step": 3354 + }, + { + "epoch": 2.716599190283401, + "grad_norm": 1.673530449759611, + "learning_rate": 4.491104946160052e-07, + "loss": 0.9926, + "step": 3355 + }, + { + "epoch": 2.717408906882591, + "grad_norm": 1.6784803821715035, + "learning_rate": 4.465670616672313e-07, + "loss": 0.951, + "step": 3356 + }, + { + "epoch": 2.718218623481781, + "grad_norm": 1.7286011969705144, + "learning_rate": 4.440306867633359e-07, + "loss": 0.9863, + "step": 3357 + }, + { + "epoch": 2.7190283400809716, + "grad_norm": 1.759405070641066, + "learning_rate": 4.4150137177818243e-07, + "loss": 1.0233, + "step": 3358 + }, + { + "epoch": 2.719838056680162, + "grad_norm": 1.7032342660835975, + "learning_rate": 4.389791185804237e-07, + "loss": 0.9514, + "step": 3359 + }, + { + "epoch": 2.720647773279352, + "grad_norm": 1.7760045827311535, + "learning_rate": 4.3646392903348823e-07, + "loss": 0.9509, + "step": 3360 + }, + { + "epoch": 2.7214574898785426, + "grad_norm": 1.703842102243129, + "learning_rate": 4.3395580499559276e-07, + "loss": 0.9985, + "step": 3361 + }, + { + "epoch": 2.7222672064777327, + "grad_norm": 1.7058090054545807, + "learning_rate": 4.3145474831972845e-07, + "loss": 0.994, + "step": 3362 + }, + { + "epoch": 2.723076923076923, + "grad_norm": 1.704324426436363, + "learning_rate": 4.2896076085367056e-07, + "loss": 0.9599, + "step": 3363 + }, + { + "epoch": 2.723886639676113, + "grad_norm": 1.6980076718179375, + "learning_rate": 4.264738444399652e-07, + "loss": 0.9828, + "step": 3364 + }, + { + "epoch": 2.7246963562753037, + "grad_norm": 1.6087858125711858, + "learning_rate": 4.2399400091594154e-07, + "loss": 1.0022, + "step": 3365 + }, + { + "epoch": 2.725506072874494, + "grad_norm": 1.6883922800088609, + "learning_rate": 4.2152123211369633e-07, + "loss": 0.9971, + "step": 3366 + }, + { + "epoch": 2.7263157894736842, + "grad_norm": 1.7249610106754352, + "learning_rate": 4.1905553986010707e-07, + "loss": 0.9756, + "step": 3367 + }, + { + "epoch": 2.7271255060728743, + "grad_norm": 1.650676985334564, + "learning_rate": 4.165969259768177e-07, + "loss": 0.9695, + "step": 3368 + }, + { + "epoch": 2.727935222672065, + "grad_norm": 1.7286467505133185, + "learning_rate": 4.1414539228024297e-07, + "loss": 1.0002, + "step": 3369 + }, + { + "epoch": 2.7287449392712553, + "grad_norm": 1.7055124440922338, + "learning_rate": 4.117009405815686e-07, + "loss": 0.9834, + "step": 3370 + }, + { + "epoch": 2.7295546558704453, + "grad_norm": 1.7056058182983371, + "learning_rate": 4.0926357268674667e-07, + "loss": 0.9676, + "step": 3371 + }, + { + "epoch": 2.7303643724696354, + "grad_norm": 1.7942169407381743, + "learning_rate": 4.068332903964978e-07, + "loss": 1.0179, + "step": 3372 + }, + { + "epoch": 2.731174089068826, + "grad_norm": 1.7569046659689378, + "learning_rate": 4.0441009550630683e-07, + "loss": 0.9497, + "step": 3373 + }, + { + "epoch": 2.7319838056680164, + "grad_norm": 1.7915957106720466, + "learning_rate": 4.0199398980641955e-07, + "loss": 1.0083, + "step": 3374 + }, + { + "epoch": 2.7327935222672064, + "grad_norm": 1.6604774269182196, + "learning_rate": 3.9958497508185036e-07, + "loss": 0.9262, + "step": 3375 + }, + { + "epoch": 2.733603238866397, + "grad_norm": 1.7981493121093652, + "learning_rate": 3.9718305311236996e-07, + "loss": 0.9811, + "step": 3376 + }, + { + "epoch": 2.734412955465587, + "grad_norm": 1.7140059177794198, + "learning_rate": 3.9478822567251e-07, + "loss": 1.02, + "step": 3377 + }, + { + "epoch": 2.7352226720647774, + "grad_norm": 1.7037447623006154, + "learning_rate": 3.924004945315618e-07, + "loss": 1.0029, + "step": 3378 + }, + { + "epoch": 2.7360323886639675, + "grad_norm": 1.6598577339137044, + "learning_rate": 3.900198614535711e-07, + "loss": 0.9981, + "step": 3379 + }, + { + "epoch": 2.736842105263158, + "grad_norm": 1.643004118226858, + "learning_rate": 3.8764632819734526e-07, + "loss": 0.9708, + "step": 3380 + }, + { + "epoch": 2.737651821862348, + "grad_norm": 1.6338597557404355, + "learning_rate": 3.852798965164406e-07, + "loss": 1.0315, + "step": 3381 + }, + { + "epoch": 2.7384615384615385, + "grad_norm": 1.6815159903047672, + "learning_rate": 3.8292056815916965e-07, + "loss": 0.9838, + "step": 3382 + }, + { + "epoch": 2.7392712550607285, + "grad_norm": 1.6880628778391817, + "learning_rate": 3.805683448685971e-07, + "loss": 0.9882, + "step": 3383 + }, + { + "epoch": 2.740080971659919, + "grad_norm": 1.7337464917612249, + "learning_rate": 3.782232283825371e-07, + "loss": 0.9449, + "step": 3384 + }, + { + "epoch": 2.7408906882591095, + "grad_norm": 1.618990796172125, + "learning_rate": 3.758852204335539e-07, + "loss": 0.9886, + "step": 3385 + }, + { + "epoch": 2.7417004048582996, + "grad_norm": 1.7016005187057022, + "learning_rate": 3.735543227489591e-07, + "loss": 1.032, + "step": 3386 + }, + { + "epoch": 2.7425101214574896, + "grad_norm": 1.6521289220479534, + "learning_rate": 3.712305370508151e-07, + "loss": 0.9023, + "step": 3387 + }, + { + "epoch": 2.74331983805668, + "grad_norm": 1.6818920788449179, + "learning_rate": 3.6891386505592543e-07, + "loss": 0.9534, + "step": 3388 + }, + { + "epoch": 2.7441295546558706, + "grad_norm": 1.6371230429182253, + "learning_rate": 3.6660430847583973e-07, + "loss": 1.0219, + "step": 3389 + }, + { + "epoch": 2.7449392712550607, + "grad_norm": 1.6513777881762906, + "learning_rate": 3.643018690168487e-07, + "loss": 1.0237, + "step": 3390 + }, + { + "epoch": 2.745748987854251, + "grad_norm": 1.702844558572882, + "learning_rate": 3.620065483799917e-07, + "loss": 1.0022, + "step": 3391 + }, + { + "epoch": 2.746558704453441, + "grad_norm": 1.6684488841014413, + "learning_rate": 3.5971834826104114e-07, + "loss": 1.0313, + "step": 3392 + }, + { + "epoch": 2.7473684210526317, + "grad_norm": 1.678173521295049, + "learning_rate": 3.5743727035051245e-07, + "loss": 0.9862, + "step": 3393 + }, + { + "epoch": 2.7481781376518217, + "grad_norm": 1.6963824088714325, + "learning_rate": 3.551633163336565e-07, + "loss": 0.9448, + "step": 3394 + }, + { + "epoch": 2.748987854251012, + "grad_norm": 1.650614217397481, + "learning_rate": 3.5289648789046616e-07, + "loss": 0.9745, + "step": 3395 + }, + { + "epoch": 2.7497975708502023, + "grad_norm": 1.6241513485984185, + "learning_rate": 3.5063678669566616e-07, + "loss": 0.9782, + "step": 3396 + }, + { + "epoch": 2.7506072874493928, + "grad_norm": 1.7366532492678954, + "learning_rate": 3.4838421441871553e-07, + "loss": 0.9545, + "step": 3397 + }, + { + "epoch": 2.751417004048583, + "grad_norm": 1.643649947081736, + "learning_rate": 3.4613877272380526e-07, + "loss": 0.9422, + "step": 3398 + }, + { + "epoch": 2.7522267206477733, + "grad_norm": 1.7397550528137697, + "learning_rate": 3.4390046326986506e-07, + "loss": 0.9788, + "step": 3399 + }, + { + "epoch": 2.753036437246964, + "grad_norm": 1.7012047353401265, + "learning_rate": 3.4166928771054653e-07, + "loss": 1.0243, + "step": 3400 + }, + { + "epoch": 2.753846153846154, + "grad_norm": 1.6224734532349014, + "learning_rate": 3.394452476942367e-07, + "loss": 1.0144, + "step": 3401 + }, + { + "epoch": 2.754655870445344, + "grad_norm": 1.6480456374209134, + "learning_rate": 3.37228344864049e-07, + "loss": 0.9859, + "step": 3402 + }, + { + "epoch": 2.7554655870445344, + "grad_norm": 1.6460293515314257, + "learning_rate": 3.350185808578232e-07, + "loss": 1.0034, + "step": 3403 + }, + { + "epoch": 2.756275303643725, + "grad_norm": 1.757720445735907, + "learning_rate": 3.328159573081258e-07, + "loss": 1.0231, + "step": 3404 + }, + { + "epoch": 2.757085020242915, + "grad_norm": 1.7467262631474252, + "learning_rate": 3.3062047584224934e-07, + "loss": 1.0036, + "step": 3405 + }, + { + "epoch": 2.7578947368421054, + "grad_norm": 1.665170966625211, + "learning_rate": 3.284321380822053e-07, + "loss": 1.0402, + "step": 3406 + }, + { + "epoch": 2.7587044534412954, + "grad_norm": 1.6956185677978453, + "learning_rate": 3.262509456447327e-07, + "loss": 0.998, + "step": 3407 + }, + { + "epoch": 2.759514170040486, + "grad_norm": 1.7519136605866905, + "learning_rate": 3.240769001412891e-07, + "loss": 0.9528, + "step": 3408 + }, + { + "epoch": 2.760323886639676, + "grad_norm": 1.7155229720938194, + "learning_rate": 3.21910003178052e-07, + "loss": 0.9239, + "step": 3409 + }, + { + "epoch": 2.7611336032388665, + "grad_norm": 1.6753569936900223, + "learning_rate": 3.197502563559185e-07, + "loss": 1.0247, + "step": 3410 + }, + { + "epoch": 2.7619433198380565, + "grad_norm": 1.6291004521416133, + "learning_rate": 3.1759766127050116e-07, + "loss": 1.0405, + "step": 3411 + }, + { + "epoch": 2.762753036437247, + "grad_norm": 1.7193595883451167, + "learning_rate": 3.1545221951213125e-07, + "loss": 0.9007, + "step": 3412 + }, + { + "epoch": 2.763562753036437, + "grad_norm": 1.7346002461970236, + "learning_rate": 3.13313932665853e-07, + "loss": 0.922, + "step": 3413 + }, + { + "epoch": 2.7643724696356275, + "grad_norm": 1.6724531692395965, + "learning_rate": 3.1118280231142496e-07, + "loss": 0.9998, + "step": 3414 + }, + { + "epoch": 2.765182186234818, + "grad_norm": 1.7916657747046403, + "learning_rate": 3.0905883002332213e-07, + "loss": 0.9282, + "step": 3415 + }, + { + "epoch": 2.765991902834008, + "grad_norm": 1.6206762833247699, + "learning_rate": 3.069420173707249e-07, + "loss": 1.0229, + "step": 3416 + }, + { + "epoch": 2.766801619433198, + "grad_norm": 1.7079187810682614, + "learning_rate": 3.048323659175301e-07, + "loss": 0.9421, + "step": 3417 + }, + { + "epoch": 2.7676113360323886, + "grad_norm": 1.6489425364151793, + "learning_rate": 3.027298772223419e-07, + "loss": 1.0534, + "step": 3418 + }, + { + "epoch": 2.768421052631579, + "grad_norm": 1.6128333988404466, + "learning_rate": 3.006345528384691e-07, + "loss": 0.9396, + "step": 3419 + }, + { + "epoch": 2.769230769230769, + "grad_norm": 1.6925085208629478, + "learning_rate": 2.985463943139322e-07, + "loss": 0.9452, + "step": 3420 + }, + { + "epoch": 2.7700404858299597, + "grad_norm": 1.728805527613981, + "learning_rate": 2.96465403191456e-07, + "loss": 0.9629, + "step": 3421 + }, + { + "epoch": 2.7708502024291497, + "grad_norm": 1.6026531892520557, + "learning_rate": 2.943915810084685e-07, + "loss": 0.9992, + "step": 3422 + }, + { + "epoch": 2.77165991902834, + "grad_norm": 1.7066029483605916, + "learning_rate": 2.923249292971042e-07, + "loss": 1.0369, + "step": 3423 + }, + { + "epoch": 2.7724696356275302, + "grad_norm": 1.6385263272251447, + "learning_rate": 2.9026544958419833e-07, + "loss": 1.0205, + "step": 3424 + }, + { + "epoch": 2.7732793522267207, + "grad_norm": 1.6324065649477524, + "learning_rate": 2.882131433912883e-07, + "loss": 0.9805, + "step": 3425 + }, + { + "epoch": 2.7740890688259108, + "grad_norm": 1.6647087033829902, + "learning_rate": 2.8616801223461e-07, + "loss": 1.0149, + "step": 3426 + }, + { + "epoch": 2.7748987854251013, + "grad_norm": 1.7242997046116755, + "learning_rate": 2.841300576250994e-07, + "loss": 0.9302, + "step": 3427 + }, + { + "epoch": 2.7757085020242913, + "grad_norm": 1.6351854391159, + "learning_rate": 2.8209928106839204e-07, + "loss": 0.9689, + "step": 3428 + }, + { + "epoch": 2.776518218623482, + "grad_norm": 1.6547619822957484, + "learning_rate": 2.800756840648178e-07, + "loss": 1.041, + "step": 3429 + }, + { + "epoch": 2.7773279352226723, + "grad_norm": 1.6907136500019693, + "learning_rate": 2.7805926810940297e-07, + "loss": 1.0214, + "step": 3430 + }, + { + "epoch": 2.7781376518218623, + "grad_norm": 1.6858591321540695, + "learning_rate": 2.7605003469187044e-07, + "loss": 1.0106, + "step": 3431 + }, + { + "epoch": 2.7789473684210524, + "grad_norm": 1.761735692393002, + "learning_rate": 2.74047985296636e-07, + "loss": 1.0111, + "step": 3432 + }, + { + "epoch": 2.779757085020243, + "grad_norm": 1.6840626486171097, + "learning_rate": 2.720531214028055e-07, + "loss": 0.9443, + "step": 3433 + }, + { + "epoch": 2.7805668016194334, + "grad_norm": 1.7212890453015592, + "learning_rate": 2.700654444841777e-07, + "loss": 0.9862, + "step": 3434 + }, + { + "epoch": 2.7813765182186234, + "grad_norm": 1.6376530049169284, + "learning_rate": 2.6808495600924355e-07, + "loss": 0.9878, + "step": 3435 + }, + { + "epoch": 2.782186234817814, + "grad_norm": 1.6855350369784343, + "learning_rate": 2.661116574411793e-07, + "loss": 0.9569, + "step": 3436 + }, + { + "epoch": 2.782995951417004, + "grad_norm": 1.774069519379109, + "learning_rate": 2.6414555023785204e-07, + "loss": 0.9306, + "step": 3437 + }, + { + "epoch": 2.7838056680161944, + "grad_norm": 1.592262691990277, + "learning_rate": 2.6218663585181547e-07, + "loss": 1.0135, + "step": 3438 + }, + { + "epoch": 2.7846153846153845, + "grad_norm": 1.7023463434984625, + "learning_rate": 2.602349157303108e-07, + "loss": 0.9985, + "step": 3439 + }, + { + "epoch": 2.785425101214575, + "grad_norm": 1.7075079740626764, + "learning_rate": 2.582903913152612e-07, + "loss": 0.9859, + "step": 3440 + }, + { + "epoch": 2.786234817813765, + "grad_norm": 1.6394777254757238, + "learning_rate": 2.563530640432732e-07, + "loss": 0.9818, + "step": 3441 + }, + { + "epoch": 2.7870445344129555, + "grad_norm": 1.6722333217908798, + "learning_rate": 2.5442293534564067e-07, + "loss": 0.9843, + "step": 3442 + }, + { + "epoch": 2.7878542510121456, + "grad_norm": 1.64564423365469, + "learning_rate": 2.525000066483352e-07, + "loss": 0.9853, + "step": 3443 + }, + { + "epoch": 2.788663967611336, + "grad_norm": 1.5885870007756162, + "learning_rate": 2.5058427937200816e-07, + "loss": 0.9864, + "step": 3444 + }, + { + "epoch": 2.7894736842105265, + "grad_norm": 1.6567088157067262, + "learning_rate": 2.4867575493199515e-07, + "loss": 0.9909, + "step": 3445 + }, + { + "epoch": 2.7902834008097166, + "grad_norm": 1.7241493918176931, + "learning_rate": 2.467744347383072e-07, + "loss": 0.9471, + "step": 3446 + }, + { + "epoch": 2.7910931174089066, + "grad_norm": 1.5953145152003532, + "learning_rate": 2.44880320195634e-07, + "loss": 1.0425, + "step": 3447 + }, + { + "epoch": 2.791902834008097, + "grad_norm": 1.704422723526784, + "learning_rate": 2.4299341270333955e-07, + "loss": 0.9508, + "step": 3448 + }, + { + "epoch": 2.7927125506072876, + "grad_norm": 1.6615202773394242, + "learning_rate": 2.4111371365546643e-07, + "loss": 0.9808, + "step": 3449 + }, + { + "epoch": 2.7935222672064777, + "grad_norm": 1.6622420527909882, + "learning_rate": 2.392412244407294e-07, + "loss": 0.9952, + "step": 3450 + }, + { + "epoch": 2.794331983805668, + "grad_norm": 1.7028019856102616, + "learning_rate": 2.373759464425174e-07, + "loss": 1.0009, + "step": 3451 + }, + { + "epoch": 2.795141700404858, + "grad_norm": 1.6318324035656953, + "learning_rate": 2.3551788103889027e-07, + "loss": 0.9645, + "step": 3452 + }, + { + "epoch": 2.7959514170040487, + "grad_norm": 1.6683299909476288, + "learning_rate": 2.3366702960258336e-07, + "loss": 1.0226, + "step": 3453 + }, + { + "epoch": 2.7967611336032387, + "grad_norm": 1.7972866276194772, + "learning_rate": 2.3182339350099724e-07, + "loss": 0.9662, + "step": 3454 + }, + { + "epoch": 2.7975708502024292, + "grad_norm": 1.7250223932824822, + "learning_rate": 2.2998697409620573e-07, + "loss": 1.0532, + "step": 3455 + }, + { + "epoch": 2.7983805668016193, + "grad_norm": 1.6934825177542452, + "learning_rate": 2.2815777274495022e-07, + "loss": 1.0051, + "step": 3456 + }, + { + "epoch": 2.7991902834008098, + "grad_norm": 1.7033740545061948, + "learning_rate": 2.2633579079863632e-07, + "loss": 0.997, + "step": 3457 + }, + { + "epoch": 2.8, + "grad_norm": 1.7292274929656826, + "learning_rate": 2.2452102960334064e-07, + "loss": 0.9835, + "step": 3458 + }, + { + "epoch": 2.8008097165991903, + "grad_norm": 1.6514262457771633, + "learning_rate": 2.2271349049979962e-07, + "loss": 0.9233, + "step": 3459 + }, + { + "epoch": 2.801619433198381, + "grad_norm": 1.7415649309506156, + "learning_rate": 2.2091317482342056e-07, + "loss": 0.9879, + "step": 3460 + }, + { + "epoch": 2.802429149797571, + "grad_norm": 1.6947139567148872, + "learning_rate": 2.1912008390426953e-07, + "loss": 0.9346, + "step": 3461 + }, + { + "epoch": 2.803238866396761, + "grad_norm": 1.760629650452238, + "learning_rate": 2.1733421906707464e-07, + "loss": 1.0005, + "step": 3462 + }, + { + "epoch": 2.8040485829959514, + "grad_norm": 1.6060526892387195, + "learning_rate": 2.1555558163122935e-07, + "loss": 1.075, + "step": 3463 + }, + { + "epoch": 2.804858299595142, + "grad_norm": 1.6926722698944767, + "learning_rate": 2.137841729107848e-07, + "loss": 0.8934, + "step": 3464 + }, + { + "epoch": 2.805668016194332, + "grad_norm": 1.6918582251550736, + "learning_rate": 2.1201999421445074e-07, + "loss": 0.9724, + "step": 3465 + }, + { + "epoch": 2.8064777327935224, + "grad_norm": 1.6163610863078754, + "learning_rate": 2.1026304684559685e-07, + "loss": 1.0206, + "step": 3466 + }, + { + "epoch": 2.8072874493927125, + "grad_norm": 1.6797477934507816, + "learning_rate": 2.0851333210225032e-07, + "loss": 0.9478, + "step": 3467 + }, + { + "epoch": 2.808097165991903, + "grad_norm": 1.6836631767653443, + "learning_rate": 2.0677085127709495e-07, + "loss": 1.0166, + "step": 3468 + }, + { + "epoch": 2.808906882591093, + "grad_norm": 1.7134147411086924, + "learning_rate": 2.0503560565747092e-07, + "loss": 0.9374, + "step": 3469 + }, + { + "epoch": 2.8097165991902835, + "grad_norm": 1.7189098388876702, + "learning_rate": 2.0330759652536835e-07, + "loss": 0.9788, + "step": 3470 + }, + { + "epoch": 2.8105263157894735, + "grad_norm": 1.6505141468009865, + "learning_rate": 2.0158682515743933e-07, + "loss": 0.9734, + "step": 3471 + }, + { + "epoch": 2.811336032388664, + "grad_norm": 1.6506249870105763, + "learning_rate": 1.9987329282498024e-07, + "loss": 0.9544, + "step": 3472 + }, + { + "epoch": 2.812145748987854, + "grad_norm": 1.7316946412284435, + "learning_rate": 1.9816700079394625e-07, + "loss": 1.0232, + "step": 3473 + }, + { + "epoch": 2.8129554655870446, + "grad_norm": 1.6533763703707396, + "learning_rate": 1.964679503249367e-07, + "loss": 0.9491, + "step": 3474 + }, + { + "epoch": 2.813765182186235, + "grad_norm": 1.76162742728525, + "learning_rate": 1.9477614267320867e-07, + "loss": 1.0347, + "step": 3475 + }, + { + "epoch": 2.814574898785425, + "grad_norm": 1.6828209980534072, + "learning_rate": 1.9309157908866116e-07, + "loss": 0.9581, + "step": 3476 + }, + { + "epoch": 2.815384615384615, + "grad_norm": 1.674090593565143, + "learning_rate": 1.9141426081584537e-07, + "loss": 0.9604, + "step": 3477 + }, + { + "epoch": 2.8161943319838056, + "grad_norm": 1.725348808312188, + "learning_rate": 1.8974418909395774e-07, + "loss": 1.0223, + "step": 3478 + }, + { + "epoch": 2.817004048582996, + "grad_norm": 1.6949407572974873, + "learning_rate": 1.880813651568425e-07, + "loss": 1.0619, + "step": 3479 + }, + { + "epoch": 2.817813765182186, + "grad_norm": 1.662538589931312, + "learning_rate": 1.8642579023298913e-07, + "loss": 1.0112, + "step": 3480 + }, + { + "epoch": 2.8186234817813767, + "grad_norm": 1.7612232629543667, + "learning_rate": 1.8477746554552922e-07, + "loss": 0.9445, + "step": 3481 + }, + { + "epoch": 2.8194331983805667, + "grad_norm": 1.6705151929988862, + "learning_rate": 1.831363923122409e-07, + "loss": 0.9485, + "step": 3482 + }, + { + "epoch": 2.820242914979757, + "grad_norm": 1.6573033708453293, + "learning_rate": 1.815025717455432e-07, + "loss": 0.9873, + "step": 3483 + }, + { + "epoch": 2.8210526315789473, + "grad_norm": 1.6767470480305524, + "learning_rate": 1.7987600505249726e-07, + "loss": 0.9749, + "step": 3484 + }, + { + "epoch": 2.8218623481781377, + "grad_norm": 1.7183296390160356, + "learning_rate": 1.7825669343480624e-07, + "loss": 0.9317, + "step": 3485 + }, + { + "epoch": 2.822672064777328, + "grad_norm": 1.6844074820085622, + "learning_rate": 1.7664463808880983e-07, + "loss": 0.9173, + "step": 3486 + }, + { + "epoch": 2.8234817813765183, + "grad_norm": 1.6126455021944093, + "learning_rate": 1.7503984020549203e-07, + "loss": 0.9177, + "step": 3487 + }, + { + "epoch": 2.8242914979757083, + "grad_norm": 1.674650005216152, + "learning_rate": 1.7344230097047111e-07, + "loss": 0.9383, + "step": 3488 + }, + { + "epoch": 2.825101214574899, + "grad_norm": 1.7018972026288255, + "learning_rate": 1.7185202156400294e-07, + "loss": 0.9821, + "step": 3489 + }, + { + "epoch": 2.8259109311740893, + "grad_norm": 1.6593676997323308, + "learning_rate": 1.7026900316098217e-07, + "loss": 0.9953, + "step": 3490 + }, + { + "epoch": 2.8267206477732794, + "grad_norm": 1.6763973087070592, + "learning_rate": 1.6869324693093768e-07, + "loss": 0.97, + "step": 3491 + }, + { + "epoch": 2.8275303643724694, + "grad_norm": 1.6756347235552076, + "learning_rate": 1.6712475403803164e-07, + "loss": 1.0831, + "step": 3492 + }, + { + "epoch": 2.82834008097166, + "grad_norm": 1.6961902060855358, + "learning_rate": 1.655635256410615e-07, + "loss": 1.0187, + "step": 3493 + }, + { + "epoch": 2.8291497975708504, + "grad_norm": 1.7106926602042642, + "learning_rate": 1.6400956289345903e-07, + "loss": 0.9455, + "step": 3494 + }, + { + "epoch": 2.8299595141700404, + "grad_norm": 1.6339795546096871, + "learning_rate": 1.6246286694328594e-07, + "loss": 1.0447, + "step": 3495 + }, + { + "epoch": 2.830769230769231, + "grad_norm": 1.6545843031250616, + "learning_rate": 1.6092343893323593e-07, + "loss": 0.9805, + "step": 3496 + }, + { + "epoch": 2.831578947368421, + "grad_norm": 1.6809638570842909, + "learning_rate": 1.5939128000063364e-07, + "loss": 0.9301, + "step": 3497 + }, + { + "epoch": 2.8323886639676115, + "grad_norm": 1.699507326131043, + "learning_rate": 1.5786639127743363e-07, + "loss": 0.9757, + "step": 3498 + }, + { + "epoch": 2.8331983805668015, + "grad_norm": 1.7076118973222212, + "learning_rate": 1.5634877389021695e-07, + "loss": 0.9538, + "step": 3499 + }, + { + "epoch": 2.834008097165992, + "grad_norm": 1.6599897623636726, + "learning_rate": 1.5483842896019675e-07, + "loss": 1.0356, + "step": 3500 + }, + { + "epoch": 2.834817813765182, + "grad_norm": 1.71624063763336, + "learning_rate": 1.5333535760320929e-07, + "loss": 0.96, + "step": 3501 + }, + { + "epoch": 2.8356275303643725, + "grad_norm": 1.7240409170070614, + "learning_rate": 1.518395609297185e-07, + "loss": 0.9491, + "step": 3502 + }, + { + "epoch": 2.8364372469635626, + "grad_norm": 1.6299987076331925, + "learning_rate": 1.5035104004481604e-07, + "loss": 1.0197, + "step": 3503 + }, + { + "epoch": 2.837246963562753, + "grad_norm": 1.5841940758881194, + "learning_rate": 1.4886979604821328e-07, + "loss": 0.9758, + "step": 3504 + }, + { + "epoch": 2.8380566801619436, + "grad_norm": 1.6462756907053675, + "learning_rate": 1.473958300342504e-07, + "loss": 0.975, + "step": 3505 + }, + { + "epoch": 2.8388663967611336, + "grad_norm": 1.6923662140676001, + "learning_rate": 1.4592914309188965e-07, + "loss": 0.9598, + "step": 3506 + }, + { + "epoch": 2.8396761133603237, + "grad_norm": 1.71758711743743, + "learning_rate": 1.4446973630471207e-07, + "loss": 0.9187, + "step": 3507 + }, + { + "epoch": 2.840485829959514, + "grad_norm": 1.7314496796265455, + "learning_rate": 1.4301761075092402e-07, + "loss": 0.9947, + "step": 3508 + }, + { + "epoch": 2.8412955465587046, + "grad_norm": 1.6658482977303863, + "learning_rate": 1.415727675033507e-07, + "loss": 0.9994, + "step": 3509 + }, + { + "epoch": 2.8421052631578947, + "grad_norm": 1.652120289375282, + "learning_rate": 1.401352076294371e-07, + "loss": 1.0019, + "step": 3510 + }, + { + "epoch": 2.842914979757085, + "grad_norm": 1.7489773597175606, + "learning_rate": 1.3870493219124814e-07, + "loss": 1.0033, + "step": 3511 + }, + { + "epoch": 2.8437246963562752, + "grad_norm": 1.703873872469804, + "learning_rate": 1.3728194224546742e-07, + "loss": 0.9716, + "step": 3512 + }, + { + "epoch": 2.8445344129554657, + "grad_norm": 1.7380413078833634, + "learning_rate": 1.35866238843394e-07, + "loss": 0.9694, + "step": 3513 + }, + { + "epoch": 2.8453441295546558, + "grad_norm": 1.7289000733110216, + "learning_rate": 1.3445782303094568e-07, + "loss": 0.9484, + "step": 3514 + }, + { + "epoch": 2.8461538461538463, + "grad_norm": 1.748270696926121, + "learning_rate": 1.3305669584865565e-07, + "loss": 0.9799, + "step": 3515 + }, + { + "epoch": 2.8469635627530363, + "grad_norm": 1.6413747236212781, + "learning_rate": 1.3166285833167147e-07, + "loss": 1.0151, + "step": 3516 + }, + { + "epoch": 2.847773279352227, + "grad_norm": 1.6667240650346242, + "learning_rate": 1.3027631150975606e-07, + "loss": 0.9758, + "step": 3517 + }, + { + "epoch": 2.848582995951417, + "grad_norm": 1.6872730298067318, + "learning_rate": 1.2889705640728445e-07, + "loss": 0.9228, + "step": 3518 + }, + { + "epoch": 2.8493927125506073, + "grad_norm": 1.6295638968820234, + "learning_rate": 1.275250940432471e-07, + "loss": 0.979, + "step": 3519 + }, + { + "epoch": 2.850202429149798, + "grad_norm": 1.6588363373893749, + "learning_rate": 1.261604254312454e-07, + "loss": 1.0192, + "step": 3520 + }, + { + "epoch": 2.851012145748988, + "grad_norm": 1.6683421475461793, + "learning_rate": 1.248030515794907e-07, + "loss": 0.9543, + "step": 3521 + }, + { + "epoch": 2.851821862348178, + "grad_norm": 1.6524719881626193, + "learning_rate": 1.2345297349080852e-07, + "loss": 0.9376, + "step": 3522 + }, + { + "epoch": 2.8526315789473684, + "grad_norm": 1.698816182274864, + "learning_rate": 1.221101921626311e-07, + "loss": 0.969, + "step": 3523 + }, + { + "epoch": 2.853441295546559, + "grad_norm": 1.6990536494185664, + "learning_rate": 1.2077470858699925e-07, + "loss": 0.9081, + "step": 3524 + }, + { + "epoch": 2.854251012145749, + "grad_norm": 1.6689902626693056, + "learning_rate": 1.1944652375056597e-07, + "loss": 0.9413, + "step": 3525 + }, + { + "epoch": 2.8550607287449394, + "grad_norm": 1.7541568188456584, + "learning_rate": 1.1812563863458859e-07, + "loss": 0.9368, + "step": 3526 + }, + { + "epoch": 2.8558704453441295, + "grad_norm": 1.6900489394163216, + "learning_rate": 1.1681205421493425e-07, + "loss": 1.0227, + "step": 3527 + }, + { + "epoch": 2.85668016194332, + "grad_norm": 1.6593250577589962, + "learning_rate": 1.1550577146207331e-07, + "loss": 0.9822, + "step": 3528 + }, + { + "epoch": 2.85748987854251, + "grad_norm": 1.706412070441469, + "learning_rate": 1.1420679134108382e-07, + "loss": 0.9934, + "step": 3529 + }, + { + "epoch": 2.8582995951417005, + "grad_norm": 1.6579862881360155, + "learning_rate": 1.1291511481164807e-07, + "loss": 0.9466, + "step": 3530 + }, + { + "epoch": 2.8591093117408906, + "grad_norm": 1.7170922289715347, + "learning_rate": 1.1163074282805165e-07, + "loss": 0.9103, + "step": 3531 + }, + { + "epoch": 2.859919028340081, + "grad_norm": 1.7114518472316398, + "learning_rate": 1.1035367633918436e-07, + "loss": 0.9184, + "step": 3532 + }, + { + "epoch": 2.860728744939271, + "grad_norm": 1.731323195336544, + "learning_rate": 1.0908391628854042e-07, + "loss": 0.9779, + "step": 3533 + }, + { + "epoch": 2.8615384615384616, + "grad_norm": 1.7111499506843564, + "learning_rate": 1.0782146361421275e-07, + "loss": 1.0177, + "step": 3534 + }, + { + "epoch": 2.862348178137652, + "grad_norm": 1.6337915289731522, + "learning_rate": 1.0656631924889749e-07, + "loss": 0.974, + "step": 3535 + }, + { + "epoch": 2.863157894736842, + "grad_norm": 1.6468027663173628, + "learning_rate": 1.0531848411989287e-07, + "loss": 1.0301, + "step": 3536 + }, + { + "epoch": 2.863967611336032, + "grad_norm": 1.704350995042486, + "learning_rate": 1.0407795914909258e-07, + "loss": 1.0112, + "step": 3537 + }, + { + "epoch": 2.8647773279352227, + "grad_norm": 1.6680112193827918, + "learning_rate": 1.0284474525299459e-07, + "loss": 1.0021, + "step": 3538 + }, + { + "epoch": 2.865587044534413, + "grad_norm": 1.630227489137973, + "learning_rate": 1.0161884334269234e-07, + "loss": 0.9871, + "step": 3539 + }, + { + "epoch": 2.866396761133603, + "grad_norm": 1.7269095533127516, + "learning_rate": 1.0040025432387801e-07, + "loss": 0.9789, + "step": 3540 + }, + { + "epoch": 2.8672064777327932, + "grad_norm": 1.645077368110772, + "learning_rate": 9.918897909684144e-08, + "loss": 0.9977, + "step": 3541 + }, + { + "epoch": 2.8680161943319837, + "grad_norm": 1.6531908723812594, + "learning_rate": 9.798501855646792e-08, + "loss": 1.0315, + "step": 3542 + }, + { + "epoch": 2.8688259109311742, + "grad_norm": 1.6773021644682622, + "learning_rate": 9.678837359224148e-08, + "loss": 0.9943, + "step": 3543 + }, + { + "epoch": 2.8696356275303643, + "grad_norm": 1.6110461386436772, + "learning_rate": 9.559904508823825e-08, + "loss": 0.9716, + "step": 3544 + }, + { + "epoch": 2.8704453441295548, + "grad_norm": 1.547369809807511, + "learning_rate": 9.441703392313095e-08, + "loss": 0.9595, + "step": 3545 + }, + { + "epoch": 2.871255060728745, + "grad_norm": 1.6684471743402367, + "learning_rate": 9.324234097018436e-08, + "loss": 0.9031, + "step": 3546 + }, + { + "epoch": 2.8720647773279353, + "grad_norm": 1.6770696827512974, + "learning_rate": 9.20749670972576e-08, + "loss": 0.9372, + "step": 3547 + }, + { + "epoch": 2.8728744939271254, + "grad_norm": 1.7263050739430634, + "learning_rate": 9.091491316680411e-08, + "loss": 0.997, + "step": 3548 + }, + { + "epoch": 2.873684210526316, + "grad_norm": 1.6963472723142492, + "learning_rate": 8.976218003586722e-08, + "loss": 1.0112, + "step": 3549 + }, + { + "epoch": 2.8744939271255063, + "grad_norm": 1.7236666081560383, + "learning_rate": 8.861676855608237e-08, + "loss": 0.9723, + "step": 3550 + }, + { + "epoch": 2.8753036437246964, + "grad_norm": 1.6959061896936787, + "learning_rate": 8.747867957367595e-08, + "loss": 0.9236, + "step": 3551 + }, + { + "epoch": 2.8761133603238864, + "grad_norm": 1.7019359454703777, + "learning_rate": 8.634791392946429e-08, + "loss": 0.9874, + "step": 3552 + }, + { + "epoch": 2.876923076923077, + "grad_norm": 1.6598850043781408, + "learning_rate": 8.522447245885356e-08, + "loss": 0.9448, + "step": 3553 + }, + { + "epoch": 2.8777327935222674, + "grad_norm": 1.657178469902909, + "learning_rate": 8.410835599183875e-08, + "loss": 1.0182, + "step": 3554 + }, + { + "epoch": 2.8785425101214575, + "grad_norm": 1.7487204603764595, + "learning_rate": 8.299956535300135e-08, + "loss": 0.9883, + "step": 3555 + }, + { + "epoch": 2.8793522267206475, + "grad_norm": 1.7130380837811467, + "learning_rate": 8.189810136151388e-08, + "loss": 0.9205, + "step": 3556 + }, + { + "epoch": 2.880161943319838, + "grad_norm": 1.7416692432911214, + "learning_rate": 8.08039648311354e-08, + "loss": 1.0081, + "step": 3557 + }, + { + "epoch": 2.8809716599190285, + "grad_norm": 1.7940488637799301, + "learning_rate": 7.971715657020706e-08, + "loss": 1.0193, + "step": 3558 + }, + { + "epoch": 2.8817813765182185, + "grad_norm": 1.7643531171720135, + "learning_rate": 7.863767738166217e-08, + "loss": 0.9777, + "step": 3559 + }, + { + "epoch": 2.882591093117409, + "grad_norm": 1.7549243652435764, + "learning_rate": 7.756552806301498e-08, + "loss": 0.9255, + "step": 3560 + }, + { + "epoch": 2.883400809716599, + "grad_norm": 1.6037364745875238, + "learning_rate": 7.650070940636634e-08, + "loss": 1.0126, + "step": 3561 + }, + { + "epoch": 2.8842105263157896, + "grad_norm": 1.6938156063789074, + "learning_rate": 7.54432221984014e-08, + "loss": 1.001, + "step": 3562 + }, + { + "epoch": 2.8850202429149796, + "grad_norm": 1.640394353127388, + "learning_rate": 7.43930672203863e-08, + "loss": 1.0106, + "step": 3563 + }, + { + "epoch": 2.88582995951417, + "grad_norm": 1.682827146515058, + "learning_rate": 7.335024524817492e-08, + "loss": 0.9672, + "step": 3564 + }, + { + "epoch": 2.8866396761133606, + "grad_norm": 1.64220479848791, + "learning_rate": 7.23147570521987e-08, + "loss": 1.0053, + "step": 3565 + }, + { + "epoch": 2.8874493927125506, + "grad_norm": 1.6800144926826894, + "learning_rate": 7.128660339747239e-08, + "loss": 1.0534, + "step": 3566 + }, + { + "epoch": 2.8882591093117407, + "grad_norm": 1.6706178385744694, + "learning_rate": 7.026578504359394e-08, + "loss": 0.9858, + "step": 3567 + }, + { + "epoch": 2.889068825910931, + "grad_norm": 1.6725170356062997, + "learning_rate": 6.925230274474003e-08, + "loss": 0.9447, + "step": 3568 + }, + { + "epoch": 2.8898785425101217, + "grad_norm": 1.7484788389494295, + "learning_rate": 6.824615724966843e-08, + "loss": 0.982, + "step": 3569 + }, + { + "epoch": 2.8906882591093117, + "grad_norm": 1.7412794412751307, + "learning_rate": 6.724734930171561e-08, + "loss": 0.9185, + "step": 3570 + }, + { + "epoch": 2.8914979757085018, + "grad_norm": 1.7112532122718904, + "learning_rate": 6.625587963879909e-08, + "loss": 0.884, + "step": 3571 + }, + { + "epoch": 2.8923076923076922, + "grad_norm": 1.669951486754946, + "learning_rate": 6.527174899341071e-08, + "loss": 1.0064, + "step": 3572 + }, + { + "epoch": 2.8931174089068827, + "grad_norm": 1.780604450503182, + "learning_rate": 6.429495809262554e-08, + "loss": 0.9273, + "step": 3573 + }, + { + "epoch": 2.893927125506073, + "grad_norm": 1.6302873462180967, + "learning_rate": 6.332550765809075e-08, + "loss": 0.9506, + "step": 3574 + }, + { + "epoch": 2.8947368421052633, + "grad_norm": 1.7710784892224958, + "learning_rate": 6.236339840603677e-08, + "loss": 0.9671, + "step": 3575 + }, + { + "epoch": 2.8955465587044533, + "grad_norm": 1.7228628615096542, + "learning_rate": 6.140863104726391e-08, + "loss": 0.9461, + "step": 3576 + }, + { + "epoch": 2.896356275303644, + "grad_norm": 1.7105420499553026, + "learning_rate": 6.046120628715124e-08, + "loss": 0.9828, + "step": 3577 + }, + { + "epoch": 2.897165991902834, + "grad_norm": 1.6456141765558494, + "learning_rate": 5.952112482565442e-08, + "loss": 0.9371, + "step": 3578 + }, + { + "epoch": 2.8979757085020244, + "grad_norm": 1.629221588967243, + "learning_rate": 5.858838735730232e-08, + "loss": 0.9767, + "step": 3579 + }, + { + "epoch": 2.898785425101215, + "grad_norm": 1.7148433089723767, + "learning_rate": 5.766299457119817e-08, + "loss": 0.9423, + "step": 3580 + }, + { + "epoch": 2.899595141700405, + "grad_norm": 1.6770711797146378, + "learning_rate": 5.674494715101841e-08, + "loss": 0.9575, + "step": 3581 + }, + { + "epoch": 2.900404858299595, + "grad_norm": 1.7228669637843592, + "learning_rate": 5.583424577501273e-08, + "loss": 0.982, + "step": 3582 + }, + { + "epoch": 2.9012145748987854, + "grad_norm": 1.6410483294411622, + "learning_rate": 5.4930891116007355e-08, + "loss": 0.9662, + "step": 3583 + }, + { + "epoch": 2.902024291497976, + "grad_norm": 1.695947041652728, + "learning_rate": 5.40348838413951e-08, + "loss": 0.9336, + "step": 3584 + }, + { + "epoch": 2.902834008097166, + "grad_norm": 1.7092686814192066, + "learning_rate": 5.3146224613144225e-08, + "loss": 1.0737, + "step": 3585 + }, + { + "epoch": 2.903643724696356, + "grad_norm": 1.7080949259326936, + "learning_rate": 5.2264914087792885e-08, + "loss": 0.9843, + "step": 3586 + }, + { + "epoch": 2.9044534412955465, + "grad_norm": 1.6572515892039745, + "learning_rate": 5.139095291645024e-08, + "loss": 1.0179, + "step": 3587 + }, + { + "epoch": 2.905263157894737, + "grad_norm": 1.6690887807850592, + "learning_rate": 5.052434174479759e-08, + "loss": 1.0066, + "step": 3588 + }, + { + "epoch": 2.906072874493927, + "grad_norm": 1.6532884012275588, + "learning_rate": 4.966508121308167e-08, + "loss": 0.9931, + "step": 3589 + }, + { + "epoch": 2.9068825910931175, + "grad_norm": 1.7133322607154833, + "learning_rate": 4.8813171956123565e-08, + "loss": 0.9854, + "step": 3590 + }, + { + "epoch": 2.9076923076923076, + "grad_norm": 1.6567745485844279, + "learning_rate": 4.796861460330982e-08, + "loss": 1.0568, + "step": 3591 + }, + { + "epoch": 2.908502024291498, + "grad_norm": 1.7343588764303524, + "learning_rate": 4.713140977859687e-08, + "loss": 0.9529, + "step": 3592 + }, + { + "epoch": 2.909311740890688, + "grad_norm": 1.6208499139806953, + "learning_rate": 4.630155810050885e-08, + "loss": 1.0186, + "step": 3593 + }, + { + "epoch": 2.9101214574898786, + "grad_norm": 1.6493435535547711, + "learning_rate": 4.547906018213866e-08, + "loss": 1.0102, + "step": 3594 + }, + { + "epoch": 2.910931174089069, + "grad_norm": 1.6692153459998045, + "learning_rate": 4.4663916631143554e-08, + "loss": 0.9606, + "step": 3595 + }, + { + "epoch": 2.911740890688259, + "grad_norm": 1.6721684961284737, + "learning_rate": 4.3856128049749594e-08, + "loss": 0.9796, + "step": 3596 + }, + { + "epoch": 2.912550607287449, + "grad_norm": 1.7315280769395167, + "learning_rate": 4.3055695034747155e-08, + "loss": 0.9748, + "step": 3597 + }, + { + "epoch": 2.9133603238866397, + "grad_norm": 1.6378741930140095, + "learning_rate": 4.226261817749544e-08, + "loss": 0.9593, + "step": 3598 + }, + { + "epoch": 2.91417004048583, + "grad_norm": 1.7322062631852746, + "learning_rate": 4.147689806391575e-08, + "loss": 1.0119, + "step": 3599 + }, + { + "epoch": 2.91497975708502, + "grad_norm": 1.7138380672008324, + "learning_rate": 4.069853527449596e-08, + "loss": 0.988, + "step": 3600 + }, + { + "epoch": 2.9157894736842103, + "grad_norm": 1.7209134121305623, + "learning_rate": 3.9927530384288314e-08, + "loss": 0.9688, + "step": 3601 + }, + { + "epoch": 2.9165991902834008, + "grad_norm": 1.7412002078631348, + "learning_rate": 3.916388396290716e-08, + "loss": 0.9903, + "step": 3602 + }, + { + "epoch": 2.9174089068825912, + "grad_norm": 1.7072796682597313, + "learning_rate": 3.840759657453452e-08, + "loss": 0.9591, + "step": 3603 + }, + { + "epoch": 2.9182186234817813, + "grad_norm": 1.6648056164804599, + "learning_rate": 3.7658668777910135e-08, + "loss": 0.9767, + "step": 3604 + }, + { + "epoch": 2.919028340080972, + "grad_norm": 1.7237893699815425, + "learning_rate": 3.691710112634139e-08, + "loss": 1.0162, + "step": 3605 + }, + { + "epoch": 2.919838056680162, + "grad_norm": 1.7961044154386043, + "learning_rate": 3.61828941676956e-08, + "loss": 1.0186, + "step": 3606 + }, + { + "epoch": 2.9206477732793523, + "grad_norm": 1.6497363776121279, + "learning_rate": 3.54560484444022e-08, + "loss": 0.909, + "step": 3607 + }, + { + "epoch": 2.9214574898785424, + "grad_norm": 1.747468367170849, + "learning_rate": 3.473656449345275e-08, + "loss": 0.89, + "step": 3608 + }, + { + "epoch": 2.922267206477733, + "grad_norm": 1.762439292957918, + "learning_rate": 3.402444284639872e-08, + "loss": 0.9111, + "step": 3609 + }, + { + "epoch": 2.9230769230769234, + "grad_norm": 1.7149546863394658, + "learning_rate": 3.3319684029354815e-08, + "loss": 0.9661, + "step": 3610 + }, + { + "epoch": 2.9238866396761134, + "grad_norm": 1.6630865806186383, + "learning_rate": 3.262228856299343e-08, + "loss": 0.9029, + "step": 3611 + }, + { + "epoch": 2.9246963562753034, + "grad_norm": 1.7363529145465648, + "learning_rate": 3.193225696254798e-08, + "loss": 0.9848, + "step": 3612 + }, + { + "epoch": 2.925506072874494, + "grad_norm": 1.6797985549185608, + "learning_rate": 3.124958973781178e-08, + "loss": 1.0399, + "step": 3613 + }, + { + "epoch": 2.9263157894736844, + "grad_norm": 1.6469873014958698, + "learning_rate": 3.057428739313695e-08, + "loss": 0.9589, + "step": 3614 + }, + { + "epoch": 2.9271255060728745, + "grad_norm": 1.7250247951219637, + "learning_rate": 2.9906350427435505e-08, + "loss": 0.937, + "step": 3615 + }, + { + "epoch": 2.9279352226720645, + "grad_norm": 1.6796115243111611, + "learning_rate": 2.924577933417716e-08, + "loss": 0.9864, + "step": 3616 + }, + { + "epoch": 2.928744939271255, + "grad_norm": 1.6550805457651048, + "learning_rate": 2.85925746013882e-08, + "loss": 1.0157, + "step": 3617 + }, + { + "epoch": 2.9295546558704455, + "grad_norm": 1.7491759234899906, + "learning_rate": 2.7946736711654822e-08, + "loss": 1.004, + "step": 3618 + }, + { + "epoch": 2.9303643724696355, + "grad_norm": 1.64099300892061, + "learning_rate": 2.7308266142119788e-08, + "loss": 1.0059, + "step": 3619 + }, + { + "epoch": 2.931174089068826, + "grad_norm": 1.6299619725247847, + "learning_rate": 2.667716336448356e-08, + "loss": 0.9822, + "step": 3620 + }, + { + "epoch": 2.931983805668016, + "grad_norm": 1.6619338478776728, + "learning_rate": 2.605342884500206e-08, + "loss": 0.9879, + "step": 3621 + }, + { + "epoch": 2.9327935222672066, + "grad_norm": 1.6869289651046775, + "learning_rate": 2.5437063044488895e-08, + "loss": 0.9443, + "step": 3622 + }, + { + "epoch": 2.9336032388663966, + "grad_norm": 1.650516004800676, + "learning_rate": 2.4828066418314256e-08, + "loss": 0.9792, + "step": 3623 + }, + { + "epoch": 2.934412955465587, + "grad_norm": 1.714896614796574, + "learning_rate": 2.422643941640046e-08, + "loss": 0.9162, + "step": 3624 + }, + { + "epoch": 2.9352226720647776, + "grad_norm": 1.614786417424853, + "learning_rate": 2.3632182483228628e-08, + "loss": 0.9934, + "step": 3625 + }, + { + "epoch": 2.9360323886639677, + "grad_norm": 1.7199405245576622, + "learning_rate": 2.3045296057834232e-08, + "loss": 0.9467, + "step": 3626 + }, + { + "epoch": 2.9368421052631577, + "grad_norm": 1.689626777761019, + "learning_rate": 2.2465780573807105e-08, + "loss": 0.9896, + "step": 3627 + }, + { + "epoch": 2.937651821862348, + "grad_norm": 1.618965258362145, + "learning_rate": 2.1893636459289213e-08, + "loss": 0.9987, + "step": 3628 + }, + { + "epoch": 2.9384615384615387, + "grad_norm": 1.6953456161595155, + "learning_rate": 2.132886413698243e-08, + "loss": 0.9242, + "step": 3629 + }, + { + "epoch": 2.9392712550607287, + "grad_norm": 1.640775504856468, + "learning_rate": 2.077146402413521e-08, + "loss": 0.9815, + "step": 3630 + }, + { + "epoch": 2.9400809716599188, + "grad_norm": 1.630716994540923, + "learning_rate": 2.0221436532555928e-08, + "loss": 0.9879, + "step": 3631 + }, + { + "epoch": 2.9408906882591093, + "grad_norm": 1.6746857067688603, + "learning_rate": 1.9678782068600633e-08, + "loss": 0.9805, + "step": 3632 + }, + { + "epoch": 2.9417004048582998, + "grad_norm": 1.692709178383313, + "learning_rate": 1.9143501033181965e-08, + "loss": 0.9926, + "step": 3633 + }, + { + "epoch": 2.94251012145749, + "grad_norm": 1.722482470681197, + "learning_rate": 1.8615593821763587e-08, + "loss": 1.0066, + "step": 3634 + }, + { + "epoch": 2.9433198380566803, + "grad_norm": 1.6955658283586104, + "learning_rate": 1.80950608243613e-08, + "loss": 1.0168, + "step": 3635 + }, + { + "epoch": 2.9441295546558703, + "grad_norm": 1.6593146288029987, + "learning_rate": 1.758190242554303e-08, + "loss": 0.9471, + "step": 3636 + }, + { + "epoch": 2.944939271255061, + "grad_norm": 1.6437181186269008, + "learning_rate": 1.7076119004429958e-08, + "loss": 1.0134, + "step": 3637 + }, + { + "epoch": 2.945748987854251, + "grad_norm": 1.7191306396154686, + "learning_rate": 1.657771093469096e-08, + "loss": 0.8814, + "step": 3638 + }, + { + "epoch": 2.9465587044534414, + "grad_norm": 1.6761533860830595, + "learning_rate": 1.6086678584550374e-08, + "loss": 0.9793, + "step": 3639 + }, + { + "epoch": 2.9473684210526314, + "grad_norm": 1.704443447654609, + "learning_rate": 1.5603022316780235e-08, + "loss": 0.9372, + "step": 3640 + }, + { + "epoch": 2.948178137651822, + "grad_norm": 1.7083386239114013, + "learning_rate": 1.5126742488703604e-08, + "loss": 0.9712, + "step": 3641 + }, + { + "epoch": 2.948987854251012, + "grad_norm": 1.6835922657350433, + "learning_rate": 1.4657839452195677e-08, + "loss": 0.9932, + "step": 3642 + }, + { + "epoch": 2.9497975708502024, + "grad_norm": 1.7376719503586473, + "learning_rate": 1.4196313553680453e-08, + "loss": 0.8903, + "step": 3643 + }, + { + "epoch": 2.950607287449393, + "grad_norm": 1.7489663276342433, + "learning_rate": 1.3742165134130736e-08, + "loss": 0.9544, + "step": 3644 + }, + { + "epoch": 2.951417004048583, + "grad_norm": 1.6975365462225342, + "learning_rate": 1.329539452907036e-08, + "loss": 0.9856, + "step": 3645 + }, + { + "epoch": 2.952226720647773, + "grad_norm": 1.7039651291858493, + "learning_rate": 1.285600206857196e-08, + "loss": 0.9986, + "step": 3646 + }, + { + "epoch": 2.9530364372469635, + "grad_norm": 1.6540752577120743, + "learning_rate": 1.2423988077258087e-08, + "loss": 1.0064, + "step": 3647 + }, + { + "epoch": 2.953846153846154, + "grad_norm": 1.563653194661081, + "learning_rate": 1.1999352874297876e-08, + "loss": 1.0462, + "step": 3648 + }, + { + "epoch": 2.954655870445344, + "grad_norm": 1.6611119980537208, + "learning_rate": 1.1582096773410379e-08, + "loss": 1.0157, + "step": 3649 + }, + { + "epoch": 2.9554655870445345, + "grad_norm": 1.6209215418673872, + "learning_rate": 1.117222008286456e-08, + "loss": 1.0152, + "step": 3650 + }, + { + "epoch": 2.9562753036437246, + "grad_norm": 1.651092000152933, + "learning_rate": 1.0769723105474861e-08, + "loss": 0.9698, + "step": 3651 + }, + { + "epoch": 2.957085020242915, + "grad_norm": 1.6427078147132284, + "learning_rate": 1.0374606138605636e-08, + "loss": 0.9562, + "step": 3652 + }, + { + "epoch": 2.957894736842105, + "grad_norm": 1.6736772750275095, + "learning_rate": 9.986869474166716e-09, + "loss": 0.9024, + "step": 3653 + }, + { + "epoch": 2.9587044534412956, + "grad_norm": 1.744834463851239, + "learning_rate": 9.606513398617846e-09, + "loss": 0.8867, + "step": 3654 + }, + { + "epoch": 2.9595141700404857, + "grad_norm": 1.58327630324171, + "learning_rate": 9.233538192963132e-09, + "loss": 1.0089, + "step": 3655 + }, + { + "epoch": 2.960323886639676, + "grad_norm": 1.6438964342520213, + "learning_rate": 8.867944132757711e-09, + "loss": 0.9828, + "step": 3656 + }, + { + "epoch": 2.961133603238866, + "grad_norm": 1.7224600116058253, + "learning_rate": 8.50973148809997e-09, + "loss": 0.9382, + "step": 3657 + }, + { + "epoch": 2.9619433198380567, + "grad_norm": 1.6459642766819444, + "learning_rate": 8.158900523635993e-09, + "loss": 0.9835, + "step": 3658 + }, + { + "epoch": 2.962753036437247, + "grad_norm": 1.6777112954645979, + "learning_rate": 7.815451498559557e-09, + "loss": 0.9928, + "step": 3659 + }, + { + "epoch": 2.9635627530364372, + "grad_norm": 1.7128316731395987, + "learning_rate": 7.479384666608802e-09, + "loss": 0.9999, + "step": 3660 + }, + { + "epoch": 2.9643724696356273, + "grad_norm": 1.7331941430919588, + "learning_rate": 7.150700276068457e-09, + "loss": 0.9244, + "step": 3661 + }, + { + "epoch": 2.9651821862348178, + "grad_norm": 1.6425820505456277, + "learning_rate": 6.82939856977094e-09, + "loss": 1.0678, + "step": 3662 + }, + { + "epoch": 2.9659919028340083, + "grad_norm": 1.6923771797910705, + "learning_rate": 6.515479785091927e-09, + "loss": 0.9698, + "step": 3663 + }, + { + "epoch": 2.9668016194331983, + "grad_norm": 1.6765814114028854, + "learning_rate": 6.208944153953678e-09, + "loss": 0.9733, + "step": 3664 + }, + { + "epoch": 2.967611336032389, + "grad_norm": 1.631450848919302, + "learning_rate": 5.909791902823925e-09, + "loss": 0.9317, + "step": 3665 + }, + { + "epoch": 2.968421052631579, + "grad_norm": 1.6990001884177788, + "learning_rate": 5.618023252714766e-09, + "loss": 0.9603, + "step": 3666 + }, + { + "epoch": 2.9692307692307693, + "grad_norm": 1.6821450995668512, + "learning_rate": 5.333638419184883e-09, + "loss": 0.9634, + "step": 3667 + }, + { + "epoch": 2.9700404858299594, + "grad_norm": 1.6746580159744173, + "learning_rate": 5.056637612336212e-09, + "loss": 0.8863, + "step": 3668 + }, + { + "epoch": 2.97085020242915, + "grad_norm": 1.6902742756024816, + "learning_rate": 4.787021036816164e-09, + "loss": 0.9718, + "step": 3669 + }, + { + "epoch": 2.97165991902834, + "grad_norm": 1.7087829075461096, + "learning_rate": 4.524788891816512e-09, + "loss": 0.9482, + "step": 3670 + }, + { + "epoch": 2.9724696356275304, + "grad_norm": 1.7218111890670291, + "learning_rate": 4.269941371073394e-09, + "loss": 0.9897, + "step": 3671 + }, + { + "epoch": 2.9732793522267205, + "grad_norm": 1.6710781319892993, + "learning_rate": 4.022478662867313e-09, + "loss": 0.9778, + "step": 3672 + }, + { + "epoch": 2.974089068825911, + "grad_norm": 1.6227351235688, + "learning_rate": 3.782400950023135e-09, + "loss": 0.9521, + "step": 3673 + }, + { + "epoch": 2.9748987854251014, + "grad_norm": 1.6542823286693975, + "learning_rate": 3.5497084099100907e-09, + "loss": 0.9691, + "step": 3674 + }, + { + "epoch": 2.9757085020242915, + "grad_norm": 1.6462032546016088, + "learning_rate": 3.3244012144395545e-09, + "loss": 0.9743, + "step": 3675 + }, + { + "epoch": 2.9765182186234815, + "grad_norm": 1.6774459355985822, + "learning_rate": 3.1064795300683735e-09, + "loss": 1.0424, + "step": 3676 + }, + { + "epoch": 2.977327935222672, + "grad_norm": 1.6990599030124527, + "learning_rate": 2.8959435177955407e-09, + "loss": 0.9464, + "step": 3677 + }, + { + "epoch": 2.9781376518218625, + "grad_norm": 1.6902993214670898, + "learning_rate": 2.692793333165522e-09, + "loss": 1.0075, + "step": 3678 + }, + { + "epoch": 2.9789473684210526, + "grad_norm": 1.623677495815907, + "learning_rate": 2.4970291262649272e-09, + "loss": 0.9633, + "step": 3679 + }, + { + "epoch": 2.979757085020243, + "grad_norm": 1.7265662360747522, + "learning_rate": 2.3086510417225093e-09, + "loss": 1.0392, + "step": 3680 + }, + { + "epoch": 2.980566801619433, + "grad_norm": 1.6751044099068313, + "learning_rate": 2.1276592187124966e-09, + "loss": 0.9776, + "step": 3681 + }, + { + "epoch": 2.9813765182186236, + "grad_norm": 1.6716882137403286, + "learning_rate": 1.95405379095126e-09, + "loss": 0.9679, + "step": 3682 + }, + { + "epoch": 2.9821862348178136, + "grad_norm": 1.7059793709856448, + "learning_rate": 1.7878348866962047e-09, + "loss": 0.9525, + "step": 3683 + }, + { + "epoch": 2.982995951417004, + "grad_norm": 1.6552987765137663, + "learning_rate": 1.6290026287513194e-09, + "loss": 1.0131, + "step": 3684 + }, + { + "epoch": 2.983805668016194, + "grad_norm": 1.6707647453986292, + "learning_rate": 1.4775571344605167e-09, + "loss": 0.9977, + "step": 3685 + }, + { + "epoch": 2.9846153846153847, + "grad_norm": 1.6942525860442892, + "learning_rate": 1.3334985157109625e-09, + "loss": 0.9585, + "step": 3686 + }, + { + "epoch": 2.9854251012145747, + "grad_norm": 1.6490557158214814, + "learning_rate": 1.1968268789330773e-09, + "loss": 0.9482, + "step": 3687 + }, + { + "epoch": 2.986234817813765, + "grad_norm": 1.6249913718833546, + "learning_rate": 1.0675423250994244e-09, + "loss": 0.9683, + "step": 3688 + }, + { + "epoch": 2.9870445344129557, + "grad_norm": 1.6778588925327733, + "learning_rate": 9.456449497247112e-10, + "loss": 0.9532, + "step": 3689 + }, + { + "epoch": 2.9878542510121457, + "grad_norm": 1.6962491724057556, + "learning_rate": 8.311348428657884e-10, + "loss": 0.9811, + "step": 3690 + }, + { + "epoch": 2.988663967611336, + "grad_norm": 1.6716685263614628, + "learning_rate": 7.240120891238711e-10, + "loss": 0.9267, + "step": 3691 + }, + { + "epoch": 2.9894736842105263, + "grad_norm": 1.7528045485583463, + "learning_rate": 6.242767676400974e-10, + "loss": 0.9343, + "step": 3692 + }, + { + "epoch": 2.9902834008097168, + "grad_norm": 1.7066732550694934, + "learning_rate": 5.31928952098859e-10, + "loss": 0.956, + "step": 3693 + }, + { + "epoch": 2.991093117408907, + "grad_norm": 1.6307704468251862, + "learning_rate": 4.469687107255816e-10, + "loss": 0.984, + "step": 3694 + }, + { + "epoch": 2.9919028340080973, + "grad_norm": 1.7099555082281752, + "learning_rate": 3.6939610628894396e-10, + "loss": 0.9458, + "step": 3695 + }, + { + "epoch": 2.9927125506072874, + "grad_norm": 1.6724491743800372, + "learning_rate": 2.9921119609976903e-10, + "loss": 0.958, + "step": 3696 + }, + { + "epoch": 2.993522267206478, + "grad_norm": 1.7205791609019359, + "learning_rate": 2.364140320110231e-10, + "loss": 0.8848, + "step": 3697 + }, + { + "epoch": 2.994331983805668, + "grad_norm": 1.7073529607261773, + "learning_rate": 1.8100466041559573e-10, + "loss": 0.9302, + "step": 3698 + }, + { + "epoch": 2.9951417004048584, + "grad_norm": 1.6422918196631109, + "learning_rate": 1.3298312225074051e-10, + "loss": 0.9791, + "step": 3699 + }, + { + "epoch": 2.9959514170040484, + "grad_norm": 1.7056433896621659, + "learning_rate": 9.234945299363418e-11, + "loss": 0.9875, + "step": 3700 + }, + { + "epoch": 2.996761133603239, + "grad_norm": 1.6180923945102257, + "learning_rate": 5.910368266470734e-11, + "loss": 0.9499, + "step": 3701 + }, + { + "epoch": 2.997570850202429, + "grad_norm": 1.6943866994325132, + "learning_rate": 3.324583582653418e-11, + "loss": 1.0206, + "step": 3702 + }, + { + "epoch": 2.9983805668016195, + "grad_norm": 1.5972662423702781, + "learning_rate": 1.477593158272228e-11, + "loss": 1.0339, + "step": 3703 + }, + { + "epoch": 2.99919028340081, + "grad_norm": 1.6264801708756609, + "learning_rate": 3.693983577912619e-12, + "loss": 0.9317, + "step": 3704 + }, + { + "epoch": 3.0, + "grad_norm": 1.6339826687405365, + "learning_rate": 0.0, + "loss": 0.9082, + "step": 3705 + } + ], + "logging_steps": 1, + "max_steps": 3705, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 618, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 4.96481039548416e+16, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}