{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.04, "eval_steps": 500, "global_step": 4000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 1e-05, "grad_norm": 1.2723733186721802, "learning_rate": 3e-06, "loss": 10.8324, "step": 1 }, { "epoch": 2e-05, "grad_norm": 1.2627283334732056, "learning_rate": 6e-06, "loss": 10.8325, "step": 2 }, { "epoch": 3e-05, "grad_norm": 1.2797267436981201, "learning_rate": 9e-06, "loss": 10.8328, "step": 3 }, { "epoch": 4e-05, "grad_norm": 1.2568743228912354, "learning_rate": 1.2e-05, "loss": 10.8307, "step": 4 }, { "epoch": 5e-05, "grad_norm": 1.26583731174469, "learning_rate": 1.5e-05, "loss": 10.8273, "step": 5 }, { "epoch": 6e-05, "grad_norm": 1.268788456916809, "learning_rate": 1.8e-05, "loss": 10.8268, "step": 6 }, { "epoch": 7e-05, "grad_norm": 1.215290904045105, "learning_rate": 2.1000000000000002e-05, "loss": 10.8151, "step": 7 }, { "epoch": 8e-05, "grad_norm": 1.1221675872802734, "learning_rate": 2.4e-05, "loss": 10.794, "step": 8 }, { "epoch": 9e-05, "grad_norm": 1.0951769351959229, "learning_rate": 2.7e-05, "loss": 10.7888, "step": 9 }, { "epoch": 0.0001, "grad_norm": 1.105851650238037, "learning_rate": 3e-05, "loss": 10.7765, "step": 10 }, { "epoch": 0.00011, "grad_norm": 1.0884467363357544, "learning_rate": 3.2999999999999996e-05, "loss": 10.763, "step": 11 }, { "epoch": 0.00012, "grad_norm": 1.0829719305038452, "learning_rate": 3.6e-05, "loss": 10.7508, "step": 12 }, { "epoch": 0.00013, "grad_norm": 1.0506291389465332, "learning_rate": 3.9e-05, "loss": 10.7324, "step": 13 }, { "epoch": 0.00014, "grad_norm": 1.037864089012146, "learning_rate": 4.2000000000000004e-05, "loss": 10.7208, "step": 14 }, { "epoch": 0.00015, "grad_norm": 1.0115288496017456, "learning_rate": 4.4999999999999996e-05, "loss": 10.7117, "step": 15 }, { "epoch": 0.00016, "grad_norm": 0.9676744341850281, "learning_rate": 4.8e-05, "loss": 10.6952, "step": 16 }, { "epoch": 0.00017, "grad_norm": 0.9355509877204895, "learning_rate": 5.1000000000000006e-05, "loss": 10.6792, "step": 17 }, { "epoch": 0.00018, "grad_norm": 0.9286826848983765, "learning_rate": 5.4e-05, "loss": 10.6649, "step": 18 }, { "epoch": 0.00019, "grad_norm": 0.9110698699951172, "learning_rate": 5.7e-05, "loss": 10.6512, "step": 19 }, { "epoch": 0.0002, "grad_norm": 0.9136782288551331, "learning_rate": 6e-05, "loss": 10.6404, "step": 20 }, { "epoch": 0.00021, "grad_norm": 0.8971966505050659, "learning_rate": 6.3e-05, "loss": 10.6274, "step": 21 }, { "epoch": 0.00022, "grad_norm": 0.8972620368003845, "learning_rate": 6.599999999999999e-05, "loss": 10.6136, "step": 22 }, { "epoch": 0.00023, "grad_norm": 0.8984483480453491, "learning_rate": 6.9e-05, "loss": 10.6016, "step": 23 }, { "epoch": 0.00024, "grad_norm": 0.8967456817626953, "learning_rate": 7.2e-05, "loss": 10.5894, "step": 24 }, { "epoch": 0.00025, "grad_norm": 0.8972211480140686, "learning_rate": 7.500000000000001e-05, "loss": 10.5752, "step": 25 }, { "epoch": 0.00026, "grad_norm": 0.9028114080429077, "learning_rate": 7.8e-05, "loss": 10.5611, "step": 26 }, { "epoch": 0.00027, "grad_norm": 0.8926876783370972, "learning_rate": 8.1e-05, "loss": 10.5491, "step": 27 }, { "epoch": 0.00028, "grad_norm": 0.8921052813529968, "learning_rate": 8.400000000000001e-05, "loss": 10.536, "step": 28 }, { "epoch": 0.00029, "grad_norm": 0.8942669034004211, "learning_rate": 8.7e-05, "loss": 10.5219, "step": 29 }, { "epoch": 0.0003, "grad_norm": 0.9005073308944702, "learning_rate": 8.999999999999999e-05, "loss": 10.5056, "step": 30 }, { "epoch": 0.00031, "grad_norm": 0.8994124531745911, "learning_rate": 9.3e-05, "loss": 10.491, "step": 31 }, { "epoch": 0.00032, "grad_norm": 0.8968571424484253, "learning_rate": 9.6e-05, "loss": 10.4763, "step": 32 }, { "epoch": 0.00033, "grad_norm": 0.8976972103118896, "learning_rate": 9.900000000000001e-05, "loss": 10.4597, "step": 33 }, { "epoch": 0.00034, "grad_norm": 0.8977769017219543, "learning_rate": 0.00010200000000000001, "loss": 10.4427, "step": 34 }, { "epoch": 0.00035, "grad_norm": 0.902169406414032, "learning_rate": 0.00010500000000000002, "loss": 10.4252, "step": 35 }, { "epoch": 0.00036, "grad_norm": 0.8990501165390015, "learning_rate": 0.000108, "loss": 10.4079, "step": 36 }, { "epoch": 0.00037, "grad_norm": 0.8933607935905457, "learning_rate": 0.000111, "loss": 10.39, "step": 37 }, { "epoch": 0.00038, "grad_norm": 0.8925058245658875, "learning_rate": 0.000114, "loss": 10.3704, "step": 38 }, { "epoch": 0.00039, "grad_norm": 0.8942745923995972, "learning_rate": 0.000117, "loss": 10.3512, "step": 39 }, { "epoch": 0.0004, "grad_norm": 0.8984111547470093, "learning_rate": 0.00012, "loss": 10.3286, "step": 40 }, { "epoch": 0.00041, "grad_norm": 0.8943851590156555, "learning_rate": 0.000123, "loss": 10.3097, "step": 41 }, { "epoch": 0.00042, "grad_norm": 0.8935915231704712, "learning_rate": 0.000126, "loss": 10.2894, "step": 42 }, { "epoch": 0.00043, "grad_norm": 0.8975799679756165, "learning_rate": 0.000129, "loss": 10.2654, "step": 43 }, { "epoch": 0.00044, "grad_norm": 0.8982045650482178, "learning_rate": 0.00013199999999999998, "loss": 10.2433, "step": 44 }, { "epoch": 0.00045, "grad_norm": 0.9000449180603027, "learning_rate": 0.000135, "loss": 10.2204, "step": 45 }, { "epoch": 0.00046, "grad_norm": 0.8900250792503357, "learning_rate": 0.000138, "loss": 10.1983, "step": 46 }, { "epoch": 0.00047, "grad_norm": 0.8965498805046082, "learning_rate": 0.000141, "loss": 10.1723, "step": 47 }, { "epoch": 0.00048, "grad_norm": 0.8975719213485718, "learning_rate": 0.000144, "loss": 10.149, "step": 48 }, { "epoch": 0.00049, "grad_norm": 0.8933398127555847, "learning_rate": 0.000147, "loss": 10.1239, "step": 49 }, { "epoch": 0.0005, "grad_norm": 0.8988479375839233, "learning_rate": 0.00015000000000000001, "loss": 10.0972, "step": 50 }, { "epoch": 0.00051, "grad_norm": 0.8983374834060669, "learning_rate": 0.000153, "loss": 10.0711, "step": 51 }, { "epoch": 0.00052, "grad_norm": 0.8959178328514099, "learning_rate": 0.000156, "loss": 10.0437, "step": 52 }, { "epoch": 0.00053, "grad_norm": 0.8871957063674927, "learning_rate": 0.000159, "loss": 10.0204, "step": 53 }, { "epoch": 0.00054, "grad_norm": 0.9051761627197266, "learning_rate": 0.000162, "loss": 9.9878, "step": 54 }, { "epoch": 0.00055, "grad_norm": 0.8952219486236572, "learning_rate": 0.000165, "loss": 9.963, "step": 55 }, { "epoch": 0.00056, "grad_norm": 0.890164315700531, "learning_rate": 0.00016800000000000002, "loss": 9.9341, "step": 56 }, { "epoch": 0.00057, "grad_norm": 0.8922548890113831, "learning_rate": 0.000171, "loss": 9.9069, "step": 57 }, { "epoch": 0.00058, "grad_norm": 0.8901249766349792, "learning_rate": 0.000174, "loss": 9.882, "step": 58 }, { "epoch": 0.00059, "grad_norm": 0.8989579677581787, "learning_rate": 0.000177, "loss": 9.8502, "step": 59 }, { "epoch": 0.0006, "grad_norm": 0.8829832673072815, "learning_rate": 0.00017999999999999998, "loss": 9.8242, "step": 60 }, { "epoch": 0.00061, "grad_norm": 0.8862175345420837, "learning_rate": 0.000183, "loss": 9.7955, "step": 61 }, { "epoch": 0.00062, "grad_norm": 0.8893216848373413, "learning_rate": 0.000186, "loss": 9.7648, "step": 62 }, { "epoch": 0.00063, "grad_norm": 0.8881028294563293, "learning_rate": 0.000189, "loss": 9.7373, "step": 63 }, { "epoch": 0.00064, "grad_norm": 0.8868633508682251, "learning_rate": 0.000192, "loss": 9.7068, "step": 64 }, { "epoch": 0.00065, "grad_norm": 0.8924434185028076, "learning_rate": 0.00019500000000000002, "loss": 9.6743, "step": 65 }, { "epoch": 0.00066, "grad_norm": 0.8872416019439697, "learning_rate": 0.00019800000000000002, "loss": 9.6503, "step": 66 }, { "epoch": 0.00067, "grad_norm": 0.8866419196128845, "learning_rate": 0.000201, "loss": 9.62, "step": 67 }, { "epoch": 0.00068, "grad_norm": 0.8931677937507629, "learning_rate": 0.00020400000000000003, "loss": 9.5881, "step": 68 }, { "epoch": 0.00069, "grad_norm": 0.879610002040863, "learning_rate": 0.00020700000000000002, "loss": 9.5597, "step": 69 }, { "epoch": 0.0007, "grad_norm": 0.8912403583526611, "learning_rate": 0.00021000000000000004, "loss": 9.5236, "step": 70 }, { "epoch": 0.00071, "grad_norm": 0.8847392797470093, "learning_rate": 0.00021299999999999997, "loss": 9.4965, "step": 71 }, { "epoch": 0.00072, "grad_norm": 0.8860267996788025, "learning_rate": 0.000216, "loss": 9.4698, "step": 72 }, { "epoch": 0.00073, "grad_norm": 0.886963963508606, "learning_rate": 0.00021899999999999998, "loss": 9.4354, "step": 73 }, { "epoch": 0.00074, "grad_norm": 0.8829045295715332, "learning_rate": 0.000222, "loss": 9.4089, "step": 74 }, { "epoch": 0.00075, "grad_norm": 0.8822581768035889, "learning_rate": 0.000225, "loss": 9.3746, "step": 75 }, { "epoch": 0.00076, "grad_norm": 0.8918945789337158, "learning_rate": 0.000228, "loss": 9.3485, "step": 76 }, { "epoch": 0.00077, "grad_norm": 0.8851014375686646, "learning_rate": 0.000231, "loss": 9.3206, "step": 77 }, { "epoch": 0.00078, "grad_norm": 0.8782386183738708, "learning_rate": 0.000234, "loss": 9.2909, "step": 78 }, { "epoch": 0.00079, "grad_norm": 0.8847852349281311, "learning_rate": 0.00023700000000000001, "loss": 9.2539, "step": 79 }, { "epoch": 0.0008, "grad_norm": 0.8866625428199768, "learning_rate": 0.00024, "loss": 9.2184, "step": 80 }, { "epoch": 0.00081, "grad_norm": 0.8906494975090027, "learning_rate": 0.00024300000000000002, "loss": 9.1857, "step": 81 }, { "epoch": 0.00082, "grad_norm": 0.8960816264152527, "learning_rate": 0.000246, "loss": 9.1611, "step": 82 }, { "epoch": 0.00083, "grad_norm": 0.8946034908294678, "learning_rate": 0.00024900000000000004, "loss": 9.1247, "step": 83 }, { "epoch": 0.00084, "grad_norm": 0.8957815170288086, "learning_rate": 0.000252, "loss": 9.0939, "step": 84 }, { "epoch": 0.00085, "grad_norm": 0.8891732692718506, "learning_rate": 0.000255, "loss": 9.0702, "step": 85 }, { "epoch": 0.00086, "grad_norm": 0.8951762318611145, "learning_rate": 0.000258, "loss": 9.0346, "step": 86 }, { "epoch": 0.00087, "grad_norm": 0.8848313093185425, "learning_rate": 0.000261, "loss": 9.0125, "step": 87 }, { "epoch": 0.00088, "grad_norm": 0.8860167860984802, "learning_rate": 0.00026399999999999997, "loss": 8.9751, "step": 88 }, { "epoch": 0.00089, "grad_norm": 0.8798444271087646, "learning_rate": 0.000267, "loss": 8.9486, "step": 89 }, { "epoch": 0.0009, "grad_norm": 0.8837233185768127, "learning_rate": 0.00027, "loss": 8.9221, "step": 90 }, { "epoch": 0.00091, "grad_norm": 0.879225492477417, "learning_rate": 0.000273, "loss": 8.8916, "step": 91 }, { "epoch": 0.00092, "grad_norm": 0.8843948245048523, "learning_rate": 0.000276, "loss": 8.8596, "step": 92 }, { "epoch": 0.00093, "grad_norm": 0.882838785648346, "learning_rate": 0.000279, "loss": 8.8297, "step": 93 }, { "epoch": 0.00094, "grad_norm": 0.8830418586730957, "learning_rate": 0.000282, "loss": 8.8034, "step": 94 }, { "epoch": 0.00095, "grad_norm": 0.8770962357521057, "learning_rate": 0.000285, "loss": 8.7743, "step": 95 }, { "epoch": 0.00096, "grad_norm": 0.8804563879966736, "learning_rate": 0.000288, "loss": 8.7444, "step": 96 }, { "epoch": 0.00097, "grad_norm": 0.8753241300582886, "learning_rate": 0.000291, "loss": 8.7211, "step": 97 }, { "epoch": 0.00098, "grad_norm": 0.8762865662574768, "learning_rate": 0.000294, "loss": 8.6826, "step": 98 }, { "epoch": 0.00099, "grad_norm": 0.8762408494949341, "learning_rate": 0.000297, "loss": 8.6601, "step": 99 }, { "epoch": 0.001, "grad_norm": 0.8741625547409058, "learning_rate": 0.00030000000000000003, "loss": 8.6324, "step": 100 }, { "epoch": 0.00101, "grad_norm": 0.8789051175117493, "learning_rate": 0.00030300000000000005, "loss": 8.5981, "step": 101 }, { "epoch": 0.00102, "grad_norm": 0.8656250834465027, "learning_rate": 0.000306, "loss": 8.5772, "step": 102 }, { "epoch": 0.00103, "grad_norm": 0.8743636012077332, "learning_rate": 0.000309, "loss": 8.5519, "step": 103 }, { "epoch": 0.00104, "grad_norm": 0.8708215951919556, "learning_rate": 0.000312, "loss": 8.5298, "step": 104 }, { "epoch": 0.00105, "grad_norm": 0.8859707713127136, "learning_rate": 0.000315, "loss": 8.5033, "step": 105 }, { "epoch": 0.00106, "grad_norm": 0.9068981409072876, "learning_rate": 0.000318, "loss": 8.4736, "step": 106 }, { "epoch": 0.00107, "grad_norm": 0.9245584011077881, "learning_rate": 0.000321, "loss": 8.4478, "step": 107 }, { "epoch": 0.00108, "grad_norm": 0.9128947257995605, "learning_rate": 0.000324, "loss": 8.4188, "step": 108 }, { "epoch": 0.00109, "grad_norm": 0.853670060634613, "learning_rate": 0.000327, "loss": 8.3963, "step": 109 }, { "epoch": 0.0011, "grad_norm": 0.8760496377944946, "learning_rate": 0.00033, "loss": 8.3734, "step": 110 }, { "epoch": 0.00111, "grad_norm": 0.9078761339187622, "learning_rate": 0.000333, "loss": 8.3444, "step": 111 }, { "epoch": 0.00112, "grad_norm": 0.866322934627533, "learning_rate": 0.00033600000000000004, "loss": 8.3207, "step": 112 }, { "epoch": 0.00113, "grad_norm": 0.8490086197853088, "learning_rate": 0.000339, "loss": 8.2796, "step": 113 }, { "epoch": 0.00114, "grad_norm": 0.8713237047195435, "learning_rate": 0.000342, "loss": 8.2694, "step": 114 }, { "epoch": 0.00115, "grad_norm": 0.8709179162979126, "learning_rate": 0.00034500000000000004, "loss": 8.2404, "step": 115 }, { "epoch": 0.00116, "grad_norm": 0.8300504684448242, "learning_rate": 0.000348, "loss": 8.214, "step": 116 }, { "epoch": 0.00117, "grad_norm": 0.8302497863769531, "learning_rate": 0.000351, "loss": 8.1818, "step": 117 }, { "epoch": 0.00118, "grad_norm": 0.8652266263961792, "learning_rate": 0.000354, "loss": 8.1647, "step": 118 }, { "epoch": 0.00119, "grad_norm": 0.9069057106971741, "learning_rate": 0.000357, "loss": 8.1514, "step": 119 }, { "epoch": 0.0012, "grad_norm": 0.9525896906852722, "learning_rate": 0.00035999999999999997, "loss": 8.1239, "step": 120 }, { "epoch": 0.00121, "grad_norm": 0.9400830268859863, "learning_rate": 0.000363, "loss": 8.095, "step": 121 }, { "epoch": 0.00122, "grad_norm": 0.8189533948898315, "learning_rate": 0.000366, "loss": 8.0717, "step": 122 }, { "epoch": 0.00123, "grad_norm": 0.8198633790016174, "learning_rate": 0.000369, "loss": 8.0459, "step": 123 }, { "epoch": 0.00124, "grad_norm": 0.9460069537162781, "learning_rate": 0.000372, "loss": 8.0261, "step": 124 }, { "epoch": 0.00125, "grad_norm": 1.0734294652938843, "learning_rate": 0.000375, "loss": 8.002, "step": 125 }, { "epoch": 0.00126, "grad_norm": 0.9635769724845886, "learning_rate": 0.000378, "loss": 7.9871, "step": 126 }, { "epoch": 0.00127, "grad_norm": 0.7875692844390869, "learning_rate": 0.000381, "loss": 7.9581, "step": 127 }, { "epoch": 0.00128, "grad_norm": 0.8865201473236084, "learning_rate": 0.000384, "loss": 7.9374, "step": 128 }, { "epoch": 0.00129, "grad_norm": 0.9998716115951538, "learning_rate": 0.00038700000000000003, "loss": 7.9265, "step": 129 }, { "epoch": 0.0013, "grad_norm": 0.8098431825637817, "learning_rate": 0.00039000000000000005, "loss": 7.8932, "step": 130 }, { "epoch": 0.00131, "grad_norm": 0.8202345967292786, "learning_rate": 0.000393, "loss": 7.8568, "step": 131 }, { "epoch": 0.00132, "grad_norm": 0.9445962905883789, "learning_rate": 0.00039600000000000003, "loss": 7.8481, "step": 132 }, { "epoch": 0.00133, "grad_norm": 0.8225625157356262, "learning_rate": 0.00039900000000000005, "loss": 7.8198, "step": 133 }, { "epoch": 0.00134, "grad_norm": 0.8087729811668396, "learning_rate": 0.000402, "loss": 7.8072, "step": 134 }, { "epoch": 0.00135, "grad_norm": 0.7232753038406372, "learning_rate": 0.00040500000000000003, "loss": 7.7727, "step": 135 }, { "epoch": 0.00136, "grad_norm": 0.8383559584617615, "learning_rate": 0.00040800000000000005, "loss": 7.7588, "step": 136 }, { "epoch": 0.00137, "grad_norm": 0.8329391479492188, "learning_rate": 0.000411, "loss": 7.738, "step": 137 }, { "epoch": 0.00138, "grad_norm": 0.8072112202644348, "learning_rate": 0.00041400000000000003, "loss": 7.7102, "step": 138 }, { "epoch": 0.00139, "grad_norm": 0.7906625270843506, "learning_rate": 0.00041700000000000005, "loss": 7.6947, "step": 139 }, { "epoch": 0.0014, "grad_norm": 0.7997886538505554, "learning_rate": 0.00042000000000000007, "loss": 7.6753, "step": 140 }, { "epoch": 0.00141, "grad_norm": 0.9642479419708252, "learning_rate": 0.000423, "loss": 7.6647, "step": 141 }, { "epoch": 0.00142, "grad_norm": 0.8051616549491882, "learning_rate": 0.00042599999999999995, "loss": 7.6463, "step": 142 }, { "epoch": 0.00143, "grad_norm": 0.7901502251625061, "learning_rate": 0.00042899999999999997, "loss": 7.6185, "step": 143 }, { "epoch": 0.00144, "grad_norm": 0.6530913710594177, "learning_rate": 0.000432, "loss": 7.5947, "step": 144 }, { "epoch": 0.00145, "grad_norm": 0.6823164820671082, "learning_rate": 0.000435, "loss": 7.5887, "step": 145 }, { "epoch": 0.00146, "grad_norm": 0.7043561935424805, "learning_rate": 0.00043799999999999997, "loss": 7.5506, "step": 146 }, { "epoch": 0.00147, "grad_norm": 0.660875141620636, "learning_rate": 0.000441, "loss": 7.5403, "step": 147 }, { "epoch": 0.00148, "grad_norm": 0.6651095747947693, "learning_rate": 0.000444, "loss": 7.527, "step": 148 }, { "epoch": 0.00149, "grad_norm": 0.594466507434845, "learning_rate": 0.00044699999999999997, "loss": 7.5177, "step": 149 }, { "epoch": 0.0015, "grad_norm": 0.640634298324585, "learning_rate": 0.00045, "loss": 7.4872, "step": 150 }, { "epoch": 0.00151, "grad_norm": 0.6399310231208801, "learning_rate": 0.000453, "loss": 7.4669, "step": 151 }, { "epoch": 0.00152, "grad_norm": 0.6032711863517761, "learning_rate": 0.000456, "loss": 7.4595, "step": 152 }, { "epoch": 0.00153, "grad_norm": 0.8105739951133728, "learning_rate": 0.000459, "loss": 7.4504, "step": 153 }, { "epoch": 0.00154, "grad_norm": 0.9096337556838989, "learning_rate": 0.000462, "loss": 7.4401, "step": 154 }, { "epoch": 0.00155, "grad_norm": 0.8464334607124329, "learning_rate": 0.000465, "loss": 7.4179, "step": 155 }, { "epoch": 0.00156, "grad_norm": 0.8338698148727417, "learning_rate": 0.000468, "loss": 7.392, "step": 156 }, { "epoch": 0.00157, "grad_norm": 0.6862301230430603, "learning_rate": 0.000471, "loss": 7.3898, "step": 157 }, { "epoch": 0.00158, "grad_norm": 0.6174972057342529, "learning_rate": 0.00047400000000000003, "loss": 7.3642, "step": 158 }, { "epoch": 0.00159, "grad_norm": 0.7215908765792847, "learning_rate": 0.000477, "loss": 7.3548, "step": 159 }, { "epoch": 0.0016, "grad_norm": 0.7243570685386658, "learning_rate": 0.00048, "loss": 7.3363, "step": 160 }, { "epoch": 0.00161, "grad_norm": 0.5998020172119141, "learning_rate": 0.00048300000000000003, "loss": 7.3105, "step": 161 }, { "epoch": 0.00162, "grad_norm": 0.6509896516799927, "learning_rate": 0.00048600000000000005, "loss": 7.3157, "step": 162 }, { "epoch": 0.00163, "grad_norm": 0.645023763179779, "learning_rate": 0.0004890000000000001, "loss": 7.2995, "step": 163 }, { "epoch": 0.00164, "grad_norm": 0.47143545746803284, "learning_rate": 0.000492, "loss": 7.2948, "step": 164 }, { "epoch": 0.00165, "grad_norm": 0.6931191086769104, "learning_rate": 0.000495, "loss": 7.2744, "step": 165 }, { "epoch": 0.00166, "grad_norm": 0.5705839991569519, "learning_rate": 0.0004980000000000001, "loss": 7.2462, "step": 166 }, { "epoch": 0.00167, "grad_norm": 0.5678435564041138, "learning_rate": 0.000501, "loss": 7.2388, "step": 167 }, { "epoch": 0.00168, "grad_norm": 0.4481411278247833, "learning_rate": 0.000504, "loss": 7.2152, "step": 168 }, { "epoch": 0.00169, "grad_norm": 0.5297079086303711, "learning_rate": 0.0005070000000000001, "loss": 7.2111, "step": 169 }, { "epoch": 0.0017, "grad_norm": 0.5522683262825012, "learning_rate": 0.00051, "loss": 7.1957, "step": 170 }, { "epoch": 0.00171, "grad_norm": 0.4941532611846924, "learning_rate": 0.000513, "loss": 7.1764, "step": 171 }, { "epoch": 0.00172, "grad_norm": 0.40358924865722656, "learning_rate": 0.000516, "loss": 7.1752, "step": 172 }, { "epoch": 0.00173, "grad_norm": 0.43254554271698, "learning_rate": 0.0005189999999999999, "loss": 7.1608, "step": 173 }, { "epoch": 0.00174, "grad_norm": 0.49833717942237854, "learning_rate": 0.000522, "loss": 7.1779, "step": 174 }, { "epoch": 0.00175, "grad_norm": 0.469341903924942, "learning_rate": 0.000525, "loss": 7.1406, "step": 175 }, { "epoch": 0.00176, "grad_norm": 0.46195128560066223, "learning_rate": 0.0005279999999999999, "loss": 7.1165, "step": 176 }, { "epoch": 0.00177, "grad_norm": 0.4399634003639221, "learning_rate": 0.000531, "loss": 7.1206, "step": 177 }, { "epoch": 0.00178, "grad_norm": 0.42299556732177734, "learning_rate": 0.000534, "loss": 7.1134, "step": 178 }, { "epoch": 0.00179, "grad_norm": 0.3987540602684021, "learning_rate": 0.000537, "loss": 7.1017, "step": 179 }, { "epoch": 0.0018, "grad_norm": 0.37395715713500977, "learning_rate": 0.00054, "loss": 7.074, "step": 180 }, { "epoch": 0.00181, "grad_norm": 0.3351408541202545, "learning_rate": 0.000543, "loss": 7.0613, "step": 181 }, { "epoch": 0.00182, "grad_norm": 0.3589305281639099, "learning_rate": 0.000546, "loss": 7.0589, "step": 182 }, { "epoch": 0.00183, "grad_norm": 0.38710907101631165, "learning_rate": 0.000549, "loss": 7.0486, "step": 183 }, { "epoch": 0.00184, "grad_norm": 0.4560106098651886, "learning_rate": 0.000552, "loss": 7.0384, "step": 184 }, { "epoch": 0.00185, "grad_norm": 0.8466277718544006, "learning_rate": 0.000555, "loss": 7.0199, "step": 185 }, { "epoch": 0.00186, "grad_norm": 1.4107517004013062, "learning_rate": 0.000558, "loss": 7.0664, "step": 186 }, { "epoch": 0.00187, "grad_norm": 0.5632089376449585, "learning_rate": 0.000561, "loss": 7.004, "step": 187 }, { "epoch": 0.00188, "grad_norm": 1.07405686378479, "learning_rate": 0.000564, "loss": 6.9945, "step": 188 }, { "epoch": 0.00189, "grad_norm": 1.1665420532226562, "learning_rate": 0.000567, "loss": 7.0095, "step": 189 }, { "epoch": 0.0019, "grad_norm": 0.4235672950744629, "learning_rate": 0.00057, "loss": 6.962, "step": 190 }, { "epoch": 0.00191, "grad_norm": 1.2953448295593262, "learning_rate": 0.000573, "loss": 6.9853, "step": 191 }, { "epoch": 0.00192, "grad_norm": 0.5110867023468018, "learning_rate": 0.000576, "loss": 6.9512, "step": 192 }, { "epoch": 0.00193, "grad_norm": 0.7966066002845764, "learning_rate": 0.000579, "loss": 6.9656, "step": 193 }, { "epoch": 0.00194, "grad_norm": 0.5008851289749146, "learning_rate": 0.000582, "loss": 6.9458, "step": 194 }, { "epoch": 0.00195, "grad_norm": 0.714582085609436, "learning_rate": 0.000585, "loss": 6.9325, "step": 195 }, { "epoch": 0.00196, "grad_norm": 0.48010018467903137, "learning_rate": 0.000588, "loss": 6.917, "step": 196 }, { "epoch": 0.00197, "grad_norm": 0.5283955335617065, "learning_rate": 0.000591, "loss": 6.9149, "step": 197 }, { "epoch": 0.00198, "grad_norm": 0.5033705830574036, "learning_rate": 0.000594, "loss": 6.9041, "step": 198 }, { "epoch": 0.00199, "grad_norm": 0.40711161494255066, "learning_rate": 0.0005970000000000001, "loss": 6.8819, "step": 199 }, { "epoch": 0.002, "grad_norm": 0.4253259003162384, "learning_rate": 0.0006000000000000001, "loss": 6.8842, "step": 200 }, { "epoch": 0.00201, "grad_norm": 0.347766637802124, "learning_rate": 0.000603, "loss": 6.8566, "step": 201 }, { "epoch": 0.00202, "grad_norm": 0.4044833779335022, "learning_rate": 0.0006060000000000001, "loss": 6.8448, "step": 202 }, { "epoch": 0.00203, "grad_norm": 0.3598291575908661, "learning_rate": 0.0006090000000000001, "loss": 6.8489, "step": 203 }, { "epoch": 0.00204, "grad_norm": 0.35803648829460144, "learning_rate": 0.000612, "loss": 6.8337, "step": 204 }, { "epoch": 0.00205, "grad_norm": 0.3630695044994354, "learning_rate": 0.000615, "loss": 6.82, "step": 205 }, { "epoch": 0.00206, "grad_norm": 0.3439967930316925, "learning_rate": 0.000618, "loss": 6.8125, "step": 206 }, { "epoch": 0.00207, "grad_norm": 0.3219742476940155, "learning_rate": 0.000621, "loss": 6.8026, "step": 207 }, { "epoch": 0.00208, "grad_norm": 0.42236459255218506, "learning_rate": 0.000624, "loss": 6.8006, "step": 208 }, { "epoch": 0.00209, "grad_norm": 0.44003602862358093, "learning_rate": 0.000627, "loss": 6.7802, "step": 209 }, { "epoch": 0.0021, "grad_norm": 0.6629877686500549, "learning_rate": 0.00063, "loss": 6.7737, "step": 210 }, { "epoch": 0.00211, "grad_norm": 0.9569016695022583, "learning_rate": 0.000633, "loss": 6.7754, "step": 211 }, { "epoch": 0.00212, "grad_norm": 1.1293443441390991, "learning_rate": 0.000636, "loss": 6.7746, "step": 212 }, { "epoch": 0.00213, "grad_norm": 0.43011096119880676, "learning_rate": 0.000639, "loss": 6.7385, "step": 213 }, { "epoch": 0.00214, "grad_norm": 0.6478229761123657, "learning_rate": 0.000642, "loss": 6.7415, "step": 214 }, { "epoch": 0.00215, "grad_norm": 0.6323032975196838, "learning_rate": 0.000645, "loss": 6.7384, "step": 215 }, { "epoch": 0.00216, "grad_norm": 0.441693514585495, "learning_rate": 0.000648, "loss": 6.7285, "step": 216 }, { "epoch": 0.00217, "grad_norm": 0.5594473481178284, "learning_rate": 0.000651, "loss": 6.7033, "step": 217 }, { "epoch": 0.00218, "grad_norm": 0.5135915279388428, "learning_rate": 0.000654, "loss": 6.7073, "step": 218 }, { "epoch": 0.00219, "grad_norm": 0.4307027757167816, "learning_rate": 0.000657, "loss": 6.6782, "step": 219 }, { "epoch": 0.0022, "grad_norm": 0.45137229561805725, "learning_rate": 0.00066, "loss": 6.6953, "step": 220 }, { "epoch": 0.00221, "grad_norm": 0.4729914963245392, "learning_rate": 0.0006630000000000001, "loss": 6.6737, "step": 221 }, { "epoch": 0.00222, "grad_norm": 0.47246506810188293, "learning_rate": 0.000666, "loss": 6.6615, "step": 222 }, { "epoch": 0.00223, "grad_norm": 0.36316192150115967, "learning_rate": 0.000669, "loss": 6.6543, "step": 223 }, { "epoch": 0.00224, "grad_norm": 0.4332623779773712, "learning_rate": 0.0006720000000000001, "loss": 6.6525, "step": 224 }, { "epoch": 0.00225, "grad_norm": 0.41814228892326355, "learning_rate": 0.000675, "loss": 6.639, "step": 225 }, { "epoch": 0.00226, "grad_norm": 0.42956992983818054, "learning_rate": 0.000678, "loss": 6.634, "step": 226 }, { "epoch": 0.00227, "grad_norm": 0.40267884731292725, "learning_rate": 0.0006810000000000001, "loss": 6.6072, "step": 227 }, { "epoch": 0.00228, "grad_norm": 0.4361991882324219, "learning_rate": 0.000684, "loss": 6.6099, "step": 228 }, { "epoch": 0.00229, "grad_norm": 0.47655290365219116, "learning_rate": 0.000687, "loss": 6.6086, "step": 229 }, { "epoch": 0.0023, "grad_norm": 0.5011177659034729, "learning_rate": 0.0006900000000000001, "loss": 6.5846, "step": 230 }, { "epoch": 0.00231, "grad_norm": 0.5389447212219238, "learning_rate": 0.000693, "loss": 6.5719, "step": 231 }, { "epoch": 0.00232, "grad_norm": 0.5394959449768066, "learning_rate": 0.000696, "loss": 6.5809, "step": 232 }, { "epoch": 0.00233, "grad_norm": 0.49784839153289795, "learning_rate": 0.0006990000000000001, "loss": 6.5675, "step": 233 }, { "epoch": 0.00234, "grad_norm": 0.42049404978752136, "learning_rate": 0.000702, "loss": 6.5581, "step": 234 }, { "epoch": 0.00235, "grad_norm": 0.5810425281524658, "learning_rate": 0.000705, "loss": 6.5463, "step": 235 }, { "epoch": 0.00236, "grad_norm": 0.6429721117019653, "learning_rate": 0.000708, "loss": 6.5306, "step": 236 }, { "epoch": 0.00237, "grad_norm": 0.6626091003417969, "learning_rate": 0.0007109999999999999, "loss": 6.5402, "step": 237 }, { "epoch": 0.00238, "grad_norm": 0.5873957872390747, "learning_rate": 0.000714, "loss": 6.5299, "step": 238 }, { "epoch": 0.00239, "grad_norm": 0.4890768527984619, "learning_rate": 0.000717, "loss": 6.5085, "step": 239 }, { "epoch": 0.0024, "grad_norm": 0.6101468801498413, "learning_rate": 0.0007199999999999999, "loss": 6.5266, "step": 240 }, { "epoch": 0.00241, "grad_norm": 0.5837545394897461, "learning_rate": 0.000723, "loss": 6.5125, "step": 241 }, { "epoch": 0.00242, "grad_norm": 0.38101619482040405, "learning_rate": 0.000726, "loss": 6.481, "step": 242 }, { "epoch": 0.00243, "grad_norm": 0.5516716837882996, "learning_rate": 0.000729, "loss": 6.4712, "step": 243 }, { "epoch": 0.00244, "grad_norm": 0.6402163505554199, "learning_rate": 0.000732, "loss": 6.4787, "step": 244 }, { "epoch": 0.00245, "grad_norm": 0.661472737789154, "learning_rate": 0.000735, "loss": 6.4666, "step": 245 }, { "epoch": 0.00246, "grad_norm": 0.8242950439453125, "learning_rate": 0.000738, "loss": 6.457, "step": 246 }, { "epoch": 0.00247, "grad_norm": 0.8979260921478271, "learning_rate": 0.000741, "loss": 6.4532, "step": 247 }, { "epoch": 0.00248, "grad_norm": 0.7822521924972534, "learning_rate": 0.000744, "loss": 6.4595, "step": 248 }, { "epoch": 0.00249, "grad_norm": 0.5830389261245728, "learning_rate": 0.000747, "loss": 6.4336, "step": 249 }, { "epoch": 0.0025, "grad_norm": 0.5053834319114685, "learning_rate": 0.00075, "loss": 6.436, "step": 250 }, { "epoch": 0.00251, "grad_norm": 0.6169653534889221, "learning_rate": 0.000753, "loss": 6.4018, "step": 251 }, { "epoch": 0.00252, "grad_norm": 0.693821370601654, "learning_rate": 0.000756, "loss": 6.411, "step": 252 }, { "epoch": 0.00253, "grad_norm": 0.7529676556587219, "learning_rate": 0.000759, "loss": 6.4053, "step": 253 }, { "epoch": 0.00254, "grad_norm": 0.7882714867591858, "learning_rate": 0.000762, "loss": 6.3993, "step": 254 }, { "epoch": 0.00255, "grad_norm": 0.8540387153625488, "learning_rate": 0.0007650000000000001, "loss": 6.399, "step": 255 }, { "epoch": 0.00256, "grad_norm": 0.8460838198661804, "learning_rate": 0.000768, "loss": 6.3805, "step": 256 }, { "epoch": 0.00257, "grad_norm": 0.7798411250114441, "learning_rate": 0.000771, "loss": 6.383, "step": 257 }, { "epoch": 0.00258, "grad_norm": 0.7431406378746033, "learning_rate": 0.0007740000000000001, "loss": 6.362, "step": 258 }, { "epoch": 0.00259, "grad_norm": 0.9195736646652222, "learning_rate": 0.000777, "loss": 6.3634, "step": 259 }, { "epoch": 0.0026, "grad_norm": 0.9989089965820312, "learning_rate": 0.0007800000000000001, "loss": 6.3605, "step": 260 }, { "epoch": 0.00261, "grad_norm": 0.8553241491317749, "learning_rate": 0.0007830000000000001, "loss": 6.3627, "step": 261 }, { "epoch": 0.00262, "grad_norm": 0.8126974701881409, "learning_rate": 0.000786, "loss": 6.3448, "step": 262 }, { "epoch": 0.00263, "grad_norm": 0.5453911423683167, "learning_rate": 0.0007890000000000001, "loss": 6.3416, "step": 263 }, { "epoch": 0.00264, "grad_norm": 0.5339396595954895, "learning_rate": 0.0007920000000000001, "loss": 6.3252, "step": 264 }, { "epoch": 0.00265, "grad_norm": 0.561769425868988, "learning_rate": 0.000795, "loss": 6.3181, "step": 265 }, { "epoch": 0.00266, "grad_norm": 0.47888532280921936, "learning_rate": 0.0007980000000000001, "loss": 6.3105, "step": 266 }, { "epoch": 0.00267, "grad_norm": 0.4935484528541565, "learning_rate": 0.0008010000000000001, "loss": 6.3049, "step": 267 }, { "epoch": 0.00268, "grad_norm": 0.4065157175064087, "learning_rate": 0.000804, "loss": 6.2969, "step": 268 }, { "epoch": 0.00269, "grad_norm": 0.5361817479133606, "learning_rate": 0.0008070000000000001, "loss": 6.2818, "step": 269 }, { "epoch": 0.0027, "grad_norm": 0.6360214352607727, "learning_rate": 0.0008100000000000001, "loss": 6.2858, "step": 270 }, { "epoch": 0.00271, "grad_norm": 0.6580653190612793, "learning_rate": 0.000813, "loss": 6.2898, "step": 271 }, { "epoch": 0.00272, "grad_norm": 0.719866931438446, "learning_rate": 0.0008160000000000001, "loss": 6.2764, "step": 272 }, { "epoch": 0.00273, "grad_norm": 0.726635754108429, "learning_rate": 0.0008190000000000001, "loss": 6.2654, "step": 273 }, { "epoch": 0.00274, "grad_norm": 0.5728192925453186, "learning_rate": 0.000822, "loss": 6.2437, "step": 274 }, { "epoch": 0.00275, "grad_norm": 0.469969242811203, "learning_rate": 0.0008250000000000001, "loss": 6.2475, "step": 275 }, { "epoch": 0.00276, "grad_norm": 0.5783148407936096, "learning_rate": 0.0008280000000000001, "loss": 6.2405, "step": 276 }, { "epoch": 0.00277, "grad_norm": 0.5995691418647766, "learning_rate": 0.0008310000000000001, "loss": 6.2356, "step": 277 }, { "epoch": 0.00278, "grad_norm": 0.5173709988594055, "learning_rate": 0.0008340000000000001, "loss": 6.2181, "step": 278 }, { "epoch": 0.00279, "grad_norm": 0.47013920545578003, "learning_rate": 0.0008370000000000001, "loss": 6.2175, "step": 279 }, { "epoch": 0.0028, "grad_norm": 0.38072702288627625, "learning_rate": 0.0008400000000000001, "loss": 6.1988, "step": 280 }, { "epoch": 0.00281, "grad_norm": 0.44907790422439575, "learning_rate": 0.0008430000000000001, "loss": 6.1893, "step": 281 }, { "epoch": 0.00282, "grad_norm": 0.40965142846107483, "learning_rate": 0.000846, "loss": 6.1869, "step": 282 }, { "epoch": 0.00283, "grad_norm": 0.48822489380836487, "learning_rate": 0.0008489999999999999, "loss": 6.1783, "step": 283 }, { "epoch": 0.00284, "grad_norm": 0.726660966873169, "learning_rate": 0.0008519999999999999, "loss": 6.1771, "step": 284 }, { "epoch": 0.00285, "grad_norm": 1.0991517305374146, "learning_rate": 0.000855, "loss": 6.1879, "step": 285 }, { "epoch": 0.00286, "grad_norm": 0.9898068904876709, "learning_rate": 0.0008579999999999999, "loss": 6.1694, "step": 286 }, { "epoch": 0.00287, "grad_norm": 0.9177366495132446, "learning_rate": 0.000861, "loss": 6.1687, "step": 287 }, { "epoch": 0.00288, "grad_norm": 1.3172835111618042, "learning_rate": 0.000864, "loss": 6.175, "step": 288 }, { "epoch": 0.00289, "grad_norm": 1.0531185865402222, "learning_rate": 0.0008669999999999999, "loss": 6.1733, "step": 289 }, { "epoch": 0.0029, "grad_norm": 0.9814063310623169, "learning_rate": 0.00087, "loss": 6.1665, "step": 290 }, { "epoch": 0.00291, "grad_norm": 0.7696391344070435, "learning_rate": 0.000873, "loss": 6.1455, "step": 291 }, { "epoch": 0.00292, "grad_norm": 0.8531065583229065, "learning_rate": 0.0008759999999999999, "loss": 6.15, "step": 292 }, { "epoch": 0.00293, "grad_norm": 0.7760049700737, "learning_rate": 0.000879, "loss": 6.1273, "step": 293 }, { "epoch": 0.00294, "grad_norm": 0.7517282366752625, "learning_rate": 0.000882, "loss": 6.1432, "step": 294 }, { "epoch": 0.00295, "grad_norm": 0.5758442282676697, "learning_rate": 0.0008849999999999999, "loss": 6.106, "step": 295 }, { "epoch": 0.00296, "grad_norm": 0.5470280647277832, "learning_rate": 0.000888, "loss": 6.1152, "step": 296 }, { "epoch": 0.00297, "grad_norm": 0.46315857768058777, "learning_rate": 0.000891, "loss": 6.0848, "step": 297 }, { "epoch": 0.00298, "grad_norm": 0.52577805519104, "learning_rate": 0.0008939999999999999, "loss": 6.0996, "step": 298 }, { "epoch": 0.00299, "grad_norm": 0.5289214253425598, "learning_rate": 0.000897, "loss": 6.0753, "step": 299 }, { "epoch": 0.003, "grad_norm": 0.39721718430519104, "learning_rate": 0.0009, "loss": 6.0729, "step": 300 }, { "epoch": 0.00301, "grad_norm": 0.42188870906829834, "learning_rate": 0.0009029999999999999, "loss": 6.0649, "step": 301 }, { "epoch": 0.00302, "grad_norm": 0.49844104051589966, "learning_rate": 0.000906, "loss": 6.0504, "step": 302 }, { "epoch": 0.00303, "grad_norm": 0.5113502144813538, "learning_rate": 0.000909, "loss": 6.0629, "step": 303 }, { "epoch": 0.00304, "grad_norm": 0.6390882730484009, "learning_rate": 0.000912, "loss": 6.0593, "step": 304 }, { "epoch": 0.00305, "grad_norm": 0.8851528763771057, "learning_rate": 0.000915, "loss": 6.0668, "step": 305 }, { "epoch": 0.00306, "grad_norm": 0.9017530083656311, "learning_rate": 0.000918, "loss": 6.0552, "step": 306 }, { "epoch": 0.00307, "grad_norm": 0.563444197177887, "learning_rate": 0.000921, "loss": 6.0386, "step": 307 }, { "epoch": 0.00308, "grad_norm": 0.6002116203308105, "learning_rate": 0.000924, "loss": 6.0248, "step": 308 }, { "epoch": 0.00309, "grad_norm": 0.7035393118858337, "learning_rate": 0.000927, "loss": 6.0412, "step": 309 }, { "epoch": 0.0031, "grad_norm": 1.01050865650177, "learning_rate": 0.00093, "loss": 6.0314, "step": 310 }, { "epoch": 0.00311, "grad_norm": 1.16908860206604, "learning_rate": 0.000933, "loss": 6.0397, "step": 311 }, { "epoch": 0.00312, "grad_norm": 0.6785458922386169, "learning_rate": 0.000936, "loss": 6.0006, "step": 312 }, { "epoch": 0.00313, "grad_norm": 0.7975029349327087, "learning_rate": 0.0009390000000000001, "loss": 6.0267, "step": 313 }, { "epoch": 0.00314, "grad_norm": 0.6784241795539856, "learning_rate": 0.000942, "loss": 6.0253, "step": 314 }, { "epoch": 0.00315, "grad_norm": 0.5287242531776428, "learning_rate": 0.000945, "loss": 5.9989, "step": 315 }, { "epoch": 0.00316, "grad_norm": 0.5889810919761658, "learning_rate": 0.0009480000000000001, "loss": 5.9738, "step": 316 }, { "epoch": 0.00317, "grad_norm": 0.8596201539039612, "learning_rate": 0.000951, "loss": 5.9963, "step": 317 }, { "epoch": 0.00318, "grad_norm": 1.220719575881958, "learning_rate": 0.000954, "loss": 6.0052, "step": 318 }, { "epoch": 0.00319, "grad_norm": 0.7490801215171814, "learning_rate": 0.0009570000000000001, "loss": 5.9868, "step": 319 }, { "epoch": 0.0032, "grad_norm": 0.6210083365440369, "learning_rate": 0.00096, "loss": 5.9801, "step": 320 }, { "epoch": 0.00321, "grad_norm": 0.5945920944213867, "learning_rate": 0.000963, "loss": 5.97, "step": 321 }, { "epoch": 0.00322, "grad_norm": 0.6791667342185974, "learning_rate": 0.0009660000000000001, "loss": 5.9544, "step": 322 }, { "epoch": 0.00323, "grad_norm": 0.9637515544891357, "learning_rate": 0.000969, "loss": 5.9627, "step": 323 }, { "epoch": 0.00324, "grad_norm": 1.1402119398117065, "learning_rate": 0.0009720000000000001, "loss": 5.972, "step": 324 }, { "epoch": 0.00325, "grad_norm": 1.0057023763656616, "learning_rate": 0.0009750000000000001, "loss": 5.9423, "step": 325 }, { "epoch": 0.00326, "grad_norm": 0.5953328609466553, "learning_rate": 0.0009780000000000001, "loss": 5.942, "step": 326 }, { "epoch": 0.00327, "grad_norm": 0.7124008536338806, "learning_rate": 0.000981, "loss": 5.9465, "step": 327 }, { "epoch": 0.00328, "grad_norm": 0.7318410277366638, "learning_rate": 0.000984, "loss": 5.9241, "step": 328 }, { "epoch": 0.00329, "grad_norm": 0.6503687500953674, "learning_rate": 0.000987, "loss": 5.922, "step": 329 }, { "epoch": 0.0033, "grad_norm": 0.6151977181434631, "learning_rate": 0.00099, "loss": 5.9159, "step": 330 }, { "epoch": 0.00331, "grad_norm": 0.49422070384025574, "learning_rate": 0.0009930000000000002, "loss": 5.9144, "step": 331 }, { "epoch": 0.00332, "grad_norm": 0.563934326171875, "learning_rate": 0.0009960000000000001, "loss": 5.9008, "step": 332 }, { "epoch": 0.00333, "grad_norm": 0.5146680474281311, "learning_rate": 0.000999, "loss": 5.8812, "step": 333 }, { "epoch": 0.00334, "grad_norm": 0.5699781775474548, "learning_rate": 0.001002, "loss": 5.8922, "step": 334 }, { "epoch": 0.00335, "grad_norm": 0.628279983997345, "learning_rate": 0.001005, "loss": 5.8934, "step": 335 }, { "epoch": 0.00336, "grad_norm": 0.638155996799469, "learning_rate": 0.001008, "loss": 5.8854, "step": 336 }, { "epoch": 0.00337, "grad_norm": 0.5850276947021484, "learning_rate": 0.0010110000000000002, "loss": 5.8631, "step": 337 }, { "epoch": 0.00338, "grad_norm": 0.5985286831855774, "learning_rate": 0.0010140000000000001, "loss": 5.879, "step": 338 }, { "epoch": 0.00339, "grad_norm": 0.9502546787261963, "learning_rate": 0.0010170000000000001, "loss": 5.8893, "step": 339 }, { "epoch": 0.0034, "grad_norm": 1.3471951484680176, "learning_rate": 0.00102, "loss": 5.8789, "step": 340 }, { "epoch": 0.00341, "grad_norm": 0.6621291041374207, "learning_rate": 0.001023, "loss": 5.8449, "step": 341 }, { "epoch": 0.00342, "grad_norm": 0.8053567409515381, "learning_rate": 0.001026, "loss": 5.8551, "step": 342 }, { "epoch": 0.00343, "grad_norm": 1.0873545408248901, "learning_rate": 0.0010290000000000002, "loss": 5.8656, "step": 343 }, { "epoch": 0.00344, "grad_norm": 1.1315717697143555, "learning_rate": 0.001032, "loss": 5.8613, "step": 344 }, { "epoch": 0.00345, "grad_norm": 0.7909812927246094, "learning_rate": 0.001035, "loss": 5.8449, "step": 345 }, { "epoch": 0.00346, "grad_norm": 0.8379076719284058, "learning_rate": 0.0010379999999999999, "loss": 5.8518, "step": 346 }, { "epoch": 0.00347, "grad_norm": 0.5857303738594055, "learning_rate": 0.001041, "loss": 5.8133, "step": 347 }, { "epoch": 0.00348, "grad_norm": 0.6455392241477966, "learning_rate": 0.001044, "loss": 5.8262, "step": 348 }, { "epoch": 0.00349, "grad_norm": 0.6315843462944031, "learning_rate": 0.001047, "loss": 5.8046, "step": 349 }, { "epoch": 0.0035, "grad_norm": 0.6185011863708496, "learning_rate": 0.00105, "loss": 5.8282, "step": 350 }, { "epoch": 0.00351, "grad_norm": 0.49840763211250305, "learning_rate": 0.001053, "loss": 5.8234, "step": 351 }, { "epoch": 0.00352, "grad_norm": 0.43951740860939026, "learning_rate": 0.0010559999999999999, "loss": 5.7984, "step": 352 }, { "epoch": 0.00353, "grad_norm": 0.4452185034751892, "learning_rate": 0.001059, "loss": 5.7993, "step": 353 }, { "epoch": 0.00354, "grad_norm": 0.43185603618621826, "learning_rate": 0.001062, "loss": 5.7703, "step": 354 }, { "epoch": 0.00355, "grad_norm": 0.4053448736667633, "learning_rate": 0.001065, "loss": 5.7848, "step": 355 }, { "epoch": 0.00356, "grad_norm": 0.48363247513771057, "learning_rate": 0.001068, "loss": 5.777, "step": 356 }, { "epoch": 0.00357, "grad_norm": 0.5601730346679688, "learning_rate": 0.001071, "loss": 5.7693, "step": 357 }, { "epoch": 0.00358, "grad_norm": 0.7239671349525452, "learning_rate": 0.001074, "loss": 5.7638, "step": 358 }, { "epoch": 0.00359, "grad_norm": 0.8531132936477661, "learning_rate": 0.001077, "loss": 5.76, "step": 359 }, { "epoch": 0.0036, "grad_norm": 1.0332695245742798, "learning_rate": 0.00108, "loss": 5.7701, "step": 360 }, { "epoch": 0.00361, "grad_norm": 1.3098387718200684, "learning_rate": 0.001083, "loss": 5.7755, "step": 361 }, { "epoch": 0.00362, "grad_norm": 0.7546947598457336, "learning_rate": 0.001086, "loss": 5.7474, "step": 362 }, { "epoch": 0.00363, "grad_norm": 0.6924042701721191, "learning_rate": 0.001089, "loss": 5.7394, "step": 363 }, { "epoch": 0.00364, "grad_norm": 0.87959223985672, "learning_rate": 0.001092, "loss": 5.7603, "step": 364 }, { "epoch": 0.00365, "grad_norm": 1.037275791168213, "learning_rate": 0.001095, "loss": 5.7581, "step": 365 }, { "epoch": 0.00366, "grad_norm": 1.056171178817749, "learning_rate": 0.001098, "loss": 5.7464, "step": 366 }, { "epoch": 0.00367, "grad_norm": 1.0635496377944946, "learning_rate": 0.001101, "loss": 5.7478, "step": 367 }, { "epoch": 0.00368, "grad_norm": 0.9701796174049377, "learning_rate": 0.001104, "loss": 5.7478, "step": 368 }, { "epoch": 0.00369, "grad_norm": 0.7430213689804077, "learning_rate": 0.001107, "loss": 5.745, "step": 369 }, { "epoch": 0.0037, "grad_norm": 0.7348084449768066, "learning_rate": 0.00111, "loss": 5.7264, "step": 370 }, { "epoch": 0.00371, "grad_norm": 0.8778790831565857, "learning_rate": 0.001113, "loss": 5.6986, "step": 371 }, { "epoch": 0.00372, "grad_norm": 1.160132884979248, "learning_rate": 0.001116, "loss": 5.7343, "step": 372 }, { "epoch": 0.00373, "grad_norm": 0.8288450241088867, "learning_rate": 0.001119, "loss": 5.7145, "step": 373 }, { "epoch": 0.00374, "grad_norm": 0.7645081281661987, "learning_rate": 0.001122, "loss": 5.7142, "step": 374 }, { "epoch": 0.00375, "grad_norm": 0.8342962265014648, "learning_rate": 0.0011250000000000001, "loss": 5.7215, "step": 375 }, { "epoch": 0.00376, "grad_norm": 0.8966416716575623, "learning_rate": 0.001128, "loss": 5.7142, "step": 376 }, { "epoch": 0.00377, "grad_norm": 1.1411352157592773, "learning_rate": 0.001131, "loss": 5.7202, "step": 377 }, { "epoch": 0.00378, "grad_norm": 1.0639731884002686, "learning_rate": 0.001134, "loss": 5.7166, "step": 378 }, { "epoch": 0.00379, "grad_norm": 1.0386251211166382, "learning_rate": 0.001137, "loss": 5.701, "step": 379 }, { "epoch": 0.0038, "grad_norm": 0.8551567196846008, "learning_rate": 0.00114, "loss": 5.7166, "step": 380 }, { "epoch": 0.00381, "grad_norm": 1.171457290649414, "learning_rate": 0.0011430000000000001, "loss": 5.693, "step": 381 }, { "epoch": 0.00382, "grad_norm": 0.9382472634315491, "learning_rate": 0.001146, "loss": 5.6768, "step": 382 }, { "epoch": 0.00383, "grad_norm": 1.011130452156067, "learning_rate": 0.001149, "loss": 5.6945, "step": 383 }, { "epoch": 0.00384, "grad_norm": 0.7897657155990601, "learning_rate": 0.001152, "loss": 5.6883, "step": 384 }, { "epoch": 0.00385, "grad_norm": 0.6210044622421265, "learning_rate": 0.001155, "loss": 5.673, "step": 385 }, { "epoch": 0.00386, "grad_norm": 0.5515205264091492, "learning_rate": 0.001158, "loss": 5.6723, "step": 386 }, { "epoch": 0.00387, "grad_norm": 0.5881383419036865, "learning_rate": 0.0011610000000000001, "loss": 5.6493, "step": 387 }, { "epoch": 0.00388, "grad_norm": 0.4862520396709442, "learning_rate": 0.001164, "loss": 5.6435, "step": 388 }, { "epoch": 0.00389, "grad_norm": 0.4490566551685333, "learning_rate": 0.001167, "loss": 5.649, "step": 389 }, { "epoch": 0.0039, "grad_norm": 0.4499252140522003, "learning_rate": 0.00117, "loss": 5.6299, "step": 390 }, { "epoch": 0.00391, "grad_norm": 0.3959794044494629, "learning_rate": 0.001173, "loss": 5.6342, "step": 391 }, { "epoch": 0.00392, "grad_norm": 0.41570809483528137, "learning_rate": 0.001176, "loss": 5.6441, "step": 392 }, { "epoch": 0.00393, "grad_norm": 0.4579019248485565, "learning_rate": 0.0011790000000000001, "loss": 5.615, "step": 393 }, { "epoch": 0.00394, "grad_norm": 0.5718971490859985, "learning_rate": 0.001182, "loss": 5.6031, "step": 394 }, { "epoch": 0.00395, "grad_norm": 0.8492469191551208, "learning_rate": 0.001185, "loss": 5.5961, "step": 395 }, { "epoch": 0.00396, "grad_norm": 1.173663854598999, "learning_rate": 0.001188, "loss": 5.6242, "step": 396 }, { "epoch": 0.00397, "grad_norm": 0.828730046749115, "learning_rate": 0.001191, "loss": 5.6071, "step": 397 }, { "epoch": 0.00398, "grad_norm": 1.2022807598114014, "learning_rate": 0.0011940000000000002, "loss": 5.6278, "step": 398 }, { "epoch": 0.00399, "grad_norm": 0.9577529430389404, "learning_rate": 0.0011970000000000001, "loss": 5.6234, "step": 399 }, { "epoch": 0.004, "grad_norm": 1.0115303993225098, "learning_rate": 0.0012000000000000001, "loss": 5.6174, "step": 400 }, { "epoch": 0.00401, "grad_norm": 0.9447324872016907, "learning_rate": 0.001203, "loss": 5.594, "step": 401 }, { "epoch": 0.00402, "grad_norm": 0.8248692154884338, "learning_rate": 0.001206, "loss": 5.5987, "step": 402 }, { "epoch": 0.00403, "grad_norm": 1.0206302404403687, "learning_rate": 0.001209, "loss": 5.6026, "step": 403 }, { "epoch": 0.00404, "grad_norm": 1.090654730796814, "learning_rate": 0.0012120000000000002, "loss": 5.6036, "step": 404 }, { "epoch": 0.00405, "grad_norm": 0.84739750623703, "learning_rate": 0.0012150000000000002, "loss": 5.5901, "step": 405 }, { "epoch": 0.00406, "grad_norm": 0.9469770193099976, "learning_rate": 0.0012180000000000001, "loss": 5.5817, "step": 406 }, { "epoch": 0.00407, "grad_norm": 0.9283969402313232, "learning_rate": 0.0012209999999999999, "loss": 5.5746, "step": 407 }, { "epoch": 0.00408, "grad_norm": 0.8949937224388123, "learning_rate": 0.001224, "loss": 5.5962, "step": 408 }, { "epoch": 0.00409, "grad_norm": 0.8844306468963623, "learning_rate": 0.001227, "loss": 5.573, "step": 409 }, { "epoch": 0.0041, "grad_norm": 0.8062122464179993, "learning_rate": 0.00123, "loss": 5.5618, "step": 410 }, { "epoch": 0.00411, "grad_norm": 0.8730618357658386, "learning_rate": 0.001233, "loss": 5.5872, "step": 411 }, { "epoch": 0.00412, "grad_norm": 0.665547788143158, "learning_rate": 0.001236, "loss": 5.5644, "step": 412 }, { "epoch": 0.00413, "grad_norm": 0.8426138758659363, "learning_rate": 0.0012389999999999999, "loss": 5.5674, "step": 413 }, { "epoch": 0.00414, "grad_norm": 1.051952838897705, "learning_rate": 0.001242, "loss": 5.5591, "step": 414 }, { "epoch": 0.00415, "grad_norm": 0.7785534858703613, "learning_rate": 0.001245, "loss": 5.5542, "step": 415 }, { "epoch": 0.00416, "grad_norm": 0.5227160453796387, "learning_rate": 0.001248, "loss": 5.5279, "step": 416 }, { "epoch": 0.00417, "grad_norm": 0.5770328044891357, "learning_rate": 0.001251, "loss": 5.5611, "step": 417 }, { "epoch": 0.00418, "grad_norm": 0.4929839074611664, "learning_rate": 0.001254, "loss": 5.5305, "step": 418 }, { "epoch": 0.00419, "grad_norm": 0.4660792648792267, "learning_rate": 0.0012569999999999999, "loss": 5.5168, "step": 419 }, { "epoch": 0.0042, "grad_norm": 0.5160586833953857, "learning_rate": 0.00126, "loss": 5.5326, "step": 420 }, { "epoch": 0.00421, "grad_norm": 0.5846797823905945, "learning_rate": 0.001263, "loss": 5.5249, "step": 421 }, { "epoch": 0.00422, "grad_norm": 0.6270997524261475, "learning_rate": 0.001266, "loss": 5.5159, "step": 422 }, { "epoch": 0.00423, "grad_norm": 0.6081735491752625, "learning_rate": 0.001269, "loss": 5.5118, "step": 423 }, { "epoch": 0.00424, "grad_norm": 0.557420551776886, "learning_rate": 0.001272, "loss": 5.5105, "step": 424 }, { "epoch": 0.00425, "grad_norm": 0.821638286113739, "learning_rate": 0.001275, "loss": 5.5176, "step": 425 }, { "epoch": 0.00426, "grad_norm": 1.0497279167175293, "learning_rate": 0.001278, "loss": 5.5294, "step": 426 }, { "epoch": 0.00427, "grad_norm": 0.8568355441093445, "learning_rate": 0.001281, "loss": 5.5072, "step": 427 }, { "epoch": 0.00428, "grad_norm": 0.9392327070236206, "learning_rate": 0.001284, "loss": 5.488, "step": 428 }, { "epoch": 0.00429, "grad_norm": 0.8972091674804688, "learning_rate": 0.001287, "loss": 5.5041, "step": 429 }, { "epoch": 0.0043, "grad_norm": 0.8478754162788391, "learning_rate": 0.00129, "loss": 5.4993, "step": 430 }, { "epoch": 0.00431, "grad_norm": 1.0406945943832397, "learning_rate": 0.001293, "loss": 5.4959, "step": 431 }, { "epoch": 0.00432, "grad_norm": 0.8937470316886902, "learning_rate": 0.001296, "loss": 5.5023, "step": 432 }, { "epoch": 0.00433, "grad_norm": 0.7159745693206787, "learning_rate": 0.001299, "loss": 5.4892, "step": 433 }, { "epoch": 0.00434, "grad_norm": 0.6872638463973999, "learning_rate": 0.001302, "loss": 5.4897, "step": 434 }, { "epoch": 0.00435, "grad_norm": 0.7336323857307434, "learning_rate": 0.001305, "loss": 5.4832, "step": 435 }, { "epoch": 0.00436, "grad_norm": 0.727497935295105, "learning_rate": 0.001308, "loss": 5.4615, "step": 436 }, { "epoch": 0.00437, "grad_norm": 0.8729015588760376, "learning_rate": 0.001311, "loss": 5.4821, "step": 437 }, { "epoch": 0.00438, "grad_norm": 1.0639538764953613, "learning_rate": 0.001314, "loss": 5.4733, "step": 438 }, { "epoch": 0.00439, "grad_norm": 0.9613514542579651, "learning_rate": 0.001317, "loss": 5.5005, "step": 439 }, { "epoch": 0.0044, "grad_norm": 0.7993902564048767, "learning_rate": 0.00132, "loss": 5.4683, "step": 440 }, { "epoch": 0.00441, "grad_norm": 0.6981948018074036, "learning_rate": 0.001323, "loss": 5.4617, "step": 441 }, { "epoch": 0.00442, "grad_norm": 0.5610657334327698, "learning_rate": 0.0013260000000000001, "loss": 5.4493, "step": 442 }, { "epoch": 0.00443, "grad_norm": 0.6162020564079285, "learning_rate": 0.001329, "loss": 5.4545, "step": 443 }, { "epoch": 0.00444, "grad_norm": 0.5797529816627502, "learning_rate": 0.001332, "loss": 5.4538, "step": 444 }, { "epoch": 0.00445, "grad_norm": 0.551798939704895, "learning_rate": 0.001335, "loss": 5.4358, "step": 445 }, { "epoch": 0.00446, "grad_norm": 0.48300743103027344, "learning_rate": 0.001338, "loss": 5.4369, "step": 446 }, { "epoch": 0.00447, "grad_norm": 0.5713039040565491, "learning_rate": 0.001341, "loss": 5.4366, "step": 447 }, { "epoch": 0.00448, "grad_norm": 0.7566826939582825, "learning_rate": 0.0013440000000000001, "loss": 5.4234, "step": 448 }, { "epoch": 0.00449, "grad_norm": 1.1563501358032227, "learning_rate": 0.001347, "loss": 5.4418, "step": 449 }, { "epoch": 0.0045, "grad_norm": 1.132352590560913, "learning_rate": 0.00135, "loss": 5.4433, "step": 450 }, { "epoch": 0.00451, "grad_norm": 1.0986182689666748, "learning_rate": 0.001353, "loss": 5.4341, "step": 451 }, { "epoch": 0.00452, "grad_norm": 1.066072702407837, "learning_rate": 0.001356, "loss": 5.4212, "step": 452 }, { "epoch": 0.00453, "grad_norm": 0.9297358989715576, "learning_rate": 0.001359, "loss": 5.4103, "step": 453 }, { "epoch": 0.00454, "grad_norm": 0.9204379320144653, "learning_rate": 0.0013620000000000001, "loss": 5.4294, "step": 454 }, { "epoch": 0.00455, "grad_norm": 1.0156644582748413, "learning_rate": 0.0013650000000000001, "loss": 5.4283, "step": 455 }, { "epoch": 0.00456, "grad_norm": 1.070080041885376, "learning_rate": 0.001368, "loss": 5.4307, "step": 456 }, { "epoch": 0.00457, "grad_norm": 0.7985509634017944, "learning_rate": 0.001371, "loss": 5.4131, "step": 457 }, { "epoch": 0.00458, "grad_norm": 0.8899184465408325, "learning_rate": 0.001374, "loss": 5.4253, "step": 458 }, { "epoch": 0.00459, "grad_norm": 1.0247424840927124, "learning_rate": 0.0013770000000000002, "loss": 5.4196, "step": 459 }, { "epoch": 0.0046, "grad_norm": 0.8818691968917847, "learning_rate": 0.0013800000000000002, "loss": 5.4023, "step": 460 }, { "epoch": 0.00461, "grad_norm": 0.9398977160453796, "learning_rate": 0.0013830000000000001, "loss": 5.4028, "step": 461 }, { "epoch": 0.00462, "grad_norm": 0.9095609784126282, "learning_rate": 0.001386, "loss": 5.4141, "step": 462 }, { "epoch": 0.00463, "grad_norm": 0.7167875170707703, "learning_rate": 0.001389, "loss": 5.3816, "step": 463 }, { "epoch": 0.00464, "grad_norm": 0.8864797353744507, "learning_rate": 0.001392, "loss": 5.4115, "step": 464 }, { "epoch": 0.00465, "grad_norm": 0.8739372491836548, "learning_rate": 0.0013950000000000002, "loss": 5.3819, "step": 465 }, { "epoch": 0.00466, "grad_norm": 0.808113157749176, "learning_rate": 0.0013980000000000002, "loss": 5.3947, "step": 466 }, { "epoch": 0.00467, "grad_norm": 0.878140389919281, "learning_rate": 0.0014010000000000001, "loss": 5.3861, "step": 467 }, { "epoch": 0.00468, "grad_norm": 1.0618577003479004, "learning_rate": 0.001404, "loss": 5.3874, "step": 468 }, { "epoch": 0.00469, "grad_norm": 0.8678603172302246, "learning_rate": 0.001407, "loss": 5.3793, "step": 469 }, { "epoch": 0.0047, "grad_norm": 1.0598393678665161, "learning_rate": 0.00141, "loss": 5.397, "step": 470 }, { "epoch": 0.00471, "grad_norm": 0.9507164359092712, "learning_rate": 0.001413, "loss": 5.3744, "step": 471 }, { "epoch": 0.00472, "grad_norm": 0.891309916973114, "learning_rate": 0.001416, "loss": 5.3876, "step": 472 }, { "epoch": 0.00473, "grad_norm": 0.9032427072525024, "learning_rate": 0.001419, "loss": 5.3933, "step": 473 }, { "epoch": 0.00474, "grad_norm": 1.2588310241699219, "learning_rate": 0.0014219999999999999, "loss": 5.3882, "step": 474 }, { "epoch": 0.00475, "grad_norm": 0.8014562129974365, "learning_rate": 0.001425, "loss": 5.367, "step": 475 }, { "epoch": 0.00476, "grad_norm": 0.7612058520317078, "learning_rate": 0.001428, "loss": 5.3988, "step": 476 }, { "epoch": 0.00477, "grad_norm": 0.6699860095977783, "learning_rate": 0.001431, "loss": 5.3462, "step": 477 }, { "epoch": 0.00478, "grad_norm": 0.7476372718811035, "learning_rate": 0.001434, "loss": 5.3761, "step": 478 }, { "epoch": 0.00479, "grad_norm": 0.7134982347488403, "learning_rate": 0.001437, "loss": 5.3522, "step": 479 }, { "epoch": 0.0048, "grad_norm": 0.5948371887207031, "learning_rate": 0.0014399999999999999, "loss": 5.343, "step": 480 }, { "epoch": 0.00481, "grad_norm": 0.4980184733867645, "learning_rate": 0.001443, "loss": 5.3421, "step": 481 }, { "epoch": 0.00482, "grad_norm": 0.49235713481903076, "learning_rate": 0.001446, "loss": 5.3164, "step": 482 }, { "epoch": 0.00483, "grad_norm": 0.5064442157745361, "learning_rate": 0.001449, "loss": 5.327, "step": 483 }, { "epoch": 0.00484, "grad_norm": 0.506096601486206, "learning_rate": 0.001452, "loss": 5.3265, "step": 484 }, { "epoch": 0.00485, "grad_norm": 0.5634677410125732, "learning_rate": 0.001455, "loss": 5.3337, "step": 485 }, { "epoch": 0.00486, "grad_norm": 0.5971659421920776, "learning_rate": 0.001458, "loss": 5.306, "step": 486 }, { "epoch": 0.00487, "grad_norm": 0.5582761168479919, "learning_rate": 0.001461, "loss": 5.3211, "step": 487 }, { "epoch": 0.00488, "grad_norm": 0.5640081763267517, "learning_rate": 0.001464, "loss": 5.2971, "step": 488 }, { "epoch": 0.00489, "grad_norm": 0.6793756484985352, "learning_rate": 0.001467, "loss": 5.2905, "step": 489 }, { "epoch": 0.0049, "grad_norm": 0.7497550249099731, "learning_rate": 0.00147, "loss": 5.3082, "step": 490 }, { "epoch": 0.00491, "grad_norm": 0.7264507412910461, "learning_rate": 0.001473, "loss": 5.3062, "step": 491 }, { "epoch": 0.00492, "grad_norm": 0.5965330004692078, "learning_rate": 0.001476, "loss": 5.3011, "step": 492 }, { "epoch": 0.00493, "grad_norm": 0.8212659358978271, "learning_rate": 0.001479, "loss": 5.3006, "step": 493 }, { "epoch": 0.00494, "grad_norm": 1.2706849575042725, "learning_rate": 0.001482, "loss": 5.3221, "step": 494 }, { "epoch": 0.00495, "grad_norm": 0.9726585149765015, "learning_rate": 0.001485, "loss": 5.3189, "step": 495 }, { "epoch": 0.00496, "grad_norm": 1.0229647159576416, "learning_rate": 0.001488, "loss": 5.3095, "step": 496 }, { "epoch": 0.00497, "grad_norm": 0.9751450419425964, "learning_rate": 0.001491, "loss": 5.294, "step": 497 }, { "epoch": 0.00498, "grad_norm": 0.9788212776184082, "learning_rate": 0.001494, "loss": 5.3219, "step": 498 }, { "epoch": 0.00499, "grad_norm": 0.897365391254425, "learning_rate": 0.001497, "loss": 5.2943, "step": 499 }, { "epoch": 0.005, "grad_norm": 0.8972038626670837, "learning_rate": 0.0015, "loss": 5.2998, "step": 500 }, { "epoch": 0.00501, "grad_norm": 1.123322606086731, "learning_rate": 0.001503, "loss": 5.3107, "step": 501 }, { "epoch": 0.00502, "grad_norm": 1.085119366645813, "learning_rate": 0.001506, "loss": 5.3147, "step": 502 }, { "epoch": 0.00503, "grad_norm": 0.9612423181533813, "learning_rate": 0.0015090000000000001, "loss": 5.3162, "step": 503 }, { "epoch": 0.00504, "grad_norm": 1.0992624759674072, "learning_rate": 0.001512, "loss": 5.3083, "step": 504 }, { "epoch": 0.00505, "grad_norm": 0.9857581257820129, "learning_rate": 0.001515, "loss": 5.2946, "step": 505 }, { "epoch": 0.00506, "grad_norm": 1.1049542427062988, "learning_rate": 0.001518, "loss": 5.3003, "step": 506 }, { "epoch": 0.00507, "grad_norm": 0.8998914957046509, "learning_rate": 0.001521, "loss": 5.298, "step": 507 }, { "epoch": 0.00508, "grad_norm": 0.6991334557533264, "learning_rate": 0.001524, "loss": 5.2862, "step": 508 }, { "epoch": 0.00509, "grad_norm": 0.7653549313545227, "learning_rate": 0.0015270000000000001, "loss": 5.2723, "step": 509 }, { "epoch": 0.0051, "grad_norm": 0.7315691113471985, "learning_rate": 0.0015300000000000001, "loss": 5.288, "step": 510 }, { "epoch": 0.00511, "grad_norm": 0.7975103855133057, "learning_rate": 0.001533, "loss": 5.2648, "step": 511 }, { "epoch": 0.00512, "grad_norm": 0.9781049489974976, "learning_rate": 0.001536, "loss": 5.2672, "step": 512 }, { "epoch": 0.00513, "grad_norm": 1.084666132926941, "learning_rate": 0.001539, "loss": 5.2896, "step": 513 }, { "epoch": 0.00514, "grad_norm": 0.9010921120643616, "learning_rate": 0.001542, "loss": 5.2706, "step": 514 }, { "epoch": 0.00515, "grad_norm": 0.96586012840271, "learning_rate": 0.0015450000000000001, "loss": 5.2764, "step": 515 }, { "epoch": 0.00516, "grad_norm": 0.9655681848526001, "learning_rate": 0.0015480000000000001, "loss": 5.2769, "step": 516 }, { "epoch": 0.00517, "grad_norm": 0.8448560833930969, "learning_rate": 0.001551, "loss": 5.2634, "step": 517 }, { "epoch": 0.00518, "grad_norm": 0.7809770703315735, "learning_rate": 0.001554, "loss": 5.2515, "step": 518 }, { "epoch": 0.00519, "grad_norm": 0.913107693195343, "learning_rate": 0.001557, "loss": 5.2572, "step": 519 }, { "epoch": 0.0052, "grad_norm": 0.9221185445785522, "learning_rate": 0.0015600000000000002, "loss": 5.2658, "step": 520 }, { "epoch": 0.00521, "grad_norm": 0.8511962294578552, "learning_rate": 0.0015630000000000002, "loss": 5.2423, "step": 521 }, { "epoch": 0.00522, "grad_norm": 0.7266805171966553, "learning_rate": 0.0015660000000000001, "loss": 5.2383, "step": 522 }, { "epoch": 0.00523, "grad_norm": 0.8215247392654419, "learning_rate": 0.001569, "loss": 5.2547, "step": 523 }, { "epoch": 0.00524, "grad_norm": 0.8742693066596985, "learning_rate": 0.001572, "loss": 5.2653, "step": 524 }, { "epoch": 0.00525, "grad_norm": 0.6882407069206238, "learning_rate": 0.001575, "loss": 5.2501, "step": 525 }, { "epoch": 0.00526, "grad_norm": 0.7224147319793701, "learning_rate": 0.0015780000000000002, "loss": 5.2254, "step": 526 }, { "epoch": 0.00527, "grad_norm": 0.6562958359718323, "learning_rate": 0.0015810000000000002, "loss": 5.2385, "step": 527 }, { "epoch": 0.00528, "grad_norm": 0.6051112413406372, "learning_rate": 0.0015840000000000001, "loss": 5.2165, "step": 528 }, { "epoch": 0.00529, "grad_norm": 0.647803008556366, "learning_rate": 0.001587, "loss": 5.2244, "step": 529 }, { "epoch": 0.0053, "grad_norm": 0.6608071327209473, "learning_rate": 0.00159, "loss": 5.2339, "step": 530 }, { "epoch": 0.00531, "grad_norm": 0.6765715479850769, "learning_rate": 0.001593, "loss": 5.2181, "step": 531 }, { "epoch": 0.00532, "grad_norm": 0.7380223870277405, "learning_rate": 0.0015960000000000002, "loss": 5.2256, "step": 532 }, { "epoch": 0.00533, "grad_norm": 0.7506837248802185, "learning_rate": 0.0015990000000000002, "loss": 5.2236, "step": 533 }, { "epoch": 0.00534, "grad_norm": 0.577497661113739, "learning_rate": 0.0016020000000000001, "loss": 5.2066, "step": 534 }, { "epoch": 0.00535, "grad_norm": 0.4974853992462158, "learning_rate": 0.001605, "loss": 5.2096, "step": 535 }, { "epoch": 0.00536, "grad_norm": 0.5754765272140503, "learning_rate": 0.001608, "loss": 5.2121, "step": 536 }, { "epoch": 0.00537, "grad_norm": 0.6681102514266968, "learning_rate": 0.0016110000000000002, "loss": 5.2067, "step": 537 }, { "epoch": 0.00538, "grad_norm": 0.8286970257759094, "learning_rate": 0.0016140000000000002, "loss": 5.1882, "step": 538 }, { "epoch": 0.00539, "grad_norm": 1.0212045907974243, "learning_rate": 0.0016170000000000002, "loss": 5.193, "step": 539 }, { "epoch": 0.0054, "grad_norm": 1.0495171546936035, "learning_rate": 0.0016200000000000001, "loss": 5.2061, "step": 540 }, { "epoch": 0.00541, "grad_norm": 0.9756328463554382, "learning_rate": 0.001623, "loss": 5.1953, "step": 541 }, { "epoch": 0.00542, "grad_norm": 1.024538278579712, "learning_rate": 0.001626, "loss": 5.2247, "step": 542 }, { "epoch": 0.00543, "grad_norm": 1.005081057548523, "learning_rate": 0.0016290000000000002, "loss": 5.1883, "step": 543 }, { "epoch": 0.00544, "grad_norm": 1.1540062427520752, "learning_rate": 0.0016320000000000002, "loss": 5.2159, "step": 544 }, { "epoch": 0.00545, "grad_norm": 1.0779460668563843, "learning_rate": 0.0016350000000000002, "loss": 5.2271, "step": 545 }, { "epoch": 0.00546, "grad_norm": 0.9231882691383362, "learning_rate": 0.0016380000000000001, "loss": 5.1917, "step": 546 }, { "epoch": 0.00547, "grad_norm": 0.9463688731193542, "learning_rate": 0.001641, "loss": 5.1943, "step": 547 }, { "epoch": 0.00548, "grad_norm": 0.8151195645332336, "learning_rate": 0.001644, "loss": 5.1898, "step": 548 }, { "epoch": 0.00549, "grad_norm": 0.6324855089187622, "learning_rate": 0.0016470000000000002, "loss": 5.1802, "step": 549 }, { "epoch": 0.0055, "grad_norm": 0.7012510299682617, "learning_rate": 0.0016500000000000002, "loss": 5.1963, "step": 550 }, { "epoch": 0.00551, "grad_norm": 0.6119561195373535, "learning_rate": 0.0016530000000000002, "loss": 5.1867, "step": 551 }, { "epoch": 0.00552, "grad_norm": 0.6565516591072083, "learning_rate": 0.0016560000000000001, "loss": 5.1814, "step": 552 }, { "epoch": 0.00553, "grad_norm": 0.60141921043396, "learning_rate": 0.001659, "loss": 5.1716, "step": 553 }, { "epoch": 0.00554, "grad_norm": 0.5632038116455078, "learning_rate": 0.0016620000000000003, "loss": 5.1664, "step": 554 }, { "epoch": 0.00555, "grad_norm": 0.7644810080528259, "learning_rate": 0.0016650000000000002, "loss": 5.1782, "step": 555 }, { "epoch": 0.00556, "grad_norm": 0.8742493987083435, "learning_rate": 0.0016680000000000002, "loss": 5.1659, "step": 556 }, { "epoch": 0.00557, "grad_norm": 0.8544741272926331, "learning_rate": 0.0016710000000000002, "loss": 5.1502, "step": 557 }, { "epoch": 0.00558, "grad_norm": 0.9127110838890076, "learning_rate": 0.0016740000000000001, "loss": 5.182, "step": 558 }, { "epoch": 0.00559, "grad_norm": 0.9648676514625549, "learning_rate": 0.001677, "loss": 5.1678, "step": 559 }, { "epoch": 0.0056, "grad_norm": 0.8821786642074585, "learning_rate": 0.0016800000000000003, "loss": 5.1427, "step": 560 }, { "epoch": 0.00561, "grad_norm": 0.891691267490387, "learning_rate": 0.0016830000000000003, "loss": 5.1733, "step": 561 }, { "epoch": 0.00562, "grad_norm": 0.9146907925605774, "learning_rate": 0.0016860000000000002, "loss": 5.1636, "step": 562 }, { "epoch": 0.00563, "grad_norm": 0.9244825839996338, "learning_rate": 0.001689, "loss": 5.1722, "step": 563 }, { "epoch": 0.00564, "grad_norm": 0.7839826345443726, "learning_rate": 0.001692, "loss": 5.1431, "step": 564 }, { "epoch": 0.00565, "grad_norm": 1.062433123588562, "learning_rate": 0.001695, "loss": 5.1564, "step": 565 }, { "epoch": 0.00566, "grad_norm": 0.8471325635910034, "learning_rate": 0.0016979999999999999, "loss": 5.1512, "step": 566 }, { "epoch": 0.00567, "grad_norm": 0.8534058928489685, "learning_rate": 0.0017009999999999998, "loss": 5.1473, "step": 567 }, { "epoch": 0.00568, "grad_norm": 0.8670461177825928, "learning_rate": 0.0017039999999999998, "loss": 5.1567, "step": 568 }, { "epoch": 0.00569, "grad_norm": 0.9165869951248169, "learning_rate": 0.001707, "loss": 5.1375, "step": 569 }, { "epoch": 0.0057, "grad_norm": 1.1377770900726318, "learning_rate": 0.00171, "loss": 5.1664, "step": 570 }, { "epoch": 0.00571, "grad_norm": 0.8107508420944214, "learning_rate": 0.001713, "loss": 5.1589, "step": 571 }, { "epoch": 0.00572, "grad_norm": 0.7913006544113159, "learning_rate": 0.0017159999999999999, "loss": 5.1345, "step": 572 }, { "epoch": 0.00573, "grad_norm": 0.7625595927238464, "learning_rate": 0.0017189999999999998, "loss": 5.1388, "step": 573 }, { "epoch": 0.00574, "grad_norm": 0.8708691596984863, "learning_rate": 0.001722, "loss": 5.1477, "step": 574 }, { "epoch": 0.00575, "grad_norm": 0.8358116149902344, "learning_rate": 0.001725, "loss": 5.126, "step": 575 }, { "epoch": 0.00576, "grad_norm": 0.7503964900970459, "learning_rate": 0.001728, "loss": 5.1165, "step": 576 }, { "epoch": 0.00577, "grad_norm": 0.743698000907898, "learning_rate": 0.001731, "loss": 5.1434, "step": 577 }, { "epoch": 0.00578, "grad_norm": 0.7937496900558472, "learning_rate": 0.0017339999999999999, "loss": 5.1222, "step": 578 }, { "epoch": 0.00579, "grad_norm": 0.7887423038482666, "learning_rate": 0.0017369999999999998, "loss": 5.1084, "step": 579 }, { "epoch": 0.0058, "grad_norm": 0.7772188186645508, "learning_rate": 0.00174, "loss": 5.1272, "step": 580 }, { "epoch": 0.00581, "grad_norm": 0.8483501076698303, "learning_rate": 0.001743, "loss": 5.1251, "step": 581 }, { "epoch": 0.00582, "grad_norm": 1.0748672485351562, "learning_rate": 0.001746, "loss": 5.1133, "step": 582 }, { "epoch": 0.00583, "grad_norm": 0.9845912456512451, "learning_rate": 0.001749, "loss": 5.1338, "step": 583 }, { "epoch": 0.00584, "grad_norm": 1.0171496868133545, "learning_rate": 0.0017519999999999999, "loss": 5.1328, "step": 584 }, { "epoch": 0.00585, "grad_norm": 0.932063102722168, "learning_rate": 0.0017549999999999998, "loss": 5.1125, "step": 585 }, { "epoch": 0.00586, "grad_norm": 1.0053131580352783, "learning_rate": 0.001758, "loss": 5.1277, "step": 586 }, { "epoch": 0.00587, "grad_norm": 1.0553542375564575, "learning_rate": 0.001761, "loss": 5.1113, "step": 587 }, { "epoch": 0.00588, "grad_norm": 0.9641870260238647, "learning_rate": 0.001764, "loss": 5.1209, "step": 588 }, { "epoch": 0.00589, "grad_norm": 0.7223602533340454, "learning_rate": 0.001767, "loss": 5.1139, "step": 589 }, { "epoch": 0.0059, "grad_norm": 0.7580032348632812, "learning_rate": 0.0017699999999999999, "loss": 5.1016, "step": 590 }, { "epoch": 0.00591, "grad_norm": 0.7553709149360657, "learning_rate": 0.001773, "loss": 5.097, "step": 591 }, { "epoch": 0.00592, "grad_norm": 0.7395292520523071, "learning_rate": 0.001776, "loss": 5.122, "step": 592 }, { "epoch": 0.00593, "grad_norm": 0.6007040739059448, "learning_rate": 0.001779, "loss": 5.1118, "step": 593 }, { "epoch": 0.00594, "grad_norm": 0.6126047372817993, "learning_rate": 0.001782, "loss": 5.1081, "step": 594 }, { "epoch": 0.00595, "grad_norm": 0.607721745967865, "learning_rate": 0.001785, "loss": 5.0957, "step": 595 }, { "epoch": 0.00596, "grad_norm": 0.6556451916694641, "learning_rate": 0.0017879999999999999, "loss": 5.0948, "step": 596 }, { "epoch": 0.00597, "grad_norm": 0.7172878384590149, "learning_rate": 0.001791, "loss": 5.0729, "step": 597 }, { "epoch": 0.00598, "grad_norm": 0.6043835878372192, "learning_rate": 0.001794, "loss": 5.0805, "step": 598 }, { "epoch": 0.00599, "grad_norm": 0.5666232109069824, "learning_rate": 0.001797, "loss": 5.0796, "step": 599 }, { "epoch": 0.006, "grad_norm": 0.5673431754112244, "learning_rate": 0.0018, "loss": 5.0764, "step": 600 }, { "epoch": 0.00601, "grad_norm": 0.5798671841621399, "learning_rate": 0.001803, "loss": 5.0708, "step": 601 }, { "epoch": 0.00602, "grad_norm": 0.520574152469635, "learning_rate": 0.0018059999999999999, "loss": 5.0544, "step": 602 }, { "epoch": 0.00603, "grad_norm": 0.5210540294647217, "learning_rate": 0.001809, "loss": 5.0732, "step": 603 }, { "epoch": 0.00604, "grad_norm": 0.6759857535362244, "learning_rate": 0.001812, "loss": 5.0576, "step": 604 }, { "epoch": 0.00605, "grad_norm": 0.7568657994270325, "learning_rate": 0.001815, "loss": 5.0701, "step": 605 }, { "epoch": 0.00606, "grad_norm": 0.7632762789726257, "learning_rate": 0.001818, "loss": 5.0711, "step": 606 }, { "epoch": 0.00607, "grad_norm": 0.788451611995697, "learning_rate": 0.001821, "loss": 5.073, "step": 607 }, { "epoch": 0.00608, "grad_norm": 0.763842761516571, "learning_rate": 0.001824, "loss": 5.0575, "step": 608 }, { "epoch": 0.00609, "grad_norm": 0.825861930847168, "learning_rate": 0.001827, "loss": 5.0665, "step": 609 }, { "epoch": 0.0061, "grad_norm": 1.0143935680389404, "learning_rate": 0.00183, "loss": 5.0488, "step": 610 }, { "epoch": 0.00611, "grad_norm": 1.1116124391555786, "learning_rate": 0.001833, "loss": 5.0562, "step": 611 }, { "epoch": 0.00612, "grad_norm": 1.0579830408096313, "learning_rate": 0.001836, "loss": 5.0621, "step": 612 }, { "epoch": 0.00613, "grad_norm": 1.2180272340774536, "learning_rate": 0.001839, "loss": 5.069, "step": 613 }, { "epoch": 0.00614, "grad_norm": 0.7525346875190735, "learning_rate": 0.001842, "loss": 5.0757, "step": 614 }, { "epoch": 0.00615, "grad_norm": 0.833108127117157, "learning_rate": 0.001845, "loss": 5.0659, "step": 615 }, { "epoch": 0.00616, "grad_norm": 0.7170072793960571, "learning_rate": 0.001848, "loss": 5.0473, "step": 616 }, { "epoch": 0.00617, "grad_norm": 0.718910276889801, "learning_rate": 0.001851, "loss": 5.0608, "step": 617 }, { "epoch": 0.00618, "grad_norm": 0.6572015881538391, "learning_rate": 0.001854, "loss": 5.0425, "step": 618 }, { "epoch": 0.00619, "grad_norm": 0.7236103415489197, "learning_rate": 0.001857, "loss": 5.0504, "step": 619 }, { "epoch": 0.0062, "grad_norm": 0.8058017492294312, "learning_rate": 0.00186, "loss": 5.0386, "step": 620 }, { "epoch": 0.00621, "grad_norm": 0.861880898475647, "learning_rate": 0.001863, "loss": 5.0409, "step": 621 }, { "epoch": 0.00622, "grad_norm": 1.0328248739242554, "learning_rate": 0.001866, "loss": 5.0489, "step": 622 }, { "epoch": 0.00623, "grad_norm": 0.8494102358818054, "learning_rate": 0.001869, "loss": 5.0318, "step": 623 }, { "epoch": 0.00624, "grad_norm": 0.9827755093574524, "learning_rate": 0.001872, "loss": 5.0476, "step": 624 }, { "epoch": 0.00625, "grad_norm": 1.0741342306137085, "learning_rate": 0.001875, "loss": 5.0686, "step": 625 }, { "epoch": 0.00626, "grad_norm": 0.7305473685264587, "learning_rate": 0.0018780000000000001, "loss": 5.0304, "step": 626 }, { "epoch": 0.00627, "grad_norm": 0.8084409832954407, "learning_rate": 0.001881, "loss": 5.0255, "step": 627 }, { "epoch": 0.00628, "grad_norm": 1.0902513265609741, "learning_rate": 0.001884, "loss": 5.0666, "step": 628 }, { "epoch": 0.00629, "grad_norm": 1.036152958869934, "learning_rate": 0.001887, "loss": 5.0439, "step": 629 }, { "epoch": 0.0063, "grad_norm": 1.0308260917663574, "learning_rate": 0.00189, "loss": 5.0356, "step": 630 }, { "epoch": 0.00631, "grad_norm": 0.984308123588562, "learning_rate": 0.0018930000000000002, "loss": 5.0356, "step": 631 }, { "epoch": 0.00632, "grad_norm": 0.8649469017982483, "learning_rate": 0.0018960000000000001, "loss": 5.0479, "step": 632 }, { "epoch": 0.00633, "grad_norm": 0.8776420950889587, "learning_rate": 0.001899, "loss": 5.0459, "step": 633 }, { "epoch": 0.00634, "grad_norm": 0.8304409980773926, "learning_rate": 0.001902, "loss": 5.037, "step": 634 }, { "epoch": 0.00635, "grad_norm": 0.7966147065162659, "learning_rate": 0.001905, "loss": 5.0227, "step": 635 }, { "epoch": 0.00636, "grad_norm": 0.9172542095184326, "learning_rate": 0.001908, "loss": 5.0337, "step": 636 }, { "epoch": 0.00637, "grad_norm": 1.3219475746154785, "learning_rate": 0.0019110000000000002, "loss": 5.0715, "step": 637 }, { "epoch": 0.00638, "grad_norm": 0.7924789190292358, "learning_rate": 0.0019140000000000001, "loss": 5.0347, "step": 638 }, { "epoch": 0.00639, "grad_norm": 0.8469759225845337, "learning_rate": 0.001917, "loss": 5.0139, "step": 639 }, { "epoch": 0.0064, "grad_norm": 0.9544380307197571, "learning_rate": 0.00192, "loss": 5.0191, "step": 640 }, { "epoch": 0.00641, "grad_norm": 1.0876184701919556, "learning_rate": 0.001923, "loss": 5.0379, "step": 641 }, { "epoch": 0.00642, "grad_norm": 0.8299298286437988, "learning_rate": 0.001926, "loss": 5.0146, "step": 642 }, { "epoch": 0.00643, "grad_norm": 0.9603999257087708, "learning_rate": 0.0019290000000000002, "loss": 5.0399, "step": 643 }, { "epoch": 0.00644, "grad_norm": 0.7977001070976257, "learning_rate": 0.0019320000000000001, "loss": 5.0094, "step": 644 }, { "epoch": 0.00645, "grad_norm": 0.7244200706481934, "learning_rate": 0.001935, "loss": 5.0161, "step": 645 }, { "epoch": 0.00646, "grad_norm": 0.7832402586936951, "learning_rate": 0.001938, "loss": 5.0193, "step": 646 }, { "epoch": 0.00647, "grad_norm": 0.8586620092391968, "learning_rate": 0.001941, "loss": 5.0299, "step": 647 }, { "epoch": 0.00648, "grad_norm": 0.8153418302536011, "learning_rate": 0.0019440000000000002, "loss": 5.0163, "step": 648 }, { "epoch": 0.00649, "grad_norm": 0.766000509262085, "learning_rate": 0.0019470000000000002, "loss": 4.9982, "step": 649 }, { "epoch": 0.0065, "grad_norm": 0.7875446677207947, "learning_rate": 0.0019500000000000001, "loss": 5.0238, "step": 650 }, { "epoch": 0.00651, "grad_norm": 0.7245673537254333, "learning_rate": 0.001953, "loss": 5.0037, "step": 651 }, { "epoch": 0.00652, "grad_norm": 0.7840576767921448, "learning_rate": 0.0019560000000000003, "loss": 4.9987, "step": 652 }, { "epoch": 0.00653, "grad_norm": 0.8112754225730896, "learning_rate": 0.0019590000000000002, "loss": 4.9969, "step": 653 }, { "epoch": 0.00654, "grad_norm": 0.6785946488380432, "learning_rate": 0.001962, "loss": 4.9786, "step": 654 }, { "epoch": 0.00655, "grad_norm": 0.6141355633735657, "learning_rate": 0.001965, "loss": 4.9897, "step": 655 }, { "epoch": 0.00656, "grad_norm": 0.647098958492279, "learning_rate": 0.001968, "loss": 4.982, "step": 656 }, { "epoch": 0.00657, "grad_norm": 0.6601396203041077, "learning_rate": 0.001971, "loss": 4.9729, "step": 657 }, { "epoch": 0.00658, "grad_norm": 0.5589995384216309, "learning_rate": 0.001974, "loss": 4.9927, "step": 658 }, { "epoch": 0.00659, "grad_norm": 0.551213264465332, "learning_rate": 0.001977, "loss": 4.972, "step": 659 }, { "epoch": 0.0066, "grad_norm": 0.5106261968612671, "learning_rate": 0.00198, "loss": 4.963, "step": 660 }, { "epoch": 0.00661, "grad_norm": 0.49552980065345764, "learning_rate": 0.001983, "loss": 4.9449, "step": 661 }, { "epoch": 0.00662, "grad_norm": 0.4502437114715576, "learning_rate": 0.0019860000000000004, "loss": 4.9438, "step": 662 }, { "epoch": 0.00663, "grad_norm": 0.4717095196247101, "learning_rate": 0.0019890000000000003, "loss": 4.9599, "step": 663 }, { "epoch": 0.00664, "grad_norm": 0.5179165005683899, "learning_rate": 0.0019920000000000003, "loss": 4.9431, "step": 664 }, { "epoch": 0.00665, "grad_norm": 0.6156288385391235, "learning_rate": 0.0019950000000000002, "loss": 4.9222, "step": 665 }, { "epoch": 0.00666, "grad_norm": 0.6403276920318604, "learning_rate": 0.001998, "loss": 4.9456, "step": 666 }, { "epoch": 0.00667, "grad_norm": 0.6612551808357239, "learning_rate": 0.002001, "loss": 4.9436, "step": 667 }, { "epoch": 0.00668, "grad_norm": 0.6370317339897156, "learning_rate": 0.002004, "loss": 4.9618, "step": 668 }, { "epoch": 0.00669, "grad_norm": 0.6632084250450134, "learning_rate": 0.002007, "loss": 4.9618, "step": 669 }, { "epoch": 0.0067, "grad_norm": 0.6647160649299622, "learning_rate": 0.00201, "loss": 4.9534, "step": 670 }, { "epoch": 0.00671, "grad_norm": 0.6171524524688721, "learning_rate": 0.002013, "loss": 4.9269, "step": 671 }, { "epoch": 0.00672, "grad_norm": 0.6804357171058655, "learning_rate": 0.002016, "loss": 4.9362, "step": 672 }, { "epoch": 0.00673, "grad_norm": 0.7436751127243042, "learning_rate": 0.002019, "loss": 4.9351, "step": 673 }, { "epoch": 0.00674, "grad_norm": 0.8610292077064514, "learning_rate": 0.0020220000000000004, "loss": 4.9433, "step": 674 }, { "epoch": 0.00675, "grad_norm": 1.046964168548584, "learning_rate": 0.0020250000000000003, "loss": 4.9648, "step": 675 }, { "epoch": 0.00676, "grad_norm": 0.9578864574432373, "learning_rate": 0.0020280000000000003, "loss": 4.9492, "step": 676 }, { "epoch": 0.00677, "grad_norm": 1.1337146759033203, "learning_rate": 0.0020310000000000003, "loss": 4.9695, "step": 677 }, { "epoch": 0.00678, "grad_norm": 1.0358091592788696, "learning_rate": 0.0020340000000000002, "loss": 4.9763, "step": 678 }, { "epoch": 0.00679, "grad_norm": 1.230987787246704, "learning_rate": 0.002037, "loss": 4.9735, "step": 679 }, { "epoch": 0.0068, "grad_norm": 0.9104715585708618, "learning_rate": 0.00204, "loss": 4.9643, "step": 680 }, { "epoch": 0.00681, "grad_norm": 1.1940183639526367, "learning_rate": 0.002043, "loss": 4.9659, "step": 681 }, { "epoch": 0.00682, "grad_norm": 1.0067143440246582, "learning_rate": 0.002046, "loss": 4.9735, "step": 682 }, { "epoch": 0.00683, "grad_norm": 1.224305510520935, "learning_rate": 0.002049, "loss": 4.9612, "step": 683 }, { "epoch": 0.00684, "grad_norm": 0.9917755126953125, "learning_rate": 0.002052, "loss": 4.9694, "step": 684 }, { "epoch": 0.00685, "grad_norm": 0.9187195897102356, "learning_rate": 0.0020550000000000004, "loss": 4.964, "step": 685 }, { "epoch": 0.00686, "grad_norm": 1.035937786102295, "learning_rate": 0.0020580000000000004, "loss": 4.9577, "step": 686 }, { "epoch": 0.00687, "grad_norm": 1.0923206806182861, "learning_rate": 0.0020610000000000003, "loss": 4.9579, "step": 687 }, { "epoch": 0.00688, "grad_norm": 0.8355166912078857, "learning_rate": 0.002064, "loss": 4.9524, "step": 688 }, { "epoch": 0.00689, "grad_norm": 0.7577447891235352, "learning_rate": 0.002067, "loss": 4.9417, "step": 689 }, { "epoch": 0.0069, "grad_norm": 0.7621678709983826, "learning_rate": 0.00207, "loss": 4.9421, "step": 690 }, { "epoch": 0.00691, "grad_norm": 0.6061983108520508, "learning_rate": 0.0020729999999999998, "loss": 4.9082, "step": 691 }, { "epoch": 0.00692, "grad_norm": 0.591027021408081, "learning_rate": 0.0020759999999999997, "loss": 4.9292, "step": 692 }, { "epoch": 0.00693, "grad_norm": 0.5834758281707764, "learning_rate": 0.0020789999999999997, "loss": 4.9222, "step": 693 }, { "epoch": 0.00694, "grad_norm": 0.7208871841430664, "learning_rate": 0.002082, "loss": 4.9071, "step": 694 }, { "epoch": 0.00695, "grad_norm": 0.8771729469299316, "learning_rate": 0.002085, "loss": 4.9171, "step": 695 }, { "epoch": 0.00696, "grad_norm": 0.9051836133003235, "learning_rate": 0.002088, "loss": 4.9182, "step": 696 }, { "epoch": 0.00697, "grad_norm": 1.1665294170379639, "learning_rate": 0.002091, "loss": 4.9339, "step": 697 }, { "epoch": 0.00698, "grad_norm": 1.092050313949585, "learning_rate": 0.002094, "loss": 4.9261, "step": 698 }, { "epoch": 0.00699, "grad_norm": 0.8412545323371887, "learning_rate": 0.002097, "loss": 4.9128, "step": 699 }, { "epoch": 0.007, "grad_norm": 0.6720849871635437, "learning_rate": 0.0021, "loss": 4.9176, "step": 700 }, { "epoch": 0.00701, "grad_norm": 0.563408613204956, "learning_rate": 0.002103, "loss": 4.8944, "step": 701 }, { "epoch": 0.00702, "grad_norm": 0.5357261300086975, "learning_rate": 0.002106, "loss": 4.893, "step": 702 }, { "epoch": 0.00703, "grad_norm": 0.5667074918746948, "learning_rate": 0.0021089999999999998, "loss": 4.8766, "step": 703 }, { "epoch": 0.00704, "grad_norm": 0.597253143787384, "learning_rate": 0.0021119999999999997, "loss": 4.8825, "step": 704 }, { "epoch": 0.00705, "grad_norm": 0.6211616396903992, "learning_rate": 0.002115, "loss": 4.8877, "step": 705 }, { "epoch": 0.00706, "grad_norm": 0.687544047832489, "learning_rate": 0.002118, "loss": 4.8929, "step": 706 }, { "epoch": 0.00707, "grad_norm": 0.7621776461601257, "learning_rate": 0.002121, "loss": 4.8688, "step": 707 }, { "epoch": 0.00708, "grad_norm": 0.9027195572853088, "learning_rate": 0.002124, "loss": 4.8567, "step": 708 }, { "epoch": 0.00709, "grad_norm": 0.9446965456008911, "learning_rate": 0.002127, "loss": 4.8935, "step": 709 }, { "epoch": 0.0071, "grad_norm": 0.8206554055213928, "learning_rate": 0.00213, "loss": 4.8706, "step": 710 }, { "epoch": 0.00711, "grad_norm": 0.8629757165908813, "learning_rate": 0.002133, "loss": 4.8565, "step": 711 }, { "epoch": 0.00712, "grad_norm": 0.7438434362411499, "learning_rate": 0.002136, "loss": 4.869, "step": 712 }, { "epoch": 0.00713, "grad_norm": 0.7951372861862183, "learning_rate": 0.002139, "loss": 4.8846, "step": 713 }, { "epoch": 0.00714, "grad_norm": 0.9020676016807556, "learning_rate": 0.002142, "loss": 4.8773, "step": 714 }, { "epoch": 0.00715, "grad_norm": 1.064352035522461, "learning_rate": 0.0021449999999999998, "loss": 4.8624, "step": 715 }, { "epoch": 0.00716, "grad_norm": 0.7318432927131653, "learning_rate": 0.002148, "loss": 4.8409, "step": 716 }, { "epoch": 0.00717, "grad_norm": 0.886417031288147, "learning_rate": 0.002151, "loss": 4.8705, "step": 717 }, { "epoch": 0.00718, "grad_norm": 0.8509985208511353, "learning_rate": 0.002154, "loss": 4.8473, "step": 718 }, { "epoch": 0.00719, "grad_norm": 0.8979188203811646, "learning_rate": 0.002157, "loss": 4.8656, "step": 719 }, { "epoch": 0.0072, "grad_norm": 1.0766488313674927, "learning_rate": 0.00216, "loss": 4.8622, "step": 720 }, { "epoch": 0.00721, "grad_norm": 1.0330792665481567, "learning_rate": 0.002163, "loss": 4.8716, "step": 721 }, { "epoch": 0.00722, "grad_norm": 1.097432255744934, "learning_rate": 0.002166, "loss": 4.8541, "step": 722 }, { "epoch": 0.00723, "grad_norm": 1.0046334266662598, "learning_rate": 0.002169, "loss": 4.8596, "step": 723 }, { "epoch": 0.00724, "grad_norm": 1.0316451787948608, "learning_rate": 0.002172, "loss": 4.8595, "step": 724 }, { "epoch": 0.00725, "grad_norm": 0.7518482804298401, "learning_rate": 0.002175, "loss": 4.8464, "step": 725 }, { "epoch": 0.00726, "grad_norm": 0.8229779005050659, "learning_rate": 0.002178, "loss": 4.8512, "step": 726 }, { "epoch": 0.00727, "grad_norm": 0.9577723741531372, "learning_rate": 0.0021809999999999998, "loss": 4.841, "step": 727 }, { "epoch": 0.00728, "grad_norm": 0.9698926210403442, "learning_rate": 0.002184, "loss": 4.8594, "step": 728 }, { "epoch": 0.00729, "grad_norm": 1.1789532899856567, "learning_rate": 0.002187, "loss": 4.8599, "step": 729 }, { "epoch": 0.0073, "grad_norm": 1.0923309326171875, "learning_rate": 0.00219, "loss": 4.8656, "step": 730 }, { "epoch": 0.00731, "grad_norm": 1.0404242277145386, "learning_rate": 0.002193, "loss": 4.8544, "step": 731 }, { "epoch": 0.00732, "grad_norm": 0.8924814462661743, "learning_rate": 0.002196, "loss": 4.8537, "step": 732 }, { "epoch": 0.00733, "grad_norm": 0.7918877005577087, "learning_rate": 0.002199, "loss": 4.8326, "step": 733 }, { "epoch": 0.00734, "grad_norm": 0.8231402635574341, "learning_rate": 0.002202, "loss": 4.8485, "step": 734 }, { "epoch": 0.00735, "grad_norm": 1.1126084327697754, "learning_rate": 0.002205, "loss": 4.8661, "step": 735 }, { "epoch": 0.00736, "grad_norm": 1.1928813457489014, "learning_rate": 0.002208, "loss": 4.857, "step": 736 }, { "epoch": 0.00737, "grad_norm": 0.9305274486541748, "learning_rate": 0.002211, "loss": 4.8183, "step": 737 }, { "epoch": 0.00738, "grad_norm": 1.2922184467315674, "learning_rate": 0.002214, "loss": 4.8313, "step": 738 }, { "epoch": 0.00739, "grad_norm": 1.0224477052688599, "learning_rate": 0.0022170000000000002, "loss": 4.8302, "step": 739 }, { "epoch": 0.0074, "grad_norm": 0.8254541158676147, "learning_rate": 0.00222, "loss": 4.8183, "step": 740 }, { "epoch": 0.00741, "grad_norm": 0.8494399785995483, "learning_rate": 0.002223, "loss": 4.7936, "step": 741 }, { "epoch": 0.00742, "grad_norm": 0.8097528219223022, "learning_rate": 0.002226, "loss": 4.8203, "step": 742 }, { "epoch": 0.00743, "grad_norm": 0.7318201065063477, "learning_rate": 0.002229, "loss": 4.782, "step": 743 }, { "epoch": 0.00744, "grad_norm": 0.770041286945343, "learning_rate": 0.002232, "loss": 4.7897, "step": 744 }, { "epoch": 0.00745, "grad_norm": 0.8400176167488098, "learning_rate": 0.002235, "loss": 4.793, "step": 745 }, { "epoch": 0.00746, "grad_norm": 0.8187500834465027, "learning_rate": 0.002238, "loss": 4.7927, "step": 746 }, { "epoch": 0.00747, "grad_norm": 0.8396742343902588, "learning_rate": 0.002241, "loss": 4.7802, "step": 747 }, { "epoch": 0.00748, "grad_norm": 0.8425725698471069, "learning_rate": 0.002244, "loss": 4.7863, "step": 748 }, { "epoch": 0.00749, "grad_norm": 0.9793819189071655, "learning_rate": 0.002247, "loss": 4.771, "step": 749 }, { "epoch": 0.0075, "grad_norm": 0.9097406268119812, "learning_rate": 0.0022500000000000003, "loss": 4.8036, "step": 750 }, { "epoch": 0.00751, "grad_norm": 1.1564972400665283, "learning_rate": 0.0022530000000000002, "loss": 4.8008, "step": 751 }, { "epoch": 0.00752, "grad_norm": 1.1269639730453491, "learning_rate": 0.002256, "loss": 4.7666, "step": 752 }, { "epoch": 0.00753, "grad_norm": 0.7554891705513, "learning_rate": 0.002259, "loss": 4.7895, "step": 753 }, { "epoch": 0.00754, "grad_norm": 0.6434245705604553, "learning_rate": 0.002262, "loss": 4.7936, "step": 754 }, { "epoch": 0.00755, "grad_norm": 0.6667845249176025, "learning_rate": 0.002265, "loss": 4.7629, "step": 755 }, { "epoch": 0.00756, "grad_norm": 0.6454316973686218, "learning_rate": 0.002268, "loss": 4.7431, "step": 756 }, { "epoch": 0.00757, "grad_norm": 0.5992982387542725, "learning_rate": 0.002271, "loss": 4.7549, "step": 757 }, { "epoch": 0.00758, "grad_norm": 0.5403345227241516, "learning_rate": 0.002274, "loss": 4.7325, "step": 758 }, { "epoch": 0.00759, "grad_norm": 0.6695131063461304, "learning_rate": 0.002277, "loss": 4.754, "step": 759 }, { "epoch": 0.0076, "grad_norm": 0.8114665150642395, "learning_rate": 0.00228, "loss": 4.7449, "step": 760 }, { "epoch": 0.00761, "grad_norm": 0.8682258129119873, "learning_rate": 0.002283, "loss": 4.7184, "step": 761 }, { "epoch": 0.00762, "grad_norm": 0.9619300961494446, "learning_rate": 0.0022860000000000003, "loss": 4.7088, "step": 762 }, { "epoch": 0.00763, "grad_norm": 1.0561281442642212, "learning_rate": 0.0022890000000000002, "loss": 4.7527, "step": 763 }, { "epoch": 0.00764, "grad_norm": 1.0605075359344482, "learning_rate": 0.002292, "loss": 4.7609, "step": 764 }, { "epoch": 0.00765, "grad_norm": 0.8591569066047668, "learning_rate": 0.002295, "loss": 4.7468, "step": 765 }, { "epoch": 0.00766, "grad_norm": 0.8702619075775146, "learning_rate": 0.002298, "loss": 4.7378, "step": 766 }, { "epoch": 0.00767, "grad_norm": 0.9817199110984802, "learning_rate": 0.002301, "loss": 4.7481, "step": 767 }, { "epoch": 0.00768, "grad_norm": 1.0741162300109863, "learning_rate": 0.002304, "loss": 4.7612, "step": 768 }, { "epoch": 0.00769, "grad_norm": 0.8188871145248413, "learning_rate": 0.002307, "loss": 4.7592, "step": 769 }, { "epoch": 0.0077, "grad_norm": 0.7658451795578003, "learning_rate": 0.00231, "loss": 4.7148, "step": 770 }, { "epoch": 0.00771, "grad_norm": 0.8762615323066711, "learning_rate": 0.002313, "loss": 4.754, "step": 771 }, { "epoch": 0.00772, "grad_norm": 0.9279188513755798, "learning_rate": 0.002316, "loss": 4.7389, "step": 772 }, { "epoch": 0.00773, "grad_norm": 0.8405673503875732, "learning_rate": 0.0023190000000000003, "loss": 4.737, "step": 773 }, { "epoch": 0.00774, "grad_norm": 0.777439296245575, "learning_rate": 0.0023220000000000003, "loss": 4.7452, "step": 774 }, { "epoch": 0.00775, "grad_norm": 0.8980410099029541, "learning_rate": 0.0023250000000000002, "loss": 4.7436, "step": 775 }, { "epoch": 0.00776, "grad_norm": 1.0412962436676025, "learning_rate": 0.002328, "loss": 4.7539, "step": 776 }, { "epoch": 0.00777, "grad_norm": 0.9281516075134277, "learning_rate": 0.002331, "loss": 4.7268, "step": 777 }, { "epoch": 0.00778, "grad_norm": 0.7459467649459839, "learning_rate": 0.002334, "loss": 4.7271, "step": 778 }, { "epoch": 0.00779, "grad_norm": 0.7607200145721436, "learning_rate": 0.002337, "loss": 4.689, "step": 779 }, { "epoch": 0.0078, "grad_norm": 0.7529038190841675, "learning_rate": 0.00234, "loss": 4.706, "step": 780 }, { "epoch": 0.00781, "grad_norm": 0.7776694297790527, "learning_rate": 0.002343, "loss": 4.7072, "step": 781 }, { "epoch": 0.00782, "grad_norm": 0.7648219466209412, "learning_rate": 0.002346, "loss": 4.6998, "step": 782 }, { "epoch": 0.00783, "grad_norm": 0.8795627355575562, "learning_rate": 0.002349, "loss": 4.6893, "step": 783 }, { "epoch": 0.00784, "grad_norm": 0.9756646156311035, "learning_rate": 0.002352, "loss": 4.706, "step": 784 }, { "epoch": 0.00785, "grad_norm": 1.03944993019104, "learning_rate": 0.0023550000000000003, "loss": 4.7094, "step": 785 }, { "epoch": 0.00786, "grad_norm": 0.9842208027839661, "learning_rate": 0.0023580000000000003, "loss": 4.7094, "step": 786 }, { "epoch": 0.00787, "grad_norm": 0.9913274049758911, "learning_rate": 0.0023610000000000003, "loss": 4.7124, "step": 787 }, { "epoch": 0.00788, "grad_norm": 1.0203496217727661, "learning_rate": 0.002364, "loss": 4.711, "step": 788 }, { "epoch": 0.00789, "grad_norm": 1.0524691343307495, "learning_rate": 0.002367, "loss": 4.7436, "step": 789 }, { "epoch": 0.0079, "grad_norm": 0.8448042273521423, "learning_rate": 0.00237, "loss": 4.7139, "step": 790 }, { "epoch": 0.00791, "grad_norm": 0.737777054309845, "learning_rate": 0.002373, "loss": 4.6607, "step": 791 }, { "epoch": 0.00792, "grad_norm": 0.8730551600456238, "learning_rate": 0.002376, "loss": 4.6988, "step": 792 }, { "epoch": 0.00793, "grad_norm": 1.0680596828460693, "learning_rate": 0.002379, "loss": 4.6938, "step": 793 }, { "epoch": 0.00794, "grad_norm": 0.876390814781189, "learning_rate": 0.002382, "loss": 4.7016, "step": 794 }, { "epoch": 0.00795, "grad_norm": 0.8010908961296082, "learning_rate": 0.002385, "loss": 4.6743, "step": 795 }, { "epoch": 0.00796, "grad_norm": 0.8452677130699158, "learning_rate": 0.0023880000000000004, "loss": 4.6712, "step": 796 }, { "epoch": 0.00797, "grad_norm": 0.7743445038795471, "learning_rate": 0.0023910000000000003, "loss": 4.7081, "step": 797 }, { "epoch": 0.00798, "grad_norm": 0.7820720076560974, "learning_rate": 0.0023940000000000003, "loss": 4.6744, "step": 798 }, { "epoch": 0.00799, "grad_norm": 0.8602663278579712, "learning_rate": 0.0023970000000000003, "loss": 4.703, "step": 799 }, { "epoch": 0.008, "grad_norm": 0.83580082654953, "learning_rate": 0.0024000000000000002, "loss": 4.6615, "step": 800 }, { "epoch": 0.00801, "grad_norm": 0.8171262145042419, "learning_rate": 0.002403, "loss": 4.6877, "step": 801 }, { "epoch": 0.00802, "grad_norm": 0.8578851819038391, "learning_rate": 0.002406, "loss": 4.6772, "step": 802 }, { "epoch": 0.00803, "grad_norm": 0.9747350215911865, "learning_rate": 0.002409, "loss": 4.662, "step": 803 }, { "epoch": 0.00804, "grad_norm": 0.9363577961921692, "learning_rate": 0.002412, "loss": 4.6595, "step": 804 }, { "epoch": 0.00805, "grad_norm": 0.9166631102561951, "learning_rate": 0.002415, "loss": 4.6601, "step": 805 }, { "epoch": 0.00806, "grad_norm": 0.860556960105896, "learning_rate": 0.002418, "loss": 4.681, "step": 806 }, { "epoch": 0.00807, "grad_norm": 0.8737242817878723, "learning_rate": 0.0024210000000000004, "loss": 4.6779, "step": 807 }, { "epoch": 0.00808, "grad_norm": 0.8125926852226257, "learning_rate": 0.0024240000000000004, "loss": 4.6555, "step": 808 }, { "epoch": 0.00809, "grad_norm": 0.9335473775863647, "learning_rate": 0.0024270000000000003, "loss": 4.6575, "step": 809 }, { "epoch": 0.0081, "grad_norm": 0.7656557559967041, "learning_rate": 0.0024300000000000003, "loss": 4.6752, "step": 810 }, { "epoch": 0.00811, "grad_norm": 0.8129197359085083, "learning_rate": 0.0024330000000000003, "loss": 4.6545, "step": 811 }, { "epoch": 0.00812, "grad_norm": 0.7713532447814941, "learning_rate": 0.0024360000000000002, "loss": 4.6279, "step": 812 }, { "epoch": 0.00813, "grad_norm": 0.8149270415306091, "learning_rate": 0.0024389999999999998, "loss": 4.6613, "step": 813 }, { "epoch": 0.00814, "grad_norm": 0.866010308265686, "learning_rate": 0.0024419999999999997, "loss": 4.6696, "step": 814 }, { "epoch": 0.00815, "grad_norm": 0.7402296662330627, "learning_rate": 0.0024449999999999997, "loss": 4.6746, "step": 815 }, { "epoch": 0.00816, "grad_norm": 0.6451212167739868, "learning_rate": 0.002448, "loss": 4.6403, "step": 816 }, { "epoch": 0.00817, "grad_norm": 0.7935466170310974, "learning_rate": 0.002451, "loss": 4.6538, "step": 817 }, { "epoch": 0.00818, "grad_norm": 1.0368677377700806, "learning_rate": 0.002454, "loss": 4.6444, "step": 818 }, { "epoch": 0.00819, "grad_norm": 1.1921635866165161, "learning_rate": 0.002457, "loss": 4.6695, "step": 819 }, { "epoch": 0.0082, "grad_norm": 0.9146779179573059, "learning_rate": 0.00246, "loss": 4.6473, "step": 820 }, { "epoch": 0.00821, "grad_norm": 0.8097479939460754, "learning_rate": 0.002463, "loss": 4.6216, "step": 821 }, { "epoch": 0.00822, "grad_norm": 0.8611756563186646, "learning_rate": 0.002466, "loss": 4.6241, "step": 822 }, { "epoch": 0.00823, "grad_norm": 0.9131811857223511, "learning_rate": 0.002469, "loss": 4.6307, "step": 823 }, { "epoch": 0.00824, "grad_norm": 0.9472024440765381, "learning_rate": 0.002472, "loss": 4.6551, "step": 824 }, { "epoch": 0.00825, "grad_norm": 1.1296700239181519, "learning_rate": 0.0024749999999999998, "loss": 4.6738, "step": 825 }, { "epoch": 0.00826, "grad_norm": 0.9912372827529907, "learning_rate": 0.0024779999999999997, "loss": 4.6298, "step": 826 }, { "epoch": 0.00827, "grad_norm": 0.9669742584228516, "learning_rate": 0.002481, "loss": 4.6558, "step": 827 }, { "epoch": 0.00828, "grad_norm": 0.7501344680786133, "learning_rate": 0.002484, "loss": 4.6425, "step": 828 }, { "epoch": 0.00829, "grad_norm": 0.6985933184623718, "learning_rate": 0.002487, "loss": 4.6351, "step": 829 }, { "epoch": 0.0083, "grad_norm": 0.6681656837463379, "learning_rate": 0.00249, "loss": 4.6406, "step": 830 }, { "epoch": 0.00831, "grad_norm": 0.7142512202262878, "learning_rate": 0.002493, "loss": 4.6218, "step": 831 }, { "epoch": 0.00832, "grad_norm": 0.7265256643295288, "learning_rate": 0.002496, "loss": 4.6212, "step": 832 }, { "epoch": 0.00833, "grad_norm": 0.7133427262306213, "learning_rate": 0.002499, "loss": 4.6125, "step": 833 }, { "epoch": 0.00834, "grad_norm": 0.7037473917007446, "learning_rate": 0.002502, "loss": 4.6151, "step": 834 }, { "epoch": 0.00835, "grad_norm": 0.9004167914390564, "learning_rate": 0.002505, "loss": 4.6154, "step": 835 }, { "epoch": 0.00836, "grad_norm": 0.9459953904151917, "learning_rate": 0.002508, "loss": 4.6497, "step": 836 }, { "epoch": 0.00837, "grad_norm": 0.9802148342132568, "learning_rate": 0.0025109999999999998, "loss": 4.5975, "step": 837 }, { "epoch": 0.00838, "grad_norm": 0.8809778690338135, "learning_rate": 0.0025139999999999997, "loss": 4.6166, "step": 838 }, { "epoch": 0.00839, "grad_norm": 0.792102038860321, "learning_rate": 0.002517, "loss": 4.6155, "step": 839 }, { "epoch": 0.0084, "grad_norm": 0.7744638323783875, "learning_rate": 0.00252, "loss": 4.5999, "step": 840 }, { "epoch": 0.00841, "grad_norm": 0.791641116142273, "learning_rate": 0.002523, "loss": 4.5943, "step": 841 }, { "epoch": 0.00842, "grad_norm": 0.8285142183303833, "learning_rate": 0.002526, "loss": 4.5937, "step": 842 }, { "epoch": 0.00843, "grad_norm": 0.9900093078613281, "learning_rate": 0.002529, "loss": 4.6057, "step": 843 }, { "epoch": 0.00844, "grad_norm": 0.984833300113678, "learning_rate": 0.002532, "loss": 4.6269, "step": 844 }, { "epoch": 0.00845, "grad_norm": 0.8697680234909058, "learning_rate": 0.002535, "loss": 4.5878, "step": 845 }, { "epoch": 0.00846, "grad_norm": 0.8182123899459839, "learning_rate": 0.002538, "loss": 4.6097, "step": 846 }, { "epoch": 0.00847, "grad_norm": 0.7307525277137756, "learning_rate": 0.002541, "loss": 4.5888, "step": 847 }, { "epoch": 0.00848, "grad_norm": 0.7021674513816833, "learning_rate": 0.002544, "loss": 4.5844, "step": 848 }, { "epoch": 0.00849, "grad_norm": 0.6287952661514282, "learning_rate": 0.002547, "loss": 4.5759, "step": 849 }, { "epoch": 0.0085, "grad_norm": 0.5557106733322144, "learning_rate": 0.00255, "loss": 4.5834, "step": 850 }, { "epoch": 0.00851, "grad_norm": 0.5636370778083801, "learning_rate": 0.002553, "loss": 4.5977, "step": 851 }, { "epoch": 0.00852, "grad_norm": 0.6509172320365906, "learning_rate": 0.002556, "loss": 4.5892, "step": 852 }, { "epoch": 0.00853, "grad_norm": 0.6835383176803589, "learning_rate": 0.002559, "loss": 4.5743, "step": 853 }, { "epoch": 0.00854, "grad_norm": 0.6499077081680298, "learning_rate": 0.002562, "loss": 4.5629, "step": 854 }, { "epoch": 0.00855, "grad_norm": 0.5777466297149658, "learning_rate": 0.002565, "loss": 4.5388, "step": 855 }, { "epoch": 0.00856, "grad_norm": 0.6060221195220947, "learning_rate": 0.002568, "loss": 4.5724, "step": 856 }, { "epoch": 0.00857, "grad_norm": 0.6906097531318665, "learning_rate": 0.002571, "loss": 4.5615, "step": 857 }, { "epoch": 0.00858, "grad_norm": 0.7318242788314819, "learning_rate": 0.002574, "loss": 4.5575, "step": 858 }, { "epoch": 0.00859, "grad_norm": 0.6517086029052734, "learning_rate": 0.002577, "loss": 4.5498, "step": 859 }, { "epoch": 0.0086, "grad_norm": 0.6514325737953186, "learning_rate": 0.00258, "loss": 4.5756, "step": 860 }, { "epoch": 0.00861, "grad_norm": 0.7899180054664612, "learning_rate": 0.0025830000000000002, "loss": 4.594, "step": 861 }, { "epoch": 0.00862, "grad_norm": 0.874433696269989, "learning_rate": 0.002586, "loss": 4.5519, "step": 862 }, { "epoch": 0.00863, "grad_norm": 0.8605120182037354, "learning_rate": 0.002589, "loss": 4.5772, "step": 863 }, { "epoch": 0.00864, "grad_norm": 0.9631415009498596, "learning_rate": 0.002592, "loss": 4.5403, "step": 864 }, { "epoch": 0.00865, "grad_norm": 1.1293022632598877, "learning_rate": 0.002595, "loss": 4.5658, "step": 865 }, { "epoch": 0.00866, "grad_norm": 1.0691903829574585, "learning_rate": 0.002598, "loss": 4.5962, "step": 866 }, { "epoch": 0.00867, "grad_norm": 0.9885998368263245, "learning_rate": 0.002601, "loss": 4.5818, "step": 867 }, { "epoch": 0.00868, "grad_norm": 1.3688061237335205, "learning_rate": 0.002604, "loss": 4.5938, "step": 868 }, { "epoch": 0.00869, "grad_norm": 0.8156008124351501, "learning_rate": 0.002607, "loss": 4.5787, "step": 869 }, { "epoch": 0.0087, "grad_norm": 0.8631764054298401, "learning_rate": 0.00261, "loss": 4.5676, "step": 870 }, { "epoch": 0.00871, "grad_norm": 0.9996145963668823, "learning_rate": 0.002613, "loss": 4.5932, "step": 871 }, { "epoch": 0.00872, "grad_norm": 0.972501814365387, "learning_rate": 0.002616, "loss": 4.5764, "step": 872 }, { "epoch": 0.00873, "grad_norm": 1.0140340328216553, "learning_rate": 0.0026190000000000002, "loss": 4.5878, "step": 873 }, { "epoch": 0.00874, "grad_norm": 0.9380632042884827, "learning_rate": 0.002622, "loss": 4.6105, "step": 874 }, { "epoch": 0.00875, "grad_norm": 0.9407688975334167, "learning_rate": 0.002625, "loss": 4.5588, "step": 875 }, { "epoch": 0.00876, "grad_norm": 1.0034210681915283, "learning_rate": 0.002628, "loss": 4.5949, "step": 876 }, { "epoch": 0.00877, "grad_norm": 1.2963709831237793, "learning_rate": 0.002631, "loss": 4.6268, "step": 877 }, { "epoch": 0.00878, "grad_norm": 0.7438150644302368, "learning_rate": 0.002634, "loss": 4.537, "step": 878 }, { "epoch": 0.00879, "grad_norm": 0.8438600301742554, "learning_rate": 0.002637, "loss": 4.5729, "step": 879 }, { "epoch": 0.0088, "grad_norm": 0.7277801036834717, "learning_rate": 0.00264, "loss": 4.5704, "step": 880 }, { "epoch": 0.00881, "grad_norm": 0.9043455123901367, "learning_rate": 0.002643, "loss": 4.5794, "step": 881 }, { "epoch": 0.00882, "grad_norm": 0.9441054463386536, "learning_rate": 0.002646, "loss": 4.5807, "step": 882 }, { "epoch": 0.00883, "grad_norm": 0.8490539193153381, "learning_rate": 0.002649, "loss": 4.5704, "step": 883 }, { "epoch": 0.00884, "grad_norm": 0.8639208078384399, "learning_rate": 0.0026520000000000003, "loss": 4.555, "step": 884 }, { "epoch": 0.00885, "grad_norm": 0.8000319600105286, "learning_rate": 0.0026550000000000002, "loss": 4.5531, "step": 885 }, { "epoch": 0.00886, "grad_norm": 0.8000409007072449, "learning_rate": 0.002658, "loss": 4.5702, "step": 886 }, { "epoch": 0.00887, "grad_norm": 0.5944011807441711, "learning_rate": 0.002661, "loss": 4.5499, "step": 887 }, { "epoch": 0.00888, "grad_norm": 0.5328640341758728, "learning_rate": 0.002664, "loss": 4.5494, "step": 888 }, { "epoch": 0.00889, "grad_norm": 0.5338924527168274, "learning_rate": 0.002667, "loss": 4.5018, "step": 889 }, { "epoch": 0.0089, "grad_norm": 0.4938536286354065, "learning_rate": 0.00267, "loss": 4.545, "step": 890 }, { "epoch": 0.00891, "grad_norm": 0.4882456064224243, "learning_rate": 0.002673, "loss": 4.5301, "step": 891 }, { "epoch": 0.00892, "grad_norm": 0.4257451891899109, "learning_rate": 0.002676, "loss": 4.5393, "step": 892 }, { "epoch": 0.00893, "grad_norm": 0.5505130290985107, "learning_rate": 0.002679, "loss": 4.5171, "step": 893 }, { "epoch": 0.00894, "grad_norm": 0.6718395352363586, "learning_rate": 0.002682, "loss": 4.5132, "step": 894 }, { "epoch": 0.00895, "grad_norm": 0.7553327679634094, "learning_rate": 0.0026850000000000003, "loss": 4.5223, "step": 895 }, { "epoch": 0.00896, "grad_norm": 0.7149863243103027, "learning_rate": 0.0026880000000000003, "loss": 4.5119, "step": 896 }, { "epoch": 0.00897, "grad_norm": 0.8281179070472717, "learning_rate": 0.0026910000000000002, "loss": 4.5328, "step": 897 }, { "epoch": 0.00898, "grad_norm": 0.9519450068473816, "learning_rate": 0.002694, "loss": 4.5342, "step": 898 }, { "epoch": 0.00899, "grad_norm": 0.9227859377861023, "learning_rate": 0.002697, "loss": 4.5445, "step": 899 }, { "epoch": 0.009, "grad_norm": 1.040459156036377, "learning_rate": 0.0027, "loss": 4.5756, "step": 900 }, { "epoch": 0.00901, "grad_norm": 0.8912736773490906, "learning_rate": 0.002703, "loss": 4.5234, "step": 901 }, { "epoch": 0.00902, "grad_norm": 0.768046498298645, "learning_rate": 0.002706, "loss": 4.5317, "step": 902 }, { "epoch": 0.00903, "grad_norm": 0.698384702205658, "learning_rate": 0.002709, "loss": 4.5227, "step": 903 }, { "epoch": 0.00904, "grad_norm": 0.7553470134735107, "learning_rate": 0.002712, "loss": 4.5046, "step": 904 }, { "epoch": 0.00905, "grad_norm": 0.9005333185195923, "learning_rate": 0.002715, "loss": 4.5273, "step": 905 }, { "epoch": 0.00906, "grad_norm": 0.9150082468986511, "learning_rate": 0.002718, "loss": 4.54, "step": 906 }, { "epoch": 0.00907, "grad_norm": 0.8349012732505798, "learning_rate": 0.0027210000000000003, "loss": 4.5543, "step": 907 }, { "epoch": 0.00908, "grad_norm": 0.7856695055961609, "learning_rate": 0.0027240000000000003, "loss": 4.5219, "step": 908 }, { "epoch": 0.00909, "grad_norm": 0.8433918952941895, "learning_rate": 0.0027270000000000003, "loss": 4.5423, "step": 909 }, { "epoch": 0.0091, "grad_norm": 0.9650413393974304, "learning_rate": 0.0027300000000000002, "loss": 4.5577, "step": 910 }, { "epoch": 0.00911, "grad_norm": 0.9894043803215027, "learning_rate": 0.002733, "loss": 4.514, "step": 911 }, { "epoch": 0.00912, "grad_norm": 0.9117268919944763, "learning_rate": 0.002736, "loss": 4.5245, "step": 912 }, { "epoch": 0.00913, "grad_norm": 1.020936369895935, "learning_rate": 0.002739, "loss": 4.5244, "step": 913 }, { "epoch": 0.00914, "grad_norm": 1.1810083389282227, "learning_rate": 0.002742, "loss": 4.5723, "step": 914 }, { "epoch": 0.00915, "grad_norm": 0.8899980187416077, "learning_rate": 0.002745, "loss": 4.5252, "step": 915 }, { "epoch": 0.00916, "grad_norm": 0.9896888136863708, "learning_rate": 0.002748, "loss": 4.5028, "step": 916 }, { "epoch": 0.00917, "grad_norm": 1.3935941457748413, "learning_rate": 0.002751, "loss": 4.5471, "step": 917 }, { "epoch": 0.00918, "grad_norm": 1.1309375762939453, "learning_rate": 0.0027540000000000004, "loss": 4.562, "step": 918 }, { "epoch": 0.00919, "grad_norm": 0.8358169198036194, "learning_rate": 0.0027570000000000003, "loss": 4.5255, "step": 919 }, { "epoch": 0.0092, "grad_norm": 0.8860310316085815, "learning_rate": 0.0027600000000000003, "loss": 4.5423, "step": 920 }, { "epoch": 0.00921, "grad_norm": 1.0510568618774414, "learning_rate": 0.0027630000000000003, "loss": 4.5336, "step": 921 }, { "epoch": 0.00922, "grad_norm": 1.1311016082763672, "learning_rate": 0.0027660000000000002, "loss": 4.5483, "step": 922 }, { "epoch": 0.00923, "grad_norm": 1.1111584901809692, "learning_rate": 0.002769, "loss": 4.5339, "step": 923 }, { "epoch": 0.00924, "grad_norm": 0.9117261171340942, "learning_rate": 0.002772, "loss": 4.5412, "step": 924 }, { "epoch": 0.00925, "grad_norm": 0.9757253527641296, "learning_rate": 0.002775, "loss": 4.5263, "step": 925 }, { "epoch": 0.00926, "grad_norm": 0.907768189907074, "learning_rate": 0.002778, "loss": 4.5422, "step": 926 }, { "epoch": 0.00927, "grad_norm": 0.8181371092796326, "learning_rate": 0.002781, "loss": 4.4999, "step": 927 }, { "epoch": 0.00928, "grad_norm": 0.7728373408317566, "learning_rate": 0.002784, "loss": 4.5461, "step": 928 }, { "epoch": 0.00929, "grad_norm": 0.624686598777771, "learning_rate": 0.0027870000000000004, "loss": 4.5192, "step": 929 }, { "epoch": 0.0093, "grad_norm": 0.6143611073493958, "learning_rate": 0.0027900000000000004, "loss": 4.4968, "step": 930 }, { "epoch": 0.00931, "grad_norm": 0.5353983044624329, "learning_rate": 0.0027930000000000003, "loss": 4.5041, "step": 931 }, { "epoch": 0.00932, "grad_norm": 0.5034843683242798, "learning_rate": 0.0027960000000000003, "loss": 4.5081, "step": 932 }, { "epoch": 0.00933, "grad_norm": 0.43557336926460266, "learning_rate": 0.0027990000000000003, "loss": 4.47, "step": 933 }, { "epoch": 0.00934, "grad_norm": 0.42429375648498535, "learning_rate": 0.0028020000000000002, "loss": 4.4927, "step": 934 }, { "epoch": 0.00935, "grad_norm": 0.4439206123352051, "learning_rate": 0.002805, "loss": 4.5015, "step": 935 }, { "epoch": 0.00936, "grad_norm": 0.4363570511341095, "learning_rate": 0.002808, "loss": 4.4989, "step": 936 }, { "epoch": 0.00937, "grad_norm": 0.4996969401836395, "learning_rate": 0.002811, "loss": 4.5097, "step": 937 }, { "epoch": 0.00938, "grad_norm": 0.5577415227890015, "learning_rate": 0.002814, "loss": 4.4882, "step": 938 }, { "epoch": 0.00939, "grad_norm": 0.6116971373558044, "learning_rate": 0.002817, "loss": 4.4823, "step": 939 }, { "epoch": 0.0094, "grad_norm": 0.675699770450592, "learning_rate": 0.00282, "loss": 4.4838, "step": 940 }, { "epoch": 0.00941, "grad_norm": 0.8027246594429016, "learning_rate": 0.002823, "loss": 4.4671, "step": 941 }, { "epoch": 0.00942, "grad_norm": 0.9442744851112366, "learning_rate": 0.002826, "loss": 4.5129, "step": 942 }, { "epoch": 0.00943, "grad_norm": 0.963239312171936, "learning_rate": 0.002829, "loss": 4.4975, "step": 943 }, { "epoch": 0.00944, "grad_norm": 0.8721352815628052, "learning_rate": 0.002832, "loss": 4.493, "step": 944 }, { "epoch": 0.00945, "grad_norm": 1.0316184759140015, "learning_rate": 0.002835, "loss": 4.5084, "step": 945 }, { "epoch": 0.00946, "grad_norm": 0.7907041907310486, "learning_rate": 0.002838, "loss": 4.5093, "step": 946 }, { "epoch": 0.00947, "grad_norm": 0.8508433699607849, "learning_rate": 0.0028409999999999998, "loss": 4.4859, "step": 947 }, { "epoch": 0.00948, "grad_norm": 0.8942288756370544, "learning_rate": 0.0028439999999999997, "loss": 4.4805, "step": 948 }, { "epoch": 0.00949, "grad_norm": 0.8804354071617126, "learning_rate": 0.002847, "loss": 4.5178, "step": 949 }, { "epoch": 0.0095, "grad_norm": 0.9102524518966675, "learning_rate": 0.00285, "loss": 4.5104, "step": 950 }, { "epoch": 0.00951, "grad_norm": 0.9140040278434753, "learning_rate": 0.002853, "loss": 4.4913, "step": 951 }, { "epoch": 0.00952, "grad_norm": 0.996699869632721, "learning_rate": 0.002856, "loss": 4.5192, "step": 952 }, { "epoch": 0.00953, "grad_norm": 1.0743249654769897, "learning_rate": 0.002859, "loss": 4.4742, "step": 953 }, { "epoch": 0.00954, "grad_norm": 0.9619385004043579, "learning_rate": 0.002862, "loss": 4.532, "step": 954 }, { "epoch": 0.00955, "grad_norm": 0.9820901155471802, "learning_rate": 0.002865, "loss": 4.5178, "step": 955 }, { "epoch": 0.00956, "grad_norm": 1.0036413669586182, "learning_rate": 0.002868, "loss": 4.501, "step": 956 }, { "epoch": 0.00957, "grad_norm": 0.8994410634040833, "learning_rate": 0.002871, "loss": 4.5266, "step": 957 }, { "epoch": 0.00958, "grad_norm": 0.910679817199707, "learning_rate": 0.002874, "loss": 4.5103, "step": 958 }, { "epoch": 0.00959, "grad_norm": 0.7567980885505676, "learning_rate": 0.002877, "loss": 4.4958, "step": 959 }, { "epoch": 0.0096, "grad_norm": 0.7841255068778992, "learning_rate": 0.0028799999999999997, "loss": 4.4967, "step": 960 }, { "epoch": 0.00961, "grad_norm": 0.6947323083877563, "learning_rate": 0.002883, "loss": 4.4665, "step": 961 }, { "epoch": 0.00962, "grad_norm": 0.5831562280654907, "learning_rate": 0.002886, "loss": 4.4887, "step": 962 }, { "epoch": 0.00963, "grad_norm": 0.5601492524147034, "learning_rate": 0.002889, "loss": 4.4846, "step": 963 }, { "epoch": 0.00964, "grad_norm": 0.474755197763443, "learning_rate": 0.002892, "loss": 4.47, "step": 964 }, { "epoch": 0.00965, "grad_norm": 0.5203716158866882, "learning_rate": 0.002895, "loss": 4.4451, "step": 965 }, { "epoch": 0.00966, "grad_norm": 0.4714201092720032, "learning_rate": 0.002898, "loss": 4.4572, "step": 966 }, { "epoch": 0.00967, "grad_norm": 0.4852457344532013, "learning_rate": 0.002901, "loss": 4.4924, "step": 967 }, { "epoch": 0.00968, "grad_norm": 0.5026896595954895, "learning_rate": 0.002904, "loss": 4.4559, "step": 968 }, { "epoch": 0.00969, "grad_norm": 0.5138400793075562, "learning_rate": 0.002907, "loss": 4.4843, "step": 969 }, { "epoch": 0.0097, "grad_norm": 0.46154963970184326, "learning_rate": 0.00291, "loss": 4.4402, "step": 970 }, { "epoch": 0.00971, "grad_norm": 0.4250973165035248, "learning_rate": 0.002913, "loss": 4.4437, "step": 971 }, { "epoch": 0.00972, "grad_norm": 0.46983814239501953, "learning_rate": 0.002916, "loss": 4.45, "step": 972 }, { "epoch": 0.00973, "grad_norm": 0.578143835067749, "learning_rate": 0.002919, "loss": 4.4478, "step": 973 }, { "epoch": 0.00974, "grad_norm": 0.7184126377105713, "learning_rate": 0.002922, "loss": 4.4366, "step": 974 }, { "epoch": 0.00975, "grad_norm": 0.8322773575782776, "learning_rate": 0.002925, "loss": 4.4683, "step": 975 }, { "epoch": 0.00976, "grad_norm": 0.7910879850387573, "learning_rate": 0.002928, "loss": 4.4497, "step": 976 }, { "epoch": 0.00977, "grad_norm": 0.7662490606307983, "learning_rate": 0.002931, "loss": 4.4868, "step": 977 }, { "epoch": 0.00978, "grad_norm": 0.6879754066467285, "learning_rate": 0.002934, "loss": 4.4568, "step": 978 }, { "epoch": 0.00979, "grad_norm": 0.7417387366294861, "learning_rate": 0.002937, "loss": 4.4491, "step": 979 }, { "epoch": 0.0098, "grad_norm": 0.6087706089019775, "learning_rate": 0.00294, "loss": 4.4405, "step": 980 }, { "epoch": 0.00981, "grad_norm": 0.5661059617996216, "learning_rate": 0.002943, "loss": 4.4516, "step": 981 }, { "epoch": 0.00982, "grad_norm": 0.5970472097396851, "learning_rate": 0.002946, "loss": 4.4435, "step": 982 }, { "epoch": 0.00983, "grad_norm": 0.6391454935073853, "learning_rate": 0.0029490000000000002, "loss": 4.4444, "step": 983 }, { "epoch": 0.00984, "grad_norm": 0.5948253870010376, "learning_rate": 0.002952, "loss": 4.4303, "step": 984 }, { "epoch": 0.00985, "grad_norm": 0.5908463597297668, "learning_rate": 0.002955, "loss": 4.4509, "step": 985 }, { "epoch": 0.00986, "grad_norm": 0.6963019371032715, "learning_rate": 0.002958, "loss": 4.4177, "step": 986 }, { "epoch": 0.00987, "grad_norm": 0.7601437568664551, "learning_rate": 0.002961, "loss": 4.4332, "step": 987 }, { "epoch": 0.00988, "grad_norm": 0.8287732005119324, "learning_rate": 0.002964, "loss": 4.4707, "step": 988 }, { "epoch": 0.00989, "grad_norm": 0.9172109961509705, "learning_rate": 0.002967, "loss": 4.4168, "step": 989 }, { "epoch": 0.0099, "grad_norm": 0.9710732102394104, "learning_rate": 0.00297, "loss": 4.4601, "step": 990 }, { "epoch": 0.00991, "grad_norm": 0.9308454990386963, "learning_rate": 0.002973, "loss": 4.4596, "step": 991 }, { "epoch": 0.00992, "grad_norm": 0.8575865030288696, "learning_rate": 0.002976, "loss": 4.4567, "step": 992 }, { "epoch": 0.00993, "grad_norm": 0.8846513032913208, "learning_rate": 0.002979, "loss": 4.4578, "step": 993 }, { "epoch": 0.00994, "grad_norm": 1.0507467985153198, "learning_rate": 0.002982, "loss": 4.4695, "step": 994 }, { "epoch": 0.00995, "grad_norm": 0.9225064516067505, "learning_rate": 0.0029850000000000002, "loss": 4.472, "step": 995 }, { "epoch": 0.00996, "grad_norm": 1.0122137069702148, "learning_rate": 0.002988, "loss": 4.4787, "step": 996 }, { "epoch": 0.00997, "grad_norm": 1.1055110692977905, "learning_rate": 0.002991, "loss": 4.4822, "step": 997 }, { "epoch": 0.00998, "grad_norm": 1.0749611854553223, "learning_rate": 0.002994, "loss": 4.4866, "step": 998 }, { "epoch": 0.00999, "grad_norm": 1.0997530221939087, "learning_rate": 0.002997, "loss": 4.469, "step": 999 }, { "epoch": 0.01, "grad_norm": 0.853782057762146, "learning_rate": 0.003, "loss": 4.4844, "step": 1000 }, { "epoch": 0.01001, "grad_norm": 0.7132817506790161, "learning_rate": 0.003, "loss": 4.4776, "step": 1001 }, { "epoch": 0.01002, "grad_norm": 0.8123744130134583, "learning_rate": 0.003, "loss": 4.4523, "step": 1002 }, { "epoch": 0.01003, "grad_norm": 0.7343231439590454, "learning_rate": 0.003, "loss": 4.4688, "step": 1003 }, { "epoch": 0.01004, "grad_norm": 0.7312922477722168, "learning_rate": 0.003, "loss": 4.4567, "step": 1004 }, { "epoch": 0.01005, "grad_norm": 0.7205970287322998, "learning_rate": 0.003, "loss": 4.4254, "step": 1005 }, { "epoch": 0.01006, "grad_norm": 0.8032294511795044, "learning_rate": 0.003, "loss": 4.4272, "step": 1006 }, { "epoch": 0.01007, "grad_norm": 0.8357327580451965, "learning_rate": 0.003, "loss": 4.4775, "step": 1007 }, { "epoch": 0.01008, "grad_norm": 0.8112847208976746, "learning_rate": 0.003, "loss": 4.4864, "step": 1008 }, { "epoch": 0.01009, "grad_norm": 0.9153414964675903, "learning_rate": 0.003, "loss": 4.46, "step": 1009 }, { "epoch": 0.0101, "grad_norm": 0.9501859545707703, "learning_rate": 0.003, "loss": 4.4762, "step": 1010 }, { "epoch": 0.01011, "grad_norm": 0.9173474907875061, "learning_rate": 0.003, "loss": 4.4819, "step": 1011 }, { "epoch": 0.01012, "grad_norm": 0.8518911600112915, "learning_rate": 0.003, "loss": 4.4635, "step": 1012 }, { "epoch": 0.01013, "grad_norm": 0.8621218204498291, "learning_rate": 0.003, "loss": 4.4943, "step": 1013 }, { "epoch": 0.01014, "grad_norm": 0.6589127779006958, "learning_rate": 0.003, "loss": 4.419, "step": 1014 }, { "epoch": 0.01015, "grad_norm": 0.7594727277755737, "learning_rate": 0.003, "loss": 4.4653, "step": 1015 }, { "epoch": 0.01016, "grad_norm": 0.800542414188385, "learning_rate": 0.003, "loss": 4.4749, "step": 1016 }, { "epoch": 0.01017, "grad_norm": 0.7398179769515991, "learning_rate": 0.003, "loss": 4.4124, "step": 1017 }, { "epoch": 0.01018, "grad_norm": 0.6518093347549438, "learning_rate": 0.003, "loss": 4.4645, "step": 1018 }, { "epoch": 0.01019, "grad_norm": 0.6830618381500244, "learning_rate": 0.003, "loss": 4.4391, "step": 1019 }, { "epoch": 0.0102, "grad_norm": 0.8045121431350708, "learning_rate": 0.003, "loss": 4.4426, "step": 1020 }, { "epoch": 0.01021, "grad_norm": 0.6937596201896667, "learning_rate": 0.003, "loss": 4.4245, "step": 1021 }, { "epoch": 0.01022, "grad_norm": 0.5872735977172852, "learning_rate": 0.003, "loss": 4.4273, "step": 1022 }, { "epoch": 0.01023, "grad_norm": 0.63226717710495, "learning_rate": 0.003, "loss": 4.4308, "step": 1023 }, { "epoch": 0.01024, "grad_norm": 0.7018114924430847, "learning_rate": 0.003, "loss": 4.3934, "step": 1024 }, { "epoch": 0.01025, "grad_norm": 0.5812709927558899, "learning_rate": 0.003, "loss": 4.4216, "step": 1025 }, { "epoch": 0.01026, "grad_norm": 0.4411616921424866, "learning_rate": 0.003, "loss": 4.4039, "step": 1026 }, { "epoch": 0.01027, "grad_norm": 0.6123212575912476, "learning_rate": 0.003, "loss": 4.4333, "step": 1027 }, { "epoch": 0.01028, "grad_norm": 0.6239144802093506, "learning_rate": 0.003, "loss": 4.4176, "step": 1028 }, { "epoch": 0.01029, "grad_norm": 0.46944838762283325, "learning_rate": 0.003, "loss": 4.386, "step": 1029 }, { "epoch": 0.0103, "grad_norm": 0.5337734818458557, "learning_rate": 0.003, "loss": 4.4157, "step": 1030 }, { "epoch": 0.01031, "grad_norm": 0.6015392541885376, "learning_rate": 0.003, "loss": 4.4386, "step": 1031 }, { "epoch": 0.01032, "grad_norm": 0.49139612913131714, "learning_rate": 0.003, "loss": 4.3989, "step": 1032 }, { "epoch": 0.01033, "grad_norm": 0.6846477389335632, "learning_rate": 0.003, "loss": 4.444, "step": 1033 }, { "epoch": 0.01034, "grad_norm": 0.619234025478363, "learning_rate": 0.003, "loss": 4.4088, "step": 1034 }, { "epoch": 0.01035, "grad_norm": 0.567406952381134, "learning_rate": 0.003, "loss": 4.4003, "step": 1035 }, { "epoch": 0.01036, "grad_norm": 0.7047313451766968, "learning_rate": 0.003, "loss": 4.4001, "step": 1036 }, { "epoch": 0.01037, "grad_norm": 0.6020769476890564, "learning_rate": 0.003, "loss": 4.4235, "step": 1037 }, { "epoch": 0.01038, "grad_norm": 0.498977929353714, "learning_rate": 0.003, "loss": 4.4148, "step": 1038 }, { "epoch": 0.01039, "grad_norm": 0.5447039008140564, "learning_rate": 0.003, "loss": 4.4085, "step": 1039 }, { "epoch": 0.0104, "grad_norm": 0.5632253289222717, "learning_rate": 0.003, "loss": 4.4204, "step": 1040 }, { "epoch": 0.01041, "grad_norm": 0.5558527708053589, "learning_rate": 0.003, "loss": 4.4009, "step": 1041 }, { "epoch": 0.01042, "grad_norm": 0.5142215490341187, "learning_rate": 0.003, "loss": 4.4233, "step": 1042 }, { "epoch": 0.01043, "grad_norm": 0.4451257884502411, "learning_rate": 0.003, "loss": 4.3651, "step": 1043 }, { "epoch": 0.01044, "grad_norm": 0.41156554222106934, "learning_rate": 0.003, "loss": 4.4008, "step": 1044 }, { "epoch": 0.01045, "grad_norm": 0.42055779695510864, "learning_rate": 0.003, "loss": 4.4203, "step": 1045 }, { "epoch": 0.01046, "grad_norm": 0.5353807806968689, "learning_rate": 0.003, "loss": 4.3928, "step": 1046 }, { "epoch": 0.01047, "grad_norm": 0.6298825144767761, "learning_rate": 0.003, "loss": 4.4015, "step": 1047 }, { "epoch": 0.01048, "grad_norm": 0.7191411256790161, "learning_rate": 0.003, "loss": 4.3988, "step": 1048 }, { "epoch": 0.01049, "grad_norm": 0.7613799571990967, "learning_rate": 0.003, "loss": 4.3808, "step": 1049 }, { "epoch": 0.0105, "grad_norm": 0.7375987768173218, "learning_rate": 0.003, "loss": 4.4222, "step": 1050 }, { "epoch": 0.01051, "grad_norm": 0.6425654888153076, "learning_rate": 0.003, "loss": 4.3959, "step": 1051 }, { "epoch": 0.01052, "grad_norm": 0.7071349620819092, "learning_rate": 0.003, "loss": 4.4073, "step": 1052 }, { "epoch": 0.01053, "grad_norm": 0.838892936706543, "learning_rate": 0.003, "loss": 4.4246, "step": 1053 }, { "epoch": 0.01054, "grad_norm": 0.8081632852554321, "learning_rate": 0.003, "loss": 4.4168, "step": 1054 }, { "epoch": 0.01055, "grad_norm": 1.0391567945480347, "learning_rate": 0.003, "loss": 4.4056, "step": 1055 }, { "epoch": 0.01056, "grad_norm": 0.9966610074043274, "learning_rate": 0.003, "loss": 4.4166, "step": 1056 }, { "epoch": 0.01057, "grad_norm": 0.8638164401054382, "learning_rate": 0.003, "loss": 4.4191, "step": 1057 }, { "epoch": 0.01058, "grad_norm": 0.8240249156951904, "learning_rate": 0.003, "loss": 4.4115, "step": 1058 }, { "epoch": 0.01059, "grad_norm": 0.8680912256240845, "learning_rate": 0.003, "loss": 4.4127, "step": 1059 }, { "epoch": 0.0106, "grad_norm": 0.8156179785728455, "learning_rate": 0.003, "loss": 4.4084, "step": 1060 }, { "epoch": 0.01061, "grad_norm": 0.7857629656791687, "learning_rate": 0.003, "loss": 4.4047, "step": 1061 }, { "epoch": 0.01062, "grad_norm": 0.8633149862289429, "learning_rate": 0.003, "loss": 4.398, "step": 1062 }, { "epoch": 0.01063, "grad_norm": 0.9093345403671265, "learning_rate": 0.003, "loss": 4.4015, "step": 1063 }, { "epoch": 0.01064, "grad_norm": 0.8469107151031494, "learning_rate": 0.003, "loss": 4.4193, "step": 1064 }, { "epoch": 0.01065, "grad_norm": 0.8231899738311768, "learning_rate": 0.003, "loss": 4.4189, "step": 1065 }, { "epoch": 0.01066, "grad_norm": 0.8297374248504639, "learning_rate": 0.003, "loss": 4.423, "step": 1066 }, { "epoch": 0.01067, "grad_norm": 0.678811252117157, "learning_rate": 0.003, "loss": 4.4286, "step": 1067 }, { "epoch": 0.01068, "grad_norm": 0.7318910360336304, "learning_rate": 0.003, "loss": 4.4099, "step": 1068 }, { "epoch": 0.01069, "grad_norm": 0.7908743023872375, "learning_rate": 0.003, "loss": 4.4229, "step": 1069 }, { "epoch": 0.0107, "grad_norm": 0.8041807413101196, "learning_rate": 0.003, "loss": 4.4771, "step": 1070 }, { "epoch": 0.01071, "grad_norm": 0.6532490849494934, "learning_rate": 0.003, "loss": 4.3932, "step": 1071 }, { "epoch": 0.01072, "grad_norm": 0.6556645035743713, "learning_rate": 0.003, "loss": 4.426, "step": 1072 }, { "epoch": 0.01073, "grad_norm": 0.8516754508018494, "learning_rate": 0.003, "loss": 4.4268, "step": 1073 }, { "epoch": 0.01074, "grad_norm": 1.0397266149520874, "learning_rate": 0.003, "loss": 4.4077, "step": 1074 }, { "epoch": 0.01075, "grad_norm": 0.8689488172531128, "learning_rate": 0.003, "loss": 4.4416, "step": 1075 }, { "epoch": 0.01076, "grad_norm": 0.9471074342727661, "learning_rate": 0.003, "loss": 4.4417, "step": 1076 }, { "epoch": 0.01077, "grad_norm": 0.9231577515602112, "learning_rate": 0.003, "loss": 4.4357, "step": 1077 }, { "epoch": 0.01078, "grad_norm": 0.8509425520896912, "learning_rate": 0.003, "loss": 4.3906, "step": 1078 }, { "epoch": 0.01079, "grad_norm": 0.7320951819419861, "learning_rate": 0.003, "loss": 4.4158, "step": 1079 }, { "epoch": 0.0108, "grad_norm": 0.6985662579536438, "learning_rate": 0.003, "loss": 4.3942, "step": 1080 }, { "epoch": 0.01081, "grad_norm": 0.5917143225669861, "learning_rate": 0.003, "loss": 4.396, "step": 1081 }, { "epoch": 0.01082, "grad_norm": 0.567608654499054, "learning_rate": 0.003, "loss": 4.4109, "step": 1082 }, { "epoch": 0.01083, "grad_norm": 0.5785332322120667, "learning_rate": 0.003, "loss": 4.3961, "step": 1083 }, { "epoch": 0.01084, "grad_norm": 0.5440964102745056, "learning_rate": 0.003, "loss": 4.3992, "step": 1084 }, { "epoch": 0.01085, "grad_norm": 0.49031350016593933, "learning_rate": 0.003, "loss": 4.3852, "step": 1085 }, { "epoch": 0.01086, "grad_norm": 0.5103932619094849, "learning_rate": 0.003, "loss": 4.3604, "step": 1086 }, { "epoch": 0.01087, "grad_norm": 0.5229532718658447, "learning_rate": 0.003, "loss": 4.3777, "step": 1087 }, { "epoch": 0.01088, "grad_norm": 0.6494709253311157, "learning_rate": 0.003, "loss": 4.4131, "step": 1088 }, { "epoch": 0.01089, "grad_norm": 0.6600701808929443, "learning_rate": 0.003, "loss": 4.3871, "step": 1089 }, { "epoch": 0.0109, "grad_norm": 0.4794403612613678, "learning_rate": 0.003, "loss": 4.4042, "step": 1090 }, { "epoch": 0.01091, "grad_norm": 0.4167538285255432, "learning_rate": 0.003, "loss": 4.369, "step": 1091 }, { "epoch": 0.01092, "grad_norm": 0.4487409293651581, "learning_rate": 0.003, "loss": 4.3678, "step": 1092 }, { "epoch": 0.01093, "grad_norm": 0.39760273694992065, "learning_rate": 0.003, "loss": 4.3829, "step": 1093 }, { "epoch": 0.01094, "grad_norm": 0.3819560706615448, "learning_rate": 0.003, "loss": 4.3691, "step": 1094 }, { "epoch": 0.01095, "grad_norm": 0.4009959101676941, "learning_rate": 0.003, "loss": 4.3966, "step": 1095 }, { "epoch": 0.01096, "grad_norm": 0.4385271668434143, "learning_rate": 0.003, "loss": 4.3677, "step": 1096 }, { "epoch": 0.01097, "grad_norm": 0.5154523253440857, "learning_rate": 0.003, "loss": 4.3585, "step": 1097 }, { "epoch": 0.01098, "grad_norm": 0.6741756200790405, "learning_rate": 0.003, "loss": 4.3816, "step": 1098 }, { "epoch": 0.01099, "grad_norm": 0.782984733581543, "learning_rate": 0.003, "loss": 4.3628, "step": 1099 }, { "epoch": 0.011, "grad_norm": 0.7672849297523499, "learning_rate": 0.003, "loss": 4.3633, "step": 1100 }, { "epoch": 0.01101, "grad_norm": 0.7932085394859314, "learning_rate": 0.003, "loss": 4.377, "step": 1101 }, { "epoch": 0.01102, "grad_norm": 0.8302851915359497, "learning_rate": 0.003, "loss": 4.3891, "step": 1102 }, { "epoch": 0.01103, "grad_norm": 0.747067391872406, "learning_rate": 0.003, "loss": 4.3737, "step": 1103 }, { "epoch": 0.01104, "grad_norm": 0.789983332157135, "learning_rate": 0.003, "loss": 4.3715, "step": 1104 }, { "epoch": 0.01105, "grad_norm": 1.0034455060958862, "learning_rate": 0.003, "loss": 4.4029, "step": 1105 }, { "epoch": 0.01106, "grad_norm": 1.1279704570770264, "learning_rate": 0.003, "loss": 4.3962, "step": 1106 }, { "epoch": 0.01107, "grad_norm": 0.916431725025177, "learning_rate": 0.003, "loss": 4.3914, "step": 1107 }, { "epoch": 0.01108, "grad_norm": 0.9773505330085754, "learning_rate": 0.003, "loss": 4.3731, "step": 1108 }, { "epoch": 0.01109, "grad_norm": 0.8878449201583862, "learning_rate": 0.003, "loss": 4.3883, "step": 1109 }, { "epoch": 0.0111, "grad_norm": 0.7733376622200012, "learning_rate": 0.003, "loss": 4.4082, "step": 1110 }, { "epoch": 0.01111, "grad_norm": 0.7688936591148376, "learning_rate": 0.003, "loss": 4.3824, "step": 1111 }, { "epoch": 0.01112, "grad_norm": 0.8010363578796387, "learning_rate": 0.003, "loss": 4.4064, "step": 1112 }, { "epoch": 0.01113, "grad_norm": 0.7642234563827515, "learning_rate": 0.003, "loss": 4.3689, "step": 1113 }, { "epoch": 0.01114, "grad_norm": 0.6956503987312317, "learning_rate": 0.003, "loss": 4.386, "step": 1114 }, { "epoch": 0.01115, "grad_norm": 0.5809863805770874, "learning_rate": 0.003, "loss": 4.3985, "step": 1115 }, { "epoch": 0.01116, "grad_norm": 0.5753639936447144, "learning_rate": 0.003, "loss": 4.3898, "step": 1116 }, { "epoch": 0.01117, "grad_norm": 0.5193636417388916, "learning_rate": 0.003, "loss": 4.3734, "step": 1117 }, { "epoch": 0.01118, "grad_norm": 0.5058582425117493, "learning_rate": 0.003, "loss": 4.3742, "step": 1118 }, { "epoch": 0.01119, "grad_norm": 0.4714462161064148, "learning_rate": 0.003, "loss": 4.3802, "step": 1119 }, { "epoch": 0.0112, "grad_norm": 0.5877813100814819, "learning_rate": 0.003, "loss": 4.3643, "step": 1120 }, { "epoch": 0.01121, "grad_norm": 0.6811574697494507, "learning_rate": 0.003, "loss": 4.3858, "step": 1121 }, { "epoch": 0.01122, "grad_norm": 0.7246198058128357, "learning_rate": 0.003, "loss": 4.3577, "step": 1122 }, { "epoch": 0.01123, "grad_norm": 0.6985503435134888, "learning_rate": 0.003, "loss": 4.3459, "step": 1123 }, { "epoch": 0.01124, "grad_norm": 0.6198428273200989, "learning_rate": 0.003, "loss": 4.3921, "step": 1124 }, { "epoch": 0.01125, "grad_norm": 0.6425508856773376, "learning_rate": 0.003, "loss": 4.3601, "step": 1125 }, { "epoch": 0.01126, "grad_norm": 0.7215856313705444, "learning_rate": 0.003, "loss": 4.3688, "step": 1126 }, { "epoch": 0.01127, "grad_norm": 0.614211916923523, "learning_rate": 0.003, "loss": 4.3955, "step": 1127 }, { "epoch": 0.01128, "grad_norm": 0.5611268877983093, "learning_rate": 0.003, "loss": 4.3725, "step": 1128 }, { "epoch": 0.01129, "grad_norm": 0.5580106973648071, "learning_rate": 0.003, "loss": 4.3639, "step": 1129 }, { "epoch": 0.0113, "grad_norm": 0.5985743403434753, "learning_rate": 0.003, "loss": 4.3456, "step": 1130 }, { "epoch": 0.01131, "grad_norm": 0.6220662593841553, "learning_rate": 0.003, "loss": 4.3584, "step": 1131 }, { "epoch": 0.01132, "grad_norm": 0.6651334166526794, "learning_rate": 0.003, "loss": 4.3835, "step": 1132 }, { "epoch": 0.01133, "grad_norm": 0.6686156392097473, "learning_rate": 0.003, "loss": 4.3408, "step": 1133 }, { "epoch": 0.01134, "grad_norm": 0.6207416653633118, "learning_rate": 0.003, "loss": 4.3549, "step": 1134 }, { "epoch": 0.01135, "grad_norm": 0.4418866038322449, "learning_rate": 0.003, "loss": 4.3765, "step": 1135 }, { "epoch": 0.01136, "grad_norm": 0.496660441160202, "learning_rate": 0.003, "loss": 4.3534, "step": 1136 }, { "epoch": 0.01137, "grad_norm": 0.516294002532959, "learning_rate": 0.003, "loss": 4.3812, "step": 1137 }, { "epoch": 0.01138, "grad_norm": 0.7070192098617554, "learning_rate": 0.003, "loss": 4.3409, "step": 1138 }, { "epoch": 0.01139, "grad_norm": 0.9681088924407959, "learning_rate": 0.003, "loss": 4.3765, "step": 1139 }, { "epoch": 0.0114, "grad_norm": 1.0279847383499146, "learning_rate": 0.003, "loss": 4.361, "step": 1140 }, { "epoch": 0.01141, "grad_norm": 0.8630406260490417, "learning_rate": 0.003, "loss": 4.3777, "step": 1141 }, { "epoch": 0.01142, "grad_norm": 0.8118616938591003, "learning_rate": 0.003, "loss": 4.3623, "step": 1142 }, { "epoch": 0.01143, "grad_norm": 0.6431841254234314, "learning_rate": 0.003, "loss": 4.3605, "step": 1143 }, { "epoch": 0.01144, "grad_norm": 0.8104698061943054, "learning_rate": 0.003, "loss": 4.3374, "step": 1144 }, { "epoch": 0.01145, "grad_norm": 0.8287402987480164, "learning_rate": 0.003, "loss": 4.3806, "step": 1145 }, { "epoch": 0.01146, "grad_norm": 0.6933982968330383, "learning_rate": 0.003, "loss": 4.358, "step": 1146 }, { "epoch": 0.01147, "grad_norm": 0.7871519327163696, "learning_rate": 0.003, "loss": 4.3521, "step": 1147 }, { "epoch": 0.01148, "grad_norm": 0.879456639289856, "learning_rate": 0.003, "loss": 4.3655, "step": 1148 }, { "epoch": 0.01149, "grad_norm": 0.973317563533783, "learning_rate": 0.003, "loss": 4.3825, "step": 1149 }, { "epoch": 0.0115, "grad_norm": 0.9118475914001465, "learning_rate": 0.003, "loss": 4.3787, "step": 1150 }, { "epoch": 0.01151, "grad_norm": 0.8108662962913513, "learning_rate": 0.003, "loss": 4.3725, "step": 1151 }, { "epoch": 0.01152, "grad_norm": 0.6577885150909424, "learning_rate": 0.003, "loss": 4.3738, "step": 1152 }, { "epoch": 0.01153, "grad_norm": 0.7330801486968994, "learning_rate": 0.003, "loss": 4.3918, "step": 1153 }, { "epoch": 0.01154, "grad_norm": 0.6542471051216125, "learning_rate": 0.003, "loss": 4.3669, "step": 1154 }, { "epoch": 0.01155, "grad_norm": 0.6441750526428223, "learning_rate": 0.003, "loss": 4.3518, "step": 1155 }, { "epoch": 0.01156, "grad_norm": 0.6419921517372131, "learning_rate": 0.003, "loss": 4.3556, "step": 1156 }, { "epoch": 0.01157, "grad_norm": 0.6812348365783691, "learning_rate": 0.003, "loss": 4.3858, "step": 1157 }, { "epoch": 0.01158, "grad_norm": 0.6114307045936584, "learning_rate": 0.003, "loss": 4.3497, "step": 1158 }, { "epoch": 0.01159, "grad_norm": 0.5222328901290894, "learning_rate": 0.003, "loss": 4.3411, "step": 1159 }, { "epoch": 0.0116, "grad_norm": 0.5184376239776611, "learning_rate": 0.003, "loss": 4.3556, "step": 1160 }, { "epoch": 0.01161, "grad_norm": 0.48943445086479187, "learning_rate": 0.003, "loss": 4.3274, "step": 1161 }, { "epoch": 0.01162, "grad_norm": 0.44027477502822876, "learning_rate": 0.003, "loss": 4.3376, "step": 1162 }, { "epoch": 0.01163, "grad_norm": 0.40750086307525635, "learning_rate": 0.003, "loss": 4.3549, "step": 1163 }, { "epoch": 0.01164, "grad_norm": 0.3570636808872223, "learning_rate": 0.003, "loss": 4.3421, "step": 1164 }, { "epoch": 0.01165, "grad_norm": 0.37404096126556396, "learning_rate": 0.003, "loss": 4.3343, "step": 1165 }, { "epoch": 0.01166, "grad_norm": 0.3607056736946106, "learning_rate": 0.003, "loss": 4.3358, "step": 1166 }, { "epoch": 0.01167, "grad_norm": 0.4395747482776642, "learning_rate": 0.003, "loss": 4.3038, "step": 1167 }, { "epoch": 0.01168, "grad_norm": 0.5243505239486694, "learning_rate": 0.003, "loss": 4.3153, "step": 1168 }, { "epoch": 0.01169, "grad_norm": 0.8133231997489929, "learning_rate": 0.003, "loss": 4.3628, "step": 1169 }, { "epoch": 0.0117, "grad_norm": 0.8480315208435059, "learning_rate": 0.003, "loss": 4.3697, "step": 1170 }, { "epoch": 0.01171, "grad_norm": 0.6388808488845825, "learning_rate": 0.003, "loss": 4.3357, "step": 1171 }, { "epoch": 0.01172, "grad_norm": 0.6913546919822693, "learning_rate": 0.003, "loss": 4.3289, "step": 1172 }, { "epoch": 0.01173, "grad_norm": 0.656560480594635, "learning_rate": 0.003, "loss": 4.3667, "step": 1173 }, { "epoch": 0.01174, "grad_norm": 0.5804395079612732, "learning_rate": 0.003, "loss": 4.3352, "step": 1174 }, { "epoch": 0.01175, "grad_norm": 0.7265805602073669, "learning_rate": 0.003, "loss": 4.3269, "step": 1175 }, { "epoch": 0.01176, "grad_norm": 0.753591001033783, "learning_rate": 0.003, "loss": 4.3628, "step": 1176 }, { "epoch": 0.01177, "grad_norm": 0.7016688585281372, "learning_rate": 0.003, "loss": 4.3288, "step": 1177 }, { "epoch": 0.01178, "grad_norm": 0.7714430689811707, "learning_rate": 0.003, "loss": 4.3516, "step": 1178 }, { "epoch": 0.01179, "grad_norm": 0.7303088903427124, "learning_rate": 0.003, "loss": 4.3309, "step": 1179 }, { "epoch": 0.0118, "grad_norm": 0.6078006029129028, "learning_rate": 0.003, "loss": 4.3881, "step": 1180 }, { "epoch": 0.01181, "grad_norm": 0.6842349767684937, "learning_rate": 0.003, "loss": 4.3776, "step": 1181 }, { "epoch": 0.01182, "grad_norm": 0.6943366527557373, "learning_rate": 0.003, "loss": 4.3406, "step": 1182 }, { "epoch": 0.01183, "grad_norm": 0.8091237545013428, "learning_rate": 0.003, "loss": 4.3348, "step": 1183 }, { "epoch": 0.01184, "grad_norm": 1.061568021774292, "learning_rate": 0.003, "loss": 4.3914, "step": 1184 }, { "epoch": 0.01185, "grad_norm": 0.8616968989372253, "learning_rate": 0.003, "loss": 4.3565, "step": 1185 }, { "epoch": 0.01186, "grad_norm": 0.9476875066757202, "learning_rate": 0.003, "loss": 4.3829, "step": 1186 }, { "epoch": 0.01187, "grad_norm": 0.8803635239601135, "learning_rate": 0.003, "loss": 4.3482, "step": 1187 }, { "epoch": 0.01188, "grad_norm": 0.8929046988487244, "learning_rate": 0.003, "loss": 4.3663, "step": 1188 }, { "epoch": 0.01189, "grad_norm": 1.0426416397094727, "learning_rate": 0.003, "loss": 4.3669, "step": 1189 }, { "epoch": 0.0119, "grad_norm": 0.8442360758781433, "learning_rate": 0.003, "loss": 4.3513, "step": 1190 }, { "epoch": 0.01191, "grad_norm": 0.6745409965515137, "learning_rate": 0.003, "loss": 4.3234, "step": 1191 }, { "epoch": 0.01192, "grad_norm": 0.6114994883537292, "learning_rate": 0.003, "loss": 4.3525, "step": 1192 }, { "epoch": 0.01193, "grad_norm": 0.545036792755127, "learning_rate": 0.003, "loss": 4.3526, "step": 1193 }, { "epoch": 0.01194, "grad_norm": 0.556999921798706, "learning_rate": 0.003, "loss": 4.3504, "step": 1194 }, { "epoch": 0.01195, "grad_norm": 0.5564072728157043, "learning_rate": 0.003, "loss": 4.3299, "step": 1195 }, { "epoch": 0.01196, "grad_norm": 0.5982546210289001, "learning_rate": 0.003, "loss": 4.337, "step": 1196 }, { "epoch": 0.01197, "grad_norm": 0.6563606858253479, "learning_rate": 0.003, "loss": 4.3029, "step": 1197 }, { "epoch": 0.01198, "grad_norm": 0.7455101609230042, "learning_rate": 0.003, "loss": 4.379, "step": 1198 }, { "epoch": 0.01199, "grad_norm": 0.7808868885040283, "learning_rate": 0.003, "loss": 4.3568, "step": 1199 }, { "epoch": 0.012, "grad_norm": 0.6572416424751282, "learning_rate": 0.003, "loss": 4.3439, "step": 1200 }, { "epoch": 0.01201, "grad_norm": 0.588565468788147, "learning_rate": 0.003, "loss": 4.3678, "step": 1201 }, { "epoch": 0.01202, "grad_norm": 0.6117038726806641, "learning_rate": 0.003, "loss": 4.3358, "step": 1202 }, { "epoch": 0.01203, "grad_norm": 0.5589160323143005, "learning_rate": 0.003, "loss": 4.343, "step": 1203 }, { "epoch": 0.01204, "grad_norm": 0.537390410900116, "learning_rate": 0.003, "loss": 4.3383, "step": 1204 }, { "epoch": 0.01205, "grad_norm": 0.6048082709312439, "learning_rate": 0.003, "loss": 4.3289, "step": 1205 }, { "epoch": 0.01206, "grad_norm": 0.5843051671981812, "learning_rate": 0.003, "loss": 4.3719, "step": 1206 }, { "epoch": 0.01207, "grad_norm": 0.5032393932342529, "learning_rate": 0.003, "loss": 4.2929, "step": 1207 }, { "epoch": 0.01208, "grad_norm": 0.4879417419433594, "learning_rate": 0.003, "loss": 4.351, "step": 1208 }, { "epoch": 0.01209, "grad_norm": 0.5636802911758423, "learning_rate": 0.003, "loss": 4.3346, "step": 1209 }, { "epoch": 0.0121, "grad_norm": 0.6080469489097595, "learning_rate": 0.003, "loss": 4.3373, "step": 1210 }, { "epoch": 0.01211, "grad_norm": 0.5971558094024658, "learning_rate": 0.003, "loss": 4.3219, "step": 1211 }, { "epoch": 0.01212, "grad_norm": 0.5976467132568359, "learning_rate": 0.003, "loss": 4.3467, "step": 1212 }, { "epoch": 0.01213, "grad_norm": 0.5944254994392395, "learning_rate": 0.003, "loss": 4.3244, "step": 1213 }, { "epoch": 0.01214, "grad_norm": 0.5843701958656311, "learning_rate": 0.003, "loss": 4.306, "step": 1214 }, { "epoch": 0.01215, "grad_norm": 0.6481133103370667, "learning_rate": 0.003, "loss": 4.3176, "step": 1215 }, { "epoch": 0.01216, "grad_norm": 0.819490909576416, "learning_rate": 0.003, "loss": 4.3212, "step": 1216 }, { "epoch": 0.01217, "grad_norm": 0.8658471703529358, "learning_rate": 0.003, "loss": 4.3388, "step": 1217 }, { "epoch": 0.01218, "grad_norm": 1.0888832807540894, "learning_rate": 0.003, "loss": 4.3466, "step": 1218 }, { "epoch": 0.01219, "grad_norm": 1.1137385368347168, "learning_rate": 0.003, "loss": 4.359, "step": 1219 }, { "epoch": 0.0122, "grad_norm": 0.8392791152000427, "learning_rate": 0.003, "loss": 4.341, "step": 1220 }, { "epoch": 0.01221, "grad_norm": 0.842366635799408, "learning_rate": 0.003, "loss": 4.355, "step": 1221 }, { "epoch": 0.01222, "grad_norm": 0.9087112545967102, "learning_rate": 0.003, "loss": 4.3425, "step": 1222 }, { "epoch": 0.01223, "grad_norm": 0.8507757186889648, "learning_rate": 0.003, "loss": 4.3381, "step": 1223 }, { "epoch": 0.01224, "grad_norm": 0.745599091053009, "learning_rate": 0.003, "loss": 4.3236, "step": 1224 }, { "epoch": 0.01225, "grad_norm": 0.6612358093261719, "learning_rate": 0.003, "loss": 4.3414, "step": 1225 }, { "epoch": 0.01226, "grad_norm": 0.7157174348831177, "learning_rate": 0.003, "loss": 4.3359, "step": 1226 }, { "epoch": 0.01227, "grad_norm": 0.7675558924674988, "learning_rate": 0.003, "loss": 4.3648, "step": 1227 }, { "epoch": 0.01228, "grad_norm": 0.7858232855796814, "learning_rate": 0.003, "loss": 4.3493, "step": 1228 }, { "epoch": 0.01229, "grad_norm": 0.750853419303894, "learning_rate": 0.003, "loss": 4.3338, "step": 1229 }, { "epoch": 0.0123, "grad_norm": 0.7338488101959229, "learning_rate": 0.003, "loss": 4.3678, "step": 1230 }, { "epoch": 0.01231, "grad_norm": 0.7138859033584595, "learning_rate": 0.003, "loss": 4.3349, "step": 1231 }, { "epoch": 0.01232, "grad_norm": 0.6423472762107849, "learning_rate": 0.003, "loss": 4.3101, "step": 1232 }, { "epoch": 0.01233, "grad_norm": 0.6176342964172363, "learning_rate": 0.003, "loss": 4.3143, "step": 1233 }, { "epoch": 0.01234, "grad_norm": 0.5909737348556519, "learning_rate": 0.003, "loss": 4.3319, "step": 1234 }, { "epoch": 0.01235, "grad_norm": 0.5610330700874329, "learning_rate": 0.003, "loss": 4.3196, "step": 1235 }, { "epoch": 0.01236, "grad_norm": 0.5141494870185852, "learning_rate": 0.003, "loss": 4.3229, "step": 1236 }, { "epoch": 0.01237, "grad_norm": 0.5010937452316284, "learning_rate": 0.003, "loss": 4.3305, "step": 1237 }, { "epoch": 0.01238, "grad_norm": 0.5260263085365295, "learning_rate": 0.003, "loss": 4.3262, "step": 1238 }, { "epoch": 0.01239, "grad_norm": 0.6261786222457886, "learning_rate": 0.003, "loss": 4.3109, "step": 1239 }, { "epoch": 0.0124, "grad_norm": 0.6283076405525208, "learning_rate": 0.003, "loss": 4.294, "step": 1240 }, { "epoch": 0.01241, "grad_norm": 0.520620584487915, "learning_rate": 0.003, "loss": 4.3044, "step": 1241 }, { "epoch": 0.01242, "grad_norm": 0.45525336265563965, "learning_rate": 0.003, "loss": 4.3077, "step": 1242 }, { "epoch": 0.01243, "grad_norm": 0.5128642320632935, "learning_rate": 0.003, "loss": 4.2904, "step": 1243 }, { "epoch": 0.01244, "grad_norm": 0.6018971800804138, "learning_rate": 0.003, "loss": 4.3061, "step": 1244 }, { "epoch": 0.01245, "grad_norm": 0.8249465227127075, "learning_rate": 0.003, "loss": 4.3067, "step": 1245 }, { "epoch": 0.01246, "grad_norm": 0.983553946018219, "learning_rate": 0.003, "loss": 4.3098, "step": 1246 }, { "epoch": 0.01247, "grad_norm": 0.8006371259689331, "learning_rate": 0.003, "loss": 4.3508, "step": 1247 }, { "epoch": 0.01248, "grad_norm": 0.7486335635185242, "learning_rate": 0.003, "loss": 4.3351, "step": 1248 }, { "epoch": 0.01249, "grad_norm": 0.731898307800293, "learning_rate": 0.003, "loss": 4.3269, "step": 1249 }, { "epoch": 0.0125, "grad_norm": 0.7205458283424377, "learning_rate": 0.003, "loss": 4.3015, "step": 1250 }, { "epoch": 0.01251, "grad_norm": 0.9905809760093689, "learning_rate": 0.003, "loss": 4.3425, "step": 1251 }, { "epoch": 0.01252, "grad_norm": 0.7830891013145447, "learning_rate": 0.003, "loss": 4.319, "step": 1252 }, { "epoch": 0.01253, "grad_norm": 0.6952295303344727, "learning_rate": 0.003, "loss": 4.2965, "step": 1253 }, { "epoch": 0.01254, "grad_norm": 0.7467588782310486, "learning_rate": 0.003, "loss": 4.3427, "step": 1254 }, { "epoch": 0.01255, "grad_norm": 0.7031508684158325, "learning_rate": 0.003, "loss": 4.2927, "step": 1255 }, { "epoch": 0.01256, "grad_norm": 0.6603983640670776, "learning_rate": 0.003, "loss": 4.3194, "step": 1256 }, { "epoch": 0.01257, "grad_norm": 0.6309519410133362, "learning_rate": 0.003, "loss": 4.3008, "step": 1257 }, { "epoch": 0.01258, "grad_norm": 0.6506064534187317, "learning_rate": 0.003, "loss": 4.3107, "step": 1258 }, { "epoch": 0.01259, "grad_norm": 0.5962826013565063, "learning_rate": 0.003, "loss": 4.3105, "step": 1259 }, { "epoch": 0.0126, "grad_norm": 0.5265083909034729, "learning_rate": 0.003, "loss": 4.3316, "step": 1260 }, { "epoch": 0.01261, "grad_norm": 0.5279543995857239, "learning_rate": 0.003, "loss": 4.3275, "step": 1261 }, { "epoch": 0.01262, "grad_norm": 0.5171265006065369, "learning_rate": 0.003, "loss": 4.3175, "step": 1262 }, { "epoch": 0.01263, "grad_norm": 0.5120646953582764, "learning_rate": 0.003, "loss": 4.3111, "step": 1263 }, { "epoch": 0.01264, "grad_norm": 0.5407173037528992, "learning_rate": 0.003, "loss": 4.3, "step": 1264 }, { "epoch": 0.01265, "grad_norm": 0.5656223893165588, "learning_rate": 0.003, "loss": 4.3113, "step": 1265 }, { "epoch": 0.01266, "grad_norm": 0.4160173237323761, "learning_rate": 0.003, "loss": 4.3033, "step": 1266 }, { "epoch": 0.01267, "grad_norm": 0.4453507661819458, "learning_rate": 0.003, "loss": 4.3244, "step": 1267 }, { "epoch": 0.01268, "grad_norm": 0.4457267224788666, "learning_rate": 0.003, "loss": 4.3093, "step": 1268 }, { "epoch": 0.01269, "grad_norm": 0.38706403970718384, "learning_rate": 0.003, "loss": 4.2734, "step": 1269 }, { "epoch": 0.0127, "grad_norm": 0.38581886887550354, "learning_rate": 0.003, "loss": 4.3082, "step": 1270 }, { "epoch": 0.01271, "grad_norm": 0.40513476729393005, "learning_rate": 0.003, "loss": 4.2869, "step": 1271 }, { "epoch": 0.01272, "grad_norm": 0.522612988948822, "learning_rate": 0.003, "loss": 4.2863, "step": 1272 }, { "epoch": 0.01273, "grad_norm": 0.8000910878181458, "learning_rate": 0.003, "loss": 4.321, "step": 1273 }, { "epoch": 0.01274, "grad_norm": 1.0203778743743896, "learning_rate": 0.003, "loss": 4.3505, "step": 1274 }, { "epoch": 0.01275, "grad_norm": 0.8743080496788025, "learning_rate": 0.003, "loss": 4.3003, "step": 1275 }, { "epoch": 0.01276, "grad_norm": 0.9019266963005066, "learning_rate": 0.003, "loss": 4.3144, "step": 1276 }, { "epoch": 0.01277, "grad_norm": 0.822109580039978, "learning_rate": 0.003, "loss": 4.3254, "step": 1277 }, { "epoch": 0.01278, "grad_norm": 0.6606391668319702, "learning_rate": 0.003, "loss": 4.325, "step": 1278 }, { "epoch": 0.01279, "grad_norm": 0.7756208777427673, "learning_rate": 0.003, "loss": 4.3104, "step": 1279 }, { "epoch": 0.0128, "grad_norm": 0.7292629480361938, "learning_rate": 0.003, "loss": 4.311, "step": 1280 }, { "epoch": 0.01281, "grad_norm": 0.6916735768318176, "learning_rate": 0.003, "loss": 4.2953, "step": 1281 }, { "epoch": 0.01282, "grad_norm": 0.6080433130264282, "learning_rate": 0.003, "loss": 4.3294, "step": 1282 }, { "epoch": 0.01283, "grad_norm": 0.6106524467468262, "learning_rate": 0.003, "loss": 4.301, "step": 1283 }, { "epoch": 0.01284, "grad_norm": 0.5725691318511963, "learning_rate": 0.003, "loss": 4.2991, "step": 1284 }, { "epoch": 0.01285, "grad_norm": 0.6401168704032898, "learning_rate": 0.003, "loss": 4.3022, "step": 1285 }, { "epoch": 0.01286, "grad_norm": 0.620028555393219, "learning_rate": 0.003, "loss": 4.3246, "step": 1286 }, { "epoch": 0.01287, "grad_norm": 0.662833034992218, "learning_rate": 0.003, "loss": 4.2902, "step": 1287 }, { "epoch": 0.01288, "grad_norm": 0.6968905925750732, "learning_rate": 0.003, "loss": 4.307, "step": 1288 }, { "epoch": 0.01289, "grad_norm": 0.5818215608596802, "learning_rate": 0.003, "loss": 4.2943, "step": 1289 }, { "epoch": 0.0129, "grad_norm": 0.5997996926307678, "learning_rate": 0.003, "loss": 4.3178, "step": 1290 }, { "epoch": 0.01291, "grad_norm": 0.6353629231452942, "learning_rate": 0.003, "loss": 4.2994, "step": 1291 }, { "epoch": 0.01292, "grad_norm": 0.7124042510986328, "learning_rate": 0.003, "loss": 4.3335, "step": 1292 }, { "epoch": 0.01293, "grad_norm": 0.8257373571395874, "learning_rate": 0.003, "loss": 4.3265, "step": 1293 }, { "epoch": 0.01294, "grad_norm": 0.7939269542694092, "learning_rate": 0.003, "loss": 4.2974, "step": 1294 }, { "epoch": 0.01295, "grad_norm": 0.703215479850769, "learning_rate": 0.003, "loss": 4.3191, "step": 1295 }, { "epoch": 0.01296, "grad_norm": 0.851357102394104, "learning_rate": 0.003, "loss": 4.2949, "step": 1296 }, { "epoch": 0.01297, "grad_norm": 0.8115617036819458, "learning_rate": 0.003, "loss": 4.309, "step": 1297 }, { "epoch": 0.01298, "grad_norm": 0.8384767174720764, "learning_rate": 0.003, "loss": 4.2888, "step": 1298 }, { "epoch": 0.01299, "grad_norm": 0.8158665895462036, "learning_rate": 0.003, "loss": 4.305, "step": 1299 }, { "epoch": 0.013, "grad_norm": 0.7459951639175415, "learning_rate": 0.003, "loss": 4.29, "step": 1300 }, { "epoch": 0.01301, "grad_norm": 0.7418009042739868, "learning_rate": 0.003, "loss": 4.3054, "step": 1301 }, { "epoch": 0.01302, "grad_norm": 0.8234617710113525, "learning_rate": 0.003, "loss": 4.358, "step": 1302 }, { "epoch": 0.01303, "grad_norm": 0.7982010841369629, "learning_rate": 0.003, "loss": 4.3068, "step": 1303 }, { "epoch": 0.01304, "grad_norm": 0.812175452709198, "learning_rate": 0.003, "loss": 4.3282, "step": 1304 }, { "epoch": 0.01305, "grad_norm": 0.8789085745811462, "learning_rate": 0.003, "loss": 4.322, "step": 1305 }, { "epoch": 0.01306, "grad_norm": 0.828743577003479, "learning_rate": 0.003, "loss": 4.3122, "step": 1306 }, { "epoch": 0.01307, "grad_norm": 0.7142960429191589, "learning_rate": 0.003, "loss": 4.3298, "step": 1307 }, { "epoch": 0.01308, "grad_norm": 0.7313523888587952, "learning_rate": 0.003, "loss": 4.3053, "step": 1308 }, { "epoch": 0.01309, "grad_norm": 0.7293263077735901, "learning_rate": 0.003, "loss": 4.3473, "step": 1309 }, { "epoch": 0.0131, "grad_norm": 0.7924617528915405, "learning_rate": 0.003, "loss": 4.2932, "step": 1310 }, { "epoch": 0.01311, "grad_norm": 0.8199212551116943, "learning_rate": 0.003, "loss": 4.3417, "step": 1311 }, { "epoch": 0.01312, "grad_norm": 0.8725687861442566, "learning_rate": 0.003, "loss": 4.3076, "step": 1312 }, { "epoch": 0.01313, "grad_norm": 0.8202559351921082, "learning_rate": 0.003, "loss": 4.2971, "step": 1313 }, { "epoch": 0.01314, "grad_norm": 0.7738103866577148, "learning_rate": 0.003, "loss": 4.2944, "step": 1314 }, { "epoch": 0.01315, "grad_norm": 0.6587685346603394, "learning_rate": 0.003, "loss": 4.3124, "step": 1315 }, { "epoch": 0.01316, "grad_norm": 0.7233019471168518, "learning_rate": 0.003, "loss": 4.3037, "step": 1316 }, { "epoch": 0.01317, "grad_norm": 0.6440667510032654, "learning_rate": 0.003, "loss": 4.3256, "step": 1317 }, { "epoch": 0.01318, "grad_norm": 0.5893415212631226, "learning_rate": 0.003, "loss": 4.301, "step": 1318 }, { "epoch": 0.01319, "grad_norm": 0.7347000241279602, "learning_rate": 0.003, "loss": 4.3133, "step": 1319 }, { "epoch": 0.0132, "grad_norm": 0.8620561957359314, "learning_rate": 0.003, "loss": 4.3153, "step": 1320 }, { "epoch": 0.01321, "grad_norm": 0.9148700833320618, "learning_rate": 0.003, "loss": 4.2941, "step": 1321 }, { "epoch": 0.01322, "grad_norm": 0.8122279047966003, "learning_rate": 0.003, "loss": 4.3273, "step": 1322 }, { "epoch": 0.01323, "grad_norm": 0.7629367113113403, "learning_rate": 0.003, "loss": 4.3161, "step": 1323 }, { "epoch": 0.01324, "grad_norm": 0.6652523875236511, "learning_rate": 0.003, "loss": 4.2981, "step": 1324 }, { "epoch": 0.01325, "grad_norm": 0.6290608048439026, "learning_rate": 0.003, "loss": 4.3001, "step": 1325 }, { "epoch": 0.01326, "grad_norm": 0.6334658861160278, "learning_rate": 0.003, "loss": 4.2848, "step": 1326 }, { "epoch": 0.01327, "grad_norm": 0.5682603716850281, "learning_rate": 0.003, "loss": 4.3018, "step": 1327 }, { "epoch": 0.01328, "grad_norm": 0.4706239700317383, "learning_rate": 0.003, "loss": 4.2884, "step": 1328 }, { "epoch": 0.01329, "grad_norm": 0.46228259801864624, "learning_rate": 0.003, "loss": 4.2518, "step": 1329 }, { "epoch": 0.0133, "grad_norm": 0.4335189163684845, "learning_rate": 0.003, "loss": 4.293, "step": 1330 }, { "epoch": 0.01331, "grad_norm": 0.383666068315506, "learning_rate": 0.003, "loss": 4.2772, "step": 1331 }, { "epoch": 0.01332, "grad_norm": 0.39924779534339905, "learning_rate": 0.003, "loss": 4.2955, "step": 1332 }, { "epoch": 0.01333, "grad_norm": 0.4323978126049042, "learning_rate": 0.003, "loss": 4.2692, "step": 1333 }, { "epoch": 0.01334, "grad_norm": 0.43190306425094604, "learning_rate": 0.003, "loss": 4.2574, "step": 1334 }, { "epoch": 0.01335, "grad_norm": 0.49212101101875305, "learning_rate": 0.003, "loss": 4.2803, "step": 1335 }, { "epoch": 0.01336, "grad_norm": 0.6307611465454102, "learning_rate": 0.003, "loss": 4.2833, "step": 1336 }, { "epoch": 0.01337, "grad_norm": 0.9523991942405701, "learning_rate": 0.003, "loss": 4.31, "step": 1337 }, { "epoch": 0.01338, "grad_norm": 1.0491547584533691, "learning_rate": 0.003, "loss": 4.3099, "step": 1338 }, { "epoch": 0.01339, "grad_norm": 0.7694852948188782, "learning_rate": 0.003, "loss": 4.307, "step": 1339 }, { "epoch": 0.0134, "grad_norm": 0.7887849807739258, "learning_rate": 0.003, "loss": 4.3001, "step": 1340 }, { "epoch": 0.01341, "grad_norm": 0.7728468179702759, "learning_rate": 0.003, "loss": 4.3115, "step": 1341 }, { "epoch": 0.01342, "grad_norm": 0.7008172273635864, "learning_rate": 0.003, "loss": 4.2807, "step": 1342 }, { "epoch": 0.01343, "grad_norm": 0.7046615481376648, "learning_rate": 0.003, "loss": 4.2868, "step": 1343 }, { "epoch": 0.01344, "grad_norm": 0.6420544385910034, "learning_rate": 0.003, "loss": 4.2829, "step": 1344 }, { "epoch": 0.01345, "grad_norm": 0.5914574861526489, "learning_rate": 0.003, "loss": 4.3212, "step": 1345 }, { "epoch": 0.01346, "grad_norm": 0.5600994229316711, "learning_rate": 0.003, "loss": 4.2911, "step": 1346 }, { "epoch": 0.01347, "grad_norm": 0.46645763516426086, "learning_rate": 0.003, "loss": 4.2774, "step": 1347 }, { "epoch": 0.01348, "grad_norm": 0.49815306067466736, "learning_rate": 0.003, "loss": 4.2778, "step": 1348 }, { "epoch": 0.01349, "grad_norm": 0.525874674320221, "learning_rate": 0.003, "loss": 4.2987, "step": 1349 }, { "epoch": 0.0135, "grad_norm": 0.487930566072464, "learning_rate": 0.003, "loss": 4.2878, "step": 1350 }, { "epoch": 0.01351, "grad_norm": 0.5213081240653992, "learning_rate": 0.003, "loss": 4.3174, "step": 1351 }, { "epoch": 0.01352, "grad_norm": 0.6742727160453796, "learning_rate": 0.003, "loss": 4.2961, "step": 1352 }, { "epoch": 0.01353, "grad_norm": 0.7189781069755554, "learning_rate": 0.003, "loss": 4.2871, "step": 1353 }, { "epoch": 0.01354, "grad_norm": 0.59047532081604, "learning_rate": 0.003, "loss": 4.2676, "step": 1354 }, { "epoch": 0.01355, "grad_norm": 0.5686421394348145, "learning_rate": 0.003, "loss": 4.2819, "step": 1355 }, { "epoch": 0.01356, "grad_norm": 0.5455272793769836, "learning_rate": 0.003, "loss": 4.2746, "step": 1356 }, { "epoch": 0.01357, "grad_norm": 0.6083900332450867, "learning_rate": 0.003, "loss": 4.2628, "step": 1357 }, { "epoch": 0.01358, "grad_norm": 0.5685535073280334, "learning_rate": 0.003, "loss": 4.258, "step": 1358 }, { "epoch": 0.01359, "grad_norm": 0.5254443883895874, "learning_rate": 0.003, "loss": 4.2971, "step": 1359 }, { "epoch": 0.0136, "grad_norm": 0.6204063892364502, "learning_rate": 0.003, "loss": 4.2647, "step": 1360 }, { "epoch": 0.01361, "grad_norm": 0.7256515026092529, "learning_rate": 0.003, "loss": 4.2884, "step": 1361 }, { "epoch": 0.01362, "grad_norm": 0.8139166831970215, "learning_rate": 0.003, "loss": 4.3042, "step": 1362 }, { "epoch": 0.01363, "grad_norm": 0.7577769756317139, "learning_rate": 0.003, "loss": 4.3089, "step": 1363 }, { "epoch": 0.01364, "grad_norm": 0.6858768463134766, "learning_rate": 0.003, "loss": 4.2972, "step": 1364 }, { "epoch": 0.01365, "grad_norm": 0.7445408701896667, "learning_rate": 0.003, "loss": 4.315, "step": 1365 }, { "epoch": 0.01366, "grad_norm": 0.9198596477508545, "learning_rate": 0.003, "loss": 4.2795, "step": 1366 }, { "epoch": 0.01367, "grad_norm": 0.8477755188941956, "learning_rate": 0.003, "loss": 4.3279, "step": 1367 }, { "epoch": 0.01368, "grad_norm": 0.8082318305969238, "learning_rate": 0.003, "loss": 4.302, "step": 1368 }, { "epoch": 0.01369, "grad_norm": 0.8816357851028442, "learning_rate": 0.003, "loss": 4.3, "step": 1369 }, { "epoch": 0.0137, "grad_norm": 0.8340180516242981, "learning_rate": 0.003, "loss": 4.326, "step": 1370 }, { "epoch": 0.01371, "grad_norm": 0.7977674007415771, "learning_rate": 0.003, "loss": 4.3179, "step": 1371 }, { "epoch": 0.01372, "grad_norm": 0.7062758803367615, "learning_rate": 0.003, "loss": 4.2633, "step": 1372 }, { "epoch": 0.01373, "grad_norm": 0.6788503527641296, "learning_rate": 0.003, "loss": 4.3, "step": 1373 }, { "epoch": 0.01374, "grad_norm": 0.6201223731040955, "learning_rate": 0.003, "loss": 4.3053, "step": 1374 }, { "epoch": 0.01375, "grad_norm": 0.6361509561538696, "learning_rate": 0.003, "loss": 4.264, "step": 1375 }, { "epoch": 0.01376, "grad_norm": 0.5589747428894043, "learning_rate": 0.003, "loss": 4.2871, "step": 1376 }, { "epoch": 0.01377, "grad_norm": 0.6119049787521362, "learning_rate": 0.003, "loss": 4.2875, "step": 1377 }, { "epoch": 0.01378, "grad_norm": 0.5476299524307251, "learning_rate": 0.003, "loss": 4.3133, "step": 1378 }, { "epoch": 0.01379, "grad_norm": 0.5394819974899292, "learning_rate": 0.003, "loss": 4.2841, "step": 1379 }, { "epoch": 0.0138, "grad_norm": 0.49164652824401855, "learning_rate": 0.003, "loss": 4.2791, "step": 1380 }, { "epoch": 0.01381, "grad_norm": 0.4505774974822998, "learning_rate": 0.003, "loss": 4.2845, "step": 1381 }, { "epoch": 0.01382, "grad_norm": 0.4613495469093323, "learning_rate": 0.003, "loss": 4.279, "step": 1382 }, { "epoch": 0.01383, "grad_norm": 0.6032357215881348, "learning_rate": 0.003, "loss": 4.2785, "step": 1383 }, { "epoch": 0.01384, "grad_norm": 0.8419365882873535, "learning_rate": 0.003, "loss": 4.3193, "step": 1384 }, { "epoch": 0.01385, "grad_norm": 1.1751699447631836, "learning_rate": 0.003, "loss": 4.3253, "step": 1385 }, { "epoch": 0.01386, "grad_norm": 0.6834710240364075, "learning_rate": 0.003, "loss": 4.2903, "step": 1386 }, { "epoch": 0.01387, "grad_norm": 0.5957779288291931, "learning_rate": 0.003, "loss": 4.2761, "step": 1387 }, { "epoch": 0.01388, "grad_norm": 0.6788092851638794, "learning_rate": 0.003, "loss": 4.3125, "step": 1388 }, { "epoch": 0.01389, "grad_norm": 0.6753062605857849, "learning_rate": 0.003, "loss": 4.2893, "step": 1389 }, { "epoch": 0.0139, "grad_norm": 0.6624464392662048, "learning_rate": 0.003, "loss": 4.2919, "step": 1390 }, { "epoch": 0.01391, "grad_norm": 0.5935611724853516, "learning_rate": 0.003, "loss": 4.2895, "step": 1391 }, { "epoch": 0.01392, "grad_norm": 0.5433003902435303, "learning_rate": 0.003, "loss": 4.2973, "step": 1392 }, { "epoch": 0.01393, "grad_norm": 0.5145422220230103, "learning_rate": 0.003, "loss": 4.2666, "step": 1393 }, { "epoch": 0.01394, "grad_norm": 0.49970656633377075, "learning_rate": 0.003, "loss": 4.2526, "step": 1394 }, { "epoch": 0.01395, "grad_norm": 0.5094105005264282, "learning_rate": 0.003, "loss": 4.283, "step": 1395 }, { "epoch": 0.01396, "grad_norm": 0.5121984481811523, "learning_rate": 0.003, "loss": 4.274, "step": 1396 }, { "epoch": 0.01397, "grad_norm": 0.5195156931877136, "learning_rate": 0.003, "loss": 4.286, "step": 1397 }, { "epoch": 0.01398, "grad_norm": 0.47007647156715393, "learning_rate": 0.003, "loss": 4.2651, "step": 1398 }, { "epoch": 0.01399, "grad_norm": 0.4805718660354614, "learning_rate": 0.003, "loss": 4.287, "step": 1399 }, { "epoch": 0.014, "grad_norm": 0.6213575005531311, "learning_rate": 0.003, "loss": 4.2485, "step": 1400 }, { "epoch": 0.01401, "grad_norm": 0.7338332533836365, "learning_rate": 0.003, "loss": 4.2762, "step": 1401 }, { "epoch": 0.01402, "grad_norm": 0.7789960503578186, "learning_rate": 0.003, "loss": 4.2883, "step": 1402 }, { "epoch": 0.01403, "grad_norm": 0.7047394514083862, "learning_rate": 0.003, "loss": 4.2622, "step": 1403 }, { "epoch": 0.01404, "grad_norm": 0.5580564737319946, "learning_rate": 0.003, "loss": 4.279, "step": 1404 }, { "epoch": 0.01405, "grad_norm": 0.6150562763214111, "learning_rate": 0.003, "loss": 4.2978, "step": 1405 }, { "epoch": 0.01406, "grad_norm": 0.6412274241447449, "learning_rate": 0.003, "loss": 4.2817, "step": 1406 }, { "epoch": 0.01407, "grad_norm": 0.6951134204864502, "learning_rate": 0.003, "loss": 4.2975, "step": 1407 }, { "epoch": 0.01408, "grad_norm": 0.7907094955444336, "learning_rate": 0.003, "loss": 4.2629, "step": 1408 }, { "epoch": 0.01409, "grad_norm": 0.6941542625427246, "learning_rate": 0.003, "loss": 4.2885, "step": 1409 }, { "epoch": 0.0141, "grad_norm": 0.6482591032981873, "learning_rate": 0.003, "loss": 4.2693, "step": 1410 }, { "epoch": 0.01411, "grad_norm": 0.6562476754188538, "learning_rate": 0.003, "loss": 4.2437, "step": 1411 }, { "epoch": 0.01412, "grad_norm": 0.7608663439750671, "learning_rate": 0.003, "loss": 4.2874, "step": 1412 }, { "epoch": 0.01413, "grad_norm": 0.763931930065155, "learning_rate": 0.003, "loss": 4.2842, "step": 1413 }, { "epoch": 0.01414, "grad_norm": 0.7897149324417114, "learning_rate": 0.003, "loss": 4.2662, "step": 1414 }, { "epoch": 0.01415, "grad_norm": 0.7817095518112183, "learning_rate": 0.003, "loss": 4.2634, "step": 1415 }, { "epoch": 0.01416, "grad_norm": 0.7488892674446106, "learning_rate": 0.003, "loss": 4.2852, "step": 1416 }, { "epoch": 0.01417, "grad_norm": 0.8105233907699585, "learning_rate": 0.003, "loss": 4.2802, "step": 1417 }, { "epoch": 0.01418, "grad_norm": 0.7555150389671326, "learning_rate": 0.003, "loss": 4.2974, "step": 1418 }, { "epoch": 0.01419, "grad_norm": 0.7873964309692383, "learning_rate": 0.003, "loss": 4.2928, "step": 1419 }, { "epoch": 0.0142, "grad_norm": 0.942128598690033, "learning_rate": 0.003, "loss": 4.3049, "step": 1420 }, { "epoch": 0.01421, "grad_norm": 1.1309813261032104, "learning_rate": 0.003, "loss": 4.2967, "step": 1421 }, { "epoch": 0.01422, "grad_norm": 0.8632763028144836, "learning_rate": 0.003, "loss": 4.2868, "step": 1422 }, { "epoch": 0.01423, "grad_norm": 0.8987135887145996, "learning_rate": 0.003, "loss": 4.2941, "step": 1423 }, { "epoch": 0.01424, "grad_norm": 0.7247830629348755, "learning_rate": 0.003, "loss": 4.3046, "step": 1424 }, { "epoch": 0.01425, "grad_norm": 0.6278427839279175, "learning_rate": 0.003, "loss": 4.3054, "step": 1425 }, { "epoch": 0.01426, "grad_norm": 0.6600518226623535, "learning_rate": 0.003, "loss": 4.2819, "step": 1426 }, { "epoch": 0.01427, "grad_norm": 0.744967520236969, "learning_rate": 0.003, "loss": 4.2813, "step": 1427 }, { "epoch": 0.01428, "grad_norm": 0.6531183123588562, "learning_rate": 0.003, "loss": 4.2709, "step": 1428 }, { "epoch": 0.01429, "grad_norm": 0.6728975176811218, "learning_rate": 0.003, "loss": 4.2816, "step": 1429 }, { "epoch": 0.0143, "grad_norm": 0.6716436147689819, "learning_rate": 0.003, "loss": 4.2945, "step": 1430 }, { "epoch": 0.01431, "grad_norm": 0.674777090549469, "learning_rate": 0.003, "loss": 4.2804, "step": 1431 }, { "epoch": 0.01432, "grad_norm": 0.6107741594314575, "learning_rate": 0.003, "loss": 4.2694, "step": 1432 }, { "epoch": 0.01433, "grad_norm": 0.5782887935638428, "learning_rate": 0.003, "loss": 4.2651, "step": 1433 }, { "epoch": 0.01434, "grad_norm": 0.5728036761283875, "learning_rate": 0.003, "loss": 4.2497, "step": 1434 }, { "epoch": 0.01435, "grad_norm": 0.5406824350357056, "learning_rate": 0.003, "loss": 4.2562, "step": 1435 }, { "epoch": 0.01436, "grad_norm": 0.5029864311218262, "learning_rate": 0.003, "loss": 4.2703, "step": 1436 }, { "epoch": 0.01437, "grad_norm": 0.5657121539115906, "learning_rate": 0.003, "loss": 4.2852, "step": 1437 }, { "epoch": 0.01438, "grad_norm": 0.5845190286636353, "learning_rate": 0.003, "loss": 4.2793, "step": 1438 }, { "epoch": 0.01439, "grad_norm": 0.591201901435852, "learning_rate": 0.003, "loss": 4.2448, "step": 1439 }, { "epoch": 0.0144, "grad_norm": 0.8200764656066895, "learning_rate": 0.003, "loss": 4.2641, "step": 1440 }, { "epoch": 0.01441, "grad_norm": 0.888174831867218, "learning_rate": 0.003, "loss": 4.2921, "step": 1441 }, { "epoch": 0.01442, "grad_norm": 0.79758220911026, "learning_rate": 0.003, "loss": 4.2638, "step": 1442 }, { "epoch": 0.01443, "grad_norm": 0.745823085308075, "learning_rate": 0.003, "loss": 4.3231, "step": 1443 }, { "epoch": 0.01444, "grad_norm": 0.8156546354293823, "learning_rate": 0.003, "loss": 4.2923, "step": 1444 }, { "epoch": 0.01445, "grad_norm": 0.7982717752456665, "learning_rate": 0.003, "loss": 4.2557, "step": 1445 }, { "epoch": 0.01446, "grad_norm": 0.6928796172142029, "learning_rate": 0.003, "loss": 4.264, "step": 1446 }, { "epoch": 0.01447, "grad_norm": 0.6646214723587036, "learning_rate": 0.003, "loss": 4.2746, "step": 1447 }, { "epoch": 0.01448, "grad_norm": 0.6782436966896057, "learning_rate": 0.003, "loss": 4.2822, "step": 1448 }, { "epoch": 0.01449, "grad_norm": 0.6758451461791992, "learning_rate": 0.003, "loss": 4.2758, "step": 1449 }, { "epoch": 0.0145, "grad_norm": 0.6399548649787903, "learning_rate": 0.003, "loss": 4.291, "step": 1450 }, { "epoch": 0.01451, "grad_norm": 0.6663489937782288, "learning_rate": 0.003, "loss": 4.2675, "step": 1451 }, { "epoch": 0.01452, "grad_norm": 0.676326334476471, "learning_rate": 0.003, "loss": 4.2687, "step": 1452 }, { "epoch": 0.01453, "grad_norm": 0.5554569959640503, "learning_rate": 0.003, "loss": 4.2765, "step": 1453 }, { "epoch": 0.01454, "grad_norm": 0.5639092922210693, "learning_rate": 0.003, "loss": 4.2445, "step": 1454 }, { "epoch": 0.01455, "grad_norm": 0.49674221873283386, "learning_rate": 0.003, "loss": 4.2722, "step": 1455 }, { "epoch": 0.01456, "grad_norm": 0.5553675293922424, "learning_rate": 0.003, "loss": 4.2521, "step": 1456 }, { "epoch": 0.01457, "grad_norm": 0.6361852288246155, "learning_rate": 0.003, "loss": 4.2449, "step": 1457 }, { "epoch": 0.01458, "grad_norm": 0.7916824817657471, "learning_rate": 0.003, "loss": 4.2458, "step": 1458 }, { "epoch": 0.01459, "grad_norm": 0.8565419316291809, "learning_rate": 0.003, "loss": 4.2834, "step": 1459 }, { "epoch": 0.0146, "grad_norm": 0.7326977252960205, "learning_rate": 0.003, "loss": 4.28, "step": 1460 }, { "epoch": 0.01461, "grad_norm": 0.7401906251907349, "learning_rate": 0.003, "loss": 4.2599, "step": 1461 }, { "epoch": 0.01462, "grad_norm": 0.7613895535469055, "learning_rate": 0.003, "loss": 4.2676, "step": 1462 }, { "epoch": 0.01463, "grad_norm": 0.642987072467804, "learning_rate": 0.003, "loss": 4.2603, "step": 1463 }, { "epoch": 0.01464, "grad_norm": 0.705771803855896, "learning_rate": 0.003, "loss": 4.2361, "step": 1464 }, { "epoch": 0.01465, "grad_norm": 0.6884810328483582, "learning_rate": 0.003, "loss": 4.284, "step": 1465 }, { "epoch": 0.01466, "grad_norm": 0.5892930626869202, "learning_rate": 0.003, "loss": 4.2489, "step": 1466 }, { "epoch": 0.01467, "grad_norm": 0.6196565628051758, "learning_rate": 0.003, "loss": 4.2848, "step": 1467 }, { "epoch": 0.01468, "grad_norm": 0.7562909126281738, "learning_rate": 0.003, "loss": 4.2639, "step": 1468 }, { "epoch": 0.01469, "grad_norm": 0.7464540004730225, "learning_rate": 0.003, "loss": 4.2693, "step": 1469 }, { "epoch": 0.0147, "grad_norm": 0.6821353435516357, "learning_rate": 0.003, "loss": 4.2569, "step": 1470 }, { "epoch": 0.01471, "grad_norm": 0.5921043753623962, "learning_rate": 0.003, "loss": 4.2689, "step": 1471 }, { "epoch": 0.01472, "grad_norm": 0.531998336315155, "learning_rate": 0.003, "loss": 4.252, "step": 1472 }, { "epoch": 0.01473, "grad_norm": 0.5447686910629272, "learning_rate": 0.003, "loss": 4.2658, "step": 1473 }, { "epoch": 0.01474, "grad_norm": 0.6622011661529541, "learning_rate": 0.003, "loss": 4.2515, "step": 1474 }, { "epoch": 0.01475, "grad_norm": 0.7866929173469543, "learning_rate": 0.003, "loss": 4.2783, "step": 1475 }, { "epoch": 0.01476, "grad_norm": 0.965038537979126, "learning_rate": 0.003, "loss": 4.2726, "step": 1476 }, { "epoch": 0.01477, "grad_norm": 0.9310076236724854, "learning_rate": 0.003, "loss": 4.2926, "step": 1477 }, { "epoch": 0.01478, "grad_norm": 0.7643823027610779, "learning_rate": 0.003, "loss": 4.2629, "step": 1478 }, { "epoch": 0.01479, "grad_norm": 0.867812991142273, "learning_rate": 0.003, "loss": 4.3023, "step": 1479 }, { "epoch": 0.0148, "grad_norm": 0.7325505018234253, "learning_rate": 0.003, "loss": 4.2689, "step": 1480 }, { "epoch": 0.01481, "grad_norm": 0.673205554485321, "learning_rate": 0.003, "loss": 4.2668, "step": 1481 }, { "epoch": 0.01482, "grad_norm": 0.7312605381011963, "learning_rate": 0.003, "loss": 4.2525, "step": 1482 }, { "epoch": 0.01483, "grad_norm": 0.7356554269790649, "learning_rate": 0.003, "loss": 4.256, "step": 1483 }, { "epoch": 0.01484, "grad_norm": 0.6896143555641174, "learning_rate": 0.003, "loss": 4.3028, "step": 1484 }, { "epoch": 0.01485, "grad_norm": 0.6053803563117981, "learning_rate": 0.003, "loss": 4.2617, "step": 1485 }, { "epoch": 0.01486, "grad_norm": 0.5422018766403198, "learning_rate": 0.003, "loss": 4.2576, "step": 1486 }, { "epoch": 0.01487, "grad_norm": 0.584571897983551, "learning_rate": 0.003, "loss": 4.2611, "step": 1487 }, { "epoch": 0.01488, "grad_norm": 0.58668452501297, "learning_rate": 0.003, "loss": 4.2703, "step": 1488 }, { "epoch": 0.01489, "grad_norm": 0.5606329441070557, "learning_rate": 0.003, "loss": 4.2644, "step": 1489 }, { "epoch": 0.0149, "grad_norm": 0.5718882083892822, "learning_rate": 0.003, "loss": 4.2562, "step": 1490 }, { "epoch": 0.01491, "grad_norm": 0.609704852104187, "learning_rate": 0.003, "loss": 4.2714, "step": 1491 }, { "epoch": 0.01492, "grad_norm": 0.6093578338623047, "learning_rate": 0.003, "loss": 4.2341, "step": 1492 }, { "epoch": 0.01493, "grad_norm": 0.6492394208908081, "learning_rate": 0.003, "loss": 4.2659, "step": 1493 }, { "epoch": 0.01494, "grad_norm": 0.7214344143867493, "learning_rate": 0.003, "loss": 4.2478, "step": 1494 }, { "epoch": 0.01495, "grad_norm": 0.6858140826225281, "learning_rate": 0.003, "loss": 4.2587, "step": 1495 }, { "epoch": 0.01496, "grad_norm": 0.6935936808586121, "learning_rate": 0.003, "loss": 4.2367, "step": 1496 }, { "epoch": 0.01497, "grad_norm": 0.6599563360214233, "learning_rate": 0.003, "loss": 4.2117, "step": 1497 }, { "epoch": 0.01498, "grad_norm": 0.6300835609436035, "learning_rate": 0.003, "loss": 4.2634, "step": 1498 }, { "epoch": 0.01499, "grad_norm": 0.721996009349823, "learning_rate": 0.003, "loss": 4.2933, "step": 1499 }, { "epoch": 0.015, "grad_norm": 0.7603088021278381, "learning_rate": 0.003, "loss": 4.2446, "step": 1500 }, { "epoch": 0.01501, "grad_norm": 0.7853468060493469, "learning_rate": 0.003, "loss": 4.2524, "step": 1501 }, { "epoch": 0.01502, "grad_norm": 0.6559277772903442, "learning_rate": 0.003, "loss": 4.2862, "step": 1502 }, { "epoch": 0.01503, "grad_norm": 0.6914763450622559, "learning_rate": 0.003, "loss": 4.2669, "step": 1503 }, { "epoch": 0.01504, "grad_norm": 0.8051386475563049, "learning_rate": 0.003, "loss": 4.2746, "step": 1504 }, { "epoch": 0.01505, "grad_norm": 0.7562403082847595, "learning_rate": 0.003, "loss": 4.2564, "step": 1505 }, { "epoch": 0.01506, "grad_norm": 0.6991645693778992, "learning_rate": 0.003, "loss": 4.2715, "step": 1506 }, { "epoch": 0.01507, "grad_norm": 0.6562188267707825, "learning_rate": 0.003, "loss": 4.2831, "step": 1507 }, { "epoch": 0.01508, "grad_norm": 0.6328547596931458, "learning_rate": 0.003, "loss": 4.2611, "step": 1508 }, { "epoch": 0.01509, "grad_norm": 0.6792221665382385, "learning_rate": 0.003, "loss": 4.2777, "step": 1509 }, { "epoch": 0.0151, "grad_norm": 0.7432823777198792, "learning_rate": 0.003, "loss": 4.2532, "step": 1510 }, { "epoch": 0.01511, "grad_norm": 0.8595705628395081, "learning_rate": 0.003, "loss": 4.2519, "step": 1511 }, { "epoch": 0.01512, "grad_norm": 0.9392295479774475, "learning_rate": 0.003, "loss": 4.2664, "step": 1512 }, { "epoch": 0.01513, "grad_norm": 0.706947386264801, "learning_rate": 0.003, "loss": 4.2625, "step": 1513 }, { "epoch": 0.01514, "grad_norm": 0.6648169159889221, "learning_rate": 0.003, "loss": 4.2697, "step": 1514 }, { "epoch": 0.01515, "grad_norm": 0.7984362244606018, "learning_rate": 0.003, "loss": 4.2642, "step": 1515 }, { "epoch": 0.01516, "grad_norm": 0.8379718065261841, "learning_rate": 0.003, "loss": 4.2755, "step": 1516 }, { "epoch": 0.01517, "grad_norm": 0.7840726375579834, "learning_rate": 0.003, "loss": 4.2252, "step": 1517 }, { "epoch": 0.01518, "grad_norm": 0.7350525856018066, "learning_rate": 0.003, "loss": 4.2609, "step": 1518 }, { "epoch": 0.01519, "grad_norm": 0.637050449848175, "learning_rate": 0.003, "loss": 4.2905, "step": 1519 }, { "epoch": 0.0152, "grad_norm": 0.6365798711776733, "learning_rate": 0.003, "loss": 4.2441, "step": 1520 }, { "epoch": 0.01521, "grad_norm": 0.5890719890594482, "learning_rate": 0.003, "loss": 4.2552, "step": 1521 }, { "epoch": 0.01522, "grad_norm": 0.5176184773445129, "learning_rate": 0.003, "loss": 4.2486, "step": 1522 }, { "epoch": 0.01523, "grad_norm": 0.5967531800270081, "learning_rate": 0.003, "loss": 4.2464, "step": 1523 }, { "epoch": 0.01524, "grad_norm": 0.57130366563797, "learning_rate": 0.003, "loss": 4.2759, "step": 1524 }, { "epoch": 0.01525, "grad_norm": 0.68913733959198, "learning_rate": 0.003, "loss": 4.2789, "step": 1525 }, { "epoch": 0.01526, "grad_norm": 0.6771203279495239, "learning_rate": 0.003, "loss": 4.2725, "step": 1526 }, { "epoch": 0.01527, "grad_norm": 0.7048394083976746, "learning_rate": 0.003, "loss": 4.2371, "step": 1527 }, { "epoch": 0.01528, "grad_norm": 0.7917721271514893, "learning_rate": 0.003, "loss": 4.2607, "step": 1528 }, { "epoch": 0.01529, "grad_norm": 0.7461357116699219, "learning_rate": 0.003, "loss": 4.2575, "step": 1529 }, { "epoch": 0.0153, "grad_norm": 0.6422540545463562, "learning_rate": 0.003, "loss": 4.2576, "step": 1530 }, { "epoch": 0.01531, "grad_norm": 0.6448599696159363, "learning_rate": 0.003, "loss": 4.2541, "step": 1531 }, { "epoch": 0.01532, "grad_norm": 0.556128203868866, "learning_rate": 0.003, "loss": 4.2286, "step": 1532 }, { "epoch": 0.01533, "grad_norm": 0.511016845703125, "learning_rate": 0.003, "loss": 4.2532, "step": 1533 }, { "epoch": 0.01534, "grad_norm": 0.4958668351173401, "learning_rate": 0.003, "loss": 4.2504, "step": 1534 }, { "epoch": 0.01535, "grad_norm": 0.472829669713974, "learning_rate": 0.003, "loss": 4.2611, "step": 1535 }, { "epoch": 0.01536, "grad_norm": 0.4443574547767639, "learning_rate": 0.003, "loss": 4.2397, "step": 1536 }, { "epoch": 0.01537, "grad_norm": 0.5031164884567261, "learning_rate": 0.003, "loss": 4.2348, "step": 1537 }, { "epoch": 0.01538, "grad_norm": 0.6832772493362427, "learning_rate": 0.003, "loss": 4.2195, "step": 1538 }, { "epoch": 0.01539, "grad_norm": 0.875408947467804, "learning_rate": 0.003, "loss": 4.2638, "step": 1539 }, { "epoch": 0.0154, "grad_norm": 0.9963869452476501, "learning_rate": 0.003, "loss": 4.2726, "step": 1540 }, { "epoch": 0.01541, "grad_norm": 0.8254619836807251, "learning_rate": 0.003, "loss": 4.249, "step": 1541 }, { "epoch": 0.01542, "grad_norm": 0.8195037245750427, "learning_rate": 0.003, "loss": 4.2801, "step": 1542 }, { "epoch": 0.01543, "grad_norm": 0.9007482528686523, "learning_rate": 0.003, "loss": 4.268, "step": 1543 }, { "epoch": 0.01544, "grad_norm": 0.8777545690536499, "learning_rate": 0.003, "loss": 4.2622, "step": 1544 }, { "epoch": 0.01545, "grad_norm": 0.7702150344848633, "learning_rate": 0.003, "loss": 4.3234, "step": 1545 }, { "epoch": 0.01546, "grad_norm": 0.903275191783905, "learning_rate": 0.003, "loss": 4.2965, "step": 1546 }, { "epoch": 0.01547, "grad_norm": 1.0503301620483398, "learning_rate": 0.003, "loss": 4.307, "step": 1547 }, { "epoch": 0.01548, "grad_norm": 0.868294894695282, "learning_rate": 0.003, "loss": 4.2647, "step": 1548 }, { "epoch": 0.01549, "grad_norm": 0.8206138610839844, "learning_rate": 0.003, "loss": 4.2515, "step": 1549 }, { "epoch": 0.0155, "grad_norm": 0.9134683012962341, "learning_rate": 0.003, "loss": 4.2824, "step": 1550 }, { "epoch": 0.01551, "grad_norm": 0.9439716935157776, "learning_rate": 0.003, "loss": 4.2675, "step": 1551 }, { "epoch": 0.01552, "grad_norm": 0.7804630994796753, "learning_rate": 0.003, "loss": 4.28, "step": 1552 }, { "epoch": 0.01553, "grad_norm": 0.6432939171791077, "learning_rate": 0.003, "loss": 4.2993, "step": 1553 }, { "epoch": 0.01554, "grad_norm": 0.6834948062896729, "learning_rate": 0.003, "loss": 4.2822, "step": 1554 }, { "epoch": 0.01555, "grad_norm": 0.7107034921646118, "learning_rate": 0.003, "loss": 4.2553, "step": 1555 }, { "epoch": 0.01556, "grad_norm": 0.6976543664932251, "learning_rate": 0.003, "loss": 4.2901, "step": 1556 }, { "epoch": 0.01557, "grad_norm": 0.6529300808906555, "learning_rate": 0.003, "loss": 4.2623, "step": 1557 }, { "epoch": 0.01558, "grad_norm": 0.6456514000892639, "learning_rate": 0.003, "loss": 4.2503, "step": 1558 }, { "epoch": 0.01559, "grad_norm": 0.5664896368980408, "learning_rate": 0.003, "loss": 4.2838, "step": 1559 }, { "epoch": 0.0156, "grad_norm": 0.5338277816772461, "learning_rate": 0.003, "loss": 4.2719, "step": 1560 }, { "epoch": 0.01561, "grad_norm": 0.45519402623176575, "learning_rate": 0.003, "loss": 4.2494, "step": 1561 }, { "epoch": 0.01562, "grad_norm": 0.4456147849559784, "learning_rate": 0.003, "loss": 4.2411, "step": 1562 }, { "epoch": 0.01563, "grad_norm": 0.3998737335205078, "learning_rate": 0.003, "loss": 4.2455, "step": 1563 }, { "epoch": 0.01564, "grad_norm": 0.4089747965335846, "learning_rate": 0.003, "loss": 4.2731, "step": 1564 }, { "epoch": 0.01565, "grad_norm": 0.41704586148262024, "learning_rate": 0.003, "loss": 4.2464, "step": 1565 }, { "epoch": 0.01566, "grad_norm": 0.4549218416213989, "learning_rate": 0.003, "loss": 4.265, "step": 1566 }, { "epoch": 0.01567, "grad_norm": 0.45544859766960144, "learning_rate": 0.003, "loss": 4.2568, "step": 1567 }, { "epoch": 0.01568, "grad_norm": 0.496439129114151, "learning_rate": 0.003, "loss": 4.2394, "step": 1568 }, { "epoch": 0.01569, "grad_norm": 0.5869415998458862, "learning_rate": 0.003, "loss": 4.2472, "step": 1569 }, { "epoch": 0.0157, "grad_norm": 0.7246452569961548, "learning_rate": 0.003, "loss": 4.2683, "step": 1570 }, { "epoch": 0.01571, "grad_norm": 0.807225227355957, "learning_rate": 0.003, "loss": 4.268, "step": 1571 }, { "epoch": 0.01572, "grad_norm": 0.7089520692825317, "learning_rate": 0.003, "loss": 4.2658, "step": 1572 }, { "epoch": 0.01573, "grad_norm": 0.7833078503608704, "learning_rate": 0.003, "loss": 4.2544, "step": 1573 }, { "epoch": 0.01574, "grad_norm": 0.8009607791900635, "learning_rate": 0.003, "loss": 4.2723, "step": 1574 }, { "epoch": 0.01575, "grad_norm": 0.7152323126792908, "learning_rate": 0.003, "loss": 4.2303, "step": 1575 }, { "epoch": 0.01576, "grad_norm": 0.8425666093826294, "learning_rate": 0.003, "loss": 4.258, "step": 1576 }, { "epoch": 0.01577, "grad_norm": 0.9029209613800049, "learning_rate": 0.003, "loss": 4.2227, "step": 1577 }, { "epoch": 0.01578, "grad_norm": 0.8221943378448486, "learning_rate": 0.003, "loss": 4.2532, "step": 1578 }, { "epoch": 0.01579, "grad_norm": 0.6562032699584961, "learning_rate": 0.003, "loss": 4.2422, "step": 1579 }, { "epoch": 0.0158, "grad_norm": 0.6161283254623413, "learning_rate": 0.003, "loss": 4.2534, "step": 1580 }, { "epoch": 0.01581, "grad_norm": 0.5998492240905762, "learning_rate": 0.003, "loss": 4.2595, "step": 1581 }, { "epoch": 0.01582, "grad_norm": 0.6605640053749084, "learning_rate": 0.003, "loss": 4.2418, "step": 1582 }, { "epoch": 0.01583, "grad_norm": 0.6276175379753113, "learning_rate": 0.003, "loss": 4.2293, "step": 1583 }, { "epoch": 0.01584, "grad_norm": 0.5631775856018066, "learning_rate": 0.003, "loss": 4.2575, "step": 1584 }, { "epoch": 0.01585, "grad_norm": 0.5975695252418518, "learning_rate": 0.003, "loss": 4.2521, "step": 1585 }, { "epoch": 0.01586, "grad_norm": 0.7194235324859619, "learning_rate": 0.003, "loss": 4.229, "step": 1586 }, { "epoch": 0.01587, "grad_norm": 0.7284054160118103, "learning_rate": 0.003, "loss": 4.246, "step": 1587 }, { "epoch": 0.01588, "grad_norm": 0.6996057629585266, "learning_rate": 0.003, "loss": 4.2728, "step": 1588 }, { "epoch": 0.01589, "grad_norm": 0.5657826662063599, "learning_rate": 0.003, "loss": 4.2616, "step": 1589 }, { "epoch": 0.0159, "grad_norm": 0.6453202366828918, "learning_rate": 0.003, "loss": 4.2584, "step": 1590 }, { "epoch": 0.01591, "grad_norm": 0.5882779955863953, "learning_rate": 0.003, "loss": 4.2627, "step": 1591 }, { "epoch": 0.01592, "grad_norm": 0.5240422487258911, "learning_rate": 0.003, "loss": 4.2485, "step": 1592 }, { "epoch": 0.01593, "grad_norm": 0.5395859479904175, "learning_rate": 0.003, "loss": 4.2507, "step": 1593 }, { "epoch": 0.01594, "grad_norm": 0.5181586742401123, "learning_rate": 0.003, "loss": 4.2388, "step": 1594 }, { "epoch": 0.01595, "grad_norm": 0.5144811272621155, "learning_rate": 0.003, "loss": 4.2661, "step": 1595 }, { "epoch": 0.01596, "grad_norm": 0.5639116764068604, "learning_rate": 0.003, "loss": 4.2583, "step": 1596 }, { "epoch": 0.01597, "grad_norm": 0.616133987903595, "learning_rate": 0.003, "loss": 4.2207, "step": 1597 }, { "epoch": 0.01598, "grad_norm": 0.7612146735191345, "learning_rate": 0.003, "loss": 4.2233, "step": 1598 }, { "epoch": 0.01599, "grad_norm": 0.8201577663421631, "learning_rate": 0.003, "loss": 4.2494, "step": 1599 }, { "epoch": 0.016, "grad_norm": 0.7031198143959045, "learning_rate": 0.003, "loss": 4.2122, "step": 1600 }, { "epoch": 0.01601, "grad_norm": 0.7635224461555481, "learning_rate": 0.003, "loss": 4.2634, "step": 1601 }, { "epoch": 0.01602, "grad_norm": 0.8013474941253662, "learning_rate": 0.003, "loss": 4.2394, "step": 1602 }, { "epoch": 0.01603, "grad_norm": 0.7904717326164246, "learning_rate": 0.003, "loss": 4.2591, "step": 1603 }, { "epoch": 0.01604, "grad_norm": 0.6602920293807983, "learning_rate": 0.003, "loss": 4.2474, "step": 1604 }, { "epoch": 0.01605, "grad_norm": 0.7017102241516113, "learning_rate": 0.003, "loss": 4.2323, "step": 1605 }, { "epoch": 0.01606, "grad_norm": 0.9051344394683838, "learning_rate": 0.003, "loss": 4.2411, "step": 1606 }, { "epoch": 0.01607, "grad_norm": 0.8984688520431519, "learning_rate": 0.003, "loss": 4.2807, "step": 1607 }, { "epoch": 0.01608, "grad_norm": 0.7925190329551697, "learning_rate": 0.003, "loss": 4.2482, "step": 1608 }, { "epoch": 0.01609, "grad_norm": 0.8031415939331055, "learning_rate": 0.003, "loss": 4.2542, "step": 1609 }, { "epoch": 0.0161, "grad_norm": 0.8899487257003784, "learning_rate": 0.003, "loss": 4.2282, "step": 1610 }, { "epoch": 0.01611, "grad_norm": 0.7361058592796326, "learning_rate": 0.003, "loss": 4.2408, "step": 1611 }, { "epoch": 0.01612, "grad_norm": 0.6385438442230225, "learning_rate": 0.003, "loss": 4.2281, "step": 1612 }, { "epoch": 0.01613, "grad_norm": 0.5635811686515808, "learning_rate": 0.003, "loss": 4.2693, "step": 1613 }, { "epoch": 0.01614, "grad_norm": 0.49793457984924316, "learning_rate": 0.003, "loss": 4.2898, "step": 1614 }, { "epoch": 0.01615, "grad_norm": 0.5850211381912231, "learning_rate": 0.003, "loss": 4.2593, "step": 1615 }, { "epoch": 0.01616, "grad_norm": 0.557797908782959, "learning_rate": 0.003, "loss": 4.2378, "step": 1616 }, { "epoch": 0.01617, "grad_norm": 0.5186588764190674, "learning_rate": 0.003, "loss": 4.226, "step": 1617 }, { "epoch": 0.01618, "grad_norm": 0.5103740692138672, "learning_rate": 0.003, "loss": 4.2319, "step": 1618 }, { "epoch": 0.01619, "grad_norm": 0.4262300729751587, "learning_rate": 0.003, "loss": 4.2167, "step": 1619 }, { "epoch": 0.0162, "grad_norm": 0.43414008617401123, "learning_rate": 0.003, "loss": 4.2243, "step": 1620 }, { "epoch": 0.01621, "grad_norm": 0.47554972767829895, "learning_rate": 0.003, "loss": 4.2198, "step": 1621 }, { "epoch": 0.01622, "grad_norm": 0.49989646673202515, "learning_rate": 0.003, "loss": 4.2531, "step": 1622 }, { "epoch": 0.01623, "grad_norm": 0.514892578125, "learning_rate": 0.003, "loss": 4.2306, "step": 1623 }, { "epoch": 0.01624, "grad_norm": 0.606673538684845, "learning_rate": 0.003, "loss": 4.2463, "step": 1624 }, { "epoch": 0.01625, "grad_norm": 0.7891315817832947, "learning_rate": 0.003, "loss": 4.2309, "step": 1625 }, { "epoch": 0.01626, "grad_norm": 0.9178404808044434, "learning_rate": 0.003, "loss": 4.2593, "step": 1626 }, { "epoch": 0.01627, "grad_norm": 1.0693062543869019, "learning_rate": 0.003, "loss": 4.2387, "step": 1627 }, { "epoch": 0.01628, "grad_norm": 0.7644784450531006, "learning_rate": 0.003, "loss": 4.2438, "step": 1628 }, { "epoch": 0.01629, "grad_norm": 0.7061607241630554, "learning_rate": 0.003, "loss": 4.2389, "step": 1629 }, { "epoch": 0.0163, "grad_norm": 0.8324339985847473, "learning_rate": 0.003, "loss": 4.2364, "step": 1630 }, { "epoch": 0.01631, "grad_norm": 0.7869639992713928, "learning_rate": 0.003, "loss": 4.2544, "step": 1631 }, { "epoch": 0.01632, "grad_norm": 0.7570949792861938, "learning_rate": 0.003, "loss": 4.264, "step": 1632 }, { "epoch": 0.01633, "grad_norm": 0.8038742542266846, "learning_rate": 0.003, "loss": 4.2404, "step": 1633 }, { "epoch": 0.01634, "grad_norm": 0.7716854810714722, "learning_rate": 0.003, "loss": 4.2566, "step": 1634 }, { "epoch": 0.01635, "grad_norm": 0.6350013613700867, "learning_rate": 0.003, "loss": 4.2343, "step": 1635 }, { "epoch": 0.01636, "grad_norm": 0.5491396188735962, "learning_rate": 0.003, "loss": 4.247, "step": 1636 }, { "epoch": 0.01637, "grad_norm": 0.5923768281936646, "learning_rate": 0.003, "loss": 4.2317, "step": 1637 }, { "epoch": 0.01638, "grad_norm": 0.5720195174217224, "learning_rate": 0.003, "loss": 4.2361, "step": 1638 }, { "epoch": 0.01639, "grad_norm": 0.5404289364814758, "learning_rate": 0.003, "loss": 4.2596, "step": 1639 }, { "epoch": 0.0164, "grad_norm": 0.5601854920387268, "learning_rate": 0.003, "loss": 4.2389, "step": 1640 }, { "epoch": 0.01641, "grad_norm": 0.5230163335800171, "learning_rate": 0.003, "loss": 4.212, "step": 1641 }, { "epoch": 0.01642, "grad_norm": 0.5209521651268005, "learning_rate": 0.003, "loss": 4.2547, "step": 1642 }, { "epoch": 0.01643, "grad_norm": 0.6072290539741516, "learning_rate": 0.003, "loss": 4.2421, "step": 1643 }, { "epoch": 0.01644, "grad_norm": 0.6505742073059082, "learning_rate": 0.003, "loss": 4.2612, "step": 1644 }, { "epoch": 0.01645, "grad_norm": 0.6426966786384583, "learning_rate": 0.003, "loss": 4.2068, "step": 1645 }, { "epoch": 0.01646, "grad_norm": 0.6992736458778381, "learning_rate": 0.003, "loss": 4.2429, "step": 1646 }, { "epoch": 0.01647, "grad_norm": 0.6382941603660583, "learning_rate": 0.003, "loss": 4.2378, "step": 1647 }, { "epoch": 0.01648, "grad_norm": 0.6025996804237366, "learning_rate": 0.003, "loss": 4.2157, "step": 1648 }, { "epoch": 0.01649, "grad_norm": 0.5261057615280151, "learning_rate": 0.003, "loss": 4.2469, "step": 1649 }, { "epoch": 0.0165, "grad_norm": 0.6138892769813538, "learning_rate": 0.003, "loss": 4.2268, "step": 1650 }, { "epoch": 0.01651, "grad_norm": 0.5947723984718323, "learning_rate": 0.003, "loss": 4.2358, "step": 1651 }, { "epoch": 0.01652, "grad_norm": 0.6178387403488159, "learning_rate": 0.003, "loss": 4.1929, "step": 1652 }, { "epoch": 0.01653, "grad_norm": 0.6459247469902039, "learning_rate": 0.003, "loss": 4.2379, "step": 1653 }, { "epoch": 0.01654, "grad_norm": 0.6552532911300659, "learning_rate": 0.003, "loss": 4.2241, "step": 1654 }, { "epoch": 0.01655, "grad_norm": 0.780878484249115, "learning_rate": 0.003, "loss": 4.2406, "step": 1655 }, { "epoch": 0.01656, "grad_norm": 0.9652771949768066, "learning_rate": 0.003, "loss": 4.2288, "step": 1656 }, { "epoch": 0.01657, "grad_norm": 1.1051970720291138, "learning_rate": 0.003, "loss": 4.2586, "step": 1657 }, { "epoch": 0.01658, "grad_norm": 0.7399364709854126, "learning_rate": 0.003, "loss": 4.2318, "step": 1658 }, { "epoch": 0.01659, "grad_norm": 0.6876490116119385, "learning_rate": 0.003, "loss": 4.2377, "step": 1659 }, { "epoch": 0.0166, "grad_norm": 0.8514830470085144, "learning_rate": 0.003, "loss": 4.2247, "step": 1660 }, { "epoch": 0.01661, "grad_norm": 0.7782738208770752, "learning_rate": 0.003, "loss": 4.2268, "step": 1661 }, { "epoch": 0.01662, "grad_norm": 0.6898196339607239, "learning_rate": 0.003, "loss": 4.2305, "step": 1662 }, { "epoch": 0.01663, "grad_norm": 0.6155083179473877, "learning_rate": 0.003, "loss": 4.2454, "step": 1663 }, { "epoch": 0.01664, "grad_norm": 0.5973721742630005, "learning_rate": 0.003, "loss": 4.2626, "step": 1664 }, { "epoch": 0.01665, "grad_norm": 0.6198952794075012, "learning_rate": 0.003, "loss": 4.2273, "step": 1665 }, { "epoch": 0.01666, "grad_norm": 0.5744661092758179, "learning_rate": 0.003, "loss": 4.2297, "step": 1666 }, { "epoch": 0.01667, "grad_norm": 0.5517554879188538, "learning_rate": 0.003, "loss": 4.241, "step": 1667 }, { "epoch": 0.01668, "grad_norm": 0.6101126670837402, "learning_rate": 0.003, "loss": 4.2451, "step": 1668 }, { "epoch": 0.01669, "grad_norm": 0.6228256821632385, "learning_rate": 0.003, "loss": 4.2315, "step": 1669 }, { "epoch": 0.0167, "grad_norm": 0.5298703908920288, "learning_rate": 0.003, "loss": 4.2179, "step": 1670 }, { "epoch": 0.01671, "grad_norm": 0.5599346160888672, "learning_rate": 0.003, "loss": 4.2066, "step": 1671 }, { "epoch": 0.01672, "grad_norm": 0.6326868534088135, "learning_rate": 0.003, "loss": 4.2204, "step": 1672 }, { "epoch": 0.01673, "grad_norm": 0.566566526889801, "learning_rate": 0.003, "loss": 4.2135, "step": 1673 }, { "epoch": 0.01674, "grad_norm": 0.5534862875938416, "learning_rate": 0.003, "loss": 4.2339, "step": 1674 }, { "epoch": 0.01675, "grad_norm": 0.6360026597976685, "learning_rate": 0.003, "loss": 4.2196, "step": 1675 }, { "epoch": 0.01676, "grad_norm": 0.754280686378479, "learning_rate": 0.003, "loss": 4.2342, "step": 1676 }, { "epoch": 0.01677, "grad_norm": 0.9305175542831421, "learning_rate": 0.003, "loss": 4.2304, "step": 1677 }, { "epoch": 0.01678, "grad_norm": 0.9641466736793518, "learning_rate": 0.003, "loss": 4.2519, "step": 1678 }, { "epoch": 0.01679, "grad_norm": 0.9234570264816284, "learning_rate": 0.003, "loss": 4.2348, "step": 1679 }, { "epoch": 0.0168, "grad_norm": 0.8640843629837036, "learning_rate": 0.003, "loss": 4.2518, "step": 1680 }, { "epoch": 0.01681, "grad_norm": 0.802136242389679, "learning_rate": 0.003, "loss": 4.2519, "step": 1681 }, { "epoch": 0.01682, "grad_norm": 0.8513818383216858, "learning_rate": 0.003, "loss": 4.2466, "step": 1682 }, { "epoch": 0.01683, "grad_norm": 0.8904326558113098, "learning_rate": 0.003, "loss": 4.265, "step": 1683 }, { "epoch": 0.01684, "grad_norm": 0.8687060475349426, "learning_rate": 0.003, "loss": 4.2429, "step": 1684 }, { "epoch": 0.01685, "grad_norm": 0.8002411127090454, "learning_rate": 0.003, "loss": 4.2528, "step": 1685 }, { "epoch": 0.01686, "grad_norm": 0.8148955702781677, "learning_rate": 0.003, "loss": 4.2527, "step": 1686 }, { "epoch": 0.01687, "grad_norm": 0.8790806531906128, "learning_rate": 0.003, "loss": 4.251, "step": 1687 }, { "epoch": 0.01688, "grad_norm": 0.9963980317115784, "learning_rate": 0.003, "loss": 4.2442, "step": 1688 }, { "epoch": 0.01689, "grad_norm": 0.9348352551460266, "learning_rate": 0.003, "loss": 4.2605, "step": 1689 }, { "epoch": 0.0169, "grad_norm": 0.9107605814933777, "learning_rate": 0.003, "loss": 4.2483, "step": 1690 }, { "epoch": 0.01691, "grad_norm": 0.8572477698326111, "learning_rate": 0.003, "loss": 4.2603, "step": 1691 }, { "epoch": 0.01692, "grad_norm": 0.8438946008682251, "learning_rate": 0.003, "loss": 4.2653, "step": 1692 }, { "epoch": 0.01693, "grad_norm": 0.880574107170105, "learning_rate": 0.003, "loss": 4.2867, "step": 1693 }, { "epoch": 0.01694, "grad_norm": 0.8747463822364807, "learning_rate": 0.003, "loss": 4.2402, "step": 1694 }, { "epoch": 0.01695, "grad_norm": 0.9774526953697205, "learning_rate": 0.003, "loss": 4.2739, "step": 1695 }, { "epoch": 0.01696, "grad_norm": 1.0337826013565063, "learning_rate": 0.003, "loss": 4.2798, "step": 1696 }, { "epoch": 0.01697, "grad_norm": 0.852648138999939, "learning_rate": 0.003, "loss": 4.2511, "step": 1697 }, { "epoch": 0.01698, "grad_norm": 0.7840266227722168, "learning_rate": 0.003, "loss": 4.2388, "step": 1698 }, { "epoch": 0.01699, "grad_norm": 0.7520343065261841, "learning_rate": 0.003, "loss": 4.2503, "step": 1699 }, { "epoch": 0.017, "grad_norm": 0.686873197555542, "learning_rate": 0.003, "loss": 4.2629, "step": 1700 }, { "epoch": 0.01701, "grad_norm": 0.6256718039512634, "learning_rate": 0.003, "loss": 4.2424, "step": 1701 }, { "epoch": 0.01702, "grad_norm": 0.5732917785644531, "learning_rate": 0.003, "loss": 4.2618, "step": 1702 }, { "epoch": 0.01703, "grad_norm": 0.5769745707511902, "learning_rate": 0.003, "loss": 4.2478, "step": 1703 }, { "epoch": 0.01704, "grad_norm": 0.5204162001609802, "learning_rate": 0.003, "loss": 4.23, "step": 1704 }, { "epoch": 0.01705, "grad_norm": 0.4713651239871979, "learning_rate": 0.003, "loss": 4.2307, "step": 1705 }, { "epoch": 0.01706, "grad_norm": 0.3806135058403015, "learning_rate": 0.003, "loss": 4.2216, "step": 1706 }, { "epoch": 0.01707, "grad_norm": 0.4176587164402008, "learning_rate": 0.003, "loss": 4.2444, "step": 1707 }, { "epoch": 0.01708, "grad_norm": 0.36861827969551086, "learning_rate": 0.003, "loss": 4.2182, "step": 1708 }, { "epoch": 0.01709, "grad_norm": 0.35236138105392456, "learning_rate": 0.003, "loss": 4.239, "step": 1709 }, { "epoch": 0.0171, "grad_norm": 0.311535120010376, "learning_rate": 0.003, "loss": 4.1837, "step": 1710 }, { "epoch": 0.01711, "grad_norm": 0.3165230453014374, "learning_rate": 0.003, "loss": 4.2135, "step": 1711 }, { "epoch": 0.01712, "grad_norm": 0.3468313217163086, "learning_rate": 0.003, "loss": 4.2236, "step": 1712 }, { "epoch": 0.01713, "grad_norm": 0.36829516291618347, "learning_rate": 0.003, "loss": 4.1963, "step": 1713 }, { "epoch": 0.01714, "grad_norm": 0.37751680612564087, "learning_rate": 0.003, "loss": 4.2329, "step": 1714 }, { "epoch": 0.01715, "grad_norm": 0.40578410029411316, "learning_rate": 0.003, "loss": 4.2399, "step": 1715 }, { "epoch": 0.01716, "grad_norm": 0.46360400319099426, "learning_rate": 0.003, "loss": 4.218, "step": 1716 }, { "epoch": 0.01717, "grad_norm": 0.628455638885498, "learning_rate": 0.003, "loss": 4.2545, "step": 1717 }, { "epoch": 0.01718, "grad_norm": 0.8051602244377136, "learning_rate": 0.003, "loss": 4.2358, "step": 1718 }, { "epoch": 0.01719, "grad_norm": 0.8738026022911072, "learning_rate": 0.003, "loss": 4.2571, "step": 1719 }, { "epoch": 0.0172, "grad_norm": 0.7727829217910767, "learning_rate": 0.003, "loss": 4.2155, "step": 1720 }, { "epoch": 0.01721, "grad_norm": 0.7172210812568665, "learning_rate": 0.003, "loss": 4.2297, "step": 1721 }, { "epoch": 0.01722, "grad_norm": 0.7522033452987671, "learning_rate": 0.003, "loss": 4.2262, "step": 1722 }, { "epoch": 0.01723, "grad_norm": 0.6683304905891418, "learning_rate": 0.003, "loss": 4.2285, "step": 1723 }, { "epoch": 0.01724, "grad_norm": 0.5735771656036377, "learning_rate": 0.003, "loss": 4.2286, "step": 1724 }, { "epoch": 0.01725, "grad_norm": 0.6008380651473999, "learning_rate": 0.003, "loss": 4.2128, "step": 1725 }, { "epoch": 0.01726, "grad_norm": 0.57844078540802, "learning_rate": 0.003, "loss": 4.2055, "step": 1726 }, { "epoch": 0.01727, "grad_norm": 0.5139830112457275, "learning_rate": 0.003, "loss": 4.2082, "step": 1727 }, { "epoch": 0.01728, "grad_norm": 0.4524785876274109, "learning_rate": 0.003, "loss": 4.2098, "step": 1728 }, { "epoch": 0.01729, "grad_norm": 0.40762364864349365, "learning_rate": 0.003, "loss": 4.2149, "step": 1729 }, { "epoch": 0.0173, "grad_norm": 0.42627114057540894, "learning_rate": 0.003, "loss": 4.1889, "step": 1730 }, { "epoch": 0.01731, "grad_norm": 0.41734832525253296, "learning_rate": 0.003, "loss": 4.208, "step": 1731 }, { "epoch": 0.01732, "grad_norm": 0.46209970116615295, "learning_rate": 0.003, "loss": 4.2163, "step": 1732 }, { "epoch": 0.01733, "grad_norm": 0.5026668906211853, "learning_rate": 0.003, "loss": 4.1988, "step": 1733 }, { "epoch": 0.01734, "grad_norm": 0.4639485478401184, "learning_rate": 0.003, "loss": 4.2076, "step": 1734 }, { "epoch": 0.01735, "grad_norm": 0.5392582416534424, "learning_rate": 0.003, "loss": 4.1815, "step": 1735 }, { "epoch": 0.01736, "grad_norm": 0.6344774961471558, "learning_rate": 0.003, "loss": 4.186, "step": 1736 }, { "epoch": 0.01737, "grad_norm": 0.6952452063560486, "learning_rate": 0.003, "loss": 4.2203, "step": 1737 }, { "epoch": 0.01738, "grad_norm": 0.7636409997940063, "learning_rate": 0.003, "loss": 4.2103, "step": 1738 }, { "epoch": 0.01739, "grad_norm": 0.8100059628486633, "learning_rate": 0.003, "loss": 4.2313, "step": 1739 }, { "epoch": 0.0174, "grad_norm": 0.7574781775474548, "learning_rate": 0.003, "loss": 4.2287, "step": 1740 }, { "epoch": 0.01741, "grad_norm": 0.8603562116622925, "learning_rate": 0.003, "loss": 4.2408, "step": 1741 }, { "epoch": 0.01742, "grad_norm": 1.0610615015029907, "learning_rate": 0.003, "loss": 4.2403, "step": 1742 }, { "epoch": 0.01743, "grad_norm": 0.9020041823387146, "learning_rate": 0.003, "loss": 4.2526, "step": 1743 }, { "epoch": 0.01744, "grad_norm": 0.7726603746414185, "learning_rate": 0.003, "loss": 4.2302, "step": 1744 }, { "epoch": 0.01745, "grad_norm": 0.842292070388794, "learning_rate": 0.003, "loss": 4.1998, "step": 1745 }, { "epoch": 0.01746, "grad_norm": 0.8620842099189758, "learning_rate": 0.003, "loss": 4.2285, "step": 1746 }, { "epoch": 0.01747, "grad_norm": 0.8984124660491943, "learning_rate": 0.003, "loss": 4.2309, "step": 1747 }, { "epoch": 0.01748, "grad_norm": 0.7890809178352356, "learning_rate": 0.003, "loss": 4.2184, "step": 1748 }, { "epoch": 0.01749, "grad_norm": 0.9051846861839294, "learning_rate": 0.003, "loss": 4.2199, "step": 1749 }, { "epoch": 0.0175, "grad_norm": 0.8282995820045471, "learning_rate": 0.003, "loss": 4.2356, "step": 1750 }, { "epoch": 0.01751, "grad_norm": 0.7147263884544373, "learning_rate": 0.003, "loss": 4.2187, "step": 1751 }, { "epoch": 0.01752, "grad_norm": 0.8123989701271057, "learning_rate": 0.003, "loss": 4.269, "step": 1752 }, { "epoch": 0.01753, "grad_norm": 0.8011901378631592, "learning_rate": 0.003, "loss": 4.2178, "step": 1753 }, { "epoch": 0.01754, "grad_norm": 0.6684347987174988, "learning_rate": 0.003, "loss": 4.2455, "step": 1754 }, { "epoch": 0.01755, "grad_norm": 0.5351077318191528, "learning_rate": 0.003, "loss": 4.1816, "step": 1755 }, { "epoch": 0.01756, "grad_norm": 0.5229023098945618, "learning_rate": 0.003, "loss": 4.2561, "step": 1756 }, { "epoch": 0.01757, "grad_norm": 0.6237030029296875, "learning_rate": 0.003, "loss": 4.2403, "step": 1757 }, { "epoch": 0.01758, "grad_norm": 0.5928184986114502, "learning_rate": 0.003, "loss": 4.2396, "step": 1758 }, { "epoch": 0.01759, "grad_norm": 0.5645509362220764, "learning_rate": 0.003, "loss": 4.2107, "step": 1759 }, { "epoch": 0.0176, "grad_norm": 0.6238541603088379, "learning_rate": 0.003, "loss": 4.2329, "step": 1760 }, { "epoch": 0.01761, "grad_norm": 0.8237802982330322, "learning_rate": 0.003, "loss": 4.2266, "step": 1761 }, { "epoch": 0.01762, "grad_norm": 1.0283610820770264, "learning_rate": 0.003, "loss": 4.2546, "step": 1762 }, { "epoch": 0.01763, "grad_norm": 0.9172603487968445, "learning_rate": 0.003, "loss": 4.2466, "step": 1763 }, { "epoch": 0.01764, "grad_norm": 0.7957273721694946, "learning_rate": 0.003, "loss": 4.2112, "step": 1764 }, { "epoch": 0.01765, "grad_norm": 0.7934739589691162, "learning_rate": 0.003, "loss": 4.2362, "step": 1765 }, { "epoch": 0.01766, "grad_norm": 0.8055094480514526, "learning_rate": 0.003, "loss": 4.2499, "step": 1766 }, { "epoch": 0.01767, "grad_norm": 0.7152431011199951, "learning_rate": 0.003, "loss": 4.21, "step": 1767 }, { "epoch": 0.01768, "grad_norm": 0.6750852465629578, "learning_rate": 0.003, "loss": 4.2147, "step": 1768 }, { "epoch": 0.01769, "grad_norm": 0.62043297290802, "learning_rate": 0.003, "loss": 4.2126, "step": 1769 }, { "epoch": 0.0177, "grad_norm": 0.7558618783950806, "learning_rate": 0.003, "loss": 4.2018, "step": 1770 }, { "epoch": 0.01771, "grad_norm": 0.7306205034255981, "learning_rate": 0.003, "loss": 4.235, "step": 1771 }, { "epoch": 0.01772, "grad_norm": 0.5451275706291199, "learning_rate": 0.003, "loss": 4.1989, "step": 1772 }, { "epoch": 0.01773, "grad_norm": 0.5318989157676697, "learning_rate": 0.003, "loss": 4.1925, "step": 1773 }, { "epoch": 0.01774, "grad_norm": 0.5252489447593689, "learning_rate": 0.003, "loss": 4.2149, "step": 1774 }, { "epoch": 0.01775, "grad_norm": 0.4652414321899414, "learning_rate": 0.003, "loss": 4.1972, "step": 1775 }, { "epoch": 0.01776, "grad_norm": 0.4579496681690216, "learning_rate": 0.003, "loss": 4.1905, "step": 1776 }, { "epoch": 0.01777, "grad_norm": 0.46492844820022583, "learning_rate": 0.003, "loss": 4.2099, "step": 1777 }, { "epoch": 0.01778, "grad_norm": 0.46276724338531494, "learning_rate": 0.003, "loss": 4.2021, "step": 1778 }, { "epoch": 0.01779, "grad_norm": 0.6184820532798767, "learning_rate": 0.003, "loss": 4.2227, "step": 1779 }, { "epoch": 0.0178, "grad_norm": 0.7079681158065796, "learning_rate": 0.003, "loss": 4.1963, "step": 1780 }, { "epoch": 0.01781, "grad_norm": 0.6819452047348022, "learning_rate": 0.003, "loss": 4.2105, "step": 1781 }, { "epoch": 0.01782, "grad_norm": 0.6064656376838684, "learning_rate": 0.003, "loss": 4.2325, "step": 1782 }, { "epoch": 0.01783, "grad_norm": 0.5331481695175171, "learning_rate": 0.003, "loss": 4.2083, "step": 1783 }, { "epoch": 0.01784, "grad_norm": 0.5265206098556519, "learning_rate": 0.003, "loss": 4.2093, "step": 1784 }, { "epoch": 0.01785, "grad_norm": 0.5389939546585083, "learning_rate": 0.003, "loss": 4.1711, "step": 1785 }, { "epoch": 0.01786, "grad_norm": 0.7006595134735107, "learning_rate": 0.003, "loss": 4.2034, "step": 1786 }, { "epoch": 0.01787, "grad_norm": 0.7858084440231323, "learning_rate": 0.003, "loss": 4.2233, "step": 1787 }, { "epoch": 0.01788, "grad_norm": 0.7923018336296082, "learning_rate": 0.003, "loss": 4.228, "step": 1788 }, { "epoch": 0.01789, "grad_norm": 0.593018651008606, "learning_rate": 0.003, "loss": 4.1934, "step": 1789 }, { "epoch": 0.0179, "grad_norm": 0.6494895815849304, "learning_rate": 0.003, "loss": 4.2138, "step": 1790 }, { "epoch": 0.01791, "grad_norm": 0.7846472859382629, "learning_rate": 0.003, "loss": 4.2301, "step": 1791 }, { "epoch": 0.01792, "grad_norm": 0.8576614260673523, "learning_rate": 0.003, "loss": 4.239, "step": 1792 }, { "epoch": 0.01793, "grad_norm": 0.8833402991294861, "learning_rate": 0.003, "loss": 4.2068, "step": 1793 }, { "epoch": 0.01794, "grad_norm": 1.0114383697509766, "learning_rate": 0.003, "loss": 4.2273, "step": 1794 }, { "epoch": 0.01795, "grad_norm": 0.879726231098175, "learning_rate": 0.003, "loss": 4.2331, "step": 1795 }, { "epoch": 0.01796, "grad_norm": 0.6577330827713013, "learning_rate": 0.003, "loss": 4.2394, "step": 1796 }, { "epoch": 0.01797, "grad_norm": 0.7508055567741394, "learning_rate": 0.003, "loss": 4.2031, "step": 1797 }, { "epoch": 0.01798, "grad_norm": 0.7287778854370117, "learning_rate": 0.003, "loss": 4.2178, "step": 1798 }, { "epoch": 0.01799, "grad_norm": 0.7758158445358276, "learning_rate": 0.003, "loss": 4.2381, "step": 1799 }, { "epoch": 0.018, "grad_norm": 0.652176558971405, "learning_rate": 0.003, "loss": 4.2204, "step": 1800 }, { "epoch": 0.01801, "grad_norm": 0.6343629360198975, "learning_rate": 0.003, "loss": 4.2139, "step": 1801 }, { "epoch": 0.01802, "grad_norm": 0.6776105165481567, "learning_rate": 0.003, "loss": 4.2317, "step": 1802 }, { "epoch": 0.01803, "grad_norm": 0.7177024483680725, "learning_rate": 0.003, "loss": 4.2423, "step": 1803 }, { "epoch": 0.01804, "grad_norm": 0.8185440301895142, "learning_rate": 0.003, "loss": 4.2159, "step": 1804 }, { "epoch": 0.01805, "grad_norm": 0.8301059603691101, "learning_rate": 0.003, "loss": 4.2394, "step": 1805 }, { "epoch": 0.01806, "grad_norm": 0.7407498359680176, "learning_rate": 0.003, "loss": 4.191, "step": 1806 }, { "epoch": 0.01807, "grad_norm": 0.7113837003707886, "learning_rate": 0.003, "loss": 4.2277, "step": 1807 }, { "epoch": 0.01808, "grad_norm": 0.5918108224868774, "learning_rate": 0.003, "loss": 4.2149, "step": 1808 }, { "epoch": 0.01809, "grad_norm": 0.5566163659095764, "learning_rate": 0.003, "loss": 4.2318, "step": 1809 }, { "epoch": 0.0181, "grad_norm": 0.4694720506668091, "learning_rate": 0.003, "loss": 4.224, "step": 1810 }, { "epoch": 0.01811, "grad_norm": 0.45558327436447144, "learning_rate": 0.003, "loss": 4.2076, "step": 1811 }, { "epoch": 0.01812, "grad_norm": 0.4792705476284027, "learning_rate": 0.003, "loss": 4.1941, "step": 1812 }, { "epoch": 0.01813, "grad_norm": 0.43152645230293274, "learning_rate": 0.003, "loss": 4.2329, "step": 1813 }, { "epoch": 0.01814, "grad_norm": 0.4722409248352051, "learning_rate": 0.003, "loss": 4.2001, "step": 1814 }, { "epoch": 0.01815, "grad_norm": 0.516764760017395, "learning_rate": 0.003, "loss": 4.1882, "step": 1815 }, { "epoch": 0.01816, "grad_norm": 0.6370428800582886, "learning_rate": 0.003, "loss": 4.2183, "step": 1816 }, { "epoch": 0.01817, "grad_norm": 0.7442272305488586, "learning_rate": 0.003, "loss": 4.2141, "step": 1817 }, { "epoch": 0.01818, "grad_norm": 0.7647615671157837, "learning_rate": 0.003, "loss": 4.2145, "step": 1818 }, { "epoch": 0.01819, "grad_norm": 0.683918833732605, "learning_rate": 0.003, "loss": 4.1944, "step": 1819 }, { "epoch": 0.0182, "grad_norm": 0.6988905072212219, "learning_rate": 0.003, "loss": 4.2155, "step": 1820 }, { "epoch": 0.01821, "grad_norm": 0.632123589515686, "learning_rate": 0.003, "loss": 4.1997, "step": 1821 }, { "epoch": 0.01822, "grad_norm": 0.5962942838668823, "learning_rate": 0.003, "loss": 4.2102, "step": 1822 }, { "epoch": 0.01823, "grad_norm": 0.5034074783325195, "learning_rate": 0.003, "loss": 4.2224, "step": 1823 }, { "epoch": 0.01824, "grad_norm": 0.4946107268333435, "learning_rate": 0.003, "loss": 4.2025, "step": 1824 }, { "epoch": 0.01825, "grad_norm": 0.5331458449363708, "learning_rate": 0.003, "loss": 4.2091, "step": 1825 }, { "epoch": 0.01826, "grad_norm": 0.4815158247947693, "learning_rate": 0.003, "loss": 4.2164, "step": 1826 }, { "epoch": 0.01827, "grad_norm": 0.5388825535774231, "learning_rate": 0.003, "loss": 4.1869, "step": 1827 }, { "epoch": 0.01828, "grad_norm": 0.6578684449195862, "learning_rate": 0.003, "loss": 4.2037, "step": 1828 }, { "epoch": 0.01829, "grad_norm": 0.7422965168952942, "learning_rate": 0.003, "loss": 4.2232, "step": 1829 }, { "epoch": 0.0183, "grad_norm": 0.7160323262214661, "learning_rate": 0.003, "loss": 4.2196, "step": 1830 }, { "epoch": 0.01831, "grad_norm": 0.7727608680725098, "learning_rate": 0.003, "loss": 4.2097, "step": 1831 }, { "epoch": 0.01832, "grad_norm": 0.9163352847099304, "learning_rate": 0.003, "loss": 4.2196, "step": 1832 }, { "epoch": 0.01833, "grad_norm": 0.7811264395713806, "learning_rate": 0.003, "loss": 4.2469, "step": 1833 }, { "epoch": 0.01834, "grad_norm": 0.7502668499946594, "learning_rate": 0.003, "loss": 4.214, "step": 1834 }, { "epoch": 0.01835, "grad_norm": 0.7814200520515442, "learning_rate": 0.003, "loss": 4.2163, "step": 1835 }, { "epoch": 0.01836, "grad_norm": 0.7723317742347717, "learning_rate": 0.003, "loss": 4.2127, "step": 1836 }, { "epoch": 0.01837, "grad_norm": 0.7659316062927246, "learning_rate": 0.003, "loss": 4.2181, "step": 1837 }, { "epoch": 0.01838, "grad_norm": 0.789987325668335, "learning_rate": 0.003, "loss": 4.2248, "step": 1838 }, { "epoch": 0.01839, "grad_norm": 0.8428659439086914, "learning_rate": 0.003, "loss": 4.2435, "step": 1839 }, { "epoch": 0.0184, "grad_norm": 0.9034631252288818, "learning_rate": 0.003, "loss": 4.2468, "step": 1840 }, { "epoch": 0.01841, "grad_norm": 0.8657104969024658, "learning_rate": 0.003, "loss": 4.1845, "step": 1841 }, { "epoch": 0.01842, "grad_norm": 0.8507790565490723, "learning_rate": 0.003, "loss": 4.2207, "step": 1842 }, { "epoch": 0.01843, "grad_norm": 0.942220151424408, "learning_rate": 0.003, "loss": 4.2062, "step": 1843 }, { "epoch": 0.01844, "grad_norm": 1.0908007621765137, "learning_rate": 0.003, "loss": 4.2172, "step": 1844 }, { "epoch": 0.01845, "grad_norm": 1.0858932733535767, "learning_rate": 0.003, "loss": 4.2239, "step": 1845 }, { "epoch": 0.01846, "grad_norm": 0.8852041959762573, "learning_rate": 0.003, "loss": 4.2518, "step": 1846 }, { "epoch": 0.01847, "grad_norm": 0.818362295627594, "learning_rate": 0.003, "loss": 4.243, "step": 1847 }, { "epoch": 0.01848, "grad_norm": 0.8533189296722412, "learning_rate": 0.003, "loss": 4.2588, "step": 1848 }, { "epoch": 0.01849, "grad_norm": 0.9039400219917297, "learning_rate": 0.003, "loss": 4.2557, "step": 1849 }, { "epoch": 0.0185, "grad_norm": 0.769622802734375, "learning_rate": 0.003, "loss": 4.2424, "step": 1850 }, { "epoch": 0.01851, "grad_norm": 0.6524956226348877, "learning_rate": 0.003, "loss": 4.2169, "step": 1851 }, { "epoch": 0.01852, "grad_norm": 0.6401670575141907, "learning_rate": 0.003, "loss": 4.2199, "step": 1852 }, { "epoch": 0.01853, "grad_norm": 0.593191921710968, "learning_rate": 0.003, "loss": 4.2181, "step": 1853 }, { "epoch": 0.01854, "grad_norm": 0.47404181957244873, "learning_rate": 0.003, "loss": 4.2422, "step": 1854 }, { "epoch": 0.01855, "grad_norm": 0.3873991072177887, "learning_rate": 0.003, "loss": 4.2245, "step": 1855 }, { "epoch": 0.01856, "grad_norm": 0.3870847821235657, "learning_rate": 0.003, "loss": 4.2074, "step": 1856 }, { "epoch": 0.01857, "grad_norm": 0.41644003987312317, "learning_rate": 0.003, "loss": 4.19, "step": 1857 }, { "epoch": 0.01858, "grad_norm": 0.380154550075531, "learning_rate": 0.003, "loss": 4.196, "step": 1858 }, { "epoch": 0.01859, "grad_norm": 0.39841675758361816, "learning_rate": 0.003, "loss": 4.1921, "step": 1859 }, { "epoch": 0.0186, "grad_norm": 0.3521161377429962, "learning_rate": 0.003, "loss": 4.1722, "step": 1860 }, { "epoch": 0.01861, "grad_norm": 0.41872426867485046, "learning_rate": 0.003, "loss": 4.2174, "step": 1861 }, { "epoch": 0.01862, "grad_norm": 0.47171056270599365, "learning_rate": 0.003, "loss": 4.1831, "step": 1862 }, { "epoch": 0.01863, "grad_norm": 0.5940234661102295, "learning_rate": 0.003, "loss": 4.1933, "step": 1863 }, { "epoch": 0.01864, "grad_norm": 0.659092366695404, "learning_rate": 0.003, "loss": 4.2305, "step": 1864 }, { "epoch": 0.01865, "grad_norm": 0.6307891607284546, "learning_rate": 0.003, "loss": 4.2044, "step": 1865 }, { "epoch": 0.01866, "grad_norm": 0.46219319105148315, "learning_rate": 0.003, "loss": 4.1993, "step": 1866 }, { "epoch": 0.01867, "grad_norm": 0.5417385697364807, "learning_rate": 0.003, "loss": 4.1771, "step": 1867 }, { "epoch": 0.01868, "grad_norm": 0.6495358943939209, "learning_rate": 0.003, "loss": 4.197, "step": 1868 }, { "epoch": 0.01869, "grad_norm": 0.6564030647277832, "learning_rate": 0.003, "loss": 4.2127, "step": 1869 }, { "epoch": 0.0187, "grad_norm": 0.6381605267524719, "learning_rate": 0.003, "loss": 4.2029, "step": 1870 }, { "epoch": 0.01871, "grad_norm": 0.6345733404159546, "learning_rate": 0.003, "loss": 4.1787, "step": 1871 }, { "epoch": 0.01872, "grad_norm": 0.6442919969558716, "learning_rate": 0.003, "loss": 4.2256, "step": 1872 }, { "epoch": 0.01873, "grad_norm": 0.6332488656044006, "learning_rate": 0.003, "loss": 4.1849, "step": 1873 }, { "epoch": 0.01874, "grad_norm": 0.6798062324523926, "learning_rate": 0.003, "loss": 4.191, "step": 1874 }, { "epoch": 0.01875, "grad_norm": 0.6715080738067627, "learning_rate": 0.003, "loss": 4.2097, "step": 1875 }, { "epoch": 0.01876, "grad_norm": 0.7180529832839966, "learning_rate": 0.003, "loss": 4.2178, "step": 1876 }, { "epoch": 0.01877, "grad_norm": 0.663026750087738, "learning_rate": 0.003, "loss": 4.2178, "step": 1877 }, { "epoch": 0.01878, "grad_norm": 0.7030567526817322, "learning_rate": 0.003, "loss": 4.2041, "step": 1878 }, { "epoch": 0.01879, "grad_norm": 0.5851782560348511, "learning_rate": 0.003, "loss": 4.1815, "step": 1879 }, { "epoch": 0.0188, "grad_norm": 0.6897128820419312, "learning_rate": 0.003, "loss": 4.1941, "step": 1880 }, { "epoch": 0.01881, "grad_norm": 0.7664402723312378, "learning_rate": 0.003, "loss": 4.1954, "step": 1881 }, { "epoch": 0.01882, "grad_norm": 0.9874089360237122, "learning_rate": 0.003, "loss": 4.2309, "step": 1882 }, { "epoch": 0.01883, "grad_norm": 1.0450137853622437, "learning_rate": 0.003, "loss": 4.2229, "step": 1883 }, { "epoch": 0.01884, "grad_norm": 0.8142443895339966, "learning_rate": 0.003, "loss": 4.2104, "step": 1884 }, { "epoch": 0.01885, "grad_norm": 0.6901335120201111, "learning_rate": 0.003, "loss": 4.1818, "step": 1885 }, { "epoch": 0.01886, "grad_norm": 0.7235998511314392, "learning_rate": 0.003, "loss": 4.224, "step": 1886 }, { "epoch": 0.01887, "grad_norm": 0.7312121987342834, "learning_rate": 0.003, "loss": 4.2148, "step": 1887 }, { "epoch": 0.01888, "grad_norm": 0.772678017616272, "learning_rate": 0.003, "loss": 4.2048, "step": 1888 }, { "epoch": 0.01889, "grad_norm": 0.7756654024124146, "learning_rate": 0.003, "loss": 4.2176, "step": 1889 }, { "epoch": 0.0189, "grad_norm": 0.7287212610244751, "learning_rate": 0.003, "loss": 4.1971, "step": 1890 }, { "epoch": 0.01891, "grad_norm": 0.7076724767684937, "learning_rate": 0.003, "loss": 4.2462, "step": 1891 }, { "epoch": 0.01892, "grad_norm": 0.6671635508537292, "learning_rate": 0.003, "loss": 4.215, "step": 1892 }, { "epoch": 0.01893, "grad_norm": 0.6953762769699097, "learning_rate": 0.003, "loss": 4.201, "step": 1893 }, { "epoch": 0.01894, "grad_norm": 0.5940777063369751, "learning_rate": 0.003, "loss": 4.2018, "step": 1894 }, { "epoch": 0.01895, "grad_norm": 0.5938246250152588, "learning_rate": 0.003, "loss": 4.1879, "step": 1895 }, { "epoch": 0.01896, "grad_norm": 0.6224337816238403, "learning_rate": 0.003, "loss": 4.2109, "step": 1896 }, { "epoch": 0.01897, "grad_norm": 0.5562435388565063, "learning_rate": 0.003, "loss": 4.2071, "step": 1897 }, { "epoch": 0.01898, "grad_norm": 0.5165086984634399, "learning_rate": 0.003, "loss": 4.1748, "step": 1898 }, { "epoch": 0.01899, "grad_norm": 0.46164852380752563, "learning_rate": 0.003, "loss": 4.1881, "step": 1899 }, { "epoch": 0.019, "grad_norm": 0.42392870783805847, "learning_rate": 0.003, "loss": 4.179, "step": 1900 }, { "epoch": 0.01901, "grad_norm": 0.41818806529045105, "learning_rate": 0.003, "loss": 4.2264, "step": 1901 }, { "epoch": 0.01902, "grad_norm": 0.5019712448120117, "learning_rate": 0.003, "loss": 4.2029, "step": 1902 }, { "epoch": 0.01903, "grad_norm": 0.6581493020057678, "learning_rate": 0.003, "loss": 4.1638, "step": 1903 }, { "epoch": 0.01904, "grad_norm": 0.9066566228866577, "learning_rate": 0.003, "loss": 4.2291, "step": 1904 }, { "epoch": 0.01905, "grad_norm": 0.9273806214332581, "learning_rate": 0.003, "loss": 4.2144, "step": 1905 }, { "epoch": 0.01906, "grad_norm": 0.6842753291130066, "learning_rate": 0.003, "loss": 4.2198, "step": 1906 }, { "epoch": 0.01907, "grad_norm": 0.7864841818809509, "learning_rate": 0.003, "loss": 4.217, "step": 1907 }, { "epoch": 0.01908, "grad_norm": 0.9169145822525024, "learning_rate": 0.003, "loss": 4.2438, "step": 1908 }, { "epoch": 0.01909, "grad_norm": 0.7539937496185303, "learning_rate": 0.003, "loss": 4.1979, "step": 1909 }, { "epoch": 0.0191, "grad_norm": 1.0344467163085938, "learning_rate": 0.003, "loss": 4.2215, "step": 1910 }, { "epoch": 0.01911, "grad_norm": 0.8631539940834045, "learning_rate": 0.003, "loss": 4.1974, "step": 1911 }, { "epoch": 0.01912, "grad_norm": 0.691514790058136, "learning_rate": 0.003, "loss": 4.1992, "step": 1912 }, { "epoch": 0.01913, "grad_norm": 0.6569962501525879, "learning_rate": 0.003, "loss": 4.2167, "step": 1913 }, { "epoch": 0.01914, "grad_norm": 0.7453441619873047, "learning_rate": 0.003, "loss": 4.2353, "step": 1914 }, { "epoch": 0.01915, "grad_norm": 0.6612529754638672, "learning_rate": 0.003, "loss": 4.2154, "step": 1915 }, { "epoch": 0.01916, "grad_norm": 0.5441082119941711, "learning_rate": 0.003, "loss": 4.1923, "step": 1916 }, { "epoch": 0.01917, "grad_norm": 0.5467451214790344, "learning_rate": 0.003, "loss": 4.2117, "step": 1917 }, { "epoch": 0.01918, "grad_norm": 0.6295916438102722, "learning_rate": 0.003, "loss": 4.1987, "step": 1918 }, { "epoch": 0.01919, "grad_norm": 0.6276860237121582, "learning_rate": 0.003, "loss": 4.2016, "step": 1919 }, { "epoch": 0.0192, "grad_norm": 0.6265702247619629, "learning_rate": 0.003, "loss": 4.2023, "step": 1920 }, { "epoch": 0.01921, "grad_norm": 0.6072244644165039, "learning_rate": 0.003, "loss": 4.1924, "step": 1921 }, { "epoch": 0.01922, "grad_norm": 0.577560305595398, "learning_rate": 0.003, "loss": 4.2062, "step": 1922 }, { "epoch": 0.01923, "grad_norm": 0.528586208820343, "learning_rate": 0.003, "loss": 4.2098, "step": 1923 }, { "epoch": 0.01924, "grad_norm": 0.437764436006546, "learning_rate": 0.003, "loss": 4.2065, "step": 1924 }, { "epoch": 0.01925, "grad_norm": 0.3857106566429138, "learning_rate": 0.003, "loss": 4.1935, "step": 1925 }, { "epoch": 0.01926, "grad_norm": 0.4072262942790985, "learning_rate": 0.003, "loss": 4.2001, "step": 1926 }, { "epoch": 0.01927, "grad_norm": 0.5231996774673462, "learning_rate": 0.003, "loss": 4.1922, "step": 1927 }, { "epoch": 0.01928, "grad_norm": 0.6032196283340454, "learning_rate": 0.003, "loss": 4.1716, "step": 1928 }, { "epoch": 0.01929, "grad_norm": 0.7323647737503052, "learning_rate": 0.003, "loss": 4.1937, "step": 1929 }, { "epoch": 0.0193, "grad_norm": 0.7953292727470398, "learning_rate": 0.003, "loss": 4.1919, "step": 1930 }, { "epoch": 0.01931, "grad_norm": 0.6991308927536011, "learning_rate": 0.003, "loss": 4.1873, "step": 1931 }, { "epoch": 0.01932, "grad_norm": 0.6141433715820312, "learning_rate": 0.003, "loss": 4.1945, "step": 1932 }, { "epoch": 0.01933, "grad_norm": 0.6607575416564941, "learning_rate": 0.003, "loss": 4.1875, "step": 1933 }, { "epoch": 0.01934, "grad_norm": 0.665708065032959, "learning_rate": 0.003, "loss": 4.1913, "step": 1934 }, { "epoch": 0.01935, "grad_norm": 0.6778123378753662, "learning_rate": 0.003, "loss": 4.2028, "step": 1935 }, { "epoch": 0.01936, "grad_norm": 0.660306453704834, "learning_rate": 0.003, "loss": 4.188, "step": 1936 }, { "epoch": 0.01937, "grad_norm": 0.6556859612464905, "learning_rate": 0.003, "loss": 4.201, "step": 1937 }, { "epoch": 0.01938, "grad_norm": 0.6403252482414246, "learning_rate": 0.003, "loss": 4.1998, "step": 1938 }, { "epoch": 0.01939, "grad_norm": 0.6520726084709167, "learning_rate": 0.003, "loss": 4.2214, "step": 1939 }, { "epoch": 0.0194, "grad_norm": 0.6492891907691956, "learning_rate": 0.003, "loss": 4.1979, "step": 1940 }, { "epoch": 0.01941, "grad_norm": 0.6015598773956299, "learning_rate": 0.003, "loss": 4.18, "step": 1941 }, { "epoch": 0.01942, "grad_norm": 0.5949015021324158, "learning_rate": 0.003, "loss": 4.2027, "step": 1942 }, { "epoch": 0.01943, "grad_norm": 0.5752372145652771, "learning_rate": 0.003, "loss": 4.1757, "step": 1943 }, { "epoch": 0.01944, "grad_norm": 0.5773752331733704, "learning_rate": 0.003, "loss": 4.2158, "step": 1944 }, { "epoch": 0.01945, "grad_norm": 0.6772210597991943, "learning_rate": 0.003, "loss": 4.1897, "step": 1945 }, { "epoch": 0.01946, "grad_norm": 0.817459225654602, "learning_rate": 0.003, "loss": 4.1898, "step": 1946 }, { "epoch": 0.01947, "grad_norm": 0.9603843092918396, "learning_rate": 0.003, "loss": 4.2053, "step": 1947 }, { "epoch": 0.01948, "grad_norm": 0.9629886746406555, "learning_rate": 0.003, "loss": 4.2304, "step": 1948 }, { "epoch": 0.01949, "grad_norm": 1.0253958702087402, "learning_rate": 0.003, "loss": 4.2301, "step": 1949 }, { "epoch": 0.0195, "grad_norm": 1.0213903188705444, "learning_rate": 0.003, "loss": 4.2249, "step": 1950 }, { "epoch": 0.01951, "grad_norm": 0.7116432189941406, "learning_rate": 0.003, "loss": 4.2054, "step": 1951 }, { "epoch": 0.01952, "grad_norm": 0.6784964203834534, "learning_rate": 0.003, "loss": 4.2146, "step": 1952 }, { "epoch": 0.01953, "grad_norm": 0.755315899848938, "learning_rate": 0.003, "loss": 4.2206, "step": 1953 }, { "epoch": 0.01954, "grad_norm": 0.8843002319335938, "learning_rate": 0.003, "loss": 4.2306, "step": 1954 }, { "epoch": 0.01955, "grad_norm": 0.9285549521446228, "learning_rate": 0.003, "loss": 4.2258, "step": 1955 }, { "epoch": 0.01956, "grad_norm": 0.7131898999214172, "learning_rate": 0.003, "loss": 4.2135, "step": 1956 }, { "epoch": 0.01957, "grad_norm": 0.7296762466430664, "learning_rate": 0.003, "loss": 4.1819, "step": 1957 }, { "epoch": 0.01958, "grad_norm": 0.76239013671875, "learning_rate": 0.003, "loss": 4.1787, "step": 1958 }, { "epoch": 0.01959, "grad_norm": 0.7252810001373291, "learning_rate": 0.003, "loss": 4.2032, "step": 1959 }, { "epoch": 0.0196, "grad_norm": 0.7949764132499695, "learning_rate": 0.003, "loss": 4.1992, "step": 1960 }, { "epoch": 0.01961, "grad_norm": 0.7223983407020569, "learning_rate": 0.003, "loss": 4.197, "step": 1961 }, { "epoch": 0.01962, "grad_norm": 0.683202862739563, "learning_rate": 0.003, "loss": 4.1965, "step": 1962 }, { "epoch": 0.01963, "grad_norm": 0.5978757739067078, "learning_rate": 0.003, "loss": 4.1801, "step": 1963 }, { "epoch": 0.01964, "grad_norm": 0.6187034249305725, "learning_rate": 0.003, "loss": 4.2107, "step": 1964 }, { "epoch": 0.01965, "grad_norm": 0.5564098358154297, "learning_rate": 0.003, "loss": 4.1988, "step": 1965 }, { "epoch": 0.01966, "grad_norm": 0.6056374311447144, "learning_rate": 0.003, "loss": 4.2015, "step": 1966 }, { "epoch": 0.01967, "grad_norm": 0.5678548812866211, "learning_rate": 0.003, "loss": 4.174, "step": 1967 }, { "epoch": 0.01968, "grad_norm": 0.5833330750465393, "learning_rate": 0.003, "loss": 4.1938, "step": 1968 }, { "epoch": 0.01969, "grad_norm": 0.681247889995575, "learning_rate": 0.003, "loss": 4.1923, "step": 1969 }, { "epoch": 0.0197, "grad_norm": 0.6020233035087585, "learning_rate": 0.003, "loss": 4.1923, "step": 1970 }, { "epoch": 0.01971, "grad_norm": 0.586289644241333, "learning_rate": 0.003, "loss": 4.1784, "step": 1971 }, { "epoch": 0.01972, "grad_norm": 0.6138716340065002, "learning_rate": 0.003, "loss": 4.1822, "step": 1972 }, { "epoch": 0.01973, "grad_norm": 0.6565843820571899, "learning_rate": 0.003, "loss": 4.1649, "step": 1973 }, { "epoch": 0.01974, "grad_norm": 0.7141625881195068, "learning_rate": 0.003, "loss": 4.2062, "step": 1974 }, { "epoch": 0.01975, "grad_norm": 0.9425281882286072, "learning_rate": 0.003, "loss": 4.2065, "step": 1975 }, { "epoch": 0.01976, "grad_norm": 1.02041757106781, "learning_rate": 0.003, "loss": 4.2127, "step": 1976 }, { "epoch": 0.01977, "grad_norm": 0.7708873748779297, "learning_rate": 0.003, "loss": 4.23, "step": 1977 }, { "epoch": 0.01978, "grad_norm": 0.6835176348686218, "learning_rate": 0.003, "loss": 4.2285, "step": 1978 }, { "epoch": 0.01979, "grad_norm": 0.6171081066131592, "learning_rate": 0.003, "loss": 4.2144, "step": 1979 }, { "epoch": 0.0198, "grad_norm": 0.5269233584403992, "learning_rate": 0.003, "loss": 4.1914, "step": 1980 }, { "epoch": 0.01981, "grad_norm": 0.5263850092887878, "learning_rate": 0.003, "loss": 4.175, "step": 1981 }, { "epoch": 0.01982, "grad_norm": 0.50020831823349, "learning_rate": 0.003, "loss": 4.1986, "step": 1982 }, { "epoch": 0.01983, "grad_norm": 0.5846083760261536, "learning_rate": 0.003, "loss": 4.194, "step": 1983 }, { "epoch": 0.01984, "grad_norm": 0.6941750645637512, "learning_rate": 0.003, "loss": 4.2073, "step": 1984 }, { "epoch": 0.01985, "grad_norm": 0.6278181672096252, "learning_rate": 0.003, "loss": 4.1955, "step": 1985 }, { "epoch": 0.01986, "grad_norm": 0.5164998769760132, "learning_rate": 0.003, "loss": 4.2189, "step": 1986 }, { "epoch": 0.01987, "grad_norm": 0.6220480799674988, "learning_rate": 0.003, "loss": 4.1777, "step": 1987 }, { "epoch": 0.01988, "grad_norm": 0.668171226978302, "learning_rate": 0.003, "loss": 4.1924, "step": 1988 }, { "epoch": 0.01989, "grad_norm": 0.7721487879753113, "learning_rate": 0.003, "loss": 4.1845, "step": 1989 }, { "epoch": 0.0199, "grad_norm": 0.9032546281814575, "learning_rate": 0.003, "loss": 4.2066, "step": 1990 }, { "epoch": 0.01991, "grad_norm": 0.9958043098449707, "learning_rate": 0.003, "loss": 4.1939, "step": 1991 }, { "epoch": 0.01992, "grad_norm": 0.9264521598815918, "learning_rate": 0.003, "loss": 4.2269, "step": 1992 }, { "epoch": 0.01993, "grad_norm": 0.8177193999290466, "learning_rate": 0.003, "loss": 4.2101, "step": 1993 }, { "epoch": 0.01994, "grad_norm": 0.6321484446525574, "learning_rate": 0.003, "loss": 4.2233, "step": 1994 }, { "epoch": 0.01995, "grad_norm": 0.6378874182701111, "learning_rate": 0.003, "loss": 4.216, "step": 1995 }, { "epoch": 0.01996, "grad_norm": 0.5592439770698547, "learning_rate": 0.003, "loss": 4.1789, "step": 1996 }, { "epoch": 0.01997, "grad_norm": 0.5063359141349792, "learning_rate": 0.003, "loss": 4.2089, "step": 1997 }, { "epoch": 0.01998, "grad_norm": 0.498551607131958, "learning_rate": 0.003, "loss": 4.1917, "step": 1998 }, { "epoch": 0.01999, "grad_norm": 0.5395573377609253, "learning_rate": 0.003, "loss": 4.1836, "step": 1999 }, { "epoch": 0.02, "grad_norm": 0.5799160003662109, "learning_rate": 0.003, "loss": 4.1718, "step": 2000 }, { "epoch": 0.02001, "grad_norm": 0.6742599606513977, "learning_rate": 0.003, "loss": 4.2094, "step": 2001 }, { "epoch": 0.02002, "grad_norm": 0.616278350353241, "learning_rate": 0.003, "loss": 4.2036, "step": 2002 }, { "epoch": 0.02003, "grad_norm": 0.3921935260295868, "learning_rate": 0.003, "loss": 4.1952, "step": 2003 }, { "epoch": 0.02004, "grad_norm": 0.3965532183647156, "learning_rate": 0.003, "loss": 4.1801, "step": 2004 }, { "epoch": 0.02005, "grad_norm": 0.4591590166091919, "learning_rate": 0.003, "loss": 4.1874, "step": 2005 }, { "epoch": 0.02006, "grad_norm": 0.43843862414360046, "learning_rate": 0.003, "loss": 4.1833, "step": 2006 }, { "epoch": 0.02007, "grad_norm": 0.5198654532432556, "learning_rate": 0.003, "loss": 4.2, "step": 2007 }, { "epoch": 0.02008, "grad_norm": 0.5515919327735901, "learning_rate": 0.003, "loss": 4.1918, "step": 2008 }, { "epoch": 0.02009, "grad_norm": 0.5351531505584717, "learning_rate": 0.003, "loss": 4.1889, "step": 2009 }, { "epoch": 0.0201, "grad_norm": 0.5307760834693909, "learning_rate": 0.003, "loss": 4.1996, "step": 2010 }, { "epoch": 0.02011, "grad_norm": 0.46869173645973206, "learning_rate": 0.003, "loss": 4.183, "step": 2011 }, { "epoch": 0.02012, "grad_norm": 0.45753300189971924, "learning_rate": 0.003, "loss": 4.1822, "step": 2012 }, { "epoch": 0.02013, "grad_norm": 0.5114355087280273, "learning_rate": 0.003, "loss": 4.1902, "step": 2013 }, { "epoch": 0.02014, "grad_norm": 0.62235426902771, "learning_rate": 0.003, "loss": 4.1784, "step": 2014 }, { "epoch": 0.02015, "grad_norm": 0.8762567043304443, "learning_rate": 0.003, "loss": 4.1818, "step": 2015 }, { "epoch": 0.02016, "grad_norm": 1.1758772134780884, "learning_rate": 0.003, "loss": 4.2184, "step": 2016 }, { "epoch": 0.02017, "grad_norm": 0.8140804171562195, "learning_rate": 0.003, "loss": 4.196, "step": 2017 }, { "epoch": 0.02018, "grad_norm": 0.8363609910011292, "learning_rate": 0.003, "loss": 4.201, "step": 2018 }, { "epoch": 0.02019, "grad_norm": 0.8395445346832275, "learning_rate": 0.003, "loss": 4.2077, "step": 2019 }, { "epoch": 0.0202, "grad_norm": 0.7397228479385376, "learning_rate": 0.003, "loss": 4.2085, "step": 2020 }, { "epoch": 0.02021, "grad_norm": 0.8446124792098999, "learning_rate": 0.003, "loss": 4.2309, "step": 2021 }, { "epoch": 0.02022, "grad_norm": 0.790037989616394, "learning_rate": 0.003, "loss": 4.1867, "step": 2022 }, { "epoch": 0.02023, "grad_norm": 0.8900567889213562, "learning_rate": 0.003, "loss": 4.1644, "step": 2023 }, { "epoch": 0.02024, "grad_norm": 0.8839752078056335, "learning_rate": 0.003, "loss": 4.1963, "step": 2024 }, { "epoch": 0.02025, "grad_norm": 0.9088333249092102, "learning_rate": 0.003, "loss": 4.2446, "step": 2025 }, { "epoch": 0.02026, "grad_norm": 0.8618097901344299, "learning_rate": 0.003, "loss": 4.226, "step": 2026 }, { "epoch": 0.02027, "grad_norm": 0.6961638927459717, "learning_rate": 0.003, "loss": 4.2132, "step": 2027 }, { "epoch": 0.02028, "grad_norm": 0.7510376572608948, "learning_rate": 0.003, "loss": 4.2002, "step": 2028 }, { "epoch": 0.02029, "grad_norm": 0.6982995271682739, "learning_rate": 0.003, "loss": 4.2032, "step": 2029 }, { "epoch": 0.0203, "grad_norm": 0.8095003366470337, "learning_rate": 0.003, "loss": 4.1988, "step": 2030 }, { "epoch": 0.02031, "grad_norm": 0.8760609030723572, "learning_rate": 0.003, "loss": 4.2121, "step": 2031 }, { "epoch": 0.02032, "grad_norm": 0.8105490207672119, "learning_rate": 0.003, "loss": 4.1833, "step": 2032 }, { "epoch": 0.02033, "grad_norm": 0.7400681376457214, "learning_rate": 0.003, "loss": 4.2221, "step": 2033 }, { "epoch": 0.02034, "grad_norm": 0.8544787168502808, "learning_rate": 0.003, "loss": 4.2195, "step": 2034 }, { "epoch": 0.02035, "grad_norm": 0.8666634559631348, "learning_rate": 0.003, "loss": 4.2081, "step": 2035 }, { "epoch": 0.02036, "grad_norm": 0.7516500949859619, "learning_rate": 0.003, "loss": 4.2028, "step": 2036 }, { "epoch": 0.02037, "grad_norm": 0.6464591026306152, "learning_rate": 0.003, "loss": 4.2133, "step": 2037 }, { "epoch": 0.02038, "grad_norm": 0.5785326957702637, "learning_rate": 0.003, "loss": 4.1922, "step": 2038 }, { "epoch": 0.02039, "grad_norm": 0.5690309405326843, "learning_rate": 0.003, "loss": 4.2006, "step": 2039 }, { "epoch": 0.0204, "grad_norm": 0.47884753346443176, "learning_rate": 0.003, "loss": 4.2009, "step": 2040 }, { "epoch": 0.02041, "grad_norm": 0.43236783146858215, "learning_rate": 0.003, "loss": 4.1612, "step": 2041 }, { "epoch": 0.02042, "grad_norm": 0.3718264102935791, "learning_rate": 0.003, "loss": 4.16, "step": 2042 }, { "epoch": 0.02043, "grad_norm": 0.38901960849761963, "learning_rate": 0.003, "loss": 4.2028, "step": 2043 }, { "epoch": 0.02044, "grad_norm": 0.391589492559433, "learning_rate": 0.003, "loss": 4.1754, "step": 2044 }, { "epoch": 0.02045, "grad_norm": 0.35344189405441284, "learning_rate": 0.003, "loss": 4.1881, "step": 2045 }, { "epoch": 0.02046, "grad_norm": 0.36751314997673035, "learning_rate": 0.003, "loss": 4.1643, "step": 2046 }, { "epoch": 0.02047, "grad_norm": 0.3489588499069214, "learning_rate": 0.003, "loss": 4.1999, "step": 2047 }, { "epoch": 0.02048, "grad_norm": 0.46102413535118103, "learning_rate": 0.003, "loss": 4.1826, "step": 2048 }, { "epoch": 0.02049, "grad_norm": 0.5971166491508484, "learning_rate": 0.003, "loss": 4.2081, "step": 2049 }, { "epoch": 0.0205, "grad_norm": 0.8196846842765808, "learning_rate": 0.003, "loss": 4.1627, "step": 2050 }, { "epoch": 0.02051, "grad_norm": 1.046412467956543, "learning_rate": 0.003, "loss": 4.179, "step": 2051 }, { "epoch": 0.02052, "grad_norm": 0.8385469317436218, "learning_rate": 0.003, "loss": 4.1822, "step": 2052 }, { "epoch": 0.02053, "grad_norm": 0.7001754641532898, "learning_rate": 0.003, "loss": 4.1758, "step": 2053 }, { "epoch": 0.02054, "grad_norm": 0.7512269020080566, "learning_rate": 0.003, "loss": 4.2184, "step": 2054 }, { "epoch": 0.02055, "grad_norm": 0.7145476937294006, "learning_rate": 0.003, "loss": 4.1785, "step": 2055 }, { "epoch": 0.02056, "grad_norm": 0.5185825824737549, "learning_rate": 0.003, "loss": 4.1667, "step": 2056 }, { "epoch": 0.02057, "grad_norm": 0.7188176512718201, "learning_rate": 0.003, "loss": 4.2106, "step": 2057 }, { "epoch": 0.02058, "grad_norm": 0.6570911407470703, "learning_rate": 0.003, "loss": 4.1847, "step": 2058 }, { "epoch": 0.02059, "grad_norm": 0.6945111155509949, "learning_rate": 0.003, "loss": 4.1692, "step": 2059 }, { "epoch": 0.0206, "grad_norm": 0.6718025207519531, "learning_rate": 0.003, "loss": 4.2144, "step": 2060 }, { "epoch": 0.02061, "grad_norm": 0.7464396953582764, "learning_rate": 0.003, "loss": 4.1868, "step": 2061 }, { "epoch": 0.02062, "grad_norm": 0.8595008850097656, "learning_rate": 0.003, "loss": 4.1982, "step": 2062 }, { "epoch": 0.02063, "grad_norm": 0.7188435196876526, "learning_rate": 0.003, "loss": 4.186, "step": 2063 }, { "epoch": 0.02064, "grad_norm": 0.7377521991729736, "learning_rate": 0.003, "loss": 4.1964, "step": 2064 }, { "epoch": 0.02065, "grad_norm": 0.8290489315986633, "learning_rate": 0.003, "loss": 4.1817, "step": 2065 }, { "epoch": 0.02066, "grad_norm": 0.8223605155944824, "learning_rate": 0.003, "loss": 4.1986, "step": 2066 }, { "epoch": 0.02067, "grad_norm": 0.6963474154472351, "learning_rate": 0.003, "loss": 4.1927, "step": 2067 }, { "epoch": 0.02068, "grad_norm": 0.6809067130088806, "learning_rate": 0.003, "loss": 4.1722, "step": 2068 }, { "epoch": 0.02069, "grad_norm": 0.6437063813209534, "learning_rate": 0.003, "loss": 4.1953, "step": 2069 }, { "epoch": 0.0207, "grad_norm": 0.6656236052513123, "learning_rate": 0.003, "loss": 4.1692, "step": 2070 }, { "epoch": 0.02071, "grad_norm": 0.5361942052841187, "learning_rate": 0.003, "loss": 4.1939, "step": 2071 }, { "epoch": 0.02072, "grad_norm": 0.5347724556922913, "learning_rate": 0.003, "loss": 4.199, "step": 2072 }, { "epoch": 0.02073, "grad_norm": 0.5662330985069275, "learning_rate": 0.003, "loss": 4.1792, "step": 2073 }, { "epoch": 0.02074, "grad_norm": 0.7545875310897827, "learning_rate": 0.003, "loss": 4.1916, "step": 2074 }, { "epoch": 0.02075, "grad_norm": 0.7776836156845093, "learning_rate": 0.003, "loss": 4.2211, "step": 2075 }, { "epoch": 0.02076, "grad_norm": 0.665153443813324, "learning_rate": 0.003, "loss": 4.2109, "step": 2076 }, { "epoch": 0.02077, "grad_norm": 0.6429176926612854, "learning_rate": 0.003, "loss": 4.1758, "step": 2077 }, { "epoch": 0.02078, "grad_norm": 0.6930047273635864, "learning_rate": 0.003, "loss": 4.1756, "step": 2078 }, { "epoch": 0.02079, "grad_norm": 0.885644793510437, "learning_rate": 0.003, "loss": 4.1694, "step": 2079 }, { "epoch": 0.0208, "grad_norm": 0.8631641864776611, "learning_rate": 0.003, "loss": 4.1873, "step": 2080 }, { "epoch": 0.02081, "grad_norm": 0.7464285492897034, "learning_rate": 0.003, "loss": 4.2136, "step": 2081 }, { "epoch": 0.02082, "grad_norm": 0.6033406257629395, "learning_rate": 0.003, "loss": 4.2032, "step": 2082 }, { "epoch": 0.02083, "grad_norm": 0.5763504505157471, "learning_rate": 0.003, "loss": 4.1974, "step": 2083 }, { "epoch": 0.02084, "grad_norm": 0.5885117650032043, "learning_rate": 0.003, "loss": 4.1894, "step": 2084 }, { "epoch": 0.02085, "grad_norm": 0.6575955152511597, "learning_rate": 0.003, "loss": 4.1685, "step": 2085 }, { "epoch": 0.02086, "grad_norm": 0.7422707080841064, "learning_rate": 0.003, "loss": 4.1768, "step": 2086 }, { "epoch": 0.02087, "grad_norm": 0.8073996901512146, "learning_rate": 0.003, "loss": 4.183, "step": 2087 }, { "epoch": 0.02088, "grad_norm": 0.7600259184837341, "learning_rate": 0.003, "loss": 4.2126, "step": 2088 }, { "epoch": 0.02089, "grad_norm": 0.7548388242721558, "learning_rate": 0.003, "loss": 4.1805, "step": 2089 }, { "epoch": 0.0209, "grad_norm": 0.6929354667663574, "learning_rate": 0.003, "loss": 4.1946, "step": 2090 }, { "epoch": 0.02091, "grad_norm": 0.6573588252067566, "learning_rate": 0.003, "loss": 4.1839, "step": 2091 }, { "epoch": 0.02092, "grad_norm": 0.5934063792228699, "learning_rate": 0.003, "loss": 4.1787, "step": 2092 }, { "epoch": 0.02093, "grad_norm": 0.5749050974845886, "learning_rate": 0.003, "loss": 4.1452, "step": 2093 }, { "epoch": 0.02094, "grad_norm": 0.6522166728973389, "learning_rate": 0.003, "loss": 4.1551, "step": 2094 }, { "epoch": 0.02095, "grad_norm": 0.6819972991943359, "learning_rate": 0.003, "loss": 4.1987, "step": 2095 }, { "epoch": 0.02096, "grad_norm": 0.8537513613700867, "learning_rate": 0.003, "loss": 4.1954, "step": 2096 }, { "epoch": 0.02097, "grad_norm": 0.8873555660247803, "learning_rate": 0.003, "loss": 4.1812, "step": 2097 }, { "epoch": 0.02098, "grad_norm": 0.8346237540245056, "learning_rate": 0.003, "loss": 4.1983, "step": 2098 }, { "epoch": 0.02099, "grad_norm": 0.7050203084945679, "learning_rate": 0.003, "loss": 4.2048, "step": 2099 }, { "epoch": 0.021, "grad_norm": 0.7459020614624023, "learning_rate": 0.003, "loss": 4.219, "step": 2100 }, { "epoch": 0.02101, "grad_norm": 0.6101992726325989, "learning_rate": 0.003, "loss": 4.205, "step": 2101 }, { "epoch": 0.02102, "grad_norm": 0.5766392350196838, "learning_rate": 0.003, "loss": 4.208, "step": 2102 }, { "epoch": 0.02103, "grad_norm": 0.4940197765827179, "learning_rate": 0.003, "loss": 4.1645, "step": 2103 }, { "epoch": 0.02104, "grad_norm": 0.5101218819618225, "learning_rate": 0.003, "loss": 4.1715, "step": 2104 }, { "epoch": 0.02105, "grad_norm": 0.4358375072479248, "learning_rate": 0.003, "loss": 4.1703, "step": 2105 }, { "epoch": 0.02106, "grad_norm": 0.4337053596973419, "learning_rate": 0.003, "loss": 4.1631, "step": 2106 }, { "epoch": 0.02107, "grad_norm": 0.42299988865852356, "learning_rate": 0.003, "loss": 4.1743, "step": 2107 }, { "epoch": 0.02108, "grad_norm": 0.408107191324234, "learning_rate": 0.003, "loss": 4.1546, "step": 2108 }, { "epoch": 0.02109, "grad_norm": 0.36309558153152466, "learning_rate": 0.003, "loss": 4.1448, "step": 2109 }, { "epoch": 0.0211, "grad_norm": 0.44200608134269714, "learning_rate": 0.003, "loss": 4.1607, "step": 2110 }, { "epoch": 0.02111, "grad_norm": 0.5945785641670227, "learning_rate": 0.003, "loss": 4.1726, "step": 2111 }, { "epoch": 0.02112, "grad_norm": 0.8337014317512512, "learning_rate": 0.003, "loss": 4.1799, "step": 2112 }, { "epoch": 0.02113, "grad_norm": 0.9096565246582031, "learning_rate": 0.003, "loss": 4.1882, "step": 2113 }, { "epoch": 0.02114, "grad_norm": 0.8218700289726257, "learning_rate": 0.003, "loss": 4.1728, "step": 2114 }, { "epoch": 0.02115, "grad_norm": 0.7097974419593811, "learning_rate": 0.003, "loss": 4.1612, "step": 2115 }, { "epoch": 0.02116, "grad_norm": 0.7614986300468445, "learning_rate": 0.003, "loss": 4.1951, "step": 2116 }, { "epoch": 0.02117, "grad_norm": 0.7237206697463989, "learning_rate": 0.003, "loss": 4.1853, "step": 2117 }, { "epoch": 0.02118, "grad_norm": 0.6130908727645874, "learning_rate": 0.003, "loss": 4.1776, "step": 2118 }, { "epoch": 0.02119, "grad_norm": 0.5855314135551453, "learning_rate": 0.003, "loss": 4.1889, "step": 2119 }, { "epoch": 0.0212, "grad_norm": 0.6109283566474915, "learning_rate": 0.003, "loss": 4.1619, "step": 2120 }, { "epoch": 0.02121, "grad_norm": 0.610881507396698, "learning_rate": 0.003, "loss": 4.158, "step": 2121 }, { "epoch": 0.02122, "grad_norm": 0.5126116275787354, "learning_rate": 0.003, "loss": 4.1642, "step": 2122 }, { "epoch": 0.02123, "grad_norm": 0.50606369972229, "learning_rate": 0.003, "loss": 4.1448, "step": 2123 }, { "epoch": 0.02124, "grad_norm": 0.5823047161102295, "learning_rate": 0.003, "loss": 4.1677, "step": 2124 }, { "epoch": 0.02125, "grad_norm": 0.6061305403709412, "learning_rate": 0.003, "loss": 4.1848, "step": 2125 }, { "epoch": 0.02126, "grad_norm": 0.6525632739067078, "learning_rate": 0.003, "loss": 4.1807, "step": 2126 }, { "epoch": 0.02127, "grad_norm": 0.6677929759025574, "learning_rate": 0.003, "loss": 4.1921, "step": 2127 }, { "epoch": 0.02128, "grad_norm": 0.6721459031105042, "learning_rate": 0.003, "loss": 4.1793, "step": 2128 }, { "epoch": 0.02129, "grad_norm": 0.5789133310317993, "learning_rate": 0.003, "loss": 4.1784, "step": 2129 }, { "epoch": 0.0213, "grad_norm": 0.48718854784965515, "learning_rate": 0.003, "loss": 4.1491, "step": 2130 }, { "epoch": 0.02131, "grad_norm": 0.5584967136383057, "learning_rate": 0.003, "loss": 4.1797, "step": 2131 }, { "epoch": 0.02132, "grad_norm": 0.6503726840019226, "learning_rate": 0.003, "loss": 4.1942, "step": 2132 }, { "epoch": 0.02133, "grad_norm": 0.8396037817001343, "learning_rate": 0.003, "loss": 4.1927, "step": 2133 }, { "epoch": 0.02134, "grad_norm": 0.9249106645584106, "learning_rate": 0.003, "loss": 4.2046, "step": 2134 }, { "epoch": 0.02135, "grad_norm": 0.9640594124794006, "learning_rate": 0.003, "loss": 4.1994, "step": 2135 }, { "epoch": 0.02136, "grad_norm": 0.8782548904418945, "learning_rate": 0.003, "loss": 4.177, "step": 2136 }, { "epoch": 0.02137, "grad_norm": 0.8113679885864258, "learning_rate": 0.003, "loss": 4.2115, "step": 2137 }, { "epoch": 0.02138, "grad_norm": 0.8208100199699402, "learning_rate": 0.003, "loss": 4.2211, "step": 2138 }, { "epoch": 0.02139, "grad_norm": 0.8544081449508667, "learning_rate": 0.003, "loss": 4.2178, "step": 2139 }, { "epoch": 0.0214, "grad_norm": 0.9318257570266724, "learning_rate": 0.003, "loss": 4.2125, "step": 2140 }, { "epoch": 0.02141, "grad_norm": 0.9511024951934814, "learning_rate": 0.003, "loss": 4.1925, "step": 2141 }, { "epoch": 0.02142, "grad_norm": 0.8391996026039124, "learning_rate": 0.003, "loss": 4.1851, "step": 2142 }, { "epoch": 0.02143, "grad_norm": 0.8300689458847046, "learning_rate": 0.003, "loss": 4.2047, "step": 2143 }, { "epoch": 0.02144, "grad_norm": 0.7621930837631226, "learning_rate": 0.003, "loss": 4.2055, "step": 2144 }, { "epoch": 0.02145, "grad_norm": 0.7281203269958496, "learning_rate": 0.003, "loss": 4.1893, "step": 2145 }, { "epoch": 0.02146, "grad_norm": 0.6785754561424255, "learning_rate": 0.003, "loss": 4.1995, "step": 2146 }, { "epoch": 0.02147, "grad_norm": 0.6476329565048218, "learning_rate": 0.003, "loss": 4.2144, "step": 2147 }, { "epoch": 0.02148, "grad_norm": 0.5711135268211365, "learning_rate": 0.003, "loss": 4.2235, "step": 2148 }, { "epoch": 0.02149, "grad_norm": 0.5608777403831482, "learning_rate": 0.003, "loss": 4.2064, "step": 2149 }, { "epoch": 0.0215, "grad_norm": 0.5449571013450623, "learning_rate": 0.003, "loss": 4.1918, "step": 2150 }, { "epoch": 0.02151, "grad_norm": 0.5975837707519531, "learning_rate": 0.003, "loss": 4.1981, "step": 2151 }, { "epoch": 0.02152, "grad_norm": 0.6778717041015625, "learning_rate": 0.003, "loss": 4.1751, "step": 2152 }, { "epoch": 0.02153, "grad_norm": 0.6393144726753235, "learning_rate": 0.003, "loss": 4.1858, "step": 2153 }, { "epoch": 0.02154, "grad_norm": 0.6100119352340698, "learning_rate": 0.003, "loss": 4.1743, "step": 2154 }, { "epoch": 0.02155, "grad_norm": 0.5750409364700317, "learning_rate": 0.003, "loss": 4.2008, "step": 2155 }, { "epoch": 0.02156, "grad_norm": 0.6067488789558411, "learning_rate": 0.003, "loss": 4.1875, "step": 2156 }, { "epoch": 0.02157, "grad_norm": 0.6373497247695923, "learning_rate": 0.003, "loss": 4.1553, "step": 2157 }, { "epoch": 0.02158, "grad_norm": 0.6033819317817688, "learning_rate": 0.003, "loss": 4.1755, "step": 2158 }, { "epoch": 0.02159, "grad_norm": 0.5368500351905823, "learning_rate": 0.003, "loss": 4.1852, "step": 2159 }, { "epoch": 0.0216, "grad_norm": 0.5477511882781982, "learning_rate": 0.003, "loss": 4.1876, "step": 2160 }, { "epoch": 0.02161, "grad_norm": 0.6277697086334229, "learning_rate": 0.003, "loss": 4.1773, "step": 2161 }, { "epoch": 0.02162, "grad_norm": 0.6784037351608276, "learning_rate": 0.003, "loss": 4.1846, "step": 2162 }, { "epoch": 0.02163, "grad_norm": 0.6868694424629211, "learning_rate": 0.003, "loss": 4.1997, "step": 2163 }, { "epoch": 0.02164, "grad_norm": 0.7093749642372131, "learning_rate": 0.003, "loss": 4.1944, "step": 2164 }, { "epoch": 0.02165, "grad_norm": 0.7334204912185669, "learning_rate": 0.003, "loss": 4.1627, "step": 2165 }, { "epoch": 0.02166, "grad_norm": 0.8240237236022949, "learning_rate": 0.003, "loss": 4.2062, "step": 2166 }, { "epoch": 0.02167, "grad_norm": 0.8271484375, "learning_rate": 0.003, "loss": 4.1857, "step": 2167 }, { "epoch": 0.02168, "grad_norm": 0.9684041142463684, "learning_rate": 0.003, "loss": 4.1878, "step": 2168 }, { "epoch": 0.02169, "grad_norm": 0.9603534936904907, "learning_rate": 0.003, "loss": 4.1956, "step": 2169 }, { "epoch": 0.0217, "grad_norm": 0.9623264074325562, "learning_rate": 0.003, "loss": 4.2029, "step": 2170 }, { "epoch": 0.02171, "grad_norm": 0.8587528467178345, "learning_rate": 0.003, "loss": 4.2163, "step": 2171 }, { "epoch": 0.02172, "grad_norm": 0.7502791285514832, "learning_rate": 0.003, "loss": 4.1819, "step": 2172 }, { "epoch": 0.02173, "grad_norm": 0.6895846724510193, "learning_rate": 0.003, "loss": 4.1826, "step": 2173 }, { "epoch": 0.02174, "grad_norm": 0.6692303419113159, "learning_rate": 0.003, "loss": 4.1793, "step": 2174 }, { "epoch": 0.02175, "grad_norm": 0.6576927900314331, "learning_rate": 0.003, "loss": 4.2023, "step": 2175 }, { "epoch": 0.02176, "grad_norm": 0.6110131740570068, "learning_rate": 0.003, "loss": 4.1815, "step": 2176 }, { "epoch": 0.02177, "grad_norm": 0.5583458542823792, "learning_rate": 0.003, "loss": 4.1855, "step": 2177 }, { "epoch": 0.02178, "grad_norm": 0.5579090118408203, "learning_rate": 0.003, "loss": 4.1655, "step": 2178 }, { "epoch": 0.02179, "grad_norm": 0.54999840259552, "learning_rate": 0.003, "loss": 4.1765, "step": 2179 }, { "epoch": 0.0218, "grad_norm": 0.5893214344978333, "learning_rate": 0.003, "loss": 4.193, "step": 2180 }, { "epoch": 0.02181, "grad_norm": 0.5285120606422424, "learning_rate": 0.003, "loss": 4.19, "step": 2181 }, { "epoch": 0.02182, "grad_norm": 0.537371814250946, "learning_rate": 0.003, "loss": 4.1755, "step": 2182 }, { "epoch": 0.02183, "grad_norm": 0.7419714331626892, "learning_rate": 0.003, "loss": 4.1927, "step": 2183 }, { "epoch": 0.02184, "grad_norm": 0.8745968341827393, "learning_rate": 0.003, "loss": 4.1753, "step": 2184 }, { "epoch": 0.02185, "grad_norm": 0.8310950398445129, "learning_rate": 0.003, "loss": 4.1708, "step": 2185 }, { "epoch": 0.02186, "grad_norm": 0.6509782075881958, "learning_rate": 0.003, "loss": 4.1821, "step": 2186 }, { "epoch": 0.02187, "grad_norm": 0.6584124565124512, "learning_rate": 0.003, "loss": 4.1801, "step": 2187 }, { "epoch": 0.02188, "grad_norm": 0.6546313762664795, "learning_rate": 0.003, "loss": 4.1586, "step": 2188 }, { "epoch": 0.02189, "grad_norm": 0.6025269031524658, "learning_rate": 0.003, "loss": 4.1873, "step": 2189 }, { "epoch": 0.0219, "grad_norm": 0.5339919924736023, "learning_rate": 0.003, "loss": 4.1761, "step": 2190 }, { "epoch": 0.02191, "grad_norm": 0.5982394814491272, "learning_rate": 0.003, "loss": 4.1693, "step": 2191 }, { "epoch": 0.02192, "grad_norm": 0.6466907262802124, "learning_rate": 0.003, "loss": 4.1981, "step": 2192 }, { "epoch": 0.02193, "grad_norm": 0.6886336207389832, "learning_rate": 0.003, "loss": 4.1664, "step": 2193 }, { "epoch": 0.02194, "grad_norm": 0.6955612897872925, "learning_rate": 0.003, "loss": 4.2136, "step": 2194 }, { "epoch": 0.02195, "grad_norm": 0.634350061416626, "learning_rate": 0.003, "loss": 4.1789, "step": 2195 }, { "epoch": 0.02196, "grad_norm": 0.6464403867721558, "learning_rate": 0.003, "loss": 4.1877, "step": 2196 }, { "epoch": 0.02197, "grad_norm": 0.7355444431304932, "learning_rate": 0.003, "loss": 4.1843, "step": 2197 }, { "epoch": 0.02198, "grad_norm": 0.6861805319786072, "learning_rate": 0.003, "loss": 4.1732, "step": 2198 }, { "epoch": 0.02199, "grad_norm": 0.5390181541442871, "learning_rate": 0.003, "loss": 4.1372, "step": 2199 }, { "epoch": 0.022, "grad_norm": 0.5449665188789368, "learning_rate": 0.003, "loss": 4.1658, "step": 2200 }, { "epoch": 0.02201, "grad_norm": 0.5550421476364136, "learning_rate": 0.003, "loss": 4.1819, "step": 2201 }, { "epoch": 0.02202, "grad_norm": 0.7112802863121033, "learning_rate": 0.003, "loss": 4.1711, "step": 2202 }, { "epoch": 0.02203, "grad_norm": 0.806663990020752, "learning_rate": 0.003, "loss": 4.2082, "step": 2203 }, { "epoch": 0.02204, "grad_norm": 0.7858209013938904, "learning_rate": 0.003, "loss": 4.1998, "step": 2204 }, { "epoch": 0.02205, "grad_norm": 0.782494306564331, "learning_rate": 0.003, "loss": 4.2034, "step": 2205 }, { "epoch": 0.02206, "grad_norm": 0.6905097365379333, "learning_rate": 0.003, "loss": 4.1663, "step": 2206 }, { "epoch": 0.02207, "grad_norm": 0.7511074542999268, "learning_rate": 0.003, "loss": 4.1796, "step": 2207 }, { "epoch": 0.02208, "grad_norm": 0.801884651184082, "learning_rate": 0.003, "loss": 4.1654, "step": 2208 }, { "epoch": 0.02209, "grad_norm": 0.7684618234634399, "learning_rate": 0.003, "loss": 4.1791, "step": 2209 }, { "epoch": 0.0221, "grad_norm": 0.7751032114028931, "learning_rate": 0.003, "loss": 4.1653, "step": 2210 }, { "epoch": 0.02211, "grad_norm": 0.7961976528167725, "learning_rate": 0.003, "loss": 4.1777, "step": 2211 }, { "epoch": 0.02212, "grad_norm": 0.694131076335907, "learning_rate": 0.003, "loss": 4.174, "step": 2212 }, { "epoch": 0.02213, "grad_norm": 0.731791079044342, "learning_rate": 0.003, "loss": 4.1834, "step": 2213 }, { "epoch": 0.02214, "grad_norm": 0.5831606388092041, "learning_rate": 0.003, "loss": 4.171, "step": 2214 }, { "epoch": 0.02215, "grad_norm": 0.6400359869003296, "learning_rate": 0.003, "loss": 4.1778, "step": 2215 }, { "epoch": 0.02216, "grad_norm": 0.6402711868286133, "learning_rate": 0.003, "loss": 4.156, "step": 2216 }, { "epoch": 0.02217, "grad_norm": 0.5657291412353516, "learning_rate": 0.003, "loss": 4.153, "step": 2217 }, { "epoch": 0.02218, "grad_norm": 0.5510069727897644, "learning_rate": 0.003, "loss": 4.1828, "step": 2218 }, { "epoch": 0.02219, "grad_norm": 0.567196786403656, "learning_rate": 0.003, "loss": 4.173, "step": 2219 }, { "epoch": 0.0222, "grad_norm": 0.5382395386695862, "learning_rate": 0.003, "loss": 4.1691, "step": 2220 }, { "epoch": 0.02221, "grad_norm": 0.5838103890419006, "learning_rate": 0.003, "loss": 4.1595, "step": 2221 }, { "epoch": 0.02222, "grad_norm": 0.6669290661811829, "learning_rate": 0.003, "loss": 4.1608, "step": 2222 }, { "epoch": 0.02223, "grad_norm": 0.8421687483787537, "learning_rate": 0.003, "loss": 4.1833, "step": 2223 }, { "epoch": 0.02224, "grad_norm": 1.0844712257385254, "learning_rate": 0.003, "loss": 4.1789, "step": 2224 }, { "epoch": 0.02225, "grad_norm": 0.7447740435600281, "learning_rate": 0.003, "loss": 4.1706, "step": 2225 }, { "epoch": 0.02226, "grad_norm": 0.6103411912918091, "learning_rate": 0.003, "loss": 4.184, "step": 2226 }, { "epoch": 0.02227, "grad_norm": 0.6984069347381592, "learning_rate": 0.003, "loss": 4.1806, "step": 2227 }, { "epoch": 0.02228, "grad_norm": 0.6870824694633484, "learning_rate": 0.003, "loss": 4.1674, "step": 2228 }, { "epoch": 0.02229, "grad_norm": 0.6159563064575195, "learning_rate": 0.003, "loss": 4.1446, "step": 2229 }, { "epoch": 0.0223, "grad_norm": 0.6422266364097595, "learning_rate": 0.003, "loss": 4.1847, "step": 2230 }, { "epoch": 0.02231, "grad_norm": 0.6304430961608887, "learning_rate": 0.003, "loss": 4.1891, "step": 2231 }, { "epoch": 0.02232, "grad_norm": 0.6334310173988342, "learning_rate": 0.003, "loss": 4.1933, "step": 2232 }, { "epoch": 0.02233, "grad_norm": 0.729522705078125, "learning_rate": 0.003, "loss": 4.1662, "step": 2233 }, { "epoch": 0.02234, "grad_norm": 0.839550256729126, "learning_rate": 0.003, "loss": 4.1911, "step": 2234 }, { "epoch": 0.02235, "grad_norm": 0.8429378271102905, "learning_rate": 0.003, "loss": 4.2154, "step": 2235 }, { "epoch": 0.02236, "grad_norm": 0.7225827574729919, "learning_rate": 0.003, "loss": 4.1659, "step": 2236 }, { "epoch": 0.02237, "grad_norm": 0.6989867091178894, "learning_rate": 0.003, "loss": 4.1975, "step": 2237 }, { "epoch": 0.02238, "grad_norm": 0.772613525390625, "learning_rate": 0.003, "loss": 4.1837, "step": 2238 }, { "epoch": 0.02239, "grad_norm": 0.7962609529495239, "learning_rate": 0.003, "loss": 4.1688, "step": 2239 }, { "epoch": 0.0224, "grad_norm": 0.7535995244979858, "learning_rate": 0.003, "loss": 4.1562, "step": 2240 }, { "epoch": 0.02241, "grad_norm": 0.8866783976554871, "learning_rate": 0.003, "loss": 4.1902, "step": 2241 }, { "epoch": 0.02242, "grad_norm": 1.0102195739746094, "learning_rate": 0.003, "loss": 4.1875, "step": 2242 }, { "epoch": 0.02243, "grad_norm": 1.013528823852539, "learning_rate": 0.003, "loss": 4.2245, "step": 2243 }, { "epoch": 0.02244, "grad_norm": 0.8773812055587769, "learning_rate": 0.003, "loss": 4.1919, "step": 2244 }, { "epoch": 0.02245, "grad_norm": 0.8694789409637451, "learning_rate": 0.003, "loss": 4.182, "step": 2245 }, { "epoch": 0.02246, "grad_norm": 0.8814519047737122, "learning_rate": 0.003, "loss": 4.2131, "step": 2246 }, { "epoch": 0.02247, "grad_norm": 0.7133143544197083, "learning_rate": 0.003, "loss": 4.1799, "step": 2247 }, { "epoch": 0.02248, "grad_norm": 0.6402725577354431, "learning_rate": 0.003, "loss": 4.1839, "step": 2248 }, { "epoch": 0.02249, "grad_norm": 0.5918011665344238, "learning_rate": 0.003, "loss": 4.1724, "step": 2249 }, { "epoch": 0.0225, "grad_norm": 0.5062255859375, "learning_rate": 0.003, "loss": 4.1824, "step": 2250 }, { "epoch": 0.02251, "grad_norm": 0.48150739073753357, "learning_rate": 0.003, "loss": 4.2037, "step": 2251 }, { "epoch": 0.02252, "grad_norm": 0.4211646616458893, "learning_rate": 0.003, "loss": 4.1917, "step": 2252 }, { "epoch": 0.02253, "grad_norm": 0.4155518114566803, "learning_rate": 0.003, "loss": 4.1719, "step": 2253 }, { "epoch": 0.02254, "grad_norm": 0.3938300311565399, "learning_rate": 0.003, "loss": 4.1747, "step": 2254 }, { "epoch": 0.02255, "grad_norm": 0.3917562961578369, "learning_rate": 0.003, "loss": 4.1579, "step": 2255 }, { "epoch": 0.02256, "grad_norm": 0.34900254011154175, "learning_rate": 0.003, "loss": 4.1446, "step": 2256 }, { "epoch": 0.02257, "grad_norm": 0.46977952122688293, "learning_rate": 0.003, "loss": 4.1641, "step": 2257 }, { "epoch": 0.02258, "grad_norm": 0.6035822629928589, "learning_rate": 0.003, "loss": 4.1756, "step": 2258 }, { "epoch": 0.02259, "grad_norm": 0.8085506558418274, "learning_rate": 0.003, "loss": 4.1444, "step": 2259 }, { "epoch": 0.0226, "grad_norm": 1.0156970024108887, "learning_rate": 0.003, "loss": 4.2107, "step": 2260 }, { "epoch": 0.02261, "grad_norm": 0.8568210601806641, "learning_rate": 0.003, "loss": 4.1861, "step": 2261 }, { "epoch": 0.02262, "grad_norm": 0.7774007320404053, "learning_rate": 0.003, "loss": 4.1822, "step": 2262 }, { "epoch": 0.02263, "grad_norm": 0.9870522022247314, "learning_rate": 0.003, "loss": 4.2046, "step": 2263 }, { "epoch": 0.02264, "grad_norm": 0.7220656871795654, "learning_rate": 0.003, "loss": 4.1915, "step": 2264 }, { "epoch": 0.02265, "grad_norm": 0.5900349020957947, "learning_rate": 0.003, "loss": 4.1438, "step": 2265 }, { "epoch": 0.02266, "grad_norm": 0.6756983995437622, "learning_rate": 0.003, "loss": 4.1642, "step": 2266 }, { "epoch": 0.02267, "grad_norm": 0.6196969747543335, "learning_rate": 0.003, "loss": 4.1867, "step": 2267 }, { "epoch": 0.02268, "grad_norm": 0.4943087697029114, "learning_rate": 0.003, "loss": 4.1589, "step": 2268 }, { "epoch": 0.02269, "grad_norm": 0.45949792861938477, "learning_rate": 0.003, "loss": 4.1791, "step": 2269 }, { "epoch": 0.0227, "grad_norm": 0.5057196617126465, "learning_rate": 0.003, "loss": 4.176, "step": 2270 }, { "epoch": 0.02271, "grad_norm": 0.587051510810852, "learning_rate": 0.003, "loss": 4.1796, "step": 2271 }, { "epoch": 0.02272, "grad_norm": 0.6179952621459961, "learning_rate": 0.003, "loss": 4.1812, "step": 2272 }, { "epoch": 0.02273, "grad_norm": 0.549956738948822, "learning_rate": 0.003, "loss": 4.1786, "step": 2273 }, { "epoch": 0.02274, "grad_norm": 0.5753885507583618, "learning_rate": 0.003, "loss": 4.1651, "step": 2274 }, { "epoch": 0.02275, "grad_norm": 0.5730156898498535, "learning_rate": 0.003, "loss": 4.1582, "step": 2275 }, { "epoch": 0.02276, "grad_norm": 0.633211076259613, "learning_rate": 0.003, "loss": 4.1545, "step": 2276 }, { "epoch": 0.02277, "grad_norm": 0.6606276631355286, "learning_rate": 0.003, "loss": 4.1668, "step": 2277 }, { "epoch": 0.02278, "grad_norm": 0.6212934255599976, "learning_rate": 0.003, "loss": 4.1699, "step": 2278 }, { "epoch": 0.02279, "grad_norm": 0.5762414932250977, "learning_rate": 0.003, "loss": 4.1803, "step": 2279 }, { "epoch": 0.0228, "grad_norm": 0.6208809018135071, "learning_rate": 0.003, "loss": 4.1721, "step": 2280 }, { "epoch": 0.02281, "grad_norm": 0.5790427327156067, "learning_rate": 0.003, "loss": 4.1659, "step": 2281 }, { "epoch": 0.02282, "grad_norm": 0.5124678015708923, "learning_rate": 0.003, "loss": 4.1737, "step": 2282 }, { "epoch": 0.02283, "grad_norm": 0.5517794489860535, "learning_rate": 0.003, "loss": 4.1895, "step": 2283 }, { "epoch": 0.02284, "grad_norm": 0.5746618509292603, "learning_rate": 0.003, "loss": 4.1861, "step": 2284 }, { "epoch": 0.02285, "grad_norm": 0.6934999823570251, "learning_rate": 0.003, "loss": 4.1284, "step": 2285 }, { "epoch": 0.02286, "grad_norm": 0.8188515305519104, "learning_rate": 0.003, "loss": 4.1994, "step": 2286 }, { "epoch": 0.02287, "grad_norm": 0.7512263655662537, "learning_rate": 0.003, "loss": 4.1749, "step": 2287 }, { "epoch": 0.02288, "grad_norm": 0.7637566328048706, "learning_rate": 0.003, "loss": 4.2039, "step": 2288 }, { "epoch": 0.02289, "grad_norm": 0.700842559337616, "learning_rate": 0.003, "loss": 4.1993, "step": 2289 }, { "epoch": 0.0229, "grad_norm": 0.6298280358314514, "learning_rate": 0.003, "loss": 4.1719, "step": 2290 }, { "epoch": 0.02291, "grad_norm": 0.6435158848762512, "learning_rate": 0.003, "loss": 4.1525, "step": 2291 }, { "epoch": 0.02292, "grad_norm": 0.5874335169792175, "learning_rate": 0.003, "loss": 4.1459, "step": 2292 }, { "epoch": 0.02293, "grad_norm": 0.6026025414466858, "learning_rate": 0.003, "loss": 4.1504, "step": 2293 }, { "epoch": 0.02294, "grad_norm": 0.7460681200027466, "learning_rate": 0.003, "loss": 4.1566, "step": 2294 }, { "epoch": 0.02295, "grad_norm": 0.7140766978263855, "learning_rate": 0.003, "loss": 4.1531, "step": 2295 }, { "epoch": 0.02296, "grad_norm": 0.6355901956558228, "learning_rate": 0.003, "loss": 4.1821, "step": 2296 }, { "epoch": 0.02297, "grad_norm": 0.673784077167511, "learning_rate": 0.003, "loss": 4.1554, "step": 2297 }, { "epoch": 0.02298, "grad_norm": 0.6826474666595459, "learning_rate": 0.003, "loss": 4.1671, "step": 2298 }, { "epoch": 0.02299, "grad_norm": 0.6509355306625366, "learning_rate": 0.003, "loss": 4.1693, "step": 2299 }, { "epoch": 0.023, "grad_norm": 0.6510788202285767, "learning_rate": 0.003, "loss": 4.1909, "step": 2300 }, { "epoch": 0.02301, "grad_norm": 0.6393023133277893, "learning_rate": 0.003, "loss": 4.1552, "step": 2301 }, { "epoch": 0.02302, "grad_norm": 0.7208176255226135, "learning_rate": 0.003, "loss": 4.1649, "step": 2302 }, { "epoch": 0.02303, "grad_norm": 0.8771877884864807, "learning_rate": 0.003, "loss": 4.1613, "step": 2303 }, { "epoch": 0.02304, "grad_norm": 1.1240605115890503, "learning_rate": 0.003, "loss": 4.2129, "step": 2304 }, { "epoch": 0.02305, "grad_norm": 1.0586861371994019, "learning_rate": 0.003, "loss": 4.1816, "step": 2305 }, { "epoch": 0.02306, "grad_norm": 0.7870573401451111, "learning_rate": 0.003, "loss": 4.1572, "step": 2306 }, { "epoch": 0.02307, "grad_norm": 0.7953494191169739, "learning_rate": 0.003, "loss": 4.1864, "step": 2307 }, { "epoch": 0.02308, "grad_norm": 0.800268292427063, "learning_rate": 0.003, "loss": 4.1782, "step": 2308 }, { "epoch": 0.02309, "grad_norm": 0.7769814133644104, "learning_rate": 0.003, "loss": 4.2068, "step": 2309 }, { "epoch": 0.0231, "grad_norm": 0.8404530882835388, "learning_rate": 0.003, "loss": 4.1894, "step": 2310 }, { "epoch": 0.02311, "grad_norm": 0.8363377451896667, "learning_rate": 0.003, "loss": 4.1952, "step": 2311 }, { "epoch": 0.02312, "grad_norm": 0.770296037197113, "learning_rate": 0.003, "loss": 4.1643, "step": 2312 }, { "epoch": 0.02313, "grad_norm": 0.7528126835823059, "learning_rate": 0.003, "loss": 4.1622, "step": 2313 }, { "epoch": 0.02314, "grad_norm": 0.74263995885849, "learning_rate": 0.003, "loss": 4.1779, "step": 2314 }, { "epoch": 0.02315, "grad_norm": 0.6081147789955139, "learning_rate": 0.003, "loss": 4.1766, "step": 2315 }, { "epoch": 0.02316, "grad_norm": 0.5721566677093506, "learning_rate": 0.003, "loss": 4.1882, "step": 2316 }, { "epoch": 0.02317, "grad_norm": 0.5732864141464233, "learning_rate": 0.003, "loss": 4.1684, "step": 2317 }, { "epoch": 0.02318, "grad_norm": 0.5256015062332153, "learning_rate": 0.003, "loss": 4.1551, "step": 2318 }, { "epoch": 0.02319, "grad_norm": 0.49379223585128784, "learning_rate": 0.003, "loss": 4.1772, "step": 2319 }, { "epoch": 0.0232, "grad_norm": 0.5656861662864685, "learning_rate": 0.003, "loss": 4.1857, "step": 2320 }, { "epoch": 0.02321, "grad_norm": 0.5173032879829407, "learning_rate": 0.003, "loss": 4.1628, "step": 2321 }, { "epoch": 0.02322, "grad_norm": 0.4566805362701416, "learning_rate": 0.003, "loss": 4.1702, "step": 2322 }, { "epoch": 0.02323, "grad_norm": 0.42646524310112, "learning_rate": 0.003, "loss": 4.1427, "step": 2323 }, { "epoch": 0.02324, "grad_norm": 0.46551966667175293, "learning_rate": 0.003, "loss": 4.1415, "step": 2324 }, { "epoch": 0.02325, "grad_norm": 0.4420718252658844, "learning_rate": 0.003, "loss": 4.1646, "step": 2325 }, { "epoch": 0.02326, "grad_norm": 0.42437517642974854, "learning_rate": 0.003, "loss": 4.1773, "step": 2326 }, { "epoch": 0.02327, "grad_norm": 0.457309365272522, "learning_rate": 0.003, "loss": 4.164, "step": 2327 }, { "epoch": 0.02328, "grad_norm": 0.515424370765686, "learning_rate": 0.003, "loss": 4.1652, "step": 2328 }, { "epoch": 0.02329, "grad_norm": 0.5420231223106384, "learning_rate": 0.003, "loss": 4.1544, "step": 2329 }, { "epoch": 0.0233, "grad_norm": 0.5254723429679871, "learning_rate": 0.003, "loss": 4.1502, "step": 2330 }, { "epoch": 0.02331, "grad_norm": 0.47864091396331787, "learning_rate": 0.003, "loss": 4.1851, "step": 2331 }, { "epoch": 0.02332, "grad_norm": 0.6108183264732361, "learning_rate": 0.003, "loss": 4.1511, "step": 2332 }, { "epoch": 0.02333, "grad_norm": 0.7649471759796143, "learning_rate": 0.003, "loss": 4.1613, "step": 2333 }, { "epoch": 0.02334, "grad_norm": 0.9433549642562866, "learning_rate": 0.003, "loss": 4.1411, "step": 2334 }, { "epoch": 0.02335, "grad_norm": 1.05660080909729, "learning_rate": 0.003, "loss": 4.1889, "step": 2335 }, { "epoch": 0.02336, "grad_norm": 0.7674508690834045, "learning_rate": 0.003, "loss": 4.1752, "step": 2336 }, { "epoch": 0.02337, "grad_norm": 0.6039113402366638, "learning_rate": 0.003, "loss": 4.1744, "step": 2337 }, { "epoch": 0.02338, "grad_norm": 0.678247332572937, "learning_rate": 0.003, "loss": 4.1635, "step": 2338 }, { "epoch": 0.02339, "grad_norm": 0.6683263182640076, "learning_rate": 0.003, "loss": 4.1573, "step": 2339 }, { "epoch": 0.0234, "grad_norm": 0.6628241539001465, "learning_rate": 0.003, "loss": 4.1711, "step": 2340 }, { "epoch": 0.02341, "grad_norm": 0.6385108232498169, "learning_rate": 0.003, "loss": 4.145, "step": 2341 }, { "epoch": 0.02342, "grad_norm": 0.7277436852455139, "learning_rate": 0.003, "loss": 4.1673, "step": 2342 }, { "epoch": 0.02343, "grad_norm": 0.7746118903160095, "learning_rate": 0.003, "loss": 4.1654, "step": 2343 }, { "epoch": 0.02344, "grad_norm": 0.8185151815414429, "learning_rate": 0.003, "loss": 4.1668, "step": 2344 }, { "epoch": 0.02345, "grad_norm": 0.8307933211326599, "learning_rate": 0.003, "loss": 4.163, "step": 2345 }, { "epoch": 0.02346, "grad_norm": 0.7920862436294556, "learning_rate": 0.003, "loss": 4.1608, "step": 2346 }, { "epoch": 0.02347, "grad_norm": 0.7341541051864624, "learning_rate": 0.003, "loss": 4.1955, "step": 2347 }, { "epoch": 0.02348, "grad_norm": 0.7185869216918945, "learning_rate": 0.003, "loss": 4.1581, "step": 2348 }, { "epoch": 0.02349, "grad_norm": 0.7363864183425903, "learning_rate": 0.003, "loss": 4.1781, "step": 2349 }, { "epoch": 0.0235, "grad_norm": 0.8115167021751404, "learning_rate": 0.003, "loss": 4.1777, "step": 2350 }, { "epoch": 0.02351, "grad_norm": 0.8712372779846191, "learning_rate": 0.003, "loss": 4.1958, "step": 2351 }, { "epoch": 0.02352, "grad_norm": 0.7720403075218201, "learning_rate": 0.003, "loss": 4.1512, "step": 2352 }, { "epoch": 0.02353, "grad_norm": 0.7881413102149963, "learning_rate": 0.003, "loss": 4.1809, "step": 2353 }, { "epoch": 0.02354, "grad_norm": 0.9158604741096497, "learning_rate": 0.003, "loss": 4.1959, "step": 2354 }, { "epoch": 0.02355, "grad_norm": 0.8635379076004028, "learning_rate": 0.003, "loss": 4.1914, "step": 2355 }, { "epoch": 0.02356, "grad_norm": 0.9214644432067871, "learning_rate": 0.003, "loss": 4.182, "step": 2356 }, { "epoch": 0.02357, "grad_norm": 1.004699945449829, "learning_rate": 0.003, "loss": 4.1873, "step": 2357 }, { "epoch": 0.02358, "grad_norm": 0.8610268235206604, "learning_rate": 0.003, "loss": 4.1999, "step": 2358 }, { "epoch": 0.02359, "grad_norm": 0.7828254699707031, "learning_rate": 0.003, "loss": 4.1779, "step": 2359 }, { "epoch": 0.0236, "grad_norm": 0.6673213839530945, "learning_rate": 0.003, "loss": 4.2078, "step": 2360 }, { "epoch": 0.02361, "grad_norm": 0.6459137201309204, "learning_rate": 0.003, "loss": 4.1883, "step": 2361 }, { "epoch": 0.02362, "grad_norm": 0.5431698560714722, "learning_rate": 0.003, "loss": 4.1762, "step": 2362 }, { "epoch": 0.02363, "grad_norm": 0.551616907119751, "learning_rate": 0.003, "loss": 4.151, "step": 2363 }, { "epoch": 0.02364, "grad_norm": 0.5682998299598694, "learning_rate": 0.003, "loss": 4.1714, "step": 2364 }, { "epoch": 0.02365, "grad_norm": 0.651782751083374, "learning_rate": 0.003, "loss": 4.1916, "step": 2365 }, { "epoch": 0.02366, "grad_norm": 0.653306782245636, "learning_rate": 0.003, "loss": 4.164, "step": 2366 }, { "epoch": 0.02367, "grad_norm": 0.6368426084518433, "learning_rate": 0.003, "loss": 4.1718, "step": 2367 }, { "epoch": 0.02368, "grad_norm": 0.713473379611969, "learning_rate": 0.003, "loss": 4.1462, "step": 2368 }, { "epoch": 0.02369, "grad_norm": 0.8085301518440247, "learning_rate": 0.003, "loss": 4.1615, "step": 2369 }, { "epoch": 0.0237, "grad_norm": 0.7926542162895203, "learning_rate": 0.003, "loss": 4.1888, "step": 2370 }, { "epoch": 0.02371, "grad_norm": 0.828406572341919, "learning_rate": 0.003, "loss": 4.1795, "step": 2371 }, { "epoch": 0.02372, "grad_norm": 0.7606370449066162, "learning_rate": 0.003, "loss": 4.1649, "step": 2372 }, { "epoch": 0.02373, "grad_norm": 0.6590718626976013, "learning_rate": 0.003, "loss": 4.1489, "step": 2373 }, { "epoch": 0.02374, "grad_norm": 0.5918402075767517, "learning_rate": 0.003, "loss": 4.1536, "step": 2374 }, { "epoch": 0.02375, "grad_norm": 0.6583318710327148, "learning_rate": 0.003, "loss": 4.1578, "step": 2375 }, { "epoch": 0.02376, "grad_norm": 0.6908389329910278, "learning_rate": 0.003, "loss": 4.1806, "step": 2376 }, { "epoch": 0.02377, "grad_norm": 0.6131519675254822, "learning_rate": 0.003, "loss": 4.1471, "step": 2377 }, { "epoch": 0.02378, "grad_norm": 0.4756273031234741, "learning_rate": 0.003, "loss": 4.1538, "step": 2378 }, { "epoch": 0.02379, "grad_norm": 0.5182189345359802, "learning_rate": 0.003, "loss": 4.1425, "step": 2379 }, { "epoch": 0.0238, "grad_norm": 0.4939170181751251, "learning_rate": 0.003, "loss": 4.1141, "step": 2380 }, { "epoch": 0.02381, "grad_norm": 0.4250797629356384, "learning_rate": 0.003, "loss": 4.1259, "step": 2381 }, { "epoch": 0.02382, "grad_norm": 0.44852757453918457, "learning_rate": 0.003, "loss": 4.1818, "step": 2382 }, { "epoch": 0.02383, "grad_norm": 0.4787205457687378, "learning_rate": 0.003, "loss": 4.1449, "step": 2383 }, { "epoch": 0.02384, "grad_norm": 0.5154638290405273, "learning_rate": 0.003, "loss": 4.1313, "step": 2384 }, { "epoch": 0.02385, "grad_norm": 0.5779361128807068, "learning_rate": 0.003, "loss": 4.1594, "step": 2385 }, { "epoch": 0.02386, "grad_norm": 0.6726837158203125, "learning_rate": 0.003, "loss": 4.1392, "step": 2386 }, { "epoch": 0.02387, "grad_norm": 0.6705544590950012, "learning_rate": 0.003, "loss": 4.138, "step": 2387 }, { "epoch": 0.02388, "grad_norm": 0.520875096321106, "learning_rate": 0.003, "loss": 4.1869, "step": 2388 }, { "epoch": 0.02389, "grad_norm": 0.4918033182621002, "learning_rate": 0.003, "loss": 4.1476, "step": 2389 }, { "epoch": 0.0239, "grad_norm": 0.5085783004760742, "learning_rate": 0.003, "loss": 4.1404, "step": 2390 }, { "epoch": 0.02391, "grad_norm": 0.5585421323776245, "learning_rate": 0.003, "loss": 4.1645, "step": 2391 }, { "epoch": 0.02392, "grad_norm": 0.6892051100730896, "learning_rate": 0.003, "loss": 4.173, "step": 2392 }, { "epoch": 0.02393, "grad_norm": 0.7382843494415283, "learning_rate": 0.003, "loss": 4.1352, "step": 2393 }, { "epoch": 0.02394, "grad_norm": 0.6831554770469666, "learning_rate": 0.003, "loss": 4.1493, "step": 2394 }, { "epoch": 0.02395, "grad_norm": 0.5855069756507874, "learning_rate": 0.003, "loss": 4.1618, "step": 2395 }, { "epoch": 0.02396, "grad_norm": 0.618294894695282, "learning_rate": 0.003, "loss": 4.1749, "step": 2396 }, { "epoch": 0.02397, "grad_norm": 0.6775050163269043, "learning_rate": 0.003, "loss": 4.1604, "step": 2397 }, { "epoch": 0.02398, "grad_norm": 0.6511969566345215, "learning_rate": 0.003, "loss": 4.1663, "step": 2398 }, { "epoch": 0.02399, "grad_norm": 0.538463294506073, "learning_rate": 0.003, "loss": 4.142, "step": 2399 }, { "epoch": 0.024, "grad_norm": 0.6128776669502258, "learning_rate": 0.003, "loss": 4.1581, "step": 2400 }, { "epoch": 0.02401, "grad_norm": 0.7419440150260925, "learning_rate": 0.003, "loss": 4.1379, "step": 2401 }, { "epoch": 0.02402, "grad_norm": 0.8263295888900757, "learning_rate": 0.003, "loss": 4.1752, "step": 2402 }, { "epoch": 0.02403, "grad_norm": 0.7187276482582092, "learning_rate": 0.003, "loss": 4.143, "step": 2403 }, { "epoch": 0.02404, "grad_norm": 0.6155825257301331, "learning_rate": 0.003, "loss": 4.1775, "step": 2404 }, { "epoch": 0.02405, "grad_norm": 0.584798276424408, "learning_rate": 0.003, "loss": 4.1805, "step": 2405 }, { "epoch": 0.02406, "grad_norm": 0.7314527034759521, "learning_rate": 0.003, "loss": 4.169, "step": 2406 }, { "epoch": 0.02407, "grad_norm": 0.9434296488761902, "learning_rate": 0.003, "loss": 4.1562, "step": 2407 }, { "epoch": 0.02408, "grad_norm": 1.003302812576294, "learning_rate": 0.003, "loss": 4.1727, "step": 2408 }, { "epoch": 0.02409, "grad_norm": 0.9447243809700012, "learning_rate": 0.003, "loss": 4.1773, "step": 2409 }, { "epoch": 0.0241, "grad_norm": 0.7472692131996155, "learning_rate": 0.003, "loss": 4.1837, "step": 2410 }, { "epoch": 0.02411, "grad_norm": 0.8666222095489502, "learning_rate": 0.003, "loss": 4.1576, "step": 2411 }, { "epoch": 0.02412, "grad_norm": 0.9741396307945251, "learning_rate": 0.003, "loss": 4.1698, "step": 2412 }, { "epoch": 0.02413, "grad_norm": 1.0521008968353271, "learning_rate": 0.003, "loss": 4.1812, "step": 2413 }, { "epoch": 0.02414, "grad_norm": 1.098198652267456, "learning_rate": 0.003, "loss": 4.198, "step": 2414 }, { "epoch": 0.02415, "grad_norm": 1.0509668588638306, "learning_rate": 0.003, "loss": 4.1933, "step": 2415 }, { "epoch": 0.02416, "grad_norm": 0.9923770427703857, "learning_rate": 0.003, "loss": 4.1791, "step": 2416 }, { "epoch": 0.02417, "grad_norm": 0.9507772922515869, "learning_rate": 0.003, "loss": 4.2103, "step": 2417 }, { "epoch": 0.02418, "grad_norm": 0.7804825305938721, "learning_rate": 0.003, "loss": 4.1914, "step": 2418 }, { "epoch": 0.02419, "grad_norm": 0.6353334188461304, "learning_rate": 0.003, "loss": 4.1871, "step": 2419 }, { "epoch": 0.0242, "grad_norm": 0.7049343585968018, "learning_rate": 0.003, "loss": 4.1989, "step": 2420 }, { "epoch": 0.02421, "grad_norm": 0.8157212734222412, "learning_rate": 0.003, "loss": 4.1685, "step": 2421 }, { "epoch": 0.02422, "grad_norm": 0.9302619099617004, "learning_rate": 0.003, "loss": 4.2265, "step": 2422 }, { "epoch": 0.02423, "grad_norm": 1.0272700786590576, "learning_rate": 0.003, "loss": 4.2247, "step": 2423 }, { "epoch": 0.02424, "grad_norm": 1.0545587539672852, "learning_rate": 0.003, "loss": 4.2182, "step": 2424 }, { "epoch": 0.02425, "grad_norm": 0.8540631532669067, "learning_rate": 0.003, "loss": 4.2022, "step": 2425 }, { "epoch": 0.02426, "grad_norm": 0.7790244817733765, "learning_rate": 0.003, "loss": 4.2053, "step": 2426 }, { "epoch": 0.02427, "grad_norm": 0.6194785237312317, "learning_rate": 0.003, "loss": 4.2245, "step": 2427 }, { "epoch": 0.02428, "grad_norm": 0.723381519317627, "learning_rate": 0.003, "loss": 4.1967, "step": 2428 }, { "epoch": 0.02429, "grad_norm": 0.8042016625404358, "learning_rate": 0.003, "loss": 4.1857, "step": 2429 }, { "epoch": 0.0243, "grad_norm": 0.8838525414466858, "learning_rate": 0.003, "loss": 4.211, "step": 2430 }, { "epoch": 0.02431, "grad_norm": 0.9250835180282593, "learning_rate": 0.003, "loss": 4.1912, "step": 2431 }, { "epoch": 0.02432, "grad_norm": 0.7146485447883606, "learning_rate": 0.003, "loss": 4.2038, "step": 2432 }, { "epoch": 0.02433, "grad_norm": 0.689758837223053, "learning_rate": 0.003, "loss": 4.2175, "step": 2433 }, { "epoch": 0.02434, "grad_norm": 0.7109194993972778, "learning_rate": 0.003, "loss": 4.1797, "step": 2434 }, { "epoch": 0.02435, "grad_norm": 0.6944730281829834, "learning_rate": 0.003, "loss": 4.1786, "step": 2435 }, { "epoch": 0.02436, "grad_norm": 0.5854509472846985, "learning_rate": 0.003, "loss": 4.1804, "step": 2436 }, { "epoch": 0.02437, "grad_norm": 0.5410053730010986, "learning_rate": 0.003, "loss": 4.1621, "step": 2437 }, { "epoch": 0.02438, "grad_norm": 0.4632667303085327, "learning_rate": 0.003, "loss": 4.1763, "step": 2438 }, { "epoch": 0.02439, "grad_norm": 0.47085174918174744, "learning_rate": 0.003, "loss": 4.143, "step": 2439 }, { "epoch": 0.0244, "grad_norm": 0.4175461232662201, "learning_rate": 0.003, "loss": 4.1908, "step": 2440 }, { "epoch": 0.02441, "grad_norm": 0.4487816095352173, "learning_rate": 0.003, "loss": 4.1559, "step": 2441 }, { "epoch": 0.02442, "grad_norm": 0.3787577450275421, "learning_rate": 0.003, "loss": 4.1845, "step": 2442 }, { "epoch": 0.02443, "grad_norm": 0.4035150706768036, "learning_rate": 0.003, "loss": 4.1219, "step": 2443 }, { "epoch": 0.02444, "grad_norm": 0.399797648191452, "learning_rate": 0.003, "loss": 4.1495, "step": 2444 }, { "epoch": 0.02445, "grad_norm": 0.4260754883289337, "learning_rate": 0.003, "loss": 4.1538, "step": 2445 }, { "epoch": 0.02446, "grad_norm": 0.5048103928565979, "learning_rate": 0.003, "loss": 4.1567, "step": 2446 }, { "epoch": 0.02447, "grad_norm": 0.6289964914321899, "learning_rate": 0.003, "loss": 4.1667, "step": 2447 }, { "epoch": 0.02448, "grad_norm": 0.8102890253067017, "learning_rate": 0.003, "loss": 4.1719, "step": 2448 }, { "epoch": 0.02449, "grad_norm": 0.8461275100708008, "learning_rate": 0.003, "loss": 4.1705, "step": 2449 }, { "epoch": 0.0245, "grad_norm": 0.7120311260223389, "learning_rate": 0.003, "loss": 4.1692, "step": 2450 }, { "epoch": 0.02451, "grad_norm": 0.4928838908672333, "learning_rate": 0.003, "loss": 4.1398, "step": 2451 }, { "epoch": 0.02452, "grad_norm": 0.517375648021698, "learning_rate": 0.003, "loss": 4.1492, "step": 2452 }, { "epoch": 0.02453, "grad_norm": 0.5950134992599487, "learning_rate": 0.003, "loss": 4.1282, "step": 2453 }, { "epoch": 0.02454, "grad_norm": 0.6498395800590515, "learning_rate": 0.003, "loss": 4.1374, "step": 2454 }, { "epoch": 0.02455, "grad_norm": 0.666580319404602, "learning_rate": 0.003, "loss": 4.1482, "step": 2455 }, { "epoch": 0.02456, "grad_norm": 0.6841810941696167, "learning_rate": 0.003, "loss": 4.1183, "step": 2456 }, { "epoch": 0.02457, "grad_norm": 0.6185785531997681, "learning_rate": 0.003, "loss": 4.1655, "step": 2457 }, { "epoch": 0.02458, "grad_norm": 0.6619728207588196, "learning_rate": 0.003, "loss": 4.1399, "step": 2458 }, { "epoch": 0.02459, "grad_norm": 0.721472978591919, "learning_rate": 0.003, "loss": 4.1746, "step": 2459 }, { "epoch": 0.0246, "grad_norm": 0.7174915075302124, "learning_rate": 0.003, "loss": 4.1756, "step": 2460 }, { "epoch": 0.02461, "grad_norm": 0.7339833378791809, "learning_rate": 0.003, "loss": 4.1527, "step": 2461 }, { "epoch": 0.02462, "grad_norm": 0.738292932510376, "learning_rate": 0.003, "loss": 4.1817, "step": 2462 }, { "epoch": 0.02463, "grad_norm": 0.6757227778434753, "learning_rate": 0.003, "loss": 4.1783, "step": 2463 }, { "epoch": 0.02464, "grad_norm": 0.5629785656929016, "learning_rate": 0.003, "loss": 4.1522, "step": 2464 }, { "epoch": 0.02465, "grad_norm": 0.619929850101471, "learning_rate": 0.003, "loss": 4.1723, "step": 2465 }, { "epoch": 0.02466, "grad_norm": 0.5323424339294434, "learning_rate": 0.003, "loss": 4.1498, "step": 2466 }, { "epoch": 0.02467, "grad_norm": 0.5933912396430969, "learning_rate": 0.003, "loss": 4.1304, "step": 2467 }, { "epoch": 0.02468, "grad_norm": 0.5487152338027954, "learning_rate": 0.003, "loss": 4.1707, "step": 2468 }, { "epoch": 0.02469, "grad_norm": 0.5824567675590515, "learning_rate": 0.003, "loss": 4.1523, "step": 2469 }, { "epoch": 0.0247, "grad_norm": 0.674126148223877, "learning_rate": 0.003, "loss": 4.1498, "step": 2470 }, { "epoch": 0.02471, "grad_norm": 0.6640561819076538, "learning_rate": 0.003, "loss": 4.1387, "step": 2471 }, { "epoch": 0.02472, "grad_norm": 0.6775668859481812, "learning_rate": 0.003, "loss": 4.1435, "step": 2472 }, { "epoch": 0.02473, "grad_norm": 0.668128252029419, "learning_rate": 0.003, "loss": 4.1273, "step": 2473 }, { "epoch": 0.02474, "grad_norm": 0.6847355961799622, "learning_rate": 0.003, "loss": 4.1625, "step": 2474 }, { "epoch": 0.02475, "grad_norm": 0.8420220017433167, "learning_rate": 0.003, "loss": 4.1414, "step": 2475 }, { "epoch": 0.02476, "grad_norm": 0.8548656702041626, "learning_rate": 0.003, "loss": 4.1541, "step": 2476 }, { "epoch": 0.02477, "grad_norm": 0.6734539866447449, "learning_rate": 0.003, "loss": 4.1705, "step": 2477 }, { "epoch": 0.02478, "grad_norm": 0.6893376111984253, "learning_rate": 0.003, "loss": 4.1474, "step": 2478 }, { "epoch": 0.02479, "grad_norm": 0.5793731808662415, "learning_rate": 0.003, "loss": 4.1333, "step": 2479 }, { "epoch": 0.0248, "grad_norm": 0.5962329506874084, "learning_rate": 0.003, "loss": 4.1754, "step": 2480 }, { "epoch": 0.02481, "grad_norm": 0.5665598511695862, "learning_rate": 0.003, "loss": 4.1561, "step": 2481 }, { "epoch": 0.02482, "grad_norm": 0.6422098875045776, "learning_rate": 0.003, "loss": 4.1658, "step": 2482 }, { "epoch": 0.02483, "grad_norm": 0.6842206716537476, "learning_rate": 0.003, "loss": 4.1214, "step": 2483 }, { "epoch": 0.02484, "grad_norm": 0.6730716824531555, "learning_rate": 0.003, "loss": 4.1517, "step": 2484 }, { "epoch": 0.02485, "grad_norm": 0.7416301369667053, "learning_rate": 0.003, "loss": 4.1761, "step": 2485 }, { "epoch": 0.02486, "grad_norm": 0.8094339370727539, "learning_rate": 0.003, "loss": 4.2033, "step": 2486 }, { "epoch": 0.02487, "grad_norm": 0.7401948571205139, "learning_rate": 0.003, "loss": 4.1575, "step": 2487 }, { "epoch": 0.02488, "grad_norm": 0.7254106402397156, "learning_rate": 0.003, "loss": 4.1347, "step": 2488 }, { "epoch": 0.02489, "grad_norm": 0.7380350232124329, "learning_rate": 0.003, "loss": 4.1254, "step": 2489 }, { "epoch": 0.0249, "grad_norm": 0.8402035236358643, "learning_rate": 0.003, "loss": 4.1326, "step": 2490 }, { "epoch": 0.02491, "grad_norm": 0.9696373343467712, "learning_rate": 0.003, "loss": 4.1667, "step": 2491 }, { "epoch": 0.02492, "grad_norm": 0.9300618171691895, "learning_rate": 0.003, "loss": 4.1853, "step": 2492 }, { "epoch": 0.02493, "grad_norm": 0.7866774201393127, "learning_rate": 0.003, "loss": 4.1475, "step": 2493 }, { "epoch": 0.02494, "grad_norm": 0.7883460521697998, "learning_rate": 0.003, "loss": 4.1631, "step": 2494 }, { "epoch": 0.02495, "grad_norm": 0.7333797216415405, "learning_rate": 0.003, "loss": 4.1705, "step": 2495 }, { "epoch": 0.02496, "grad_norm": 0.7359468936920166, "learning_rate": 0.003, "loss": 4.1637, "step": 2496 }, { "epoch": 0.02497, "grad_norm": 0.7477230429649353, "learning_rate": 0.003, "loss": 4.1701, "step": 2497 }, { "epoch": 0.02498, "grad_norm": 0.6632980704307556, "learning_rate": 0.003, "loss": 4.1549, "step": 2498 }, { "epoch": 0.02499, "grad_norm": 0.5524125099182129, "learning_rate": 0.003, "loss": 4.1622, "step": 2499 }, { "epoch": 0.025, "grad_norm": 0.5341687202453613, "learning_rate": 0.003, "loss": 4.1471, "step": 2500 }, { "epoch": 0.02501, "grad_norm": 0.5111349821090698, "learning_rate": 0.003, "loss": 4.1766, "step": 2501 }, { "epoch": 0.02502, "grad_norm": 0.5449102520942688, "learning_rate": 0.003, "loss": 4.159, "step": 2502 }, { "epoch": 0.02503, "grad_norm": 0.5349955558776855, "learning_rate": 0.003, "loss": 4.1642, "step": 2503 }, { "epoch": 0.02504, "grad_norm": 0.48626482486724854, "learning_rate": 0.003, "loss": 4.1409, "step": 2504 }, { "epoch": 0.02505, "grad_norm": 0.527682363986969, "learning_rate": 0.003, "loss": 4.1423, "step": 2505 }, { "epoch": 0.02506, "grad_norm": 0.6103768348693848, "learning_rate": 0.003, "loss": 4.1565, "step": 2506 }, { "epoch": 0.02507, "grad_norm": 0.6964853405952454, "learning_rate": 0.003, "loss": 4.1472, "step": 2507 }, { "epoch": 0.02508, "grad_norm": 0.8648868203163147, "learning_rate": 0.003, "loss": 4.1494, "step": 2508 }, { "epoch": 0.02509, "grad_norm": 0.9269753694534302, "learning_rate": 0.003, "loss": 4.1639, "step": 2509 }, { "epoch": 0.0251, "grad_norm": 1.022930383682251, "learning_rate": 0.003, "loss": 4.1832, "step": 2510 }, { "epoch": 0.02511, "grad_norm": 0.7745629549026489, "learning_rate": 0.003, "loss": 4.1837, "step": 2511 }, { "epoch": 0.02512, "grad_norm": 0.6751075983047485, "learning_rate": 0.003, "loss": 4.1635, "step": 2512 }, { "epoch": 0.02513, "grad_norm": 0.7629876732826233, "learning_rate": 0.003, "loss": 4.1626, "step": 2513 }, { "epoch": 0.02514, "grad_norm": 0.7923519015312195, "learning_rate": 0.003, "loss": 4.1521, "step": 2514 }, { "epoch": 0.02515, "grad_norm": 0.7363923192024231, "learning_rate": 0.003, "loss": 4.1379, "step": 2515 }, { "epoch": 0.02516, "grad_norm": 0.6523436903953552, "learning_rate": 0.003, "loss": 4.1588, "step": 2516 }, { "epoch": 0.02517, "grad_norm": 0.669247031211853, "learning_rate": 0.003, "loss": 4.1618, "step": 2517 }, { "epoch": 0.02518, "grad_norm": 0.5932192802429199, "learning_rate": 0.003, "loss": 4.1656, "step": 2518 }, { "epoch": 0.02519, "grad_norm": 0.5920613408088684, "learning_rate": 0.003, "loss": 4.1449, "step": 2519 }, { "epoch": 0.0252, "grad_norm": 0.5889076590538025, "learning_rate": 0.003, "loss": 4.1566, "step": 2520 }, { "epoch": 0.02521, "grad_norm": 0.650814414024353, "learning_rate": 0.003, "loss": 4.1509, "step": 2521 }, { "epoch": 0.02522, "grad_norm": 0.5603437423706055, "learning_rate": 0.003, "loss": 4.1446, "step": 2522 }, { "epoch": 0.02523, "grad_norm": 0.4928959906101227, "learning_rate": 0.003, "loss": 4.146, "step": 2523 }, { "epoch": 0.02524, "grad_norm": 0.4817637801170349, "learning_rate": 0.003, "loss": 4.1571, "step": 2524 }, { "epoch": 0.02525, "grad_norm": 0.4819410741329193, "learning_rate": 0.003, "loss": 4.169, "step": 2525 }, { "epoch": 0.02526, "grad_norm": 0.5578210353851318, "learning_rate": 0.003, "loss": 4.1154, "step": 2526 }, { "epoch": 0.02527, "grad_norm": 0.7309604287147522, "learning_rate": 0.003, "loss": 4.157, "step": 2527 }, { "epoch": 0.02528, "grad_norm": 0.9689713716506958, "learning_rate": 0.003, "loss": 4.1882, "step": 2528 }, { "epoch": 0.02529, "grad_norm": 1.002740502357483, "learning_rate": 0.003, "loss": 4.1697, "step": 2529 }, { "epoch": 0.0253, "grad_norm": 0.845522940158844, "learning_rate": 0.003, "loss": 4.1791, "step": 2530 }, { "epoch": 0.02531, "grad_norm": 0.8186164498329163, "learning_rate": 0.003, "loss": 4.181, "step": 2531 }, { "epoch": 0.02532, "grad_norm": 0.7776713371276855, "learning_rate": 0.003, "loss": 4.15, "step": 2532 }, { "epoch": 0.02533, "grad_norm": 0.6073544025421143, "learning_rate": 0.003, "loss": 4.1843, "step": 2533 }, { "epoch": 0.02534, "grad_norm": 0.7059157490730286, "learning_rate": 0.003, "loss": 4.1674, "step": 2534 }, { "epoch": 0.02535, "grad_norm": 0.6908336281776428, "learning_rate": 0.003, "loss": 4.1601, "step": 2535 }, { "epoch": 0.02536, "grad_norm": 0.6330790519714355, "learning_rate": 0.003, "loss": 4.1547, "step": 2536 }, { "epoch": 0.02537, "grad_norm": 0.5009950995445251, "learning_rate": 0.003, "loss": 4.1424, "step": 2537 }, { "epoch": 0.02538, "grad_norm": 0.5122132897377014, "learning_rate": 0.003, "loss": 4.1182, "step": 2538 }, { "epoch": 0.02539, "grad_norm": 0.5557868480682373, "learning_rate": 0.003, "loss": 4.1698, "step": 2539 }, { "epoch": 0.0254, "grad_norm": 0.5659569501876831, "learning_rate": 0.003, "loss": 4.1601, "step": 2540 }, { "epoch": 0.02541, "grad_norm": 0.4916652739048004, "learning_rate": 0.003, "loss": 4.1476, "step": 2541 }, { "epoch": 0.02542, "grad_norm": 0.5319804549217224, "learning_rate": 0.003, "loss": 4.1273, "step": 2542 }, { "epoch": 0.02543, "grad_norm": 0.4849517047405243, "learning_rate": 0.003, "loss": 4.1525, "step": 2543 }, { "epoch": 0.02544, "grad_norm": 0.46546679735183716, "learning_rate": 0.003, "loss": 4.1407, "step": 2544 }, { "epoch": 0.02545, "grad_norm": 0.4903736710548401, "learning_rate": 0.003, "loss": 4.1226, "step": 2545 }, { "epoch": 0.02546, "grad_norm": 0.5320796370506287, "learning_rate": 0.003, "loss": 4.1328, "step": 2546 }, { "epoch": 0.02547, "grad_norm": 0.6876251101493835, "learning_rate": 0.003, "loss": 4.1736, "step": 2547 }, { "epoch": 0.02548, "grad_norm": 0.8988205194473267, "learning_rate": 0.003, "loss": 4.1808, "step": 2548 }, { "epoch": 0.02549, "grad_norm": 0.9427040815353394, "learning_rate": 0.003, "loss": 4.1752, "step": 2549 }, { "epoch": 0.0255, "grad_norm": 0.8955228924751282, "learning_rate": 0.003, "loss": 4.1429, "step": 2550 }, { "epoch": 0.02551, "grad_norm": 0.6933260560035706, "learning_rate": 0.003, "loss": 4.1649, "step": 2551 }, { "epoch": 0.02552, "grad_norm": 0.6283506751060486, "learning_rate": 0.003, "loss": 4.1318, "step": 2552 }, { "epoch": 0.02553, "grad_norm": 0.6629008054733276, "learning_rate": 0.003, "loss": 4.1984, "step": 2553 }, { "epoch": 0.02554, "grad_norm": 0.6356085538864136, "learning_rate": 0.003, "loss": 4.1468, "step": 2554 }, { "epoch": 0.02555, "grad_norm": 0.6418399214744568, "learning_rate": 0.003, "loss": 4.1375, "step": 2555 }, { "epoch": 0.02556, "grad_norm": 0.689191997051239, "learning_rate": 0.003, "loss": 4.1505, "step": 2556 }, { "epoch": 0.02557, "grad_norm": 0.6599099636077881, "learning_rate": 0.003, "loss": 4.1536, "step": 2557 }, { "epoch": 0.02558, "grad_norm": 0.6565999984741211, "learning_rate": 0.003, "loss": 4.1668, "step": 2558 }, { "epoch": 0.02559, "grad_norm": 0.6738297939300537, "learning_rate": 0.003, "loss": 4.1674, "step": 2559 }, { "epoch": 0.0256, "grad_norm": 0.5509478449821472, "learning_rate": 0.003, "loss": 4.1382, "step": 2560 }, { "epoch": 0.02561, "grad_norm": 0.5239382982254028, "learning_rate": 0.003, "loss": 4.1361, "step": 2561 }, { "epoch": 0.02562, "grad_norm": 0.5102459788322449, "learning_rate": 0.003, "loss": 4.1531, "step": 2562 }, { "epoch": 0.02563, "grad_norm": 0.5850344896316528, "learning_rate": 0.003, "loss": 4.1384, "step": 2563 }, { "epoch": 0.02564, "grad_norm": 0.6901075839996338, "learning_rate": 0.003, "loss": 4.1621, "step": 2564 }, { "epoch": 0.02565, "grad_norm": 0.8146087527275085, "learning_rate": 0.003, "loss": 4.1471, "step": 2565 }, { "epoch": 0.02566, "grad_norm": 0.8885420560836792, "learning_rate": 0.003, "loss": 4.1712, "step": 2566 }, { "epoch": 0.02567, "grad_norm": 0.9885812997817993, "learning_rate": 0.003, "loss": 4.1808, "step": 2567 }, { "epoch": 0.02568, "grad_norm": 1.0153335332870483, "learning_rate": 0.003, "loss": 4.1801, "step": 2568 }, { "epoch": 0.02569, "grad_norm": 0.9595790505409241, "learning_rate": 0.003, "loss": 4.1596, "step": 2569 }, { "epoch": 0.0257, "grad_norm": 1.0054306983947754, "learning_rate": 0.003, "loss": 4.1689, "step": 2570 }, { "epoch": 0.02571, "grad_norm": 0.997405469417572, "learning_rate": 0.003, "loss": 4.1908, "step": 2571 }, { "epoch": 0.02572, "grad_norm": 0.9328084588050842, "learning_rate": 0.003, "loss": 4.1687, "step": 2572 }, { "epoch": 0.02573, "grad_norm": 0.8243227005004883, "learning_rate": 0.003, "loss": 4.1707, "step": 2573 }, { "epoch": 0.02574, "grad_norm": 0.7659904360771179, "learning_rate": 0.003, "loss": 4.1984, "step": 2574 }, { "epoch": 0.02575, "grad_norm": 0.7300676107406616, "learning_rate": 0.003, "loss": 4.1334, "step": 2575 }, { "epoch": 0.02576, "grad_norm": 0.6190217733383179, "learning_rate": 0.003, "loss": 4.1369, "step": 2576 }, { "epoch": 0.02577, "grad_norm": 0.6222746968269348, "learning_rate": 0.003, "loss": 4.138, "step": 2577 }, { "epoch": 0.02578, "grad_norm": 0.698691725730896, "learning_rate": 0.003, "loss": 4.1629, "step": 2578 }, { "epoch": 0.02579, "grad_norm": 0.6583962440490723, "learning_rate": 0.003, "loss": 4.1089, "step": 2579 }, { "epoch": 0.0258, "grad_norm": 0.6283578872680664, "learning_rate": 0.003, "loss": 4.1658, "step": 2580 }, { "epoch": 0.02581, "grad_norm": 0.48666515946388245, "learning_rate": 0.003, "loss": 4.1808, "step": 2581 }, { "epoch": 0.02582, "grad_norm": 0.5155734419822693, "learning_rate": 0.003, "loss": 4.1757, "step": 2582 }, { "epoch": 0.02583, "grad_norm": 0.46086037158966064, "learning_rate": 0.003, "loss": 4.1498, "step": 2583 }, { "epoch": 0.02584, "grad_norm": 0.4467635154724121, "learning_rate": 0.003, "loss": 4.1578, "step": 2584 }, { "epoch": 0.02585, "grad_norm": 0.4732924997806549, "learning_rate": 0.003, "loss": 4.1407, "step": 2585 }, { "epoch": 0.02586, "grad_norm": 0.40596628189086914, "learning_rate": 0.003, "loss": 4.159, "step": 2586 }, { "epoch": 0.02587, "grad_norm": 0.41176727414131165, "learning_rate": 0.003, "loss": 4.1467, "step": 2587 }, { "epoch": 0.02588, "grad_norm": 0.4998745620250702, "learning_rate": 0.003, "loss": 4.1476, "step": 2588 }, { "epoch": 0.02589, "grad_norm": 0.6378880143165588, "learning_rate": 0.003, "loss": 4.1526, "step": 2589 }, { "epoch": 0.0259, "grad_norm": 0.9064441919326782, "learning_rate": 0.003, "loss": 4.1707, "step": 2590 }, { "epoch": 0.02591, "grad_norm": 1.16274094581604, "learning_rate": 0.003, "loss": 4.1263, "step": 2591 }, { "epoch": 0.02592, "grad_norm": 0.8324390053749084, "learning_rate": 0.003, "loss": 4.1564, "step": 2592 }, { "epoch": 0.02593, "grad_norm": 0.8290135860443115, "learning_rate": 0.003, "loss": 4.1637, "step": 2593 }, { "epoch": 0.02594, "grad_norm": 0.8181279897689819, "learning_rate": 0.003, "loss": 4.1811, "step": 2594 }, { "epoch": 0.02595, "grad_norm": 0.7736456394195557, "learning_rate": 0.003, "loss": 4.1375, "step": 2595 }, { "epoch": 0.02596, "grad_norm": 0.7409265041351318, "learning_rate": 0.003, "loss": 4.1736, "step": 2596 }, { "epoch": 0.02597, "grad_norm": 0.7836101651191711, "learning_rate": 0.003, "loss": 4.1438, "step": 2597 }, { "epoch": 0.02598, "grad_norm": 0.6943440437316895, "learning_rate": 0.003, "loss": 4.1827, "step": 2598 }, { "epoch": 0.02599, "grad_norm": 0.6729668378829956, "learning_rate": 0.003, "loss": 4.1732, "step": 2599 }, { "epoch": 0.026, "grad_norm": 0.7131978869438171, "learning_rate": 0.003, "loss": 4.1756, "step": 2600 }, { "epoch": 0.02601, "grad_norm": 0.7535547018051147, "learning_rate": 0.003, "loss": 4.1785, "step": 2601 }, { "epoch": 0.02602, "grad_norm": 0.8210467100143433, "learning_rate": 0.003, "loss": 4.1562, "step": 2602 }, { "epoch": 0.02603, "grad_norm": 0.7161123752593994, "learning_rate": 0.003, "loss": 4.1505, "step": 2603 }, { "epoch": 0.02604, "grad_norm": 0.6834149956703186, "learning_rate": 0.003, "loss": 4.1371, "step": 2604 }, { "epoch": 0.02605, "grad_norm": 0.5697470307350159, "learning_rate": 0.003, "loss": 4.1503, "step": 2605 }, { "epoch": 0.02606, "grad_norm": 0.5395636558532715, "learning_rate": 0.003, "loss": 4.1673, "step": 2606 }, { "epoch": 0.02607, "grad_norm": 0.47388994693756104, "learning_rate": 0.003, "loss": 4.157, "step": 2607 }, { "epoch": 0.02608, "grad_norm": 0.435537189245224, "learning_rate": 0.003, "loss": 4.1355, "step": 2608 }, { "epoch": 0.02609, "grad_norm": 0.4369211792945862, "learning_rate": 0.003, "loss": 4.1788, "step": 2609 }, { "epoch": 0.0261, "grad_norm": 0.46976083517074585, "learning_rate": 0.003, "loss": 4.1283, "step": 2610 }, { "epoch": 0.02611, "grad_norm": 0.5262687802314758, "learning_rate": 0.003, "loss": 4.1186, "step": 2611 }, { "epoch": 0.02612, "grad_norm": 0.7051580548286438, "learning_rate": 0.003, "loss": 4.1591, "step": 2612 }, { "epoch": 0.02613, "grad_norm": 0.8811540603637695, "learning_rate": 0.003, "loss": 4.1618, "step": 2613 }, { "epoch": 0.02614, "grad_norm": 0.8296162486076355, "learning_rate": 0.003, "loss": 4.1426, "step": 2614 }, { "epoch": 0.02615, "grad_norm": 0.5731111764907837, "learning_rate": 0.003, "loss": 4.1239, "step": 2615 }, { "epoch": 0.02616, "grad_norm": 0.6465540528297424, "learning_rate": 0.003, "loss": 4.1387, "step": 2616 }, { "epoch": 0.02617, "grad_norm": 0.7469308972358704, "learning_rate": 0.003, "loss": 4.1301, "step": 2617 }, { "epoch": 0.02618, "grad_norm": 0.7493590712547302, "learning_rate": 0.003, "loss": 4.1589, "step": 2618 }, { "epoch": 0.02619, "grad_norm": 0.6884176731109619, "learning_rate": 0.003, "loss": 4.1704, "step": 2619 }, { "epoch": 0.0262, "grad_norm": 0.6231204271316528, "learning_rate": 0.003, "loss": 4.1306, "step": 2620 }, { "epoch": 0.02621, "grad_norm": 0.5638678669929504, "learning_rate": 0.003, "loss": 4.1548, "step": 2621 }, { "epoch": 0.02622, "grad_norm": 0.6251258254051208, "learning_rate": 0.003, "loss": 4.1545, "step": 2622 }, { "epoch": 0.02623, "grad_norm": 0.6526143550872803, "learning_rate": 0.003, "loss": 4.1604, "step": 2623 }, { "epoch": 0.02624, "grad_norm": 0.7207937836647034, "learning_rate": 0.003, "loss": 4.1973, "step": 2624 }, { "epoch": 0.02625, "grad_norm": 0.7291254997253418, "learning_rate": 0.003, "loss": 4.1951, "step": 2625 }, { "epoch": 0.02626, "grad_norm": 0.6461899280548096, "learning_rate": 0.003, "loss": 4.1285, "step": 2626 }, { "epoch": 0.02627, "grad_norm": 0.7499544620513916, "learning_rate": 0.003, "loss": 4.1449, "step": 2627 }, { "epoch": 0.02628, "grad_norm": 0.7836819887161255, "learning_rate": 0.003, "loss": 4.1795, "step": 2628 }, { "epoch": 0.02629, "grad_norm": 0.7647614479064941, "learning_rate": 0.003, "loss": 4.1169, "step": 2629 }, { "epoch": 0.0263, "grad_norm": 0.6021679639816284, "learning_rate": 0.003, "loss": 4.1405, "step": 2630 }, { "epoch": 0.02631, "grad_norm": 0.5774012804031372, "learning_rate": 0.003, "loss": 4.1643, "step": 2631 }, { "epoch": 0.02632, "grad_norm": 0.5787044167518616, "learning_rate": 0.003, "loss": 4.1232, "step": 2632 }, { "epoch": 0.02633, "grad_norm": 0.6322444677352905, "learning_rate": 0.003, "loss": 4.14, "step": 2633 }, { "epoch": 0.02634, "grad_norm": 0.6454491019248962, "learning_rate": 0.003, "loss": 4.161, "step": 2634 }, { "epoch": 0.02635, "grad_norm": 0.6432276368141174, "learning_rate": 0.003, "loss": 4.1552, "step": 2635 }, { "epoch": 0.02636, "grad_norm": 0.7024194002151489, "learning_rate": 0.003, "loss": 4.1424, "step": 2636 }, { "epoch": 0.02637, "grad_norm": 0.8076905012130737, "learning_rate": 0.003, "loss": 4.1774, "step": 2637 }, { "epoch": 0.02638, "grad_norm": 0.7787435054779053, "learning_rate": 0.003, "loss": 4.1404, "step": 2638 }, { "epoch": 0.02639, "grad_norm": 0.6584726572036743, "learning_rate": 0.003, "loss": 4.1295, "step": 2639 }, { "epoch": 0.0264, "grad_norm": 0.6226917505264282, "learning_rate": 0.003, "loss": 4.1266, "step": 2640 }, { "epoch": 0.02641, "grad_norm": 0.5553937554359436, "learning_rate": 0.003, "loss": 4.1385, "step": 2641 }, { "epoch": 0.02642, "grad_norm": 0.5754583477973938, "learning_rate": 0.003, "loss": 4.1442, "step": 2642 }, { "epoch": 0.02643, "grad_norm": 0.6242693662643433, "learning_rate": 0.003, "loss": 4.1325, "step": 2643 }, { "epoch": 0.02644, "grad_norm": 0.6401628255844116, "learning_rate": 0.003, "loss": 4.1486, "step": 2644 }, { "epoch": 0.02645, "grad_norm": 0.6735085844993591, "learning_rate": 0.003, "loss": 4.1629, "step": 2645 }, { "epoch": 0.02646, "grad_norm": 0.7502104640007019, "learning_rate": 0.003, "loss": 4.1325, "step": 2646 }, { "epoch": 0.02647, "grad_norm": 0.9201660752296448, "learning_rate": 0.003, "loss": 4.1609, "step": 2647 }, { "epoch": 0.02648, "grad_norm": 0.8611843585968018, "learning_rate": 0.003, "loss": 4.1815, "step": 2648 }, { "epoch": 0.02649, "grad_norm": 0.6259444355964661, "learning_rate": 0.003, "loss": 4.1591, "step": 2649 }, { "epoch": 0.0265, "grad_norm": 0.6097885966300964, "learning_rate": 0.003, "loss": 4.1187, "step": 2650 }, { "epoch": 0.02651, "grad_norm": 0.696152925491333, "learning_rate": 0.003, "loss": 4.1424, "step": 2651 }, { "epoch": 0.02652, "grad_norm": 0.6412234902381897, "learning_rate": 0.003, "loss": 4.1679, "step": 2652 }, { "epoch": 0.02653, "grad_norm": 0.6418108940124512, "learning_rate": 0.003, "loss": 4.1255, "step": 2653 }, { "epoch": 0.02654, "grad_norm": 0.5534683465957642, "learning_rate": 0.003, "loss": 4.1338, "step": 2654 }, { "epoch": 0.02655, "grad_norm": 0.5514299273490906, "learning_rate": 0.003, "loss": 4.1394, "step": 2655 }, { "epoch": 0.02656, "grad_norm": 0.48489049077033997, "learning_rate": 0.003, "loss": 4.1108, "step": 2656 }, { "epoch": 0.02657, "grad_norm": 0.5331873297691345, "learning_rate": 0.003, "loss": 4.1423, "step": 2657 }, { "epoch": 0.02658, "grad_norm": 0.5732285380363464, "learning_rate": 0.003, "loss": 4.149, "step": 2658 }, { "epoch": 0.02659, "grad_norm": 0.6610122323036194, "learning_rate": 0.003, "loss": 4.1479, "step": 2659 }, { "epoch": 0.0266, "grad_norm": 0.7037291526794434, "learning_rate": 0.003, "loss": 4.1333, "step": 2660 }, { "epoch": 0.02661, "grad_norm": 0.8676835894584656, "learning_rate": 0.003, "loss": 4.161, "step": 2661 }, { "epoch": 0.02662, "grad_norm": 0.9771322011947632, "learning_rate": 0.003, "loss": 4.1542, "step": 2662 }, { "epoch": 0.02663, "grad_norm": 1.1501520872116089, "learning_rate": 0.003, "loss": 4.1488, "step": 2663 }, { "epoch": 0.02664, "grad_norm": 0.8324311971664429, "learning_rate": 0.003, "loss": 4.1441, "step": 2664 }, { "epoch": 0.02665, "grad_norm": 0.852374255657196, "learning_rate": 0.003, "loss": 4.1931, "step": 2665 }, { "epoch": 0.02666, "grad_norm": 1.0820196866989136, "learning_rate": 0.003, "loss": 4.198, "step": 2666 }, { "epoch": 0.02667, "grad_norm": 0.7490083575248718, "learning_rate": 0.003, "loss": 4.1632, "step": 2667 }, { "epoch": 0.02668, "grad_norm": 0.679391622543335, "learning_rate": 0.003, "loss": 4.1331, "step": 2668 }, { "epoch": 0.02669, "grad_norm": 0.6063788533210754, "learning_rate": 0.003, "loss": 4.1519, "step": 2669 }, { "epoch": 0.0267, "grad_norm": 0.6576395034790039, "learning_rate": 0.003, "loss": 4.1517, "step": 2670 }, { "epoch": 0.02671, "grad_norm": 0.8710551261901855, "learning_rate": 0.003, "loss": 4.1782, "step": 2671 }, { "epoch": 0.02672, "grad_norm": 1.043360710144043, "learning_rate": 0.003, "loss": 4.179, "step": 2672 }, { "epoch": 0.02673, "grad_norm": 0.8524636030197144, "learning_rate": 0.003, "loss": 4.1702, "step": 2673 }, { "epoch": 0.02674, "grad_norm": 0.7167258858680725, "learning_rate": 0.003, "loss": 4.135, "step": 2674 }, { "epoch": 0.02675, "grad_norm": 0.8229854106903076, "learning_rate": 0.003, "loss": 4.1392, "step": 2675 }, { "epoch": 0.02676, "grad_norm": 0.8725418448448181, "learning_rate": 0.003, "loss": 4.1608, "step": 2676 }, { "epoch": 0.02677, "grad_norm": 0.8751718997955322, "learning_rate": 0.003, "loss": 4.189, "step": 2677 }, { "epoch": 0.02678, "grad_norm": 0.9639006853103638, "learning_rate": 0.003, "loss": 4.1897, "step": 2678 }, { "epoch": 0.02679, "grad_norm": 0.9028424024581909, "learning_rate": 0.003, "loss": 4.1596, "step": 2679 }, { "epoch": 0.0268, "grad_norm": 0.8419076204299927, "learning_rate": 0.003, "loss": 4.1719, "step": 2680 }, { "epoch": 0.02681, "grad_norm": 0.8327796459197998, "learning_rate": 0.003, "loss": 4.1966, "step": 2681 }, { "epoch": 0.02682, "grad_norm": 0.6412435173988342, "learning_rate": 0.003, "loss": 4.1667, "step": 2682 }, { "epoch": 0.02683, "grad_norm": 0.5319331288337708, "learning_rate": 0.003, "loss": 4.1829, "step": 2683 }, { "epoch": 0.02684, "grad_norm": 0.5018413662910461, "learning_rate": 0.003, "loss": 4.1559, "step": 2684 }, { "epoch": 0.02685, "grad_norm": 0.4933565557003021, "learning_rate": 0.003, "loss": 4.1592, "step": 2685 }, { "epoch": 0.02686, "grad_norm": 0.5294405221939087, "learning_rate": 0.003, "loss": 4.1404, "step": 2686 }, { "epoch": 0.02687, "grad_norm": 0.6580759882926941, "learning_rate": 0.003, "loss": 4.1532, "step": 2687 }, { "epoch": 0.02688, "grad_norm": 0.7823148369789124, "learning_rate": 0.003, "loss": 4.1334, "step": 2688 }, { "epoch": 0.02689, "grad_norm": 0.8333227038383484, "learning_rate": 0.003, "loss": 4.1677, "step": 2689 }, { "epoch": 0.0269, "grad_norm": 0.6743582487106323, "learning_rate": 0.003, "loss": 4.1651, "step": 2690 }, { "epoch": 0.02691, "grad_norm": 0.5275107622146606, "learning_rate": 0.003, "loss": 4.154, "step": 2691 }, { "epoch": 0.02692, "grad_norm": 0.585838258266449, "learning_rate": 0.003, "loss": 4.1589, "step": 2692 }, { "epoch": 0.02693, "grad_norm": 0.6433229446411133, "learning_rate": 0.003, "loss": 4.1748, "step": 2693 }, { "epoch": 0.02694, "grad_norm": 0.5260990858078003, "learning_rate": 0.003, "loss": 4.1297, "step": 2694 }, { "epoch": 0.02695, "grad_norm": 0.49656516313552856, "learning_rate": 0.003, "loss": 4.1442, "step": 2695 }, { "epoch": 0.02696, "grad_norm": 0.46913599967956543, "learning_rate": 0.003, "loss": 4.1441, "step": 2696 }, { "epoch": 0.02697, "grad_norm": 0.4466596841812134, "learning_rate": 0.003, "loss": 4.1534, "step": 2697 }, { "epoch": 0.02698, "grad_norm": 0.4291961193084717, "learning_rate": 0.003, "loss": 4.1359, "step": 2698 }, { "epoch": 0.02699, "grad_norm": 0.43313321471214294, "learning_rate": 0.003, "loss": 4.1533, "step": 2699 }, { "epoch": 0.027, "grad_norm": 0.4425189793109894, "learning_rate": 0.003, "loss": 4.1247, "step": 2700 }, { "epoch": 0.02701, "grad_norm": 0.41129767894744873, "learning_rate": 0.003, "loss": 4.112, "step": 2701 }, { "epoch": 0.02702, "grad_norm": 0.3028920888900757, "learning_rate": 0.003, "loss": 4.1613, "step": 2702 }, { "epoch": 0.02703, "grad_norm": 0.36113715171813965, "learning_rate": 0.003, "loss": 4.1575, "step": 2703 }, { "epoch": 0.02704, "grad_norm": 0.42614156007766724, "learning_rate": 0.003, "loss": 4.1013, "step": 2704 }, { "epoch": 0.02705, "grad_norm": 0.4877123534679413, "learning_rate": 0.003, "loss": 4.1578, "step": 2705 }, { "epoch": 0.02706, "grad_norm": 0.6430928707122803, "learning_rate": 0.003, "loss": 4.1234, "step": 2706 }, { "epoch": 0.02707, "grad_norm": 0.7538681626319885, "learning_rate": 0.003, "loss": 4.1289, "step": 2707 }, { "epoch": 0.02708, "grad_norm": 0.9270827770233154, "learning_rate": 0.003, "loss": 4.1612, "step": 2708 }, { "epoch": 0.02709, "grad_norm": 1.0371726751327515, "learning_rate": 0.003, "loss": 4.1442, "step": 2709 }, { "epoch": 0.0271, "grad_norm": 0.7252341508865356, "learning_rate": 0.003, "loss": 4.1418, "step": 2710 }, { "epoch": 0.02711, "grad_norm": 0.6913469433784485, "learning_rate": 0.003, "loss": 4.1271, "step": 2711 }, { "epoch": 0.02712, "grad_norm": 0.7029024362564087, "learning_rate": 0.003, "loss": 4.142, "step": 2712 }, { "epoch": 0.02713, "grad_norm": 0.6771293878555298, "learning_rate": 0.003, "loss": 4.1313, "step": 2713 }, { "epoch": 0.02714, "grad_norm": 0.6258983016014099, "learning_rate": 0.003, "loss": 4.1322, "step": 2714 }, { "epoch": 0.02715, "grad_norm": 0.7235097289085388, "learning_rate": 0.003, "loss": 4.1319, "step": 2715 }, { "epoch": 0.02716, "grad_norm": 0.8248934745788574, "learning_rate": 0.003, "loss": 4.1475, "step": 2716 }, { "epoch": 0.02717, "grad_norm": 0.8002756237983704, "learning_rate": 0.003, "loss": 4.1502, "step": 2717 }, { "epoch": 0.02718, "grad_norm": 0.7360194325447083, "learning_rate": 0.003, "loss": 4.1647, "step": 2718 }, { "epoch": 0.02719, "grad_norm": 0.8694934844970703, "learning_rate": 0.003, "loss": 4.145, "step": 2719 }, { "epoch": 0.0272, "grad_norm": 0.8528920412063599, "learning_rate": 0.003, "loss": 4.137, "step": 2720 }, { "epoch": 0.02721, "grad_norm": 1.0115561485290527, "learning_rate": 0.003, "loss": 4.1594, "step": 2721 }, { "epoch": 0.02722, "grad_norm": 1.050325632095337, "learning_rate": 0.003, "loss": 4.1686, "step": 2722 }, { "epoch": 0.02723, "grad_norm": 0.8528364896774292, "learning_rate": 0.003, "loss": 4.1629, "step": 2723 }, { "epoch": 0.02724, "grad_norm": 0.8148915767669678, "learning_rate": 0.003, "loss": 4.1665, "step": 2724 }, { "epoch": 0.02725, "grad_norm": 0.707343339920044, "learning_rate": 0.003, "loss": 4.1539, "step": 2725 }, { "epoch": 0.02726, "grad_norm": 0.7041299939155579, "learning_rate": 0.003, "loss": 4.1281, "step": 2726 }, { "epoch": 0.02727, "grad_norm": 0.5910518765449524, "learning_rate": 0.003, "loss": 4.1551, "step": 2727 }, { "epoch": 0.02728, "grad_norm": 0.6046881079673767, "learning_rate": 0.003, "loss": 4.1422, "step": 2728 }, { "epoch": 0.02729, "grad_norm": 0.5683781504631042, "learning_rate": 0.003, "loss": 4.1716, "step": 2729 }, { "epoch": 0.0273, "grad_norm": 0.5593625903129578, "learning_rate": 0.003, "loss": 4.143, "step": 2730 }, { "epoch": 0.02731, "grad_norm": 0.5543289184570312, "learning_rate": 0.003, "loss": 4.1417, "step": 2731 }, { "epoch": 0.02732, "grad_norm": 0.5714956521987915, "learning_rate": 0.003, "loss": 4.135, "step": 2732 }, { "epoch": 0.02733, "grad_norm": 0.5906261205673218, "learning_rate": 0.003, "loss": 4.1645, "step": 2733 }, { "epoch": 0.02734, "grad_norm": 0.7874712944030762, "learning_rate": 0.003, "loss": 4.1208, "step": 2734 }, { "epoch": 0.02735, "grad_norm": 0.9163006544113159, "learning_rate": 0.003, "loss": 4.174, "step": 2735 }, { "epoch": 0.02736, "grad_norm": 0.8792824745178223, "learning_rate": 0.003, "loss": 4.1495, "step": 2736 }, { "epoch": 0.02737, "grad_norm": 0.7901833057403564, "learning_rate": 0.003, "loss": 4.1377, "step": 2737 }, { "epoch": 0.02738, "grad_norm": 0.8548569083213806, "learning_rate": 0.003, "loss": 4.1629, "step": 2738 }, { "epoch": 0.02739, "grad_norm": 0.7134705185890198, "learning_rate": 0.003, "loss": 4.1544, "step": 2739 }, { "epoch": 0.0274, "grad_norm": 0.6430677175521851, "learning_rate": 0.003, "loss": 4.139, "step": 2740 }, { "epoch": 0.02741, "grad_norm": 0.5635836720466614, "learning_rate": 0.003, "loss": 4.1495, "step": 2741 }, { "epoch": 0.02742, "grad_norm": 0.5489112138748169, "learning_rate": 0.003, "loss": 4.1367, "step": 2742 }, { "epoch": 0.02743, "grad_norm": 0.5366271734237671, "learning_rate": 0.003, "loss": 4.1572, "step": 2743 }, { "epoch": 0.02744, "grad_norm": 0.489491730928421, "learning_rate": 0.003, "loss": 4.1372, "step": 2744 }, { "epoch": 0.02745, "grad_norm": 0.47551363706588745, "learning_rate": 0.003, "loss": 4.1468, "step": 2745 }, { "epoch": 0.02746, "grad_norm": 0.39822816848754883, "learning_rate": 0.003, "loss": 4.1191, "step": 2746 }, { "epoch": 0.02747, "grad_norm": 0.3974153399467468, "learning_rate": 0.003, "loss": 4.1538, "step": 2747 }, { "epoch": 0.02748, "grad_norm": 0.47189366817474365, "learning_rate": 0.003, "loss": 4.107, "step": 2748 }, { "epoch": 0.02749, "grad_norm": 0.5153442025184631, "learning_rate": 0.003, "loss": 4.1113, "step": 2749 }, { "epoch": 0.0275, "grad_norm": 0.565412700176239, "learning_rate": 0.003, "loss": 4.1461, "step": 2750 }, { "epoch": 0.02751, "grad_norm": 0.6354933977127075, "learning_rate": 0.003, "loss": 4.1638, "step": 2751 }, { "epoch": 0.02752, "grad_norm": 0.6897895932197571, "learning_rate": 0.003, "loss": 4.1494, "step": 2752 }, { "epoch": 0.02753, "grad_norm": 0.6377696394920349, "learning_rate": 0.003, "loss": 4.1306, "step": 2753 }, { "epoch": 0.02754, "grad_norm": 0.5976959466934204, "learning_rate": 0.003, "loss": 4.1139, "step": 2754 }, { "epoch": 0.02755, "grad_norm": 0.702274739742279, "learning_rate": 0.003, "loss": 4.1146, "step": 2755 }, { "epoch": 0.02756, "grad_norm": 0.7118279337882996, "learning_rate": 0.003, "loss": 4.1439, "step": 2756 }, { "epoch": 0.02757, "grad_norm": 0.660001277923584, "learning_rate": 0.003, "loss": 4.1563, "step": 2757 }, { "epoch": 0.02758, "grad_norm": 0.7159073948860168, "learning_rate": 0.003, "loss": 4.106, "step": 2758 }, { "epoch": 0.02759, "grad_norm": 0.8859509825706482, "learning_rate": 0.003, "loss": 4.1452, "step": 2759 }, { "epoch": 0.0276, "grad_norm": 0.9749924540519714, "learning_rate": 0.003, "loss": 4.1562, "step": 2760 }, { "epoch": 0.02761, "grad_norm": 0.9062779545783997, "learning_rate": 0.003, "loss": 4.1742, "step": 2761 }, { "epoch": 0.02762, "grad_norm": 0.8129538893699646, "learning_rate": 0.003, "loss": 4.1407, "step": 2762 }, { "epoch": 0.02763, "grad_norm": 0.7007825970649719, "learning_rate": 0.003, "loss": 4.1479, "step": 2763 }, { "epoch": 0.02764, "grad_norm": 0.8174701929092407, "learning_rate": 0.003, "loss": 4.1454, "step": 2764 }, { "epoch": 0.02765, "grad_norm": 0.8394188284873962, "learning_rate": 0.003, "loss": 4.1542, "step": 2765 }, { "epoch": 0.02766, "grad_norm": 0.8616805672645569, "learning_rate": 0.003, "loss": 4.1254, "step": 2766 }, { "epoch": 0.02767, "grad_norm": 0.8441250920295715, "learning_rate": 0.003, "loss": 4.1869, "step": 2767 }, { "epoch": 0.02768, "grad_norm": 0.7080286741256714, "learning_rate": 0.003, "loss": 4.1142, "step": 2768 }, { "epoch": 0.02769, "grad_norm": 0.708198070526123, "learning_rate": 0.003, "loss": 4.174, "step": 2769 }, { "epoch": 0.0277, "grad_norm": 0.7906506061553955, "learning_rate": 0.003, "loss": 4.1426, "step": 2770 }, { "epoch": 0.02771, "grad_norm": 0.7239770889282227, "learning_rate": 0.003, "loss": 4.1314, "step": 2771 }, { "epoch": 0.02772, "grad_norm": 0.5857943296432495, "learning_rate": 0.003, "loss": 4.1172, "step": 2772 }, { "epoch": 0.02773, "grad_norm": 0.5903287529945374, "learning_rate": 0.003, "loss": 4.1512, "step": 2773 }, { "epoch": 0.02774, "grad_norm": 0.6580007672309875, "learning_rate": 0.003, "loss": 4.176, "step": 2774 }, { "epoch": 0.02775, "grad_norm": 0.6529462933540344, "learning_rate": 0.003, "loss": 4.1423, "step": 2775 }, { "epoch": 0.02776, "grad_norm": 0.7036746144294739, "learning_rate": 0.003, "loss": 4.138, "step": 2776 }, { "epoch": 0.02777, "grad_norm": 0.7649575471878052, "learning_rate": 0.003, "loss": 4.1221, "step": 2777 }, { "epoch": 0.02778, "grad_norm": 0.6718697547912598, "learning_rate": 0.003, "loss": 4.1607, "step": 2778 }, { "epoch": 0.02779, "grad_norm": 0.587509036064148, "learning_rate": 0.003, "loss": 4.1283, "step": 2779 }, { "epoch": 0.0278, "grad_norm": 0.5817381143569946, "learning_rate": 0.003, "loss": 4.1256, "step": 2780 }, { "epoch": 0.02781, "grad_norm": 0.5770044326782227, "learning_rate": 0.003, "loss": 4.1078, "step": 2781 }, { "epoch": 0.02782, "grad_norm": 0.687736451625824, "learning_rate": 0.003, "loss": 4.1341, "step": 2782 }, { "epoch": 0.02783, "grad_norm": 0.6473103165626526, "learning_rate": 0.003, "loss": 4.1386, "step": 2783 }, { "epoch": 0.02784, "grad_norm": 0.5239824056625366, "learning_rate": 0.003, "loss": 4.1293, "step": 2784 }, { "epoch": 0.02785, "grad_norm": 0.4588249921798706, "learning_rate": 0.003, "loss": 4.1447, "step": 2785 }, { "epoch": 0.02786, "grad_norm": 0.4880278408527374, "learning_rate": 0.003, "loss": 4.1394, "step": 2786 }, { "epoch": 0.02787, "grad_norm": 0.502045750617981, "learning_rate": 0.003, "loss": 4.1396, "step": 2787 }, { "epoch": 0.02788, "grad_norm": 0.6067155003547668, "learning_rate": 0.003, "loss": 4.1294, "step": 2788 }, { "epoch": 0.02789, "grad_norm": 0.7814184427261353, "learning_rate": 0.003, "loss": 4.1409, "step": 2789 }, { "epoch": 0.0279, "grad_norm": 1.1954095363616943, "learning_rate": 0.003, "loss": 4.132, "step": 2790 }, { "epoch": 0.02791, "grad_norm": 0.7128247022628784, "learning_rate": 0.003, "loss": 4.1352, "step": 2791 }, { "epoch": 0.02792, "grad_norm": 0.6396022439002991, "learning_rate": 0.003, "loss": 4.1495, "step": 2792 }, { "epoch": 0.02793, "grad_norm": 0.8452949523925781, "learning_rate": 0.003, "loss": 4.1513, "step": 2793 }, { "epoch": 0.02794, "grad_norm": 0.7837446928024292, "learning_rate": 0.003, "loss": 4.1464, "step": 2794 }, { "epoch": 0.02795, "grad_norm": 0.7633419036865234, "learning_rate": 0.003, "loss": 4.1621, "step": 2795 }, { "epoch": 0.02796, "grad_norm": 0.7906386852264404, "learning_rate": 0.003, "loss": 4.1576, "step": 2796 }, { "epoch": 0.02797, "grad_norm": 0.7294182777404785, "learning_rate": 0.003, "loss": 4.1672, "step": 2797 }, { "epoch": 0.02798, "grad_norm": 0.7420005798339844, "learning_rate": 0.003, "loss": 4.1646, "step": 2798 }, { "epoch": 0.02799, "grad_norm": 0.722224235534668, "learning_rate": 0.003, "loss": 4.1492, "step": 2799 }, { "epoch": 0.028, "grad_norm": 0.677977442741394, "learning_rate": 0.003, "loss": 4.1596, "step": 2800 }, { "epoch": 0.02801, "grad_norm": 0.6493065357208252, "learning_rate": 0.003, "loss": 4.1339, "step": 2801 }, { "epoch": 0.02802, "grad_norm": 0.6755994558334351, "learning_rate": 0.003, "loss": 4.1363, "step": 2802 }, { "epoch": 0.02803, "grad_norm": 0.6203683614730835, "learning_rate": 0.003, "loss": 4.1536, "step": 2803 }, { "epoch": 0.02804, "grad_norm": 0.5352902412414551, "learning_rate": 0.003, "loss": 4.1231, "step": 2804 }, { "epoch": 0.02805, "grad_norm": 0.4998801350593567, "learning_rate": 0.003, "loss": 4.1579, "step": 2805 }, { "epoch": 0.02806, "grad_norm": 0.43145909905433655, "learning_rate": 0.003, "loss": 4.13, "step": 2806 }, { "epoch": 0.02807, "grad_norm": 0.4628302752971649, "learning_rate": 0.003, "loss": 4.1337, "step": 2807 }, { "epoch": 0.02808, "grad_norm": 0.4608153998851776, "learning_rate": 0.003, "loss": 4.1137, "step": 2808 }, { "epoch": 0.02809, "grad_norm": 0.47452715039253235, "learning_rate": 0.003, "loss": 4.1551, "step": 2809 }, { "epoch": 0.0281, "grad_norm": 0.5173685550689697, "learning_rate": 0.003, "loss": 4.1083, "step": 2810 }, { "epoch": 0.02811, "grad_norm": 0.5137503147125244, "learning_rate": 0.003, "loss": 4.1165, "step": 2811 }, { "epoch": 0.02812, "grad_norm": 0.5507237315177917, "learning_rate": 0.003, "loss": 4.1344, "step": 2812 }, { "epoch": 0.02813, "grad_norm": 0.667034387588501, "learning_rate": 0.003, "loss": 4.1267, "step": 2813 }, { "epoch": 0.02814, "grad_norm": 0.8353255987167358, "learning_rate": 0.003, "loss": 4.1329, "step": 2814 }, { "epoch": 0.02815, "grad_norm": 0.8815486431121826, "learning_rate": 0.003, "loss": 4.1736, "step": 2815 }, { "epoch": 0.02816, "grad_norm": 0.7531535029411316, "learning_rate": 0.003, "loss": 4.1477, "step": 2816 }, { "epoch": 0.02817, "grad_norm": 0.6786168217658997, "learning_rate": 0.003, "loss": 4.1043, "step": 2817 }, { "epoch": 0.02818, "grad_norm": 0.5562347173690796, "learning_rate": 0.003, "loss": 4.1127, "step": 2818 }, { "epoch": 0.02819, "grad_norm": 0.6471225619316101, "learning_rate": 0.003, "loss": 4.1416, "step": 2819 }, { "epoch": 0.0282, "grad_norm": 0.7505398392677307, "learning_rate": 0.003, "loss": 4.1422, "step": 2820 }, { "epoch": 0.02821, "grad_norm": 0.845917820930481, "learning_rate": 0.003, "loss": 4.1632, "step": 2821 }, { "epoch": 0.02822, "grad_norm": 0.8887820839881897, "learning_rate": 0.003, "loss": 4.1419, "step": 2822 }, { "epoch": 0.02823, "grad_norm": 0.953726589679718, "learning_rate": 0.003, "loss": 4.1686, "step": 2823 }, { "epoch": 0.02824, "grad_norm": 0.9745505452156067, "learning_rate": 0.003, "loss": 4.1627, "step": 2824 }, { "epoch": 0.02825, "grad_norm": 0.9194844961166382, "learning_rate": 0.003, "loss": 4.1655, "step": 2825 }, { "epoch": 0.02826, "grad_norm": 0.9426382184028625, "learning_rate": 0.003, "loss": 4.1439, "step": 2826 }, { "epoch": 0.02827, "grad_norm": 0.8329664468765259, "learning_rate": 0.003, "loss": 4.1305, "step": 2827 }, { "epoch": 0.02828, "grad_norm": 0.7950755953788757, "learning_rate": 0.003, "loss": 4.1685, "step": 2828 }, { "epoch": 0.02829, "grad_norm": 0.7973052263259888, "learning_rate": 0.003, "loss": 4.1943, "step": 2829 }, { "epoch": 0.0283, "grad_norm": 0.7938761115074158, "learning_rate": 0.003, "loss": 4.1561, "step": 2830 }, { "epoch": 0.02831, "grad_norm": 0.900173008441925, "learning_rate": 0.003, "loss": 4.1604, "step": 2831 }, { "epoch": 0.02832, "grad_norm": 0.9947631359100342, "learning_rate": 0.003, "loss": 4.1755, "step": 2832 }, { "epoch": 0.02833, "grad_norm": 0.8765170574188232, "learning_rate": 0.003, "loss": 4.1539, "step": 2833 }, { "epoch": 0.02834, "grad_norm": 0.8053457140922546, "learning_rate": 0.003, "loss": 4.1358, "step": 2834 }, { "epoch": 0.02835, "grad_norm": 0.728701651096344, "learning_rate": 0.003, "loss": 4.147, "step": 2835 }, { "epoch": 0.02836, "grad_norm": 0.6180861592292786, "learning_rate": 0.003, "loss": 4.1615, "step": 2836 }, { "epoch": 0.02837, "grad_norm": 0.5929837226867676, "learning_rate": 0.003, "loss": 4.1342, "step": 2837 }, { "epoch": 0.02838, "grad_norm": 0.5861077308654785, "learning_rate": 0.003, "loss": 4.0992, "step": 2838 }, { "epoch": 0.02839, "grad_norm": 0.5293443202972412, "learning_rate": 0.003, "loss": 4.0976, "step": 2839 }, { "epoch": 0.0284, "grad_norm": 0.5219327211380005, "learning_rate": 0.003, "loss": 4.1106, "step": 2840 }, { "epoch": 0.02841, "grad_norm": 0.494223028421402, "learning_rate": 0.003, "loss": 4.1465, "step": 2841 }, { "epoch": 0.02842, "grad_norm": 0.5202710628509521, "learning_rate": 0.003, "loss": 4.1271, "step": 2842 }, { "epoch": 0.02843, "grad_norm": 0.6077046990394592, "learning_rate": 0.003, "loss": 4.1525, "step": 2843 }, { "epoch": 0.02844, "grad_norm": 0.6248916983604431, "learning_rate": 0.003, "loss": 4.1488, "step": 2844 }, { "epoch": 0.02845, "grad_norm": 0.5447239875793457, "learning_rate": 0.003, "loss": 4.124, "step": 2845 }, { "epoch": 0.02846, "grad_norm": 0.5102211236953735, "learning_rate": 0.003, "loss": 4.1178, "step": 2846 }, { "epoch": 0.02847, "grad_norm": 0.47447827458381653, "learning_rate": 0.003, "loss": 4.171, "step": 2847 }, { "epoch": 0.02848, "grad_norm": 0.5006018877029419, "learning_rate": 0.003, "loss": 4.1279, "step": 2848 }, { "epoch": 0.02849, "grad_norm": 0.6120002865791321, "learning_rate": 0.003, "loss": 4.1185, "step": 2849 }, { "epoch": 0.0285, "grad_norm": 0.6846702098846436, "learning_rate": 0.003, "loss": 4.1242, "step": 2850 }, { "epoch": 0.02851, "grad_norm": 0.7738217115402222, "learning_rate": 0.003, "loss": 4.1646, "step": 2851 }, { "epoch": 0.02852, "grad_norm": 0.8055019974708557, "learning_rate": 0.003, "loss": 4.1502, "step": 2852 }, { "epoch": 0.02853, "grad_norm": 0.8890880942344666, "learning_rate": 0.003, "loss": 4.1216, "step": 2853 }, { "epoch": 0.02854, "grad_norm": 0.8885093927383423, "learning_rate": 0.003, "loss": 4.1639, "step": 2854 }, { "epoch": 0.02855, "grad_norm": 0.8279180526733398, "learning_rate": 0.003, "loss": 4.1584, "step": 2855 }, { "epoch": 0.02856, "grad_norm": 0.7752195596694946, "learning_rate": 0.003, "loss": 4.1614, "step": 2856 }, { "epoch": 0.02857, "grad_norm": 0.8694232702255249, "learning_rate": 0.003, "loss": 4.1419, "step": 2857 }, { "epoch": 0.02858, "grad_norm": 0.9445026516914368, "learning_rate": 0.003, "loss": 4.1366, "step": 2858 }, { "epoch": 0.02859, "grad_norm": 0.8522076606750488, "learning_rate": 0.003, "loss": 4.1462, "step": 2859 }, { "epoch": 0.0286, "grad_norm": 0.8931873440742493, "learning_rate": 0.003, "loss": 4.1628, "step": 2860 }, { "epoch": 0.02861, "grad_norm": 0.8646109700202942, "learning_rate": 0.003, "loss": 4.1343, "step": 2861 }, { "epoch": 0.02862, "grad_norm": 0.7844035625457764, "learning_rate": 0.003, "loss": 4.1497, "step": 2862 }, { "epoch": 0.02863, "grad_norm": 0.7147074341773987, "learning_rate": 0.003, "loss": 4.169, "step": 2863 }, { "epoch": 0.02864, "grad_norm": 0.7001998424530029, "learning_rate": 0.003, "loss": 4.1435, "step": 2864 }, { "epoch": 0.02865, "grad_norm": 0.78515625, "learning_rate": 0.003, "loss": 4.1756, "step": 2865 }, { "epoch": 0.02866, "grad_norm": 0.6479020714759827, "learning_rate": 0.003, "loss": 4.1532, "step": 2866 }, { "epoch": 0.02867, "grad_norm": 0.720973014831543, "learning_rate": 0.003, "loss": 4.1431, "step": 2867 }, { "epoch": 0.02868, "grad_norm": 0.6694818735122681, "learning_rate": 0.003, "loss": 4.1474, "step": 2868 }, { "epoch": 0.02869, "grad_norm": 0.5998570919036865, "learning_rate": 0.003, "loss": 4.1334, "step": 2869 }, { "epoch": 0.0287, "grad_norm": 0.5683268308639526, "learning_rate": 0.003, "loss": 4.1536, "step": 2870 }, { "epoch": 0.02871, "grad_norm": 0.5723297595977783, "learning_rate": 0.003, "loss": 4.1376, "step": 2871 }, { "epoch": 0.02872, "grad_norm": 0.5388701558113098, "learning_rate": 0.003, "loss": 4.1708, "step": 2872 }, { "epoch": 0.02873, "grad_norm": 0.46427324414253235, "learning_rate": 0.003, "loss": 4.1389, "step": 2873 }, { "epoch": 0.02874, "grad_norm": 0.4973464012145996, "learning_rate": 0.003, "loss": 4.1539, "step": 2874 }, { "epoch": 0.02875, "grad_norm": 0.5175668001174927, "learning_rate": 0.003, "loss": 4.1568, "step": 2875 }, { "epoch": 0.02876, "grad_norm": 0.5334407091140747, "learning_rate": 0.003, "loss": 4.1386, "step": 2876 }, { "epoch": 0.02877, "grad_norm": 0.5334872007369995, "learning_rate": 0.003, "loss": 4.1197, "step": 2877 }, { "epoch": 0.02878, "grad_norm": 0.5646567344665527, "learning_rate": 0.003, "loss": 4.1174, "step": 2878 }, { "epoch": 0.02879, "grad_norm": 0.560286819934845, "learning_rate": 0.003, "loss": 4.1083, "step": 2879 }, { "epoch": 0.0288, "grad_norm": 0.6549745202064514, "learning_rate": 0.003, "loss": 4.1188, "step": 2880 }, { "epoch": 0.02881, "grad_norm": 0.8416467905044556, "learning_rate": 0.003, "loss": 4.1365, "step": 2881 }, { "epoch": 0.02882, "grad_norm": 0.7693246603012085, "learning_rate": 0.003, "loss": 4.1523, "step": 2882 }, { "epoch": 0.02883, "grad_norm": 0.6049852967262268, "learning_rate": 0.003, "loss": 4.1296, "step": 2883 }, { "epoch": 0.02884, "grad_norm": 0.561359703540802, "learning_rate": 0.003, "loss": 4.1649, "step": 2884 }, { "epoch": 0.02885, "grad_norm": 0.630006730556488, "learning_rate": 0.003, "loss": 4.1114, "step": 2885 }, { "epoch": 0.02886, "grad_norm": 0.6453554630279541, "learning_rate": 0.003, "loss": 4.1416, "step": 2886 }, { "epoch": 0.02887, "grad_norm": 0.5455800890922546, "learning_rate": 0.003, "loss": 4.1255, "step": 2887 }, { "epoch": 0.02888, "grad_norm": 0.4928850531578064, "learning_rate": 0.003, "loss": 4.1194, "step": 2888 }, { "epoch": 0.02889, "grad_norm": 0.4772005081176758, "learning_rate": 0.003, "loss": 4.1074, "step": 2889 }, { "epoch": 0.0289, "grad_norm": 0.6117534041404724, "learning_rate": 0.003, "loss": 4.1446, "step": 2890 }, { "epoch": 0.02891, "grad_norm": 0.8205873370170593, "learning_rate": 0.003, "loss": 4.1647, "step": 2891 }, { "epoch": 0.02892, "grad_norm": 0.8640782833099365, "learning_rate": 0.003, "loss": 4.1212, "step": 2892 }, { "epoch": 0.02893, "grad_norm": 0.8006044626235962, "learning_rate": 0.003, "loss": 4.13, "step": 2893 }, { "epoch": 0.02894, "grad_norm": 0.8116544485092163, "learning_rate": 0.003, "loss": 4.1376, "step": 2894 }, { "epoch": 0.02895, "grad_norm": 0.8600641489028931, "learning_rate": 0.003, "loss": 4.1864, "step": 2895 }, { "epoch": 0.02896, "grad_norm": 0.6702893376350403, "learning_rate": 0.003, "loss": 4.1344, "step": 2896 }, { "epoch": 0.02897, "grad_norm": 0.6657308340072632, "learning_rate": 0.003, "loss": 4.1186, "step": 2897 }, { "epoch": 0.02898, "grad_norm": 0.662657618522644, "learning_rate": 0.003, "loss": 4.1623, "step": 2898 }, { "epoch": 0.02899, "grad_norm": 0.7254528999328613, "learning_rate": 0.003, "loss": 4.148, "step": 2899 }, { "epoch": 0.029, "grad_norm": 0.7490177750587463, "learning_rate": 0.003, "loss": 4.1372, "step": 2900 }, { "epoch": 0.02901, "grad_norm": 0.7398015856742859, "learning_rate": 0.003, "loss": 4.1513, "step": 2901 }, { "epoch": 0.02902, "grad_norm": 0.8604675531387329, "learning_rate": 0.003, "loss": 4.1725, "step": 2902 }, { "epoch": 0.02903, "grad_norm": 0.9788482785224915, "learning_rate": 0.003, "loss": 4.1436, "step": 2903 }, { "epoch": 0.02904, "grad_norm": 0.9012083411216736, "learning_rate": 0.003, "loss": 4.1596, "step": 2904 }, { "epoch": 0.02905, "grad_norm": 0.6638503074645996, "learning_rate": 0.003, "loss": 4.1651, "step": 2905 }, { "epoch": 0.02906, "grad_norm": 0.640064001083374, "learning_rate": 0.003, "loss": 4.1474, "step": 2906 }, { "epoch": 0.02907, "grad_norm": 0.6573886871337891, "learning_rate": 0.003, "loss": 4.1195, "step": 2907 }, { "epoch": 0.02908, "grad_norm": 0.6549690961837769, "learning_rate": 0.003, "loss": 4.1423, "step": 2908 }, { "epoch": 0.02909, "grad_norm": 0.6671884059906006, "learning_rate": 0.003, "loss": 4.1361, "step": 2909 }, { "epoch": 0.0291, "grad_norm": 0.6334506273269653, "learning_rate": 0.003, "loss": 4.1462, "step": 2910 }, { "epoch": 0.02911, "grad_norm": 0.6618163585662842, "learning_rate": 0.003, "loss": 4.1423, "step": 2911 }, { "epoch": 0.02912, "grad_norm": 0.7299465537071228, "learning_rate": 0.003, "loss": 4.1166, "step": 2912 }, { "epoch": 0.02913, "grad_norm": 0.7096904516220093, "learning_rate": 0.003, "loss": 4.1481, "step": 2913 }, { "epoch": 0.02914, "grad_norm": 0.7228156924247742, "learning_rate": 0.003, "loss": 4.1341, "step": 2914 }, { "epoch": 0.02915, "grad_norm": 0.6965159177780151, "learning_rate": 0.003, "loss": 4.1301, "step": 2915 }, { "epoch": 0.02916, "grad_norm": 0.5864951014518738, "learning_rate": 0.003, "loss": 4.1326, "step": 2916 }, { "epoch": 0.02917, "grad_norm": 0.682627260684967, "learning_rate": 0.003, "loss": 4.1647, "step": 2917 }, { "epoch": 0.02918, "grad_norm": 0.7035491466522217, "learning_rate": 0.003, "loss": 4.1198, "step": 2918 }, { "epoch": 0.02919, "grad_norm": 0.6258479952812195, "learning_rate": 0.003, "loss": 4.1613, "step": 2919 }, { "epoch": 0.0292, "grad_norm": 0.7004368305206299, "learning_rate": 0.003, "loss": 4.1034, "step": 2920 }, { "epoch": 0.02921, "grad_norm": 0.7259970903396606, "learning_rate": 0.003, "loss": 4.1461, "step": 2921 }, { "epoch": 0.02922, "grad_norm": 0.8203588724136353, "learning_rate": 0.003, "loss": 4.1484, "step": 2922 }, { "epoch": 0.02923, "grad_norm": 0.7598622441291809, "learning_rate": 0.003, "loss": 4.1197, "step": 2923 }, { "epoch": 0.02924, "grad_norm": 0.713235080242157, "learning_rate": 0.003, "loss": 4.1518, "step": 2924 }, { "epoch": 0.02925, "grad_norm": 0.6576932668685913, "learning_rate": 0.003, "loss": 4.1548, "step": 2925 }, { "epoch": 0.02926, "grad_norm": 0.5912261009216309, "learning_rate": 0.003, "loss": 4.1321, "step": 2926 }, { "epoch": 0.02927, "grad_norm": 0.6221820712089539, "learning_rate": 0.003, "loss": 4.1364, "step": 2927 }, { "epoch": 0.02928, "grad_norm": 0.5222737193107605, "learning_rate": 0.003, "loss": 4.1557, "step": 2928 }, { "epoch": 0.02929, "grad_norm": 0.448812872171402, "learning_rate": 0.003, "loss": 4.1384, "step": 2929 }, { "epoch": 0.0293, "grad_norm": 0.4934072196483612, "learning_rate": 0.003, "loss": 4.1309, "step": 2930 }, { "epoch": 0.02931, "grad_norm": 0.6587796211242676, "learning_rate": 0.003, "loss": 4.136, "step": 2931 }, { "epoch": 0.02932, "grad_norm": 0.8658855557441711, "learning_rate": 0.003, "loss": 4.1325, "step": 2932 }, { "epoch": 0.02933, "grad_norm": 1.1174193620681763, "learning_rate": 0.003, "loss": 4.1685, "step": 2933 }, { "epoch": 0.02934, "grad_norm": 0.7016493678092957, "learning_rate": 0.003, "loss": 4.145, "step": 2934 }, { "epoch": 0.02935, "grad_norm": 0.543066680431366, "learning_rate": 0.003, "loss": 4.1272, "step": 2935 }, { "epoch": 0.02936, "grad_norm": 0.6172811388969421, "learning_rate": 0.003, "loss": 4.1458, "step": 2936 }, { "epoch": 0.02937, "grad_norm": 0.7183346152305603, "learning_rate": 0.003, "loss": 4.1303, "step": 2937 }, { "epoch": 0.02938, "grad_norm": 0.7085850834846497, "learning_rate": 0.003, "loss": 4.1452, "step": 2938 }, { "epoch": 0.02939, "grad_norm": 0.5843592882156372, "learning_rate": 0.003, "loss": 4.1302, "step": 2939 }, { "epoch": 0.0294, "grad_norm": 0.5836790800094604, "learning_rate": 0.003, "loss": 4.1531, "step": 2940 }, { "epoch": 0.02941, "grad_norm": 0.6397603154182434, "learning_rate": 0.003, "loss": 4.1546, "step": 2941 }, { "epoch": 0.02942, "grad_norm": 0.6955589056015015, "learning_rate": 0.003, "loss": 4.1404, "step": 2942 }, { "epoch": 0.02943, "grad_norm": 0.754485011100769, "learning_rate": 0.003, "loss": 4.1328, "step": 2943 }, { "epoch": 0.02944, "grad_norm": 0.8499577045440674, "learning_rate": 0.003, "loss": 4.1566, "step": 2944 }, { "epoch": 0.02945, "grad_norm": 0.8511131405830383, "learning_rate": 0.003, "loss": 4.1375, "step": 2945 }, { "epoch": 0.02946, "grad_norm": 0.7081684470176697, "learning_rate": 0.003, "loss": 4.1304, "step": 2946 }, { "epoch": 0.02947, "grad_norm": 0.6551468968391418, "learning_rate": 0.003, "loss": 4.1627, "step": 2947 }, { "epoch": 0.02948, "grad_norm": 0.6386260390281677, "learning_rate": 0.003, "loss": 4.1399, "step": 2948 }, { "epoch": 0.02949, "grad_norm": 0.5764749050140381, "learning_rate": 0.003, "loss": 4.113, "step": 2949 }, { "epoch": 0.0295, "grad_norm": 0.5615102648735046, "learning_rate": 0.003, "loss": 4.1347, "step": 2950 }, { "epoch": 0.02951, "grad_norm": 0.5903642773628235, "learning_rate": 0.003, "loss": 4.088, "step": 2951 }, { "epoch": 0.02952, "grad_norm": 0.6850778460502625, "learning_rate": 0.003, "loss": 4.1243, "step": 2952 }, { "epoch": 0.02953, "grad_norm": 0.6652215123176575, "learning_rate": 0.003, "loss": 4.1312, "step": 2953 }, { "epoch": 0.02954, "grad_norm": 0.6463000178337097, "learning_rate": 0.003, "loss": 4.1725, "step": 2954 }, { "epoch": 0.02955, "grad_norm": 0.7008712291717529, "learning_rate": 0.003, "loss": 4.1435, "step": 2955 }, { "epoch": 0.02956, "grad_norm": 0.7221593260765076, "learning_rate": 0.003, "loss": 4.1396, "step": 2956 }, { "epoch": 0.02957, "grad_norm": 0.8538970351219177, "learning_rate": 0.003, "loss": 4.1508, "step": 2957 }, { "epoch": 0.02958, "grad_norm": 0.9920998811721802, "learning_rate": 0.003, "loss": 4.1749, "step": 2958 }, { "epoch": 0.02959, "grad_norm": 1.0068269968032837, "learning_rate": 0.003, "loss": 4.1654, "step": 2959 }, { "epoch": 0.0296, "grad_norm": 0.9310505986213684, "learning_rate": 0.003, "loss": 4.1729, "step": 2960 }, { "epoch": 0.02961, "grad_norm": 0.878132164478302, "learning_rate": 0.003, "loss": 4.1458, "step": 2961 }, { "epoch": 0.02962, "grad_norm": 0.953723132610321, "learning_rate": 0.003, "loss": 4.1532, "step": 2962 }, { "epoch": 0.02963, "grad_norm": 0.9499011039733887, "learning_rate": 0.003, "loss": 4.1333, "step": 2963 }, { "epoch": 0.02964, "grad_norm": 0.8707635402679443, "learning_rate": 0.003, "loss": 4.1169, "step": 2964 }, { "epoch": 0.02965, "grad_norm": 0.8199546337127686, "learning_rate": 0.003, "loss": 4.1628, "step": 2965 }, { "epoch": 0.02966, "grad_norm": 0.7343395948410034, "learning_rate": 0.003, "loss": 4.1728, "step": 2966 }, { "epoch": 0.02967, "grad_norm": 0.6565076112747192, "learning_rate": 0.003, "loss": 4.1547, "step": 2967 }, { "epoch": 0.02968, "grad_norm": 0.6714766025543213, "learning_rate": 0.003, "loss": 4.1518, "step": 2968 }, { "epoch": 0.02969, "grad_norm": 0.6478135585784912, "learning_rate": 0.003, "loss": 4.1489, "step": 2969 }, { "epoch": 0.0297, "grad_norm": 0.5846438407897949, "learning_rate": 0.003, "loss": 4.1316, "step": 2970 }, { "epoch": 0.02971, "grad_norm": 0.5542798042297363, "learning_rate": 0.003, "loss": 4.1482, "step": 2971 }, { "epoch": 0.02972, "grad_norm": 0.5505024790763855, "learning_rate": 0.003, "loss": 4.1172, "step": 2972 }, { "epoch": 0.02973, "grad_norm": 0.5356234908103943, "learning_rate": 0.003, "loss": 4.1011, "step": 2973 }, { "epoch": 0.02974, "grad_norm": 0.6036863327026367, "learning_rate": 0.003, "loss": 4.1403, "step": 2974 }, { "epoch": 0.02975, "grad_norm": 0.48086002469062805, "learning_rate": 0.003, "loss": 4.1216, "step": 2975 }, { "epoch": 0.02976, "grad_norm": 0.43674689531326294, "learning_rate": 0.003, "loss": 4.1282, "step": 2976 }, { "epoch": 0.02977, "grad_norm": 0.4434605836868286, "learning_rate": 0.003, "loss": 4.1165, "step": 2977 }, { "epoch": 0.02978, "grad_norm": 0.5155782103538513, "learning_rate": 0.003, "loss": 4.1373, "step": 2978 }, { "epoch": 0.02979, "grad_norm": 0.6957648396492004, "learning_rate": 0.003, "loss": 4.1259, "step": 2979 }, { "epoch": 0.0298, "grad_norm": 0.8462650179862976, "learning_rate": 0.003, "loss": 4.139, "step": 2980 }, { "epoch": 0.02981, "grad_norm": 0.7909703254699707, "learning_rate": 0.003, "loss": 4.146, "step": 2981 }, { "epoch": 0.02982, "grad_norm": 0.5594244003295898, "learning_rate": 0.003, "loss": 4.1346, "step": 2982 }, { "epoch": 0.02983, "grad_norm": 0.5850778222084045, "learning_rate": 0.003, "loss": 4.1518, "step": 2983 }, { "epoch": 0.02984, "grad_norm": 0.6843441128730774, "learning_rate": 0.003, "loss": 4.1276, "step": 2984 }, { "epoch": 0.02985, "grad_norm": 0.6832881569862366, "learning_rate": 0.003, "loss": 4.1591, "step": 2985 }, { "epoch": 0.02986, "grad_norm": 0.5421028733253479, "learning_rate": 0.003, "loss": 4.1172, "step": 2986 }, { "epoch": 0.02987, "grad_norm": 0.551857590675354, "learning_rate": 0.003, "loss": 4.1095, "step": 2987 }, { "epoch": 0.02988, "grad_norm": 0.6235337853431702, "learning_rate": 0.003, "loss": 4.1248, "step": 2988 }, { "epoch": 0.02989, "grad_norm": 0.7048717737197876, "learning_rate": 0.003, "loss": 4.1396, "step": 2989 }, { "epoch": 0.0299, "grad_norm": 0.7494181990623474, "learning_rate": 0.003, "loss": 4.1016, "step": 2990 }, { "epoch": 0.02991, "grad_norm": 0.7830327749252319, "learning_rate": 0.003, "loss": 4.1547, "step": 2991 }, { "epoch": 0.02992, "grad_norm": 0.8855887651443481, "learning_rate": 0.003, "loss": 4.1523, "step": 2992 }, { "epoch": 0.02993, "grad_norm": 0.888844907283783, "learning_rate": 0.003, "loss": 4.1255, "step": 2993 }, { "epoch": 0.02994, "grad_norm": 0.822543203830719, "learning_rate": 0.003, "loss": 4.1363, "step": 2994 }, { "epoch": 0.02995, "grad_norm": 0.8102987408638, "learning_rate": 0.003, "loss": 4.1743, "step": 2995 }, { "epoch": 0.02996, "grad_norm": 0.8138471841812134, "learning_rate": 0.003, "loss": 4.1624, "step": 2996 }, { "epoch": 0.02997, "grad_norm": 0.7990952134132385, "learning_rate": 0.003, "loss": 4.1243, "step": 2997 }, { "epoch": 0.02998, "grad_norm": 0.7419268488883972, "learning_rate": 0.003, "loss": 4.124, "step": 2998 }, { "epoch": 0.02999, "grad_norm": 0.6637421250343323, "learning_rate": 0.003, "loss": 4.1145, "step": 2999 }, { "epoch": 0.03, "grad_norm": 0.5517862439155579, "learning_rate": 0.003, "loss": 4.1563, "step": 3000 }, { "epoch": 0.03001, "grad_norm": 0.5322889089584351, "learning_rate": 0.003, "loss": 4.1217, "step": 3001 }, { "epoch": 0.03002, "grad_norm": 0.5043152570724487, "learning_rate": 0.003, "loss": 4.1356, "step": 3002 }, { "epoch": 0.03003, "grad_norm": 0.5661661028862, "learning_rate": 0.003, "loss": 4.1145, "step": 3003 }, { "epoch": 0.03004, "grad_norm": 0.6002155542373657, "learning_rate": 0.003, "loss": 4.0956, "step": 3004 }, { "epoch": 0.03005, "grad_norm": 0.5543268918991089, "learning_rate": 0.003, "loss": 4.1446, "step": 3005 }, { "epoch": 0.03006, "grad_norm": 0.6193807721138, "learning_rate": 0.003, "loss": 4.1405, "step": 3006 }, { "epoch": 0.03007, "grad_norm": 0.7129995822906494, "learning_rate": 0.003, "loss": 4.1141, "step": 3007 }, { "epoch": 0.03008, "grad_norm": 0.7921411991119385, "learning_rate": 0.003, "loss": 4.1465, "step": 3008 }, { "epoch": 0.03009, "grad_norm": 0.6867934465408325, "learning_rate": 0.003, "loss": 4.1179, "step": 3009 }, { "epoch": 0.0301, "grad_norm": 0.630652666091919, "learning_rate": 0.003, "loss": 4.1398, "step": 3010 }, { "epoch": 0.03011, "grad_norm": 0.7961447834968567, "learning_rate": 0.003, "loss": 4.1298, "step": 3011 }, { "epoch": 0.03012, "grad_norm": 0.6278183460235596, "learning_rate": 0.003, "loss": 4.1307, "step": 3012 }, { "epoch": 0.03013, "grad_norm": 0.5435069799423218, "learning_rate": 0.003, "loss": 4.087, "step": 3013 }, { "epoch": 0.03014, "grad_norm": 0.5863070487976074, "learning_rate": 0.003, "loss": 4.1185, "step": 3014 }, { "epoch": 0.03015, "grad_norm": 0.6343687176704407, "learning_rate": 0.003, "loss": 4.1167, "step": 3015 }, { "epoch": 0.03016, "grad_norm": 0.72650146484375, "learning_rate": 0.003, "loss": 4.1264, "step": 3016 }, { "epoch": 0.03017, "grad_norm": 0.719595193862915, "learning_rate": 0.003, "loss": 4.126, "step": 3017 }, { "epoch": 0.03018, "grad_norm": 0.5911171436309814, "learning_rate": 0.003, "loss": 4.1164, "step": 3018 }, { "epoch": 0.03019, "grad_norm": 0.5423696637153625, "learning_rate": 0.003, "loss": 4.1307, "step": 3019 }, { "epoch": 0.0302, "grad_norm": 0.5470551252365112, "learning_rate": 0.003, "loss": 4.1232, "step": 3020 }, { "epoch": 0.03021, "grad_norm": 0.7124951481819153, "learning_rate": 0.003, "loss": 4.1496, "step": 3021 }, { "epoch": 0.03022, "grad_norm": 0.8745585680007935, "learning_rate": 0.003, "loss": 4.1587, "step": 3022 }, { "epoch": 0.03023, "grad_norm": 0.866553008556366, "learning_rate": 0.003, "loss": 4.1734, "step": 3023 }, { "epoch": 0.03024, "grad_norm": 0.7500194907188416, "learning_rate": 0.003, "loss": 4.1646, "step": 3024 }, { "epoch": 0.03025, "grad_norm": 0.7470998764038086, "learning_rate": 0.003, "loss": 4.1428, "step": 3025 }, { "epoch": 0.03026, "grad_norm": 0.8869830369949341, "learning_rate": 0.003, "loss": 4.1211, "step": 3026 }, { "epoch": 0.03027, "grad_norm": 0.7815242409706116, "learning_rate": 0.003, "loss": 4.1415, "step": 3027 }, { "epoch": 0.03028, "grad_norm": 0.6851547956466675, "learning_rate": 0.003, "loss": 4.098, "step": 3028 }, { "epoch": 0.03029, "grad_norm": 0.6831088066101074, "learning_rate": 0.003, "loss": 4.1417, "step": 3029 }, { "epoch": 0.0303, "grad_norm": 0.6287125945091248, "learning_rate": 0.003, "loss": 4.1228, "step": 3030 }, { "epoch": 0.03031, "grad_norm": 0.5370122194290161, "learning_rate": 0.003, "loss": 4.1032, "step": 3031 }, { "epoch": 0.03032, "grad_norm": 0.5675404071807861, "learning_rate": 0.003, "loss": 4.1322, "step": 3032 }, { "epoch": 0.03033, "grad_norm": 0.7573124766349792, "learning_rate": 0.003, "loss": 4.1348, "step": 3033 }, { "epoch": 0.03034, "grad_norm": 0.8983549475669861, "learning_rate": 0.003, "loss": 4.1466, "step": 3034 }, { "epoch": 0.03035, "grad_norm": 1.0109939575195312, "learning_rate": 0.003, "loss": 4.1821, "step": 3035 }, { "epoch": 0.03036, "grad_norm": 0.9054379463195801, "learning_rate": 0.003, "loss": 4.1495, "step": 3036 }, { "epoch": 0.03037, "grad_norm": 0.9631518125534058, "learning_rate": 0.003, "loss": 4.1395, "step": 3037 }, { "epoch": 0.03038, "grad_norm": 0.9668822288513184, "learning_rate": 0.003, "loss": 4.1401, "step": 3038 }, { "epoch": 0.03039, "grad_norm": 0.9021666049957275, "learning_rate": 0.003, "loss": 4.1497, "step": 3039 }, { "epoch": 0.0304, "grad_norm": 0.8186821937561035, "learning_rate": 0.003, "loss": 4.1515, "step": 3040 }, { "epoch": 0.03041, "grad_norm": 0.7148540616035461, "learning_rate": 0.003, "loss": 4.1558, "step": 3041 }, { "epoch": 0.03042, "grad_norm": 0.7441778182983398, "learning_rate": 0.003, "loss": 4.18, "step": 3042 }, { "epoch": 0.03043, "grad_norm": 0.7269161343574524, "learning_rate": 0.003, "loss": 4.1303, "step": 3043 }, { "epoch": 0.03044, "grad_norm": 0.7291993498802185, "learning_rate": 0.003, "loss": 4.1294, "step": 3044 }, { "epoch": 0.03045, "grad_norm": 0.8323465585708618, "learning_rate": 0.003, "loss": 4.1387, "step": 3045 }, { "epoch": 0.03046, "grad_norm": 0.9850740432739258, "learning_rate": 0.003, "loss": 4.1535, "step": 3046 }, { "epoch": 0.03047, "grad_norm": 1.0652803182601929, "learning_rate": 0.003, "loss": 4.1672, "step": 3047 }, { "epoch": 0.03048, "grad_norm": 0.8505216836929321, "learning_rate": 0.003, "loss": 4.145, "step": 3048 }, { "epoch": 0.03049, "grad_norm": 0.7812585234642029, "learning_rate": 0.003, "loss": 4.13, "step": 3049 }, { "epoch": 0.0305, "grad_norm": 0.8137039542198181, "learning_rate": 0.003, "loss": 4.148, "step": 3050 }, { "epoch": 0.03051, "grad_norm": 0.8009527325630188, "learning_rate": 0.003, "loss": 4.157, "step": 3051 }, { "epoch": 0.03052, "grad_norm": 0.6827675104141235, "learning_rate": 0.003, "loss": 4.1565, "step": 3052 }, { "epoch": 0.03053, "grad_norm": 0.5847889184951782, "learning_rate": 0.003, "loss": 4.1564, "step": 3053 }, { "epoch": 0.03054, "grad_norm": 0.5359497666358948, "learning_rate": 0.003, "loss": 4.1362, "step": 3054 }, { "epoch": 0.03055, "grad_norm": 0.5127337574958801, "learning_rate": 0.003, "loss": 4.1349, "step": 3055 }, { "epoch": 0.03056, "grad_norm": 0.4672796130180359, "learning_rate": 0.003, "loss": 4.1421, "step": 3056 }, { "epoch": 0.03057, "grad_norm": 0.4908005893230438, "learning_rate": 0.003, "loss": 4.141, "step": 3057 }, { "epoch": 0.03058, "grad_norm": 0.4413350522518158, "learning_rate": 0.003, "loss": 4.1103, "step": 3058 }, { "epoch": 0.03059, "grad_norm": 0.42543938755989075, "learning_rate": 0.003, "loss": 4.1211, "step": 3059 }, { "epoch": 0.0306, "grad_norm": 0.43946075439453125, "learning_rate": 0.003, "loss": 4.107, "step": 3060 }, { "epoch": 0.03061, "grad_norm": 0.4698311388492584, "learning_rate": 0.003, "loss": 4.1318, "step": 3061 }, { "epoch": 0.03062, "grad_norm": 0.44352906942367554, "learning_rate": 0.003, "loss": 4.154, "step": 3062 }, { "epoch": 0.03063, "grad_norm": 0.4036288857460022, "learning_rate": 0.003, "loss": 4.0965, "step": 3063 }, { "epoch": 0.03064, "grad_norm": 0.34206268191337585, "learning_rate": 0.003, "loss": 4.1283, "step": 3064 }, { "epoch": 0.03065, "grad_norm": 0.31049665808677673, "learning_rate": 0.003, "loss": 4.094, "step": 3065 }, { "epoch": 0.03066, "grad_norm": 0.309965580701828, "learning_rate": 0.003, "loss": 4.137, "step": 3066 }, { "epoch": 0.03067, "grad_norm": 0.31760501861572266, "learning_rate": 0.003, "loss": 4.1421, "step": 3067 }, { "epoch": 0.03068, "grad_norm": 0.34679949283599854, "learning_rate": 0.003, "loss": 4.1118, "step": 3068 }, { "epoch": 0.03069, "grad_norm": 0.491832435131073, "learning_rate": 0.003, "loss": 4.0983, "step": 3069 }, { "epoch": 0.0307, "grad_norm": 0.7506191730499268, "learning_rate": 0.003, "loss": 4.1154, "step": 3070 }, { "epoch": 0.03071, "grad_norm": 0.9722976684570312, "learning_rate": 0.003, "loss": 4.1252, "step": 3071 }, { "epoch": 0.03072, "grad_norm": 0.9531710147857666, "learning_rate": 0.003, "loss": 4.1347, "step": 3072 }, { "epoch": 0.03073, "grad_norm": 0.7240149974822998, "learning_rate": 0.003, "loss": 4.1264, "step": 3073 }, { "epoch": 0.03074, "grad_norm": 0.8381486535072327, "learning_rate": 0.003, "loss": 4.1328, "step": 3074 }, { "epoch": 0.03075, "grad_norm": 1.0160551071166992, "learning_rate": 0.003, "loss": 4.163, "step": 3075 }, { "epoch": 0.03076, "grad_norm": 0.9568767547607422, "learning_rate": 0.003, "loss": 4.1535, "step": 3076 }, { "epoch": 0.03077, "grad_norm": 0.855299174785614, "learning_rate": 0.003, "loss": 4.152, "step": 3077 }, { "epoch": 0.03078, "grad_norm": 0.9153781533241272, "learning_rate": 0.003, "loss": 4.1418, "step": 3078 }, { "epoch": 0.03079, "grad_norm": 0.8517905473709106, "learning_rate": 0.003, "loss": 4.1284, "step": 3079 }, { "epoch": 0.0308, "grad_norm": 0.7446357607841492, "learning_rate": 0.003, "loss": 4.1544, "step": 3080 }, { "epoch": 0.03081, "grad_norm": 0.8622949719429016, "learning_rate": 0.003, "loss": 4.1305, "step": 3081 }, { "epoch": 0.03082, "grad_norm": 0.923214316368103, "learning_rate": 0.003, "loss": 4.1406, "step": 3082 }, { "epoch": 0.03083, "grad_norm": 0.7624456882476807, "learning_rate": 0.003, "loss": 4.1375, "step": 3083 }, { "epoch": 0.03084, "grad_norm": 0.7506299018859863, "learning_rate": 0.003, "loss": 4.1539, "step": 3084 }, { "epoch": 0.03085, "grad_norm": 0.7482979893684387, "learning_rate": 0.003, "loss": 4.149, "step": 3085 }, { "epoch": 0.03086, "grad_norm": 0.8742256164550781, "learning_rate": 0.003, "loss": 4.1735, "step": 3086 }, { "epoch": 0.03087, "grad_norm": 0.8986655473709106, "learning_rate": 0.003, "loss": 4.1492, "step": 3087 }, { "epoch": 0.03088, "grad_norm": 0.6525835990905762, "learning_rate": 0.003, "loss": 4.1581, "step": 3088 }, { "epoch": 0.03089, "grad_norm": 0.5235843062400818, "learning_rate": 0.003, "loss": 4.119, "step": 3089 }, { "epoch": 0.0309, "grad_norm": 0.5896126627922058, "learning_rate": 0.003, "loss": 4.1262, "step": 3090 }, { "epoch": 0.03091, "grad_norm": 0.6240640878677368, "learning_rate": 0.003, "loss": 4.0867, "step": 3091 }, { "epoch": 0.03092, "grad_norm": 0.7089477777481079, "learning_rate": 0.003, "loss": 4.1188, "step": 3092 }, { "epoch": 0.03093, "grad_norm": 0.7679318785667419, "learning_rate": 0.003, "loss": 4.1512, "step": 3093 }, { "epoch": 0.03094, "grad_norm": 0.829313337802887, "learning_rate": 0.003, "loss": 4.1391, "step": 3094 }, { "epoch": 0.03095, "grad_norm": 0.858989953994751, "learning_rate": 0.003, "loss": 4.1537, "step": 3095 }, { "epoch": 0.03096, "grad_norm": 0.8490680456161499, "learning_rate": 0.003, "loss": 4.1348, "step": 3096 }, { "epoch": 0.03097, "grad_norm": 0.7456005811691284, "learning_rate": 0.003, "loss": 4.1564, "step": 3097 }, { "epoch": 0.03098, "grad_norm": 0.6156277656555176, "learning_rate": 0.003, "loss": 4.1447, "step": 3098 }, { "epoch": 0.03099, "grad_norm": 0.7040721774101257, "learning_rate": 0.003, "loss": 4.1403, "step": 3099 }, { "epoch": 0.031, "grad_norm": 0.7396160960197449, "learning_rate": 0.003, "loss": 4.1364, "step": 3100 }, { "epoch": 0.03101, "grad_norm": 0.6194980144500732, "learning_rate": 0.003, "loss": 4.1244, "step": 3101 }, { "epoch": 0.03102, "grad_norm": 0.5160166025161743, "learning_rate": 0.003, "loss": 4.1235, "step": 3102 }, { "epoch": 0.03103, "grad_norm": 0.4513045847415924, "learning_rate": 0.003, "loss": 4.1085, "step": 3103 }, { "epoch": 0.03104, "grad_norm": 0.46098387241363525, "learning_rate": 0.003, "loss": 4.1146, "step": 3104 }, { "epoch": 0.03105, "grad_norm": 0.46439290046691895, "learning_rate": 0.003, "loss": 4.0932, "step": 3105 }, { "epoch": 0.03106, "grad_norm": 0.48921629786491394, "learning_rate": 0.003, "loss": 4.112, "step": 3106 }, { "epoch": 0.03107, "grad_norm": 0.4456900954246521, "learning_rate": 0.003, "loss": 4.0905, "step": 3107 }, { "epoch": 0.03108, "grad_norm": 0.3602052330970764, "learning_rate": 0.003, "loss": 4.0993, "step": 3108 }, { "epoch": 0.03109, "grad_norm": 0.37874293327331543, "learning_rate": 0.003, "loss": 4.1048, "step": 3109 }, { "epoch": 0.0311, "grad_norm": 0.34511056542396545, "learning_rate": 0.003, "loss": 4.1247, "step": 3110 }, { "epoch": 0.03111, "grad_norm": 0.31971776485443115, "learning_rate": 0.003, "loss": 4.1015, "step": 3111 }, { "epoch": 0.03112, "grad_norm": 0.38327884674072266, "learning_rate": 0.003, "loss": 4.123, "step": 3112 }, { "epoch": 0.03113, "grad_norm": 0.4181249141693115, "learning_rate": 0.003, "loss": 4.1094, "step": 3113 }, { "epoch": 0.03114, "grad_norm": 0.5446284413337708, "learning_rate": 0.003, "loss": 4.1157, "step": 3114 }, { "epoch": 0.03115, "grad_norm": 0.7353031635284424, "learning_rate": 0.003, "loss": 4.1229, "step": 3115 }, { "epoch": 0.03116, "grad_norm": 0.9837905168533325, "learning_rate": 0.003, "loss": 4.1171, "step": 3116 }, { "epoch": 0.03117, "grad_norm": 1.0615694522857666, "learning_rate": 0.003, "loss": 4.1341, "step": 3117 }, { "epoch": 0.03118, "grad_norm": 0.7411212921142578, "learning_rate": 0.003, "loss": 4.122, "step": 3118 }, { "epoch": 0.03119, "grad_norm": 0.6940107941627502, "learning_rate": 0.003, "loss": 4.1372, "step": 3119 }, { "epoch": 0.0312, "grad_norm": 0.6913151144981384, "learning_rate": 0.003, "loss": 4.1423, "step": 3120 }, { "epoch": 0.03121, "grad_norm": 0.6097826361656189, "learning_rate": 0.003, "loss": 4.1172, "step": 3121 }, { "epoch": 0.03122, "grad_norm": 0.6615216732025146, "learning_rate": 0.003, "loss": 4.1389, "step": 3122 }, { "epoch": 0.03123, "grad_norm": 0.799041211605072, "learning_rate": 0.003, "loss": 4.1385, "step": 3123 }, { "epoch": 0.03124, "grad_norm": 0.8251768350601196, "learning_rate": 0.003, "loss": 4.1199, "step": 3124 }, { "epoch": 0.03125, "grad_norm": 0.7816169857978821, "learning_rate": 0.003, "loss": 4.1194, "step": 3125 }, { "epoch": 0.03126, "grad_norm": 0.7771303057670593, "learning_rate": 0.003, "loss": 4.1237, "step": 3126 }, { "epoch": 0.03127, "grad_norm": 0.7820833921432495, "learning_rate": 0.003, "loss": 4.1321, "step": 3127 }, { "epoch": 0.03128, "grad_norm": 0.7861484885215759, "learning_rate": 0.003, "loss": 4.1231, "step": 3128 }, { "epoch": 0.03129, "grad_norm": 0.7761921286582947, "learning_rate": 0.003, "loss": 4.1391, "step": 3129 }, { "epoch": 0.0313, "grad_norm": 0.7497063875198364, "learning_rate": 0.003, "loss": 4.1467, "step": 3130 }, { "epoch": 0.03131, "grad_norm": 0.673405647277832, "learning_rate": 0.003, "loss": 4.1248, "step": 3131 }, { "epoch": 0.03132, "grad_norm": 0.6740285158157349, "learning_rate": 0.003, "loss": 4.1467, "step": 3132 }, { "epoch": 0.03133, "grad_norm": 0.7402722835540771, "learning_rate": 0.003, "loss": 4.1395, "step": 3133 }, { "epoch": 0.03134, "grad_norm": 0.8292396068572998, "learning_rate": 0.003, "loss": 4.1408, "step": 3134 }, { "epoch": 0.03135, "grad_norm": 0.8309870958328247, "learning_rate": 0.003, "loss": 4.1144, "step": 3135 }, { "epoch": 0.03136, "grad_norm": 0.9130949378013611, "learning_rate": 0.003, "loss": 4.1334, "step": 3136 }, { "epoch": 0.03137, "grad_norm": 0.8444306254386902, "learning_rate": 0.003, "loss": 4.1331, "step": 3137 }, { "epoch": 0.03138, "grad_norm": 0.8266807198524475, "learning_rate": 0.003, "loss": 4.1348, "step": 3138 }, { "epoch": 0.03139, "grad_norm": 0.9213491082191467, "learning_rate": 0.003, "loss": 4.1459, "step": 3139 }, { "epoch": 0.0314, "grad_norm": 0.8733784556388855, "learning_rate": 0.003, "loss": 4.1429, "step": 3140 }, { "epoch": 0.03141, "grad_norm": 0.848698079586029, "learning_rate": 0.003, "loss": 4.1763, "step": 3141 }, { "epoch": 0.03142, "grad_norm": 0.8152028918266296, "learning_rate": 0.003, "loss": 4.1512, "step": 3142 }, { "epoch": 0.03143, "grad_norm": 0.690168023109436, "learning_rate": 0.003, "loss": 4.1361, "step": 3143 }, { "epoch": 0.03144, "grad_norm": 0.6327980160713196, "learning_rate": 0.003, "loss": 4.1047, "step": 3144 }, { "epoch": 0.03145, "grad_norm": 0.5650879144668579, "learning_rate": 0.003, "loss": 4.1275, "step": 3145 }, { "epoch": 0.03146, "grad_norm": 0.5615544319152832, "learning_rate": 0.003, "loss": 4.1349, "step": 3146 }, { "epoch": 0.03147, "grad_norm": 0.5901345014572144, "learning_rate": 0.003, "loss": 4.1335, "step": 3147 }, { "epoch": 0.03148, "grad_norm": 0.5519311428070068, "learning_rate": 0.003, "loss": 4.1346, "step": 3148 }, { "epoch": 0.03149, "grad_norm": 0.5862347483634949, "learning_rate": 0.003, "loss": 4.1404, "step": 3149 }, { "epoch": 0.0315, "grad_norm": 0.48593467473983765, "learning_rate": 0.003, "loss": 4.1245, "step": 3150 }, { "epoch": 0.03151, "grad_norm": 0.44726112484931946, "learning_rate": 0.003, "loss": 4.1298, "step": 3151 }, { "epoch": 0.03152, "grad_norm": 0.5725032687187195, "learning_rate": 0.003, "loss": 4.1316, "step": 3152 }, { "epoch": 0.03153, "grad_norm": 0.7204610109329224, "learning_rate": 0.003, "loss": 4.1445, "step": 3153 }, { "epoch": 0.03154, "grad_norm": 0.7772231101989746, "learning_rate": 0.003, "loss": 4.1406, "step": 3154 }, { "epoch": 0.03155, "grad_norm": 0.7544244527816772, "learning_rate": 0.003, "loss": 4.1298, "step": 3155 }, { "epoch": 0.03156, "grad_norm": 0.6843472719192505, "learning_rate": 0.003, "loss": 4.1127, "step": 3156 }, { "epoch": 0.03157, "grad_norm": 0.5659379363059998, "learning_rate": 0.003, "loss": 4.1338, "step": 3157 }, { "epoch": 0.03158, "grad_norm": 0.560870885848999, "learning_rate": 0.003, "loss": 4.1158, "step": 3158 }, { "epoch": 0.03159, "grad_norm": 0.6483632922172546, "learning_rate": 0.003, "loss": 4.1202, "step": 3159 }, { "epoch": 0.0316, "grad_norm": 0.7656121253967285, "learning_rate": 0.003, "loss": 4.1378, "step": 3160 }, { "epoch": 0.03161, "grad_norm": 0.7832038998603821, "learning_rate": 0.003, "loss": 4.148, "step": 3161 }, { "epoch": 0.03162, "grad_norm": 0.5725152492523193, "learning_rate": 0.003, "loss": 4.1453, "step": 3162 }, { "epoch": 0.03163, "grad_norm": 0.5066990256309509, "learning_rate": 0.003, "loss": 4.1262, "step": 3163 }, { "epoch": 0.03164, "grad_norm": 0.5056422352790833, "learning_rate": 0.003, "loss": 4.1234, "step": 3164 }, { "epoch": 0.03165, "grad_norm": 0.5810751914978027, "learning_rate": 0.003, "loss": 4.0986, "step": 3165 }, { "epoch": 0.03166, "grad_norm": 0.6706964373588562, "learning_rate": 0.003, "loss": 4.1145, "step": 3166 }, { "epoch": 0.03167, "grad_norm": 0.8062258958816528, "learning_rate": 0.003, "loss": 4.1086, "step": 3167 }, { "epoch": 0.03168, "grad_norm": 0.8774738311767578, "learning_rate": 0.003, "loss": 4.1414, "step": 3168 }, { "epoch": 0.03169, "grad_norm": 0.817283034324646, "learning_rate": 0.003, "loss": 4.1495, "step": 3169 }, { "epoch": 0.0317, "grad_norm": 0.8635371327400208, "learning_rate": 0.003, "loss": 4.1057, "step": 3170 }, { "epoch": 0.03171, "grad_norm": 0.7711028456687927, "learning_rate": 0.003, "loss": 4.1463, "step": 3171 }, { "epoch": 0.03172, "grad_norm": 0.754484236240387, "learning_rate": 0.003, "loss": 4.1332, "step": 3172 }, { "epoch": 0.03173, "grad_norm": 0.6678244471549988, "learning_rate": 0.003, "loss": 4.1048, "step": 3173 }, { "epoch": 0.03174, "grad_norm": 0.588426411151886, "learning_rate": 0.003, "loss": 4.1179, "step": 3174 }, { "epoch": 0.03175, "grad_norm": 0.5396115779876709, "learning_rate": 0.003, "loss": 4.1398, "step": 3175 }, { "epoch": 0.03176, "grad_norm": 0.5115644931793213, "learning_rate": 0.003, "loss": 4.1025, "step": 3176 }, { "epoch": 0.03177, "grad_norm": 0.5066182017326355, "learning_rate": 0.003, "loss": 4.1086, "step": 3177 }, { "epoch": 0.03178, "grad_norm": 0.6152738928794861, "learning_rate": 0.003, "loss": 4.1574, "step": 3178 }, { "epoch": 0.03179, "grad_norm": 0.7892536520957947, "learning_rate": 0.003, "loss": 4.1254, "step": 3179 }, { "epoch": 0.0318, "grad_norm": 0.9117100834846497, "learning_rate": 0.003, "loss": 4.1312, "step": 3180 }, { "epoch": 0.03181, "grad_norm": 0.8305923938751221, "learning_rate": 0.003, "loss": 4.1452, "step": 3181 }, { "epoch": 0.03182, "grad_norm": 0.7435977458953857, "learning_rate": 0.003, "loss": 4.1356, "step": 3182 }, { "epoch": 0.03183, "grad_norm": 0.9169098138809204, "learning_rate": 0.003, "loss": 4.1455, "step": 3183 }, { "epoch": 0.03184, "grad_norm": 1.1370512247085571, "learning_rate": 0.003, "loss": 4.1304, "step": 3184 }, { "epoch": 0.03185, "grad_norm": 0.7811375856399536, "learning_rate": 0.003, "loss": 4.1419, "step": 3185 }, { "epoch": 0.03186, "grad_norm": 0.6801838278770447, "learning_rate": 0.003, "loss": 4.1382, "step": 3186 }, { "epoch": 0.03187, "grad_norm": 0.7519158124923706, "learning_rate": 0.003, "loss": 4.103, "step": 3187 }, { "epoch": 0.03188, "grad_norm": 0.7719635963439941, "learning_rate": 0.003, "loss": 4.1382, "step": 3188 }, { "epoch": 0.03189, "grad_norm": 0.7892667055130005, "learning_rate": 0.003, "loss": 4.1273, "step": 3189 }, { "epoch": 0.0319, "grad_norm": 0.7617167234420776, "learning_rate": 0.003, "loss": 4.1219, "step": 3190 }, { "epoch": 0.03191, "grad_norm": 0.729390561580658, "learning_rate": 0.003, "loss": 4.1513, "step": 3191 }, { "epoch": 0.03192, "grad_norm": 0.6866948008537292, "learning_rate": 0.003, "loss": 4.1252, "step": 3192 }, { "epoch": 0.03193, "grad_norm": 0.8149189352989197, "learning_rate": 0.003, "loss": 4.1294, "step": 3193 }, { "epoch": 0.03194, "grad_norm": 0.7727823257446289, "learning_rate": 0.003, "loss": 4.1567, "step": 3194 }, { "epoch": 0.03195, "grad_norm": 0.6907305121421814, "learning_rate": 0.003, "loss": 4.1187, "step": 3195 }, { "epoch": 0.03196, "grad_norm": 0.6480019688606262, "learning_rate": 0.003, "loss": 4.129, "step": 3196 }, { "epoch": 0.03197, "grad_norm": 0.6377413272857666, "learning_rate": 0.003, "loss": 4.1009, "step": 3197 }, { "epoch": 0.03198, "grad_norm": 0.5232990384101868, "learning_rate": 0.003, "loss": 4.1265, "step": 3198 }, { "epoch": 0.03199, "grad_norm": 0.5117045044898987, "learning_rate": 0.003, "loss": 4.1001, "step": 3199 }, { "epoch": 0.032, "grad_norm": 0.49118801951408386, "learning_rate": 0.003, "loss": 4.1204, "step": 3200 }, { "epoch": 0.03201, "grad_norm": 0.39169371128082275, "learning_rate": 0.003, "loss": 4.0803, "step": 3201 }, { "epoch": 0.03202, "grad_norm": 0.4503386616706848, "learning_rate": 0.003, "loss": 4.1192, "step": 3202 }, { "epoch": 0.03203, "grad_norm": 0.44623667001724243, "learning_rate": 0.003, "loss": 4.1033, "step": 3203 }, { "epoch": 0.03204, "grad_norm": 0.4461396634578705, "learning_rate": 0.003, "loss": 4.1193, "step": 3204 }, { "epoch": 0.03205, "grad_norm": 0.47087642550468445, "learning_rate": 0.003, "loss": 4.1152, "step": 3205 }, { "epoch": 0.03206, "grad_norm": 0.4962460696697235, "learning_rate": 0.003, "loss": 4.1085, "step": 3206 }, { "epoch": 0.03207, "grad_norm": 0.6064698100090027, "learning_rate": 0.003, "loss": 4.0901, "step": 3207 }, { "epoch": 0.03208, "grad_norm": 0.759096622467041, "learning_rate": 0.003, "loss": 4.1186, "step": 3208 }, { "epoch": 0.03209, "grad_norm": 0.7810276746749878, "learning_rate": 0.003, "loss": 4.1579, "step": 3209 }, { "epoch": 0.0321, "grad_norm": 0.7179496884346008, "learning_rate": 0.003, "loss": 4.1363, "step": 3210 }, { "epoch": 0.03211, "grad_norm": 0.7148532867431641, "learning_rate": 0.003, "loss": 4.1099, "step": 3211 }, { "epoch": 0.03212, "grad_norm": 0.7214945554733276, "learning_rate": 0.003, "loss": 4.1198, "step": 3212 }, { "epoch": 0.03213, "grad_norm": 0.7809195518493652, "learning_rate": 0.003, "loss": 4.1251, "step": 3213 }, { "epoch": 0.03214, "grad_norm": 0.9291434288024902, "learning_rate": 0.003, "loss": 4.1015, "step": 3214 }, { "epoch": 0.03215, "grad_norm": 0.9452233910560608, "learning_rate": 0.003, "loss": 4.1164, "step": 3215 }, { "epoch": 0.03216, "grad_norm": 0.8493017554283142, "learning_rate": 0.003, "loss": 4.125, "step": 3216 }, { "epoch": 0.03217, "grad_norm": 0.7973296046257019, "learning_rate": 0.003, "loss": 4.1215, "step": 3217 }, { "epoch": 0.03218, "grad_norm": 0.7930925488471985, "learning_rate": 0.003, "loss": 4.1166, "step": 3218 }, { "epoch": 0.03219, "grad_norm": 0.7060784101486206, "learning_rate": 0.003, "loss": 4.1215, "step": 3219 }, { "epoch": 0.0322, "grad_norm": 0.6504276394844055, "learning_rate": 0.003, "loss": 4.1329, "step": 3220 }, { "epoch": 0.03221, "grad_norm": 0.7263512015342712, "learning_rate": 0.003, "loss": 4.1414, "step": 3221 }, { "epoch": 0.03222, "grad_norm": 0.7812563180923462, "learning_rate": 0.003, "loss": 4.1338, "step": 3222 }, { "epoch": 0.03223, "grad_norm": 0.718917727470398, "learning_rate": 0.003, "loss": 4.1374, "step": 3223 }, { "epoch": 0.03224, "grad_norm": 0.6486620306968689, "learning_rate": 0.003, "loss": 4.0945, "step": 3224 }, { "epoch": 0.03225, "grad_norm": 0.633409857749939, "learning_rate": 0.003, "loss": 4.1275, "step": 3225 }, { "epoch": 0.03226, "grad_norm": 0.6219311356544495, "learning_rate": 0.003, "loss": 4.1074, "step": 3226 }, { "epoch": 0.03227, "grad_norm": 0.6718069911003113, "learning_rate": 0.003, "loss": 4.1299, "step": 3227 }, { "epoch": 0.03228, "grad_norm": 0.6600261926651001, "learning_rate": 0.003, "loss": 4.1046, "step": 3228 }, { "epoch": 0.03229, "grad_norm": 0.7300699353218079, "learning_rate": 0.003, "loss": 4.1237, "step": 3229 }, { "epoch": 0.0323, "grad_norm": 0.8185812830924988, "learning_rate": 0.003, "loss": 4.1184, "step": 3230 }, { "epoch": 0.03231, "grad_norm": 0.8448948860168457, "learning_rate": 0.003, "loss": 4.1359, "step": 3231 }, { "epoch": 0.03232, "grad_norm": 0.8037069439888, "learning_rate": 0.003, "loss": 4.1226, "step": 3232 }, { "epoch": 0.03233, "grad_norm": 0.6786398887634277, "learning_rate": 0.003, "loss": 4.1384, "step": 3233 }, { "epoch": 0.03234, "grad_norm": 0.6006938219070435, "learning_rate": 0.003, "loss": 4.1167, "step": 3234 }, { "epoch": 0.03235, "grad_norm": 0.651039183139801, "learning_rate": 0.003, "loss": 4.1271, "step": 3235 }, { "epoch": 0.03236, "grad_norm": 0.635384202003479, "learning_rate": 0.003, "loss": 4.1167, "step": 3236 }, { "epoch": 0.03237, "grad_norm": 0.6635240912437439, "learning_rate": 0.003, "loss": 4.1401, "step": 3237 }, { "epoch": 0.03238, "grad_norm": 0.6811038851737976, "learning_rate": 0.003, "loss": 4.1468, "step": 3238 }, { "epoch": 0.03239, "grad_norm": 0.7615686655044556, "learning_rate": 0.003, "loss": 4.1295, "step": 3239 }, { "epoch": 0.0324, "grad_norm": 0.8041428923606873, "learning_rate": 0.003, "loss": 4.151, "step": 3240 }, { "epoch": 0.03241, "grad_norm": 0.8027405738830566, "learning_rate": 0.003, "loss": 4.1432, "step": 3241 }, { "epoch": 0.03242, "grad_norm": 0.9180432558059692, "learning_rate": 0.003, "loss": 4.1256, "step": 3242 }, { "epoch": 0.03243, "grad_norm": 0.9836052060127258, "learning_rate": 0.003, "loss": 4.1186, "step": 3243 }, { "epoch": 0.03244, "grad_norm": 0.9347205758094788, "learning_rate": 0.003, "loss": 4.1385, "step": 3244 }, { "epoch": 0.03245, "grad_norm": 1.1407207250595093, "learning_rate": 0.003, "loss": 4.1692, "step": 3245 }, { "epoch": 0.03246, "grad_norm": 0.7863346934318542, "learning_rate": 0.003, "loss": 4.1416, "step": 3246 }, { "epoch": 0.03247, "grad_norm": 0.6830831170082092, "learning_rate": 0.003, "loss": 4.1458, "step": 3247 }, { "epoch": 0.03248, "grad_norm": 0.6007001996040344, "learning_rate": 0.003, "loss": 4.1605, "step": 3248 }, { "epoch": 0.03249, "grad_norm": 0.5720266103744507, "learning_rate": 0.003, "loss": 4.1242, "step": 3249 }, { "epoch": 0.0325, "grad_norm": 0.5476762056350708, "learning_rate": 0.003, "loss": 4.1536, "step": 3250 }, { "epoch": 0.03251, "grad_norm": 0.5844308137893677, "learning_rate": 0.003, "loss": 4.1498, "step": 3251 }, { "epoch": 0.03252, "grad_norm": 0.6554009318351746, "learning_rate": 0.003, "loss": 4.1172, "step": 3252 }, { "epoch": 0.03253, "grad_norm": 0.7417848110198975, "learning_rate": 0.003, "loss": 4.1203, "step": 3253 }, { "epoch": 0.03254, "grad_norm": 0.7940112352371216, "learning_rate": 0.003, "loss": 4.1189, "step": 3254 }, { "epoch": 0.03255, "grad_norm": 0.7129653096199036, "learning_rate": 0.003, "loss": 4.1292, "step": 3255 }, { "epoch": 0.03256, "grad_norm": 0.5819328427314758, "learning_rate": 0.003, "loss": 4.1213, "step": 3256 }, { "epoch": 0.03257, "grad_norm": 0.4663737118244171, "learning_rate": 0.003, "loss": 4.1081, "step": 3257 }, { "epoch": 0.03258, "grad_norm": 0.423995703458786, "learning_rate": 0.003, "loss": 4.1155, "step": 3258 }, { "epoch": 0.03259, "grad_norm": 0.43535032868385315, "learning_rate": 0.003, "loss": 4.1178, "step": 3259 }, { "epoch": 0.0326, "grad_norm": 0.39455822110176086, "learning_rate": 0.003, "loss": 4.1154, "step": 3260 }, { "epoch": 0.03261, "grad_norm": 0.40695109963417053, "learning_rate": 0.003, "loss": 4.088, "step": 3261 }, { "epoch": 0.03262, "grad_norm": 0.3774632513523102, "learning_rate": 0.003, "loss": 4.113, "step": 3262 }, { "epoch": 0.03263, "grad_norm": 0.46376833319664, "learning_rate": 0.003, "loss": 4.1225, "step": 3263 }, { "epoch": 0.03264, "grad_norm": 0.5567134618759155, "learning_rate": 0.003, "loss": 4.136, "step": 3264 }, { "epoch": 0.03265, "grad_norm": 0.6502901315689087, "learning_rate": 0.003, "loss": 4.1336, "step": 3265 }, { "epoch": 0.03266, "grad_norm": 0.8112798929214478, "learning_rate": 0.003, "loss": 4.0905, "step": 3266 }, { "epoch": 0.03267, "grad_norm": 0.9981924295425415, "learning_rate": 0.003, "loss": 4.1139, "step": 3267 }, { "epoch": 0.03268, "grad_norm": 0.9973412752151489, "learning_rate": 0.003, "loss": 4.1068, "step": 3268 }, { "epoch": 0.03269, "grad_norm": 0.8641186356544495, "learning_rate": 0.003, "loss": 4.1296, "step": 3269 }, { "epoch": 0.0327, "grad_norm": 0.7888898253440857, "learning_rate": 0.003, "loss": 4.1558, "step": 3270 }, { "epoch": 0.03271, "grad_norm": 0.8556457757949829, "learning_rate": 0.003, "loss": 4.1218, "step": 3271 }, { "epoch": 0.03272, "grad_norm": 0.8728947639465332, "learning_rate": 0.003, "loss": 4.1311, "step": 3272 }, { "epoch": 0.03273, "grad_norm": 0.8285180926322937, "learning_rate": 0.003, "loss": 4.133, "step": 3273 }, { "epoch": 0.03274, "grad_norm": 0.6249201893806458, "learning_rate": 0.003, "loss": 4.1423, "step": 3274 }, { "epoch": 0.03275, "grad_norm": 0.5604618787765503, "learning_rate": 0.003, "loss": 4.1388, "step": 3275 }, { "epoch": 0.03276, "grad_norm": 0.6792795062065125, "learning_rate": 0.003, "loss": 4.1194, "step": 3276 }, { "epoch": 0.03277, "grad_norm": 0.7676330804824829, "learning_rate": 0.003, "loss": 4.1239, "step": 3277 }, { "epoch": 0.03278, "grad_norm": 0.8335491418838501, "learning_rate": 0.003, "loss": 4.138, "step": 3278 }, { "epoch": 0.03279, "grad_norm": 0.7494176626205444, "learning_rate": 0.003, "loss": 4.1244, "step": 3279 }, { "epoch": 0.0328, "grad_norm": 0.5933589935302734, "learning_rate": 0.003, "loss": 4.1027, "step": 3280 }, { "epoch": 0.03281, "grad_norm": 0.49470627307891846, "learning_rate": 0.003, "loss": 4.1456, "step": 3281 }, { "epoch": 0.03282, "grad_norm": 0.5273627638816833, "learning_rate": 0.003, "loss": 4.1014, "step": 3282 }, { "epoch": 0.03283, "grad_norm": 0.5509713292121887, "learning_rate": 0.003, "loss": 4.0972, "step": 3283 }, { "epoch": 0.03284, "grad_norm": 0.544580340385437, "learning_rate": 0.003, "loss": 4.1383, "step": 3284 }, { "epoch": 0.03285, "grad_norm": 0.5744161009788513, "learning_rate": 0.003, "loss": 4.0849, "step": 3285 }, { "epoch": 0.03286, "grad_norm": 0.531225860118866, "learning_rate": 0.003, "loss": 4.1237, "step": 3286 }, { "epoch": 0.03287, "grad_norm": 0.49495846033096313, "learning_rate": 0.003, "loss": 4.13, "step": 3287 }, { "epoch": 0.03288, "grad_norm": 0.5203666687011719, "learning_rate": 0.003, "loss": 4.1111, "step": 3288 }, { "epoch": 0.03289, "grad_norm": 0.5833369493484497, "learning_rate": 0.003, "loss": 4.1168, "step": 3289 }, { "epoch": 0.0329, "grad_norm": 0.6156966686248779, "learning_rate": 0.003, "loss": 4.1476, "step": 3290 }, { "epoch": 0.03291, "grad_norm": 0.7284314036369324, "learning_rate": 0.003, "loss": 4.1346, "step": 3291 }, { "epoch": 0.03292, "grad_norm": 0.8301785588264465, "learning_rate": 0.003, "loss": 4.1017, "step": 3292 }, { "epoch": 0.03293, "grad_norm": 0.8880192041397095, "learning_rate": 0.003, "loss": 4.1255, "step": 3293 }, { "epoch": 0.03294, "grad_norm": 0.7777594923973083, "learning_rate": 0.003, "loss": 4.1268, "step": 3294 }, { "epoch": 0.03295, "grad_norm": 0.7415675520896912, "learning_rate": 0.003, "loss": 4.1178, "step": 3295 }, { "epoch": 0.03296, "grad_norm": 0.5945268273353577, "learning_rate": 0.003, "loss": 4.1056, "step": 3296 }, { "epoch": 0.03297, "grad_norm": 0.6904322504997253, "learning_rate": 0.003, "loss": 4.1182, "step": 3297 }, { "epoch": 0.03298, "grad_norm": 0.8207030892372131, "learning_rate": 0.003, "loss": 4.1412, "step": 3298 }, { "epoch": 0.03299, "grad_norm": 1.0657931566238403, "learning_rate": 0.003, "loss": 4.1346, "step": 3299 }, { "epoch": 0.033, "grad_norm": 0.9290387034416199, "learning_rate": 0.003, "loss": 4.1222, "step": 3300 }, { "epoch": 0.03301, "grad_norm": 0.6823791861534119, "learning_rate": 0.003, "loss": 4.1464, "step": 3301 }, { "epoch": 0.03302, "grad_norm": 0.5951451659202576, "learning_rate": 0.003, "loss": 4.1188, "step": 3302 }, { "epoch": 0.03303, "grad_norm": 0.5956482887268066, "learning_rate": 0.003, "loss": 4.1233, "step": 3303 }, { "epoch": 0.03304, "grad_norm": 0.5632296204566956, "learning_rate": 0.003, "loss": 4.1161, "step": 3304 }, { "epoch": 0.03305, "grad_norm": 0.5975555181503296, "learning_rate": 0.003, "loss": 4.1217, "step": 3305 }, { "epoch": 0.03306, "grad_norm": 0.5866029262542725, "learning_rate": 0.003, "loss": 4.1387, "step": 3306 }, { "epoch": 0.03307, "grad_norm": 0.586901843547821, "learning_rate": 0.003, "loss": 4.1345, "step": 3307 }, { "epoch": 0.03308, "grad_norm": 0.5834401249885559, "learning_rate": 0.003, "loss": 4.1153, "step": 3308 }, { "epoch": 0.03309, "grad_norm": 0.6751212477684021, "learning_rate": 0.003, "loss": 4.1037, "step": 3309 }, { "epoch": 0.0331, "grad_norm": 0.5830533504486084, "learning_rate": 0.003, "loss": 4.1068, "step": 3310 }, { "epoch": 0.03311, "grad_norm": 0.6124199628829956, "learning_rate": 0.003, "loss": 4.1271, "step": 3311 }, { "epoch": 0.03312, "grad_norm": 0.5900564193725586, "learning_rate": 0.003, "loss": 4.0968, "step": 3312 }, { "epoch": 0.03313, "grad_norm": 0.6442069411277771, "learning_rate": 0.003, "loss": 4.1271, "step": 3313 }, { "epoch": 0.03314, "grad_norm": 0.7764161229133606, "learning_rate": 0.003, "loss": 4.0759, "step": 3314 }, { "epoch": 0.03315, "grad_norm": 0.8021034002304077, "learning_rate": 0.003, "loss": 4.13, "step": 3315 }, { "epoch": 0.03316, "grad_norm": 0.8197445869445801, "learning_rate": 0.003, "loss": 4.1343, "step": 3316 }, { "epoch": 0.03317, "grad_norm": 0.7437990307807922, "learning_rate": 0.003, "loss": 4.1238, "step": 3317 }, { "epoch": 0.03318, "grad_norm": 0.7505268454551697, "learning_rate": 0.003, "loss": 4.1055, "step": 3318 }, { "epoch": 0.03319, "grad_norm": 0.7677083611488342, "learning_rate": 0.003, "loss": 4.0993, "step": 3319 }, { "epoch": 0.0332, "grad_norm": 0.8221232891082764, "learning_rate": 0.003, "loss": 4.1277, "step": 3320 }, { "epoch": 0.03321, "grad_norm": 0.8141359090805054, "learning_rate": 0.003, "loss": 4.1433, "step": 3321 }, { "epoch": 0.03322, "grad_norm": 0.7189428806304932, "learning_rate": 0.003, "loss": 4.0938, "step": 3322 }, { "epoch": 0.03323, "grad_norm": 0.6044638156890869, "learning_rate": 0.003, "loss": 4.1329, "step": 3323 }, { "epoch": 0.03324, "grad_norm": 0.5276358127593994, "learning_rate": 0.003, "loss": 4.1001, "step": 3324 }, { "epoch": 0.03325, "grad_norm": 0.5766726136207581, "learning_rate": 0.003, "loss": 4.134, "step": 3325 }, { "epoch": 0.03326, "grad_norm": 0.6931579113006592, "learning_rate": 0.003, "loss": 4.0949, "step": 3326 }, { "epoch": 0.03327, "grad_norm": 0.8419184684753418, "learning_rate": 0.003, "loss": 4.1308, "step": 3327 }, { "epoch": 0.03328, "grad_norm": 0.82012540102005, "learning_rate": 0.003, "loss": 4.1529, "step": 3328 }, { "epoch": 0.03329, "grad_norm": 0.6669595837593079, "learning_rate": 0.003, "loss": 4.1055, "step": 3329 }, { "epoch": 0.0333, "grad_norm": 0.6745646595954895, "learning_rate": 0.003, "loss": 4.1079, "step": 3330 }, { "epoch": 0.03331, "grad_norm": 0.7668302655220032, "learning_rate": 0.003, "loss": 4.1109, "step": 3331 }, { "epoch": 0.03332, "grad_norm": 0.7806735634803772, "learning_rate": 0.003, "loss": 4.1058, "step": 3332 }, { "epoch": 0.03333, "grad_norm": 0.8616372346878052, "learning_rate": 0.003, "loss": 4.0992, "step": 3333 }, { "epoch": 0.03334, "grad_norm": 0.8143258690834045, "learning_rate": 0.003, "loss": 4.1291, "step": 3334 }, { "epoch": 0.03335, "grad_norm": 0.7264704704284668, "learning_rate": 0.003, "loss": 4.1332, "step": 3335 }, { "epoch": 0.03336, "grad_norm": 0.7028628587722778, "learning_rate": 0.003, "loss": 4.134, "step": 3336 }, { "epoch": 0.03337, "grad_norm": 0.7172381281852722, "learning_rate": 0.003, "loss": 4.1099, "step": 3337 }, { "epoch": 0.03338, "grad_norm": 0.7452136278152466, "learning_rate": 0.003, "loss": 4.1223, "step": 3338 }, { "epoch": 0.03339, "grad_norm": 0.6151822209358215, "learning_rate": 0.003, "loss": 4.122, "step": 3339 }, { "epoch": 0.0334, "grad_norm": 0.48978814482688904, "learning_rate": 0.003, "loss": 4.0957, "step": 3340 }, { "epoch": 0.03341, "grad_norm": 0.5616298317909241, "learning_rate": 0.003, "loss": 4.1166, "step": 3341 }, { "epoch": 0.03342, "grad_norm": 0.5989934802055359, "learning_rate": 0.003, "loss": 4.1047, "step": 3342 }, { "epoch": 0.03343, "grad_norm": 0.6791746616363525, "learning_rate": 0.003, "loss": 4.0819, "step": 3343 }, { "epoch": 0.03344, "grad_norm": 0.8041501641273499, "learning_rate": 0.003, "loss": 4.139, "step": 3344 }, { "epoch": 0.03345, "grad_norm": 0.9025998711585999, "learning_rate": 0.003, "loss": 4.1226, "step": 3345 }, { "epoch": 0.03346, "grad_norm": 0.8547161221504211, "learning_rate": 0.003, "loss": 4.1382, "step": 3346 }, { "epoch": 0.03347, "grad_norm": 0.8230910301208496, "learning_rate": 0.003, "loss": 4.1211, "step": 3347 }, { "epoch": 0.03348, "grad_norm": 0.6764378547668457, "learning_rate": 0.003, "loss": 4.1086, "step": 3348 }, { "epoch": 0.03349, "grad_norm": 0.6756121516227722, "learning_rate": 0.003, "loss": 4.1537, "step": 3349 }, { "epoch": 0.0335, "grad_norm": 0.90619957447052, "learning_rate": 0.003, "loss": 4.1174, "step": 3350 }, { "epoch": 0.03351, "grad_norm": 0.995084822177887, "learning_rate": 0.003, "loss": 4.1296, "step": 3351 }, { "epoch": 0.03352, "grad_norm": 0.9932165741920471, "learning_rate": 0.003, "loss": 4.1476, "step": 3352 }, { "epoch": 0.03353, "grad_norm": 0.7695693373680115, "learning_rate": 0.003, "loss": 4.144, "step": 3353 }, { "epoch": 0.03354, "grad_norm": 0.757735013961792, "learning_rate": 0.003, "loss": 4.1066, "step": 3354 }, { "epoch": 0.03355, "grad_norm": 0.7958679795265198, "learning_rate": 0.003, "loss": 4.1313, "step": 3355 }, { "epoch": 0.03356, "grad_norm": 0.8041818141937256, "learning_rate": 0.003, "loss": 4.1556, "step": 3356 }, { "epoch": 0.03357, "grad_norm": 0.6908081769943237, "learning_rate": 0.003, "loss": 4.1258, "step": 3357 }, { "epoch": 0.03358, "grad_norm": 0.5655892491340637, "learning_rate": 0.003, "loss": 4.1399, "step": 3358 }, { "epoch": 0.03359, "grad_norm": 0.6353917717933655, "learning_rate": 0.003, "loss": 4.1288, "step": 3359 }, { "epoch": 0.0336, "grad_norm": 0.6901819705963135, "learning_rate": 0.003, "loss": 4.1421, "step": 3360 }, { "epoch": 0.03361, "grad_norm": 0.6260518431663513, "learning_rate": 0.003, "loss": 4.1414, "step": 3361 }, { "epoch": 0.03362, "grad_norm": 0.5238988399505615, "learning_rate": 0.003, "loss": 4.132, "step": 3362 }, { "epoch": 0.03363, "grad_norm": 0.4695539176464081, "learning_rate": 0.003, "loss": 4.0823, "step": 3363 }, { "epoch": 0.03364, "grad_norm": 0.4180772602558136, "learning_rate": 0.003, "loss": 4.103, "step": 3364 }, { "epoch": 0.03365, "grad_norm": 0.46931684017181396, "learning_rate": 0.003, "loss": 4.098, "step": 3365 }, { "epoch": 0.03366, "grad_norm": 0.4340416491031647, "learning_rate": 0.003, "loss": 4.0693, "step": 3366 }, { "epoch": 0.03367, "grad_norm": 0.3824833929538727, "learning_rate": 0.003, "loss": 4.0882, "step": 3367 }, { "epoch": 0.03368, "grad_norm": 0.391565203666687, "learning_rate": 0.003, "loss": 4.1353, "step": 3368 }, { "epoch": 0.03369, "grad_norm": 0.3979616165161133, "learning_rate": 0.003, "loss": 4.093, "step": 3369 }, { "epoch": 0.0337, "grad_norm": 0.41400259733200073, "learning_rate": 0.003, "loss": 4.1266, "step": 3370 }, { "epoch": 0.03371, "grad_norm": 0.4535299241542816, "learning_rate": 0.003, "loss": 4.0609, "step": 3371 }, { "epoch": 0.03372, "grad_norm": 0.4900164008140564, "learning_rate": 0.003, "loss": 4.091, "step": 3372 }, { "epoch": 0.03373, "grad_norm": 0.48760801553726196, "learning_rate": 0.003, "loss": 4.124, "step": 3373 }, { "epoch": 0.03374, "grad_norm": 0.42620474100112915, "learning_rate": 0.003, "loss": 4.1035, "step": 3374 }, { "epoch": 0.03375, "grad_norm": 0.539411187171936, "learning_rate": 0.003, "loss": 4.0937, "step": 3375 }, { "epoch": 0.03376, "grad_norm": 0.7004280686378479, "learning_rate": 0.003, "loss": 4.0758, "step": 3376 }, { "epoch": 0.03377, "grad_norm": 0.9512114524841309, "learning_rate": 0.003, "loss": 4.1273, "step": 3377 }, { "epoch": 0.03378, "grad_norm": 1.173850178718567, "learning_rate": 0.003, "loss": 4.1444, "step": 3378 }, { "epoch": 0.03379, "grad_norm": 0.8208590149879456, "learning_rate": 0.003, "loss": 4.1088, "step": 3379 }, { "epoch": 0.0338, "grad_norm": 0.9167894721031189, "learning_rate": 0.003, "loss": 4.138, "step": 3380 }, { "epoch": 0.03381, "grad_norm": 0.8736410140991211, "learning_rate": 0.003, "loss": 4.1428, "step": 3381 }, { "epoch": 0.03382, "grad_norm": 0.9413759112358093, "learning_rate": 0.003, "loss": 4.1405, "step": 3382 }, { "epoch": 0.03383, "grad_norm": 0.9034369587898254, "learning_rate": 0.003, "loss": 4.102, "step": 3383 }, { "epoch": 0.03384, "grad_norm": 0.7863962650299072, "learning_rate": 0.003, "loss": 4.1256, "step": 3384 }, { "epoch": 0.03385, "grad_norm": 0.6559106707572937, "learning_rate": 0.003, "loss": 4.1243, "step": 3385 }, { "epoch": 0.03386, "grad_norm": 0.5466395020484924, "learning_rate": 0.003, "loss": 4.116, "step": 3386 }, { "epoch": 0.03387, "grad_norm": 0.5657241344451904, "learning_rate": 0.003, "loss": 4.1195, "step": 3387 }, { "epoch": 0.03388, "grad_norm": 0.6148836016654968, "learning_rate": 0.003, "loss": 4.1344, "step": 3388 }, { "epoch": 0.03389, "grad_norm": 0.6511346101760864, "learning_rate": 0.003, "loss": 4.1482, "step": 3389 }, { "epoch": 0.0339, "grad_norm": 0.7301871180534363, "learning_rate": 0.003, "loss": 4.14, "step": 3390 }, { "epoch": 0.03391, "grad_norm": 0.7425968647003174, "learning_rate": 0.003, "loss": 4.1178, "step": 3391 }, { "epoch": 0.03392, "grad_norm": 0.8300238251686096, "learning_rate": 0.003, "loss": 4.1282, "step": 3392 }, { "epoch": 0.03393, "grad_norm": 0.9328773617744446, "learning_rate": 0.003, "loss": 4.1432, "step": 3393 }, { "epoch": 0.03394, "grad_norm": 0.8949936628341675, "learning_rate": 0.003, "loss": 4.145, "step": 3394 }, { "epoch": 0.03395, "grad_norm": 0.9411712884902954, "learning_rate": 0.003, "loss": 4.1549, "step": 3395 }, { "epoch": 0.03396, "grad_norm": 0.9026573896408081, "learning_rate": 0.003, "loss": 4.1395, "step": 3396 }, { "epoch": 0.03397, "grad_norm": 0.8574052453041077, "learning_rate": 0.003, "loss": 4.1065, "step": 3397 }, { "epoch": 0.03398, "grad_norm": 0.9310076236724854, "learning_rate": 0.003, "loss": 4.1626, "step": 3398 }, { "epoch": 0.03399, "grad_norm": 0.9323184490203857, "learning_rate": 0.003, "loss": 4.1821, "step": 3399 }, { "epoch": 0.034, "grad_norm": 0.996589720249176, "learning_rate": 0.003, "loss": 4.1643, "step": 3400 }, { "epoch": 0.03401, "grad_norm": 0.8762171268463135, "learning_rate": 0.003, "loss": 4.1361, "step": 3401 }, { "epoch": 0.03402, "grad_norm": 0.7612597942352295, "learning_rate": 0.003, "loss": 4.162, "step": 3402 }, { "epoch": 0.03403, "grad_norm": 0.7073224782943726, "learning_rate": 0.003, "loss": 4.1193, "step": 3403 }, { "epoch": 0.03404, "grad_norm": 0.8644006848335266, "learning_rate": 0.003, "loss": 4.1576, "step": 3404 }, { "epoch": 0.03405, "grad_norm": 0.9565165638923645, "learning_rate": 0.003, "loss": 4.1456, "step": 3405 }, { "epoch": 0.03406, "grad_norm": 0.9732779264450073, "learning_rate": 0.003, "loss": 4.1351, "step": 3406 }, { "epoch": 0.03407, "grad_norm": 0.8747183084487915, "learning_rate": 0.003, "loss": 4.1492, "step": 3407 }, { "epoch": 0.03408, "grad_norm": 0.8171975612640381, "learning_rate": 0.003, "loss": 4.1301, "step": 3408 }, { "epoch": 0.03409, "grad_norm": 0.8472719788551331, "learning_rate": 0.003, "loss": 4.139, "step": 3409 }, { "epoch": 0.0341, "grad_norm": 0.772420346736908, "learning_rate": 0.003, "loss": 4.1296, "step": 3410 }, { "epoch": 0.03411, "grad_norm": 0.7122489809989929, "learning_rate": 0.003, "loss": 4.1412, "step": 3411 }, { "epoch": 0.03412, "grad_norm": 0.7397792339324951, "learning_rate": 0.003, "loss": 4.1595, "step": 3412 }, { "epoch": 0.03413, "grad_norm": 0.5002322793006897, "learning_rate": 0.003, "loss": 4.1603, "step": 3413 }, { "epoch": 0.03414, "grad_norm": 0.533718466758728, "learning_rate": 0.003, "loss": 4.1493, "step": 3414 }, { "epoch": 0.03415, "grad_norm": 0.5009657144546509, "learning_rate": 0.003, "loss": 4.1281, "step": 3415 }, { "epoch": 0.03416, "grad_norm": 0.49424099922180176, "learning_rate": 0.003, "loss": 4.1289, "step": 3416 }, { "epoch": 0.03417, "grad_norm": 0.5071776509284973, "learning_rate": 0.003, "loss": 4.1225, "step": 3417 }, { "epoch": 0.03418, "grad_norm": 0.600383996963501, "learning_rate": 0.003, "loss": 4.0956, "step": 3418 }, { "epoch": 0.03419, "grad_norm": 0.5728192329406738, "learning_rate": 0.003, "loss": 4.1155, "step": 3419 }, { "epoch": 0.0342, "grad_norm": 0.5360076427459717, "learning_rate": 0.003, "loss": 4.1246, "step": 3420 }, { "epoch": 0.03421, "grad_norm": 0.5283966660499573, "learning_rate": 0.003, "loss": 4.1131, "step": 3421 }, { "epoch": 0.03422, "grad_norm": 0.5383856892585754, "learning_rate": 0.003, "loss": 4.1282, "step": 3422 }, { "epoch": 0.03423, "grad_norm": 0.6316571831703186, "learning_rate": 0.003, "loss": 4.1096, "step": 3423 }, { "epoch": 0.03424, "grad_norm": 0.7047085762023926, "learning_rate": 0.003, "loss": 4.1375, "step": 3424 }, { "epoch": 0.03425, "grad_norm": 0.7236632704734802, "learning_rate": 0.003, "loss": 4.1018, "step": 3425 }, { "epoch": 0.03426, "grad_norm": 0.6646005511283875, "learning_rate": 0.003, "loss": 4.1234, "step": 3426 }, { "epoch": 0.03427, "grad_norm": 0.5403919816017151, "learning_rate": 0.003, "loss": 4.0894, "step": 3427 }, { "epoch": 0.03428, "grad_norm": 0.5291489958763123, "learning_rate": 0.003, "loss": 4.0887, "step": 3428 }, { "epoch": 0.03429, "grad_norm": 0.5223709940910339, "learning_rate": 0.003, "loss": 4.1048, "step": 3429 }, { "epoch": 0.0343, "grad_norm": 0.5112771987915039, "learning_rate": 0.003, "loss": 4.0996, "step": 3430 }, { "epoch": 0.03431, "grad_norm": 0.4984135031700134, "learning_rate": 0.003, "loss": 4.0918, "step": 3431 }, { "epoch": 0.03432, "grad_norm": 0.6651791334152222, "learning_rate": 0.003, "loss": 4.1044, "step": 3432 }, { "epoch": 0.03433, "grad_norm": 0.8698033094406128, "learning_rate": 0.003, "loss": 4.0843, "step": 3433 }, { "epoch": 0.03434, "grad_norm": 0.9080641865730286, "learning_rate": 0.003, "loss": 4.1135, "step": 3434 }, { "epoch": 0.03435, "grad_norm": 0.6624755263328552, "learning_rate": 0.003, "loss": 4.0888, "step": 3435 }, { "epoch": 0.03436, "grad_norm": 0.5985552668571472, "learning_rate": 0.003, "loss": 4.1135, "step": 3436 }, { "epoch": 0.03437, "grad_norm": 0.74169921875, "learning_rate": 0.003, "loss": 4.1061, "step": 3437 }, { "epoch": 0.03438, "grad_norm": 0.6739313006401062, "learning_rate": 0.003, "loss": 4.1204, "step": 3438 }, { "epoch": 0.03439, "grad_norm": 0.6183229684829712, "learning_rate": 0.003, "loss": 4.1164, "step": 3439 }, { "epoch": 0.0344, "grad_norm": 0.6372612118721008, "learning_rate": 0.003, "loss": 4.0928, "step": 3440 }, { "epoch": 0.03441, "grad_norm": 0.5561286211013794, "learning_rate": 0.003, "loss": 4.1126, "step": 3441 }, { "epoch": 0.03442, "grad_norm": 0.5146679282188416, "learning_rate": 0.003, "loss": 4.0767, "step": 3442 }, { "epoch": 0.03443, "grad_norm": 0.5086862444877625, "learning_rate": 0.003, "loss": 4.108, "step": 3443 }, { "epoch": 0.03444, "grad_norm": 0.5029706358909607, "learning_rate": 0.003, "loss": 4.117, "step": 3444 }, { "epoch": 0.03445, "grad_norm": 0.5076051950454712, "learning_rate": 0.003, "loss": 4.0821, "step": 3445 }, { "epoch": 0.03446, "grad_norm": 0.5356073379516602, "learning_rate": 0.003, "loss": 4.0732, "step": 3446 }, { "epoch": 0.03447, "grad_norm": 0.5907533764839172, "learning_rate": 0.003, "loss": 4.1219, "step": 3447 }, { "epoch": 0.03448, "grad_norm": 0.6191877722740173, "learning_rate": 0.003, "loss": 4.0951, "step": 3448 }, { "epoch": 0.03449, "grad_norm": 0.6186185479164124, "learning_rate": 0.003, "loss": 4.0808, "step": 3449 }, { "epoch": 0.0345, "grad_norm": 0.6868272423744202, "learning_rate": 0.003, "loss": 4.119, "step": 3450 }, { "epoch": 0.03451, "grad_norm": 0.5868905186653137, "learning_rate": 0.003, "loss": 4.0896, "step": 3451 }, { "epoch": 0.03452, "grad_norm": 0.5602003931999207, "learning_rate": 0.003, "loss": 4.1187, "step": 3452 }, { "epoch": 0.03453, "grad_norm": 0.6827725172042847, "learning_rate": 0.003, "loss": 4.0989, "step": 3453 }, { "epoch": 0.03454, "grad_norm": 0.6988645792007446, "learning_rate": 0.003, "loss": 4.084, "step": 3454 }, { "epoch": 0.03455, "grad_norm": 0.7984029650688171, "learning_rate": 0.003, "loss": 4.0883, "step": 3455 }, { "epoch": 0.03456, "grad_norm": 0.8589499592781067, "learning_rate": 0.003, "loss": 4.1126, "step": 3456 }, { "epoch": 0.03457, "grad_norm": 0.8572336435317993, "learning_rate": 0.003, "loss": 4.0784, "step": 3457 }, { "epoch": 0.03458, "grad_norm": 0.7515364289283752, "learning_rate": 0.003, "loss": 4.1174, "step": 3458 }, { "epoch": 0.03459, "grad_norm": 0.7732160091400146, "learning_rate": 0.003, "loss": 4.1453, "step": 3459 }, { "epoch": 0.0346, "grad_norm": 1.0194605588912964, "learning_rate": 0.003, "loss": 4.1595, "step": 3460 }, { "epoch": 0.03461, "grad_norm": 1.2320705652236938, "learning_rate": 0.003, "loss": 4.1536, "step": 3461 }, { "epoch": 0.03462, "grad_norm": 0.8935246467590332, "learning_rate": 0.003, "loss": 4.134, "step": 3462 }, { "epoch": 0.03463, "grad_norm": 0.8400885462760925, "learning_rate": 0.003, "loss": 4.1144, "step": 3463 }, { "epoch": 0.03464, "grad_norm": 0.8417510390281677, "learning_rate": 0.003, "loss": 4.1325, "step": 3464 }, { "epoch": 0.03465, "grad_norm": 0.9221370220184326, "learning_rate": 0.003, "loss": 4.1429, "step": 3465 }, { "epoch": 0.03466, "grad_norm": 0.9339236617088318, "learning_rate": 0.003, "loss": 4.1417, "step": 3466 }, { "epoch": 0.03467, "grad_norm": 1.027878999710083, "learning_rate": 0.003, "loss": 4.1322, "step": 3467 }, { "epoch": 0.03468, "grad_norm": 0.9010937213897705, "learning_rate": 0.003, "loss": 4.1478, "step": 3468 }, { "epoch": 0.03469, "grad_norm": 1.0625158548355103, "learning_rate": 0.003, "loss": 4.1472, "step": 3469 }, { "epoch": 0.0347, "grad_norm": 0.9459049701690674, "learning_rate": 0.003, "loss": 4.1579, "step": 3470 }, { "epoch": 0.03471, "grad_norm": 0.8470619320869446, "learning_rate": 0.003, "loss": 4.1443, "step": 3471 }, { "epoch": 0.03472, "grad_norm": 0.7621662020683289, "learning_rate": 0.003, "loss": 4.1408, "step": 3472 }, { "epoch": 0.03473, "grad_norm": 0.7190437316894531, "learning_rate": 0.003, "loss": 4.1319, "step": 3473 }, { "epoch": 0.03474, "grad_norm": 0.6625324487686157, "learning_rate": 0.003, "loss": 4.1605, "step": 3474 }, { "epoch": 0.03475, "grad_norm": 0.5887710452079773, "learning_rate": 0.003, "loss": 4.1257, "step": 3475 }, { "epoch": 0.03476, "grad_norm": 0.6497656106948853, "learning_rate": 0.003, "loss": 4.1307, "step": 3476 }, { "epoch": 0.03477, "grad_norm": 0.6939454078674316, "learning_rate": 0.003, "loss": 4.1271, "step": 3477 }, { "epoch": 0.03478, "grad_norm": 0.7538445591926575, "learning_rate": 0.003, "loss": 4.1361, "step": 3478 }, { "epoch": 0.03479, "grad_norm": 0.8461519479751587, "learning_rate": 0.003, "loss": 4.1151, "step": 3479 }, { "epoch": 0.0348, "grad_norm": 0.9156594276428223, "learning_rate": 0.003, "loss": 4.1158, "step": 3480 }, { "epoch": 0.03481, "grad_norm": 0.830324649810791, "learning_rate": 0.003, "loss": 4.1316, "step": 3481 }, { "epoch": 0.03482, "grad_norm": 0.6343877911567688, "learning_rate": 0.003, "loss": 4.1326, "step": 3482 }, { "epoch": 0.03483, "grad_norm": 0.5798615217208862, "learning_rate": 0.003, "loss": 4.1065, "step": 3483 }, { "epoch": 0.03484, "grad_norm": 0.6012097001075745, "learning_rate": 0.003, "loss": 4.1507, "step": 3484 }, { "epoch": 0.03485, "grad_norm": 0.534057080745697, "learning_rate": 0.003, "loss": 4.14, "step": 3485 }, { "epoch": 0.03486, "grad_norm": 0.5091946721076965, "learning_rate": 0.003, "loss": 4.1087, "step": 3486 }, { "epoch": 0.03487, "grad_norm": 0.45456182956695557, "learning_rate": 0.003, "loss": 4.1009, "step": 3487 }, { "epoch": 0.03488, "grad_norm": 0.4439913034439087, "learning_rate": 0.003, "loss": 4.0973, "step": 3488 }, { "epoch": 0.03489, "grad_norm": 0.48145756125450134, "learning_rate": 0.003, "loss": 4.1128, "step": 3489 }, { "epoch": 0.0349, "grad_norm": 0.543171763420105, "learning_rate": 0.003, "loss": 4.1103, "step": 3490 }, { "epoch": 0.03491, "grad_norm": 0.4846112132072449, "learning_rate": 0.003, "loss": 4.1225, "step": 3491 }, { "epoch": 0.03492, "grad_norm": 0.4532843828201294, "learning_rate": 0.003, "loss": 4.1096, "step": 3492 }, { "epoch": 0.03493, "grad_norm": 0.452328622341156, "learning_rate": 0.003, "loss": 4.0984, "step": 3493 }, { "epoch": 0.03494, "grad_norm": 0.43499302864074707, "learning_rate": 0.003, "loss": 4.0996, "step": 3494 }, { "epoch": 0.03495, "grad_norm": 0.3597606122493744, "learning_rate": 0.003, "loss": 4.1056, "step": 3495 }, { "epoch": 0.03496, "grad_norm": 0.36562833189964294, "learning_rate": 0.003, "loss": 4.0783, "step": 3496 }, { "epoch": 0.03497, "grad_norm": 0.3867475986480713, "learning_rate": 0.003, "loss": 4.1148, "step": 3497 }, { "epoch": 0.03498, "grad_norm": 0.42188137769699097, "learning_rate": 0.003, "loss": 4.0881, "step": 3498 }, { "epoch": 0.03499, "grad_norm": 0.5299180746078491, "learning_rate": 0.003, "loss": 4.0947, "step": 3499 }, { "epoch": 0.035, "grad_norm": 0.6962635517120361, "learning_rate": 0.003, "loss": 4.1169, "step": 3500 }, { "epoch": 0.03501, "grad_norm": 0.9552702307701111, "learning_rate": 0.003, "loss": 4.1206, "step": 3501 }, { "epoch": 0.03502, "grad_norm": 1.1519496440887451, "learning_rate": 0.003, "loss": 4.1306, "step": 3502 }, { "epoch": 0.03503, "grad_norm": 0.6880185604095459, "learning_rate": 0.003, "loss": 4.1312, "step": 3503 }, { "epoch": 0.03504, "grad_norm": 0.5892576575279236, "learning_rate": 0.003, "loss": 4.1025, "step": 3504 }, { "epoch": 0.03505, "grad_norm": 0.6985999345779419, "learning_rate": 0.003, "loss": 4.1215, "step": 3505 }, { "epoch": 0.03506, "grad_norm": 0.7576223611831665, "learning_rate": 0.003, "loss": 4.1306, "step": 3506 }, { "epoch": 0.03507, "grad_norm": 0.6434160470962524, "learning_rate": 0.003, "loss": 4.1089, "step": 3507 }, { "epoch": 0.03508, "grad_norm": 0.6596179604530334, "learning_rate": 0.003, "loss": 4.0875, "step": 3508 }, { "epoch": 0.03509, "grad_norm": 0.6995351314544678, "learning_rate": 0.003, "loss": 4.1043, "step": 3509 }, { "epoch": 0.0351, "grad_norm": 0.6974444389343262, "learning_rate": 0.003, "loss": 4.0959, "step": 3510 }, { "epoch": 0.03511, "grad_norm": 0.6773884296417236, "learning_rate": 0.003, "loss": 4.1118, "step": 3511 }, { "epoch": 0.03512, "grad_norm": 0.6800361275672913, "learning_rate": 0.003, "loss": 4.101, "step": 3512 }, { "epoch": 0.03513, "grad_norm": 0.6331777572631836, "learning_rate": 0.003, "loss": 4.1017, "step": 3513 }, { "epoch": 0.03514, "grad_norm": 0.6398311853408813, "learning_rate": 0.003, "loss": 4.1107, "step": 3514 }, { "epoch": 0.03515, "grad_norm": 0.652564525604248, "learning_rate": 0.003, "loss": 4.123, "step": 3515 }, { "epoch": 0.03516, "grad_norm": 0.6278548836708069, "learning_rate": 0.003, "loss": 4.1476, "step": 3516 }, { "epoch": 0.03517, "grad_norm": 0.5660470128059387, "learning_rate": 0.003, "loss": 4.0959, "step": 3517 }, { "epoch": 0.03518, "grad_norm": 0.6680671572685242, "learning_rate": 0.003, "loss": 4.088, "step": 3518 }, { "epoch": 0.03519, "grad_norm": 0.8297867178916931, "learning_rate": 0.003, "loss": 4.1339, "step": 3519 }, { "epoch": 0.0352, "grad_norm": 1.0089613199234009, "learning_rate": 0.003, "loss": 4.1251, "step": 3520 }, { "epoch": 0.03521, "grad_norm": 0.8898510336875916, "learning_rate": 0.003, "loss": 4.1207, "step": 3521 }, { "epoch": 0.03522, "grad_norm": 0.7937150001525879, "learning_rate": 0.003, "loss": 4.1173, "step": 3522 }, { "epoch": 0.03523, "grad_norm": 0.9151129126548767, "learning_rate": 0.003, "loss": 4.0883, "step": 3523 }, { "epoch": 0.03524, "grad_norm": 1.016127109527588, "learning_rate": 0.003, "loss": 4.1424, "step": 3524 }, { "epoch": 0.03525, "grad_norm": 0.8876500129699707, "learning_rate": 0.003, "loss": 4.1096, "step": 3525 }, { "epoch": 0.03526, "grad_norm": 0.9732938408851624, "learning_rate": 0.003, "loss": 4.1249, "step": 3526 }, { "epoch": 0.03527, "grad_norm": 0.9365341067314148, "learning_rate": 0.003, "loss": 4.1567, "step": 3527 }, { "epoch": 0.03528, "grad_norm": 0.7851080298423767, "learning_rate": 0.003, "loss": 4.1377, "step": 3528 }, { "epoch": 0.03529, "grad_norm": 0.7726654410362244, "learning_rate": 0.003, "loss": 4.1279, "step": 3529 }, { "epoch": 0.0353, "grad_norm": 0.8136371374130249, "learning_rate": 0.003, "loss": 4.1286, "step": 3530 }, { "epoch": 0.03531, "grad_norm": 0.9276587963104248, "learning_rate": 0.003, "loss": 4.146, "step": 3531 }, { "epoch": 0.03532, "grad_norm": 1.0662339925765991, "learning_rate": 0.003, "loss": 4.1553, "step": 3532 }, { "epoch": 0.03533, "grad_norm": 0.9835387468338013, "learning_rate": 0.003, "loss": 4.154, "step": 3533 }, { "epoch": 0.03534, "grad_norm": 0.8469735383987427, "learning_rate": 0.003, "loss": 4.1438, "step": 3534 }, { "epoch": 0.03535, "grad_norm": 0.8011348247528076, "learning_rate": 0.003, "loss": 4.1115, "step": 3535 }, { "epoch": 0.03536, "grad_norm": 0.7366276383399963, "learning_rate": 0.003, "loss": 4.113, "step": 3536 }, { "epoch": 0.03537, "grad_norm": 0.7694060802459717, "learning_rate": 0.003, "loss": 4.1186, "step": 3537 }, { "epoch": 0.03538, "grad_norm": 0.7417018413543701, "learning_rate": 0.003, "loss": 4.1239, "step": 3538 }, { "epoch": 0.03539, "grad_norm": 0.6806791424751282, "learning_rate": 0.003, "loss": 4.1095, "step": 3539 }, { "epoch": 0.0354, "grad_norm": 0.7284207940101624, "learning_rate": 0.003, "loss": 4.1493, "step": 3540 }, { "epoch": 0.03541, "grad_norm": 0.8043148517608643, "learning_rate": 0.003, "loss": 4.1355, "step": 3541 }, { "epoch": 0.03542, "grad_norm": 0.9069803357124329, "learning_rate": 0.003, "loss": 4.121, "step": 3542 }, { "epoch": 0.03543, "grad_norm": 0.8361301422119141, "learning_rate": 0.003, "loss": 4.1305, "step": 3543 }, { "epoch": 0.03544, "grad_norm": 0.6755576133728027, "learning_rate": 0.003, "loss": 4.1316, "step": 3544 }, { "epoch": 0.03545, "grad_norm": 0.6478848457336426, "learning_rate": 0.003, "loss": 4.0998, "step": 3545 }, { "epoch": 0.03546, "grad_norm": 0.6325247287750244, "learning_rate": 0.003, "loss": 4.1169, "step": 3546 }, { "epoch": 0.03547, "grad_norm": 0.551747739315033, "learning_rate": 0.003, "loss": 4.1233, "step": 3547 }, { "epoch": 0.03548, "grad_norm": 0.550105094909668, "learning_rate": 0.003, "loss": 4.1261, "step": 3548 }, { "epoch": 0.03549, "grad_norm": 0.5281518697738647, "learning_rate": 0.003, "loss": 4.1188, "step": 3549 }, { "epoch": 0.0355, "grad_norm": 0.5446397066116333, "learning_rate": 0.003, "loss": 4.0722, "step": 3550 }, { "epoch": 0.03551, "grad_norm": 0.5065484046936035, "learning_rate": 0.003, "loss": 4.1297, "step": 3551 }, { "epoch": 0.03552, "grad_norm": 0.5231883525848389, "learning_rate": 0.003, "loss": 4.0898, "step": 3552 }, { "epoch": 0.03553, "grad_norm": 0.5496423840522766, "learning_rate": 0.003, "loss": 4.1036, "step": 3553 }, { "epoch": 0.03554, "grad_norm": 0.5717417597770691, "learning_rate": 0.003, "loss": 4.1073, "step": 3554 }, { "epoch": 0.03555, "grad_norm": 0.5578296184539795, "learning_rate": 0.003, "loss": 4.1184, "step": 3555 }, { "epoch": 0.03556, "grad_norm": 0.5329107642173767, "learning_rate": 0.003, "loss": 4.0988, "step": 3556 }, { "epoch": 0.03557, "grad_norm": 0.6033030152320862, "learning_rate": 0.003, "loss": 4.1004, "step": 3557 }, { "epoch": 0.03558, "grad_norm": 0.7457759380340576, "learning_rate": 0.003, "loss": 4.135, "step": 3558 }, { "epoch": 0.03559, "grad_norm": 0.9078201055526733, "learning_rate": 0.003, "loss": 4.1229, "step": 3559 }, { "epoch": 0.0356, "grad_norm": 0.848138689994812, "learning_rate": 0.003, "loss": 4.1262, "step": 3560 }, { "epoch": 0.03561, "grad_norm": 0.6350159645080566, "learning_rate": 0.003, "loss": 4.1425, "step": 3561 }, { "epoch": 0.03562, "grad_norm": 0.6772452592849731, "learning_rate": 0.003, "loss": 4.1085, "step": 3562 }, { "epoch": 0.03563, "grad_norm": 0.7804591655731201, "learning_rate": 0.003, "loss": 4.1453, "step": 3563 }, { "epoch": 0.03564, "grad_norm": 0.725709080696106, "learning_rate": 0.003, "loss": 4.1193, "step": 3564 }, { "epoch": 0.03565, "grad_norm": 0.6160607933998108, "learning_rate": 0.003, "loss": 4.1088, "step": 3565 }, { "epoch": 0.03566, "grad_norm": 0.6539621353149414, "learning_rate": 0.003, "loss": 4.1136, "step": 3566 }, { "epoch": 0.03567, "grad_norm": 0.7242776155471802, "learning_rate": 0.003, "loss": 4.1107, "step": 3567 }, { "epoch": 0.03568, "grad_norm": 0.623746395111084, "learning_rate": 0.003, "loss": 4.1043, "step": 3568 }, { "epoch": 0.03569, "grad_norm": 0.570114016532898, "learning_rate": 0.003, "loss": 4.0963, "step": 3569 }, { "epoch": 0.0357, "grad_norm": 0.5575445294380188, "learning_rate": 0.003, "loss": 4.098, "step": 3570 }, { "epoch": 0.03571, "grad_norm": 0.5092292428016663, "learning_rate": 0.003, "loss": 4.1047, "step": 3571 }, { "epoch": 0.03572, "grad_norm": 0.5417683720588684, "learning_rate": 0.003, "loss": 4.1243, "step": 3572 }, { "epoch": 0.03573, "grad_norm": 0.5097172856330872, "learning_rate": 0.003, "loss": 4.0926, "step": 3573 }, { "epoch": 0.03574, "grad_norm": 0.4699859917163849, "learning_rate": 0.003, "loss": 4.1165, "step": 3574 }, { "epoch": 0.03575, "grad_norm": 0.47701337933540344, "learning_rate": 0.003, "loss": 4.0951, "step": 3575 }, { "epoch": 0.03576, "grad_norm": 0.48855581879615784, "learning_rate": 0.003, "loss": 4.079, "step": 3576 }, { "epoch": 0.03577, "grad_norm": 0.5532284379005432, "learning_rate": 0.003, "loss": 4.1121, "step": 3577 }, { "epoch": 0.03578, "grad_norm": 0.6470955610275269, "learning_rate": 0.003, "loss": 4.1044, "step": 3578 }, { "epoch": 0.03579, "grad_norm": 0.7264450788497925, "learning_rate": 0.003, "loss": 4.1219, "step": 3579 }, { "epoch": 0.0358, "grad_norm": 0.8104549646377563, "learning_rate": 0.003, "loss": 4.083, "step": 3580 }, { "epoch": 0.03581, "grad_norm": 0.7132370471954346, "learning_rate": 0.003, "loss": 4.0948, "step": 3581 }, { "epoch": 0.03582, "grad_norm": 0.5359878540039062, "learning_rate": 0.003, "loss": 4.0911, "step": 3582 }, { "epoch": 0.03583, "grad_norm": 0.45527881383895874, "learning_rate": 0.003, "loss": 4.0899, "step": 3583 }, { "epoch": 0.03584, "grad_norm": 0.5485943555831909, "learning_rate": 0.003, "loss": 4.0809, "step": 3584 }, { "epoch": 0.03585, "grad_norm": 0.5606132745742798, "learning_rate": 0.003, "loss": 4.0774, "step": 3585 }, { "epoch": 0.03586, "grad_norm": 0.5852906703948975, "learning_rate": 0.003, "loss": 4.1008, "step": 3586 }, { "epoch": 0.03587, "grad_norm": 0.6464812159538269, "learning_rate": 0.003, "loss": 4.1013, "step": 3587 }, { "epoch": 0.03588, "grad_norm": 0.6532333493232727, "learning_rate": 0.003, "loss": 4.1008, "step": 3588 }, { "epoch": 0.03589, "grad_norm": 0.729999303817749, "learning_rate": 0.003, "loss": 4.1197, "step": 3589 }, { "epoch": 0.0359, "grad_norm": 0.716708779335022, "learning_rate": 0.003, "loss": 4.0883, "step": 3590 }, { "epoch": 0.03591, "grad_norm": 0.6319497227668762, "learning_rate": 0.003, "loss": 4.1188, "step": 3591 }, { "epoch": 0.03592, "grad_norm": 0.7355323433876038, "learning_rate": 0.003, "loss": 4.1029, "step": 3592 }, { "epoch": 0.03593, "grad_norm": 0.74644935131073, "learning_rate": 0.003, "loss": 4.1041, "step": 3593 }, { "epoch": 0.03594, "grad_norm": 0.7125880122184753, "learning_rate": 0.003, "loss": 4.1094, "step": 3594 }, { "epoch": 0.03595, "grad_norm": 0.6679682731628418, "learning_rate": 0.003, "loss": 4.0993, "step": 3595 }, { "epoch": 0.03596, "grad_norm": 0.6546993851661682, "learning_rate": 0.003, "loss": 4.1358, "step": 3596 }, { "epoch": 0.03597, "grad_norm": 0.6869426369667053, "learning_rate": 0.003, "loss": 4.1088, "step": 3597 }, { "epoch": 0.03598, "grad_norm": 0.6870181560516357, "learning_rate": 0.003, "loss": 4.1155, "step": 3598 }, { "epoch": 0.03599, "grad_norm": 0.6337956190109253, "learning_rate": 0.003, "loss": 4.1088, "step": 3599 }, { "epoch": 0.036, "grad_norm": 0.7691097855567932, "learning_rate": 0.003, "loss": 4.1371, "step": 3600 }, { "epoch": 0.03601, "grad_norm": 0.938689649105072, "learning_rate": 0.003, "loss": 4.1168, "step": 3601 }, { "epoch": 0.03602, "grad_norm": 1.0124480724334717, "learning_rate": 0.003, "loss": 4.1053, "step": 3602 }, { "epoch": 0.03603, "grad_norm": 1.0977243185043335, "learning_rate": 0.003, "loss": 4.1007, "step": 3603 }, { "epoch": 0.03604, "grad_norm": 0.7794393301010132, "learning_rate": 0.003, "loss": 4.131, "step": 3604 }, { "epoch": 0.03605, "grad_norm": 0.5185660719871521, "learning_rate": 0.003, "loss": 4.114, "step": 3605 }, { "epoch": 0.03606, "grad_norm": 0.5366716384887695, "learning_rate": 0.003, "loss": 4.0997, "step": 3606 }, { "epoch": 0.03607, "grad_norm": 0.5930801630020142, "learning_rate": 0.003, "loss": 4.0952, "step": 3607 }, { "epoch": 0.03608, "grad_norm": 0.5576140284538269, "learning_rate": 0.003, "loss": 4.0965, "step": 3608 }, { "epoch": 0.03609, "grad_norm": 0.5525704622268677, "learning_rate": 0.003, "loss": 4.1239, "step": 3609 }, { "epoch": 0.0361, "grad_norm": 0.5817636847496033, "learning_rate": 0.003, "loss": 4.126, "step": 3610 }, { "epoch": 0.03611, "grad_norm": 0.5388094782829285, "learning_rate": 0.003, "loss": 4.0762, "step": 3611 }, { "epoch": 0.03612, "grad_norm": 0.5407794117927551, "learning_rate": 0.003, "loss": 4.0741, "step": 3612 }, { "epoch": 0.03613, "grad_norm": 0.7358308434486389, "learning_rate": 0.003, "loss": 4.1212, "step": 3613 }, { "epoch": 0.03614, "grad_norm": 0.7770156264305115, "learning_rate": 0.003, "loss": 4.0986, "step": 3614 }, { "epoch": 0.03615, "grad_norm": 0.7676093578338623, "learning_rate": 0.003, "loss": 4.1123, "step": 3615 }, { "epoch": 0.03616, "grad_norm": 0.8209259510040283, "learning_rate": 0.003, "loss": 4.1253, "step": 3616 }, { "epoch": 0.03617, "grad_norm": 0.7589568495750427, "learning_rate": 0.003, "loss": 4.1202, "step": 3617 }, { "epoch": 0.03618, "grad_norm": 0.7976088523864746, "learning_rate": 0.003, "loss": 4.1145, "step": 3618 }, { "epoch": 0.03619, "grad_norm": 0.7740630507469177, "learning_rate": 0.003, "loss": 4.1146, "step": 3619 }, { "epoch": 0.0362, "grad_norm": 0.8788385391235352, "learning_rate": 0.003, "loss": 4.1243, "step": 3620 }, { "epoch": 0.03621, "grad_norm": 1.0235563516616821, "learning_rate": 0.003, "loss": 4.1324, "step": 3621 }, { "epoch": 0.03622, "grad_norm": 1.101138949394226, "learning_rate": 0.003, "loss": 4.1112, "step": 3622 }, { "epoch": 0.03623, "grad_norm": 0.782410204410553, "learning_rate": 0.003, "loss": 4.0938, "step": 3623 }, { "epoch": 0.03624, "grad_norm": 0.5812339186668396, "learning_rate": 0.003, "loss": 4.0954, "step": 3624 }, { "epoch": 0.03625, "grad_norm": 0.679203987121582, "learning_rate": 0.003, "loss": 4.1317, "step": 3625 }, { "epoch": 0.03626, "grad_norm": 0.7614912986755371, "learning_rate": 0.003, "loss": 4.1097, "step": 3626 }, { "epoch": 0.03627, "grad_norm": 0.7893409729003906, "learning_rate": 0.003, "loss": 4.1109, "step": 3627 }, { "epoch": 0.03628, "grad_norm": 0.6757506132125854, "learning_rate": 0.003, "loss": 4.1182, "step": 3628 }, { "epoch": 0.03629, "grad_norm": 0.5744339227676392, "learning_rate": 0.003, "loss": 4.1332, "step": 3629 }, { "epoch": 0.0363, "grad_norm": 0.5694675445556641, "learning_rate": 0.003, "loss": 4.1144, "step": 3630 }, { "epoch": 0.03631, "grad_norm": 0.5684549808502197, "learning_rate": 0.003, "loss": 4.107, "step": 3631 }, { "epoch": 0.03632, "grad_norm": 0.7418304085731506, "learning_rate": 0.003, "loss": 4.1039, "step": 3632 }, { "epoch": 0.03633, "grad_norm": 0.8812902569770813, "learning_rate": 0.003, "loss": 4.1369, "step": 3633 }, { "epoch": 0.03634, "grad_norm": 0.960989236831665, "learning_rate": 0.003, "loss": 4.1198, "step": 3634 }, { "epoch": 0.03635, "grad_norm": 0.8940312266349792, "learning_rate": 0.003, "loss": 4.1037, "step": 3635 }, { "epoch": 0.03636, "grad_norm": 0.7907798886299133, "learning_rate": 0.003, "loss": 4.1228, "step": 3636 }, { "epoch": 0.03637, "grad_norm": 0.7844222187995911, "learning_rate": 0.003, "loss": 4.1135, "step": 3637 }, { "epoch": 0.03638, "grad_norm": 0.8917554616928101, "learning_rate": 0.003, "loss": 4.1269, "step": 3638 }, { "epoch": 0.03639, "grad_norm": 0.8074480891227722, "learning_rate": 0.003, "loss": 4.1473, "step": 3639 }, { "epoch": 0.0364, "grad_norm": 0.9071139097213745, "learning_rate": 0.003, "loss": 4.149, "step": 3640 }, { "epoch": 0.03641, "grad_norm": 0.749542236328125, "learning_rate": 0.003, "loss": 4.1275, "step": 3641 }, { "epoch": 0.03642, "grad_norm": 0.7375038862228394, "learning_rate": 0.003, "loss": 4.1468, "step": 3642 }, { "epoch": 0.03643, "grad_norm": 0.7622751593589783, "learning_rate": 0.003, "loss": 4.1104, "step": 3643 }, { "epoch": 0.03644, "grad_norm": 0.7410848140716553, "learning_rate": 0.003, "loss": 4.1341, "step": 3644 }, { "epoch": 0.03645, "grad_norm": 0.7992194890975952, "learning_rate": 0.003, "loss": 4.1055, "step": 3645 }, { "epoch": 0.03646, "grad_norm": 0.7577066421508789, "learning_rate": 0.003, "loss": 4.124, "step": 3646 }, { "epoch": 0.03647, "grad_norm": 0.7054228782653809, "learning_rate": 0.003, "loss": 4.1186, "step": 3647 }, { "epoch": 0.03648, "grad_norm": 0.664077877998352, "learning_rate": 0.003, "loss": 4.1069, "step": 3648 }, { "epoch": 0.03649, "grad_norm": 0.5441340208053589, "learning_rate": 0.003, "loss": 4.126, "step": 3649 }, { "epoch": 0.0365, "grad_norm": 0.5746814012527466, "learning_rate": 0.003, "loss": 4.1046, "step": 3650 }, { "epoch": 0.03651, "grad_norm": 0.6534304618835449, "learning_rate": 0.003, "loss": 4.1274, "step": 3651 }, { "epoch": 0.03652, "grad_norm": 0.6664396524429321, "learning_rate": 0.003, "loss": 4.1084, "step": 3652 }, { "epoch": 0.03653, "grad_norm": 0.6238181591033936, "learning_rate": 0.003, "loss": 4.0937, "step": 3653 }, { "epoch": 0.03654, "grad_norm": 0.5265892148017883, "learning_rate": 0.003, "loss": 4.0921, "step": 3654 }, { "epoch": 0.03655, "grad_norm": 0.4858669936656952, "learning_rate": 0.003, "loss": 4.1136, "step": 3655 }, { "epoch": 0.03656, "grad_norm": 0.41965577006340027, "learning_rate": 0.003, "loss": 4.0995, "step": 3656 }, { "epoch": 0.03657, "grad_norm": 0.4169410765171051, "learning_rate": 0.003, "loss": 4.0719, "step": 3657 }, { "epoch": 0.03658, "grad_norm": 0.4434378445148468, "learning_rate": 0.003, "loss": 4.1361, "step": 3658 }, { "epoch": 0.03659, "grad_norm": 0.4580453634262085, "learning_rate": 0.003, "loss": 4.0843, "step": 3659 }, { "epoch": 0.0366, "grad_norm": 0.4966508448123932, "learning_rate": 0.003, "loss": 4.1149, "step": 3660 }, { "epoch": 0.03661, "grad_norm": 0.6382727026939392, "learning_rate": 0.003, "loss": 4.0967, "step": 3661 }, { "epoch": 0.03662, "grad_norm": 0.8349222540855408, "learning_rate": 0.003, "loss": 4.0986, "step": 3662 }, { "epoch": 0.03663, "grad_norm": 0.931993842124939, "learning_rate": 0.003, "loss": 4.1239, "step": 3663 }, { "epoch": 0.03664, "grad_norm": 0.837236762046814, "learning_rate": 0.003, "loss": 4.1023, "step": 3664 }, { "epoch": 0.03665, "grad_norm": 0.6735215783119202, "learning_rate": 0.003, "loss": 4.12, "step": 3665 }, { "epoch": 0.03666, "grad_norm": 0.7142363786697388, "learning_rate": 0.003, "loss": 4.0773, "step": 3666 }, { "epoch": 0.03667, "grad_norm": 0.7672339081764221, "learning_rate": 0.003, "loss": 4.0907, "step": 3667 }, { "epoch": 0.03668, "grad_norm": 0.660099446773529, "learning_rate": 0.003, "loss": 4.1026, "step": 3668 }, { "epoch": 0.03669, "grad_norm": 0.664168119430542, "learning_rate": 0.003, "loss": 4.08, "step": 3669 }, { "epoch": 0.0367, "grad_norm": 0.5606299042701721, "learning_rate": 0.003, "loss": 4.1162, "step": 3670 }, { "epoch": 0.03671, "grad_norm": 0.5958951711654663, "learning_rate": 0.003, "loss": 4.1041, "step": 3671 }, { "epoch": 0.03672, "grad_norm": 0.7330699563026428, "learning_rate": 0.003, "loss": 4.1123, "step": 3672 }, { "epoch": 0.03673, "grad_norm": 0.7183411717414856, "learning_rate": 0.003, "loss": 4.1238, "step": 3673 }, { "epoch": 0.03674, "grad_norm": 0.6538311243057251, "learning_rate": 0.003, "loss": 4.0817, "step": 3674 }, { "epoch": 0.03675, "grad_norm": 0.6396141052246094, "learning_rate": 0.003, "loss": 4.1215, "step": 3675 }, { "epoch": 0.03676, "grad_norm": 0.5857633948326111, "learning_rate": 0.003, "loss": 4.0802, "step": 3676 }, { "epoch": 0.03677, "grad_norm": 0.5811484456062317, "learning_rate": 0.003, "loss": 4.0927, "step": 3677 }, { "epoch": 0.03678, "grad_norm": 0.5666897892951965, "learning_rate": 0.003, "loss": 4.121, "step": 3678 }, { "epoch": 0.03679, "grad_norm": 0.6068264245986938, "learning_rate": 0.003, "loss": 4.1091, "step": 3679 }, { "epoch": 0.0368, "grad_norm": 0.6787545084953308, "learning_rate": 0.003, "loss": 4.1035, "step": 3680 }, { "epoch": 0.03681, "grad_norm": 0.8176649212837219, "learning_rate": 0.003, "loss": 4.0805, "step": 3681 }, { "epoch": 0.03682, "grad_norm": 1.0537775754928589, "learning_rate": 0.003, "loss": 4.1282, "step": 3682 }, { "epoch": 0.03683, "grad_norm": 0.9726674556732178, "learning_rate": 0.003, "loss": 4.0925, "step": 3683 }, { "epoch": 0.03684, "grad_norm": 0.7920905351638794, "learning_rate": 0.003, "loss": 4.1132, "step": 3684 }, { "epoch": 0.03685, "grad_norm": 0.6423739194869995, "learning_rate": 0.003, "loss": 4.094, "step": 3685 }, { "epoch": 0.03686, "grad_norm": 0.6905859112739563, "learning_rate": 0.003, "loss": 4.1182, "step": 3686 }, { "epoch": 0.03687, "grad_norm": 0.7596766948699951, "learning_rate": 0.003, "loss": 4.1374, "step": 3687 }, { "epoch": 0.03688, "grad_norm": 0.7518109679222107, "learning_rate": 0.003, "loss": 4.1276, "step": 3688 }, { "epoch": 0.03689, "grad_norm": 0.764574408531189, "learning_rate": 0.003, "loss": 4.0944, "step": 3689 }, { "epoch": 0.0369, "grad_norm": 0.721488893032074, "learning_rate": 0.003, "loss": 4.1267, "step": 3690 }, { "epoch": 0.03691, "grad_norm": 0.7047094702720642, "learning_rate": 0.003, "loss": 4.1095, "step": 3691 }, { "epoch": 0.03692, "grad_norm": 0.8295153975486755, "learning_rate": 0.003, "loss": 4.1489, "step": 3692 }, { "epoch": 0.03693, "grad_norm": 0.899239182472229, "learning_rate": 0.003, "loss": 4.1229, "step": 3693 }, { "epoch": 0.03694, "grad_norm": 0.9156467914581299, "learning_rate": 0.003, "loss": 4.1211, "step": 3694 }, { "epoch": 0.03695, "grad_norm": 0.8504350781440735, "learning_rate": 0.003, "loss": 4.1158, "step": 3695 }, { "epoch": 0.03696, "grad_norm": 0.7041043639183044, "learning_rate": 0.003, "loss": 4.1126, "step": 3696 }, { "epoch": 0.03697, "grad_norm": 0.643920361995697, "learning_rate": 0.003, "loss": 4.0985, "step": 3697 }, { "epoch": 0.03698, "grad_norm": 0.7144352197647095, "learning_rate": 0.003, "loss": 4.1359, "step": 3698 }, { "epoch": 0.03699, "grad_norm": 0.7363175749778748, "learning_rate": 0.003, "loss": 4.1234, "step": 3699 }, { "epoch": 0.037, "grad_norm": 0.649755597114563, "learning_rate": 0.003, "loss": 4.1119, "step": 3700 }, { "epoch": 0.03701, "grad_norm": 0.5577970743179321, "learning_rate": 0.003, "loss": 4.1128, "step": 3701 }, { "epoch": 0.03702, "grad_norm": 0.5865651369094849, "learning_rate": 0.003, "loss": 4.1004, "step": 3702 }, { "epoch": 0.03703, "grad_norm": 0.6238481402397156, "learning_rate": 0.003, "loss": 4.1129, "step": 3703 }, { "epoch": 0.03704, "grad_norm": 0.630403995513916, "learning_rate": 0.003, "loss": 4.1145, "step": 3704 }, { "epoch": 0.03705, "grad_norm": 0.6488320231437683, "learning_rate": 0.003, "loss": 4.1311, "step": 3705 }, { "epoch": 0.03706, "grad_norm": 0.6384865045547485, "learning_rate": 0.003, "loss": 4.1146, "step": 3706 }, { "epoch": 0.03707, "grad_norm": 0.6757420897483826, "learning_rate": 0.003, "loss": 4.0963, "step": 3707 }, { "epoch": 0.03708, "grad_norm": 0.7628651857376099, "learning_rate": 0.003, "loss": 4.1187, "step": 3708 }, { "epoch": 0.03709, "grad_norm": 0.6610810160636902, "learning_rate": 0.003, "loss": 4.0902, "step": 3709 }, { "epoch": 0.0371, "grad_norm": 0.6568567156791687, "learning_rate": 0.003, "loss": 4.1283, "step": 3710 }, { "epoch": 0.03711, "grad_norm": 0.6123287677764893, "learning_rate": 0.003, "loss": 4.0969, "step": 3711 }, { "epoch": 0.03712, "grad_norm": 0.5023053288459778, "learning_rate": 0.003, "loss": 4.0948, "step": 3712 }, { "epoch": 0.03713, "grad_norm": 0.561567485332489, "learning_rate": 0.003, "loss": 4.0848, "step": 3713 }, { "epoch": 0.03714, "grad_norm": 0.6204001903533936, "learning_rate": 0.003, "loss": 4.0978, "step": 3714 }, { "epoch": 0.03715, "grad_norm": 0.7001908421516418, "learning_rate": 0.003, "loss": 4.1084, "step": 3715 }, { "epoch": 0.03716, "grad_norm": 0.7154534459114075, "learning_rate": 0.003, "loss": 4.1031, "step": 3716 }, { "epoch": 0.03717, "grad_norm": 0.8945140242576599, "learning_rate": 0.003, "loss": 4.1087, "step": 3717 }, { "epoch": 0.03718, "grad_norm": 0.9586088061332703, "learning_rate": 0.003, "loss": 4.0868, "step": 3718 }, { "epoch": 0.03719, "grad_norm": 0.8600691556930542, "learning_rate": 0.003, "loss": 4.1178, "step": 3719 }, { "epoch": 0.0372, "grad_norm": 0.7804197669029236, "learning_rate": 0.003, "loss": 4.1204, "step": 3720 }, { "epoch": 0.03721, "grad_norm": 0.9556355476379395, "learning_rate": 0.003, "loss": 4.1145, "step": 3721 }, { "epoch": 0.03722, "grad_norm": 0.9721794724464417, "learning_rate": 0.003, "loss": 4.1185, "step": 3722 }, { "epoch": 0.03723, "grad_norm": 0.9020527005195618, "learning_rate": 0.003, "loss": 4.1534, "step": 3723 }, { "epoch": 0.03724, "grad_norm": 0.7264366745948792, "learning_rate": 0.003, "loss": 4.1229, "step": 3724 }, { "epoch": 0.03725, "grad_norm": 0.6015045046806335, "learning_rate": 0.003, "loss": 4.1288, "step": 3725 }, { "epoch": 0.03726, "grad_norm": 0.5826119780540466, "learning_rate": 0.003, "loss": 4.1017, "step": 3726 }, { "epoch": 0.03727, "grad_norm": 0.5803874731063843, "learning_rate": 0.003, "loss": 4.1349, "step": 3727 }, { "epoch": 0.03728, "grad_norm": 0.5483809113502502, "learning_rate": 0.003, "loss": 4.0828, "step": 3728 }, { "epoch": 0.03729, "grad_norm": 0.49769601225852966, "learning_rate": 0.003, "loss": 4.117, "step": 3729 }, { "epoch": 0.0373, "grad_norm": 0.5014387965202332, "learning_rate": 0.003, "loss": 4.0982, "step": 3730 }, { "epoch": 0.03731, "grad_norm": 0.5072956681251526, "learning_rate": 0.003, "loss": 4.1216, "step": 3731 }, { "epoch": 0.03732, "grad_norm": 0.5933743119239807, "learning_rate": 0.003, "loss": 4.1209, "step": 3732 }, { "epoch": 0.03733, "grad_norm": 0.644372820854187, "learning_rate": 0.003, "loss": 4.0984, "step": 3733 }, { "epoch": 0.03734, "grad_norm": 0.6615225076675415, "learning_rate": 0.003, "loss": 4.0993, "step": 3734 }, { "epoch": 0.03735, "grad_norm": 0.7122288942337036, "learning_rate": 0.003, "loss": 4.0961, "step": 3735 }, { "epoch": 0.03736, "grad_norm": 0.7745519876480103, "learning_rate": 0.003, "loss": 4.1273, "step": 3736 }, { "epoch": 0.03737, "grad_norm": 0.7451056241989136, "learning_rate": 0.003, "loss": 4.1138, "step": 3737 }, { "epoch": 0.03738, "grad_norm": 0.6421622037887573, "learning_rate": 0.003, "loss": 4.127, "step": 3738 }, { "epoch": 0.03739, "grad_norm": 0.6206536889076233, "learning_rate": 0.003, "loss": 4.0911, "step": 3739 }, { "epoch": 0.0374, "grad_norm": 0.6064566373825073, "learning_rate": 0.003, "loss": 4.1106, "step": 3740 }, { "epoch": 0.03741, "grad_norm": 0.726904034614563, "learning_rate": 0.003, "loss": 4.0978, "step": 3741 }, { "epoch": 0.03742, "grad_norm": 0.8068590760231018, "learning_rate": 0.003, "loss": 4.0937, "step": 3742 }, { "epoch": 0.03743, "grad_norm": 0.6390416622161865, "learning_rate": 0.003, "loss": 4.0884, "step": 3743 }, { "epoch": 0.03744, "grad_norm": 0.6150771975517273, "learning_rate": 0.003, "loss": 4.1172, "step": 3744 }, { "epoch": 0.03745, "grad_norm": 0.664985716342926, "learning_rate": 0.003, "loss": 4.1024, "step": 3745 }, { "epoch": 0.03746, "grad_norm": 0.7317749857902527, "learning_rate": 0.003, "loss": 4.1139, "step": 3746 }, { "epoch": 0.03747, "grad_norm": 0.7559656500816345, "learning_rate": 0.003, "loss": 4.119, "step": 3747 }, { "epoch": 0.03748, "grad_norm": 0.6956848502159119, "learning_rate": 0.003, "loss": 4.0925, "step": 3748 }, { "epoch": 0.03749, "grad_norm": 0.8748852014541626, "learning_rate": 0.003, "loss": 4.1089, "step": 3749 }, { "epoch": 0.0375, "grad_norm": 0.8952407240867615, "learning_rate": 0.003, "loss": 4.1339, "step": 3750 }, { "epoch": 0.03751, "grad_norm": 0.9176827669143677, "learning_rate": 0.003, "loss": 4.0869, "step": 3751 }, { "epoch": 0.03752, "grad_norm": 1.0543770790100098, "learning_rate": 0.003, "loss": 4.1191, "step": 3752 }, { "epoch": 0.03753, "grad_norm": 0.9552967548370361, "learning_rate": 0.003, "loss": 4.1269, "step": 3753 }, { "epoch": 0.03754, "grad_norm": 0.7647460699081421, "learning_rate": 0.003, "loss": 4.1277, "step": 3754 }, { "epoch": 0.03755, "grad_norm": 0.8259355425834656, "learning_rate": 0.003, "loss": 4.1264, "step": 3755 }, { "epoch": 0.03756, "grad_norm": 0.8404808044433594, "learning_rate": 0.003, "loss": 4.0998, "step": 3756 }, { "epoch": 0.03757, "grad_norm": 0.7295672297477722, "learning_rate": 0.003, "loss": 4.1265, "step": 3757 }, { "epoch": 0.03758, "grad_norm": 0.6218863129615784, "learning_rate": 0.003, "loss": 4.0653, "step": 3758 }, { "epoch": 0.03759, "grad_norm": 0.5583391785621643, "learning_rate": 0.003, "loss": 4.1224, "step": 3759 }, { "epoch": 0.0376, "grad_norm": 0.6009096503257751, "learning_rate": 0.003, "loss": 4.1056, "step": 3760 }, { "epoch": 0.03761, "grad_norm": 0.7737796902656555, "learning_rate": 0.003, "loss": 4.0853, "step": 3761 }, { "epoch": 0.03762, "grad_norm": 0.8101951479911804, "learning_rate": 0.003, "loss": 4.1395, "step": 3762 }, { "epoch": 0.03763, "grad_norm": 0.6714347004890442, "learning_rate": 0.003, "loss": 4.1198, "step": 3763 }, { "epoch": 0.03764, "grad_norm": 0.4961341321468353, "learning_rate": 0.003, "loss": 4.125, "step": 3764 }, { "epoch": 0.03765, "grad_norm": 0.5392913818359375, "learning_rate": 0.003, "loss": 4.1471, "step": 3765 }, { "epoch": 0.03766, "grad_norm": 0.6497456431388855, "learning_rate": 0.003, "loss": 4.0875, "step": 3766 }, { "epoch": 0.03767, "grad_norm": 0.6586916446685791, "learning_rate": 0.003, "loss": 4.1283, "step": 3767 }, { "epoch": 0.03768, "grad_norm": 0.6380057334899902, "learning_rate": 0.003, "loss": 4.1067, "step": 3768 }, { "epoch": 0.03769, "grad_norm": 0.6120327711105347, "learning_rate": 0.003, "loss": 4.0939, "step": 3769 }, { "epoch": 0.0377, "grad_norm": 0.5535104274749756, "learning_rate": 0.003, "loss": 4.074, "step": 3770 }, { "epoch": 0.03771, "grad_norm": 0.5369603633880615, "learning_rate": 0.003, "loss": 4.0896, "step": 3771 }, { "epoch": 0.03772, "grad_norm": 0.5378307700157166, "learning_rate": 0.003, "loss": 4.1075, "step": 3772 }, { "epoch": 0.03773, "grad_norm": 0.5350049734115601, "learning_rate": 0.003, "loss": 4.1151, "step": 3773 }, { "epoch": 0.03774, "grad_norm": 0.48990651965141296, "learning_rate": 0.003, "loss": 4.0971, "step": 3774 }, { "epoch": 0.03775, "grad_norm": 0.4350149929523468, "learning_rate": 0.003, "loss": 4.112, "step": 3775 }, { "epoch": 0.03776, "grad_norm": 0.455160915851593, "learning_rate": 0.003, "loss": 4.0888, "step": 3776 }, { "epoch": 0.03777, "grad_norm": 0.4597632586956024, "learning_rate": 0.003, "loss": 4.1041, "step": 3777 }, { "epoch": 0.03778, "grad_norm": 0.5477138757705688, "learning_rate": 0.003, "loss": 4.0821, "step": 3778 }, { "epoch": 0.03779, "grad_norm": 0.7227920293807983, "learning_rate": 0.003, "loss": 4.1017, "step": 3779 }, { "epoch": 0.0378, "grad_norm": 0.9108830690383911, "learning_rate": 0.003, "loss": 4.1068, "step": 3780 }, { "epoch": 0.03781, "grad_norm": 1.0170267820358276, "learning_rate": 0.003, "loss": 4.1423, "step": 3781 }, { "epoch": 0.03782, "grad_norm": 0.9500049948692322, "learning_rate": 0.003, "loss": 4.1159, "step": 3782 }, { "epoch": 0.03783, "grad_norm": 0.7215931415557861, "learning_rate": 0.003, "loss": 4.1121, "step": 3783 }, { "epoch": 0.03784, "grad_norm": 0.6211499571800232, "learning_rate": 0.003, "loss": 4.1103, "step": 3784 }, { "epoch": 0.03785, "grad_norm": 0.6421756148338318, "learning_rate": 0.003, "loss": 4.0943, "step": 3785 }, { "epoch": 0.03786, "grad_norm": 0.6241987943649292, "learning_rate": 0.003, "loss": 4.1119, "step": 3786 }, { "epoch": 0.03787, "grad_norm": 0.6087479591369629, "learning_rate": 0.003, "loss": 4.0958, "step": 3787 }, { "epoch": 0.03788, "grad_norm": 0.5939789414405823, "learning_rate": 0.003, "loss": 4.1271, "step": 3788 }, { "epoch": 0.03789, "grad_norm": 0.5391459465026855, "learning_rate": 0.003, "loss": 4.0592, "step": 3789 }, { "epoch": 0.0379, "grad_norm": 0.5861315131187439, "learning_rate": 0.003, "loss": 4.0863, "step": 3790 }, { "epoch": 0.03791, "grad_norm": 0.6246805191040039, "learning_rate": 0.003, "loss": 4.0974, "step": 3791 }, { "epoch": 0.03792, "grad_norm": 0.7353853583335876, "learning_rate": 0.003, "loss": 4.1322, "step": 3792 }, { "epoch": 0.03793, "grad_norm": 0.6765242218971252, "learning_rate": 0.003, "loss": 4.1235, "step": 3793 }, { "epoch": 0.03794, "grad_norm": 0.7689555883407593, "learning_rate": 0.003, "loss": 4.1271, "step": 3794 }, { "epoch": 0.03795, "grad_norm": 0.9137020707130432, "learning_rate": 0.003, "loss": 4.121, "step": 3795 }, { "epoch": 0.03796, "grad_norm": 1.0011204481124878, "learning_rate": 0.003, "loss": 4.1083, "step": 3796 }, { "epoch": 0.03797, "grad_norm": 1.0785057544708252, "learning_rate": 0.003, "loss": 4.1042, "step": 3797 }, { "epoch": 0.03798, "grad_norm": 1.0564720630645752, "learning_rate": 0.003, "loss": 4.1262, "step": 3798 }, { "epoch": 0.03799, "grad_norm": 0.9513949155807495, "learning_rate": 0.003, "loss": 4.1296, "step": 3799 }, { "epoch": 0.038, "grad_norm": 0.7991365790367126, "learning_rate": 0.003, "loss": 4.136, "step": 3800 }, { "epoch": 0.03801, "grad_norm": 0.942852795124054, "learning_rate": 0.003, "loss": 4.1481, "step": 3801 }, { "epoch": 0.03802, "grad_norm": 0.9442994594573975, "learning_rate": 0.003, "loss": 4.1224, "step": 3802 }, { "epoch": 0.03803, "grad_norm": 0.917535662651062, "learning_rate": 0.003, "loss": 4.1541, "step": 3803 }, { "epoch": 0.03804, "grad_norm": 1.0844844579696655, "learning_rate": 0.003, "loss": 4.1317, "step": 3804 }, { "epoch": 0.03805, "grad_norm": 0.8430477380752563, "learning_rate": 0.003, "loss": 4.1379, "step": 3805 }, { "epoch": 0.03806, "grad_norm": 0.8205291032791138, "learning_rate": 0.003, "loss": 4.1397, "step": 3806 }, { "epoch": 0.03807, "grad_norm": 0.8324175477027893, "learning_rate": 0.003, "loss": 4.1299, "step": 3807 }, { "epoch": 0.03808, "grad_norm": 0.8151392936706543, "learning_rate": 0.003, "loss": 4.146, "step": 3808 }, { "epoch": 0.03809, "grad_norm": 0.6972253918647766, "learning_rate": 0.003, "loss": 4.1248, "step": 3809 }, { "epoch": 0.0381, "grad_norm": 0.7419064044952393, "learning_rate": 0.003, "loss": 4.109, "step": 3810 }, { "epoch": 0.03811, "grad_norm": 0.72950679063797, "learning_rate": 0.003, "loss": 4.1397, "step": 3811 }, { "epoch": 0.03812, "grad_norm": 0.7456114888191223, "learning_rate": 0.003, "loss": 4.1298, "step": 3812 }, { "epoch": 0.03813, "grad_norm": 0.868884801864624, "learning_rate": 0.003, "loss": 4.1041, "step": 3813 }, { "epoch": 0.03814, "grad_norm": 1.0820525884628296, "learning_rate": 0.003, "loss": 4.1402, "step": 3814 }, { "epoch": 0.03815, "grad_norm": 1.0806629657745361, "learning_rate": 0.003, "loss": 4.1304, "step": 3815 }, { "epoch": 0.03816, "grad_norm": 0.820555567741394, "learning_rate": 0.003, "loss": 4.126, "step": 3816 }, { "epoch": 0.03817, "grad_norm": 0.7203388810157776, "learning_rate": 0.003, "loss": 4.1016, "step": 3817 }, { "epoch": 0.03818, "grad_norm": 0.6214293241500854, "learning_rate": 0.003, "loss": 4.1479, "step": 3818 }, { "epoch": 0.03819, "grad_norm": 0.5392916798591614, "learning_rate": 0.003, "loss": 4.0977, "step": 3819 }, { "epoch": 0.0382, "grad_norm": 0.5882942080497742, "learning_rate": 0.003, "loss": 4.1451, "step": 3820 }, { "epoch": 0.03821, "grad_norm": 0.6876145005226135, "learning_rate": 0.003, "loss": 4.0921, "step": 3821 }, { "epoch": 0.03822, "grad_norm": 0.6543509364128113, "learning_rate": 0.003, "loss": 4.1133, "step": 3822 }, { "epoch": 0.03823, "grad_norm": 0.6853277087211609, "learning_rate": 0.003, "loss": 4.1091, "step": 3823 }, { "epoch": 0.03824, "grad_norm": 0.6686453819274902, "learning_rate": 0.003, "loss": 4.1061, "step": 3824 }, { "epoch": 0.03825, "grad_norm": 0.5822768211364746, "learning_rate": 0.003, "loss": 4.1176, "step": 3825 }, { "epoch": 0.03826, "grad_norm": 0.5187621712684631, "learning_rate": 0.003, "loss": 4.101, "step": 3826 }, { "epoch": 0.03827, "grad_norm": 0.48330262303352356, "learning_rate": 0.003, "loss": 4.125, "step": 3827 }, { "epoch": 0.03828, "grad_norm": 0.525285542011261, "learning_rate": 0.003, "loss": 4.0998, "step": 3828 }, { "epoch": 0.03829, "grad_norm": 0.5435876846313477, "learning_rate": 0.003, "loss": 4.1066, "step": 3829 }, { "epoch": 0.0383, "grad_norm": 0.5318546295166016, "learning_rate": 0.003, "loss": 4.0943, "step": 3830 }, { "epoch": 0.03831, "grad_norm": 0.5064510703086853, "learning_rate": 0.003, "loss": 4.0935, "step": 3831 }, { "epoch": 0.03832, "grad_norm": 0.5135751962661743, "learning_rate": 0.003, "loss": 4.0901, "step": 3832 }, { "epoch": 0.03833, "grad_norm": 0.5150834321975708, "learning_rate": 0.003, "loss": 4.091, "step": 3833 }, { "epoch": 0.03834, "grad_norm": 0.5075490474700928, "learning_rate": 0.003, "loss": 4.1195, "step": 3834 }, { "epoch": 0.03835, "grad_norm": 0.47964203357696533, "learning_rate": 0.003, "loss": 4.1071, "step": 3835 }, { "epoch": 0.03836, "grad_norm": 0.5092160105705261, "learning_rate": 0.003, "loss": 4.1087, "step": 3836 }, { "epoch": 0.03837, "grad_norm": 0.5252288579940796, "learning_rate": 0.003, "loss": 4.0969, "step": 3837 }, { "epoch": 0.03838, "grad_norm": 0.4719623029232025, "learning_rate": 0.003, "loss": 4.0815, "step": 3838 }, { "epoch": 0.03839, "grad_norm": 0.525349497795105, "learning_rate": 0.003, "loss": 4.09, "step": 3839 }, { "epoch": 0.0384, "grad_norm": 0.5973276495933533, "learning_rate": 0.003, "loss": 4.0965, "step": 3840 }, { "epoch": 0.03841, "grad_norm": 0.7389470338821411, "learning_rate": 0.003, "loss": 4.101, "step": 3841 }, { "epoch": 0.03842, "grad_norm": 0.8890230059623718, "learning_rate": 0.003, "loss": 4.1096, "step": 3842 }, { "epoch": 0.03843, "grad_norm": 0.9227127432823181, "learning_rate": 0.003, "loss": 4.0975, "step": 3843 }, { "epoch": 0.03844, "grad_norm": 0.8832306861877441, "learning_rate": 0.003, "loss": 4.1101, "step": 3844 }, { "epoch": 0.03845, "grad_norm": 0.711075484752655, "learning_rate": 0.003, "loss": 4.104, "step": 3845 }, { "epoch": 0.03846, "grad_norm": 0.5776762962341309, "learning_rate": 0.003, "loss": 4.1135, "step": 3846 }, { "epoch": 0.03847, "grad_norm": 0.6540825963020325, "learning_rate": 0.003, "loss": 4.1065, "step": 3847 }, { "epoch": 0.03848, "grad_norm": 0.6692461967468262, "learning_rate": 0.003, "loss": 4.0898, "step": 3848 }, { "epoch": 0.03849, "grad_norm": 0.6830011010169983, "learning_rate": 0.003, "loss": 4.1044, "step": 3849 }, { "epoch": 0.0385, "grad_norm": 0.6109870076179504, "learning_rate": 0.003, "loss": 4.1145, "step": 3850 }, { "epoch": 0.03851, "grad_norm": 0.6136434078216553, "learning_rate": 0.003, "loss": 4.0754, "step": 3851 }, { "epoch": 0.03852, "grad_norm": 0.6604334712028503, "learning_rate": 0.003, "loss": 4.1124, "step": 3852 }, { "epoch": 0.03853, "grad_norm": 0.59455806016922, "learning_rate": 0.003, "loss": 4.1074, "step": 3853 }, { "epoch": 0.03854, "grad_norm": 0.575217604637146, "learning_rate": 0.003, "loss": 4.0982, "step": 3854 }, { "epoch": 0.03855, "grad_norm": 0.6192313432693481, "learning_rate": 0.003, "loss": 4.1173, "step": 3855 }, { "epoch": 0.03856, "grad_norm": 0.6025145649909973, "learning_rate": 0.003, "loss": 4.0898, "step": 3856 }, { "epoch": 0.03857, "grad_norm": 0.6573590636253357, "learning_rate": 0.003, "loss": 4.0973, "step": 3857 }, { "epoch": 0.03858, "grad_norm": 0.6135656833648682, "learning_rate": 0.003, "loss": 4.0911, "step": 3858 }, { "epoch": 0.03859, "grad_norm": 0.6176097989082336, "learning_rate": 0.003, "loss": 4.1146, "step": 3859 }, { "epoch": 0.0386, "grad_norm": 0.7532007098197937, "learning_rate": 0.003, "loss": 4.0797, "step": 3860 }, { "epoch": 0.03861, "grad_norm": 0.8719074130058289, "learning_rate": 0.003, "loss": 4.1223, "step": 3861 }, { "epoch": 0.03862, "grad_norm": 1.079362154006958, "learning_rate": 0.003, "loss": 4.1073, "step": 3862 }, { "epoch": 0.03863, "grad_norm": 0.9231005311012268, "learning_rate": 0.003, "loss": 4.1031, "step": 3863 }, { "epoch": 0.03864, "grad_norm": 0.776627242565155, "learning_rate": 0.003, "loss": 4.105, "step": 3864 }, { "epoch": 0.03865, "grad_norm": 0.6355084180831909, "learning_rate": 0.003, "loss": 4.1132, "step": 3865 }, { "epoch": 0.03866, "grad_norm": 0.6550437211990356, "learning_rate": 0.003, "loss": 4.0906, "step": 3866 }, { "epoch": 0.03867, "grad_norm": 0.8415738940238953, "learning_rate": 0.003, "loss": 4.0778, "step": 3867 }, { "epoch": 0.03868, "grad_norm": 0.8901708126068115, "learning_rate": 0.003, "loss": 4.1199, "step": 3868 }, { "epoch": 0.03869, "grad_norm": 0.697814404964447, "learning_rate": 0.003, "loss": 4.1022, "step": 3869 }, { "epoch": 0.0387, "grad_norm": 0.6080979108810425, "learning_rate": 0.003, "loss": 4.1098, "step": 3870 }, { "epoch": 0.03871, "grad_norm": 0.6336243748664856, "learning_rate": 0.003, "loss": 4.0855, "step": 3871 }, { "epoch": 0.03872, "grad_norm": 0.6221011877059937, "learning_rate": 0.003, "loss": 4.0776, "step": 3872 }, { "epoch": 0.03873, "grad_norm": 0.6594142317771912, "learning_rate": 0.003, "loss": 4.1194, "step": 3873 }, { "epoch": 0.03874, "grad_norm": 0.5940245389938354, "learning_rate": 0.003, "loss": 4.0914, "step": 3874 }, { "epoch": 0.03875, "grad_norm": 0.47639554738998413, "learning_rate": 0.003, "loss": 4.0897, "step": 3875 }, { "epoch": 0.03876, "grad_norm": 0.5326244831085205, "learning_rate": 0.003, "loss": 4.123, "step": 3876 }, { "epoch": 0.03877, "grad_norm": 0.5539395213127136, "learning_rate": 0.003, "loss": 4.1029, "step": 3877 }, { "epoch": 0.03878, "grad_norm": 0.604924201965332, "learning_rate": 0.003, "loss": 4.1151, "step": 3878 }, { "epoch": 0.03879, "grad_norm": 0.6767284870147705, "learning_rate": 0.003, "loss": 4.1189, "step": 3879 }, { "epoch": 0.0388, "grad_norm": 0.8324589133262634, "learning_rate": 0.003, "loss": 4.0894, "step": 3880 }, { "epoch": 0.03881, "grad_norm": 1.0423182249069214, "learning_rate": 0.003, "loss": 4.1216, "step": 3881 }, { "epoch": 0.03882, "grad_norm": 1.036028504371643, "learning_rate": 0.003, "loss": 4.1333, "step": 3882 }, { "epoch": 0.03883, "grad_norm": 0.8236280679702759, "learning_rate": 0.003, "loss": 4.0962, "step": 3883 }, { "epoch": 0.03884, "grad_norm": 0.6207178831100464, "learning_rate": 0.003, "loss": 4.1053, "step": 3884 }, { "epoch": 0.03885, "grad_norm": 0.6633720397949219, "learning_rate": 0.003, "loss": 4.122, "step": 3885 }, { "epoch": 0.03886, "grad_norm": 0.6237534880638123, "learning_rate": 0.003, "loss": 4.0972, "step": 3886 }, { "epoch": 0.03887, "grad_norm": 0.49867168068885803, "learning_rate": 0.003, "loss": 4.1034, "step": 3887 }, { "epoch": 0.03888, "grad_norm": 0.4786747395992279, "learning_rate": 0.003, "loss": 4.0787, "step": 3888 }, { "epoch": 0.03889, "grad_norm": 0.5030584931373596, "learning_rate": 0.003, "loss": 4.0971, "step": 3889 }, { "epoch": 0.0389, "grad_norm": 0.6045381426811218, "learning_rate": 0.003, "loss": 4.0885, "step": 3890 }, { "epoch": 0.03891, "grad_norm": 0.6243774890899658, "learning_rate": 0.003, "loss": 4.0961, "step": 3891 }, { "epoch": 0.03892, "grad_norm": 0.7076771855354309, "learning_rate": 0.003, "loss": 4.1158, "step": 3892 }, { "epoch": 0.03893, "grad_norm": 0.6316627264022827, "learning_rate": 0.003, "loss": 4.0896, "step": 3893 }, { "epoch": 0.03894, "grad_norm": 0.6711505055427551, "learning_rate": 0.003, "loss": 4.1012, "step": 3894 }, { "epoch": 0.03895, "grad_norm": 0.7016406655311584, "learning_rate": 0.003, "loss": 4.0776, "step": 3895 }, { "epoch": 0.03896, "grad_norm": 0.6848783493041992, "learning_rate": 0.003, "loss": 4.1034, "step": 3896 }, { "epoch": 0.03897, "grad_norm": 0.701120913028717, "learning_rate": 0.003, "loss": 4.0781, "step": 3897 }, { "epoch": 0.03898, "grad_norm": 0.678917407989502, "learning_rate": 0.003, "loss": 4.1108, "step": 3898 }, { "epoch": 0.03899, "grad_norm": 0.7172825336456299, "learning_rate": 0.003, "loss": 4.1019, "step": 3899 }, { "epoch": 0.039, "grad_norm": 0.6747111082077026, "learning_rate": 0.003, "loss": 4.1015, "step": 3900 }, { "epoch": 0.03901, "grad_norm": 0.7526534795761108, "learning_rate": 0.003, "loss": 4.1173, "step": 3901 }, { "epoch": 0.03902, "grad_norm": 0.7362321615219116, "learning_rate": 0.003, "loss": 4.0883, "step": 3902 }, { "epoch": 0.03903, "grad_norm": 0.8664571642875671, "learning_rate": 0.003, "loss": 4.1302, "step": 3903 }, { "epoch": 0.03904, "grad_norm": 1.0233482122421265, "learning_rate": 0.003, "loss": 4.1165, "step": 3904 }, { "epoch": 0.03905, "grad_norm": 0.9382357001304626, "learning_rate": 0.003, "loss": 4.1356, "step": 3905 }, { "epoch": 0.03906, "grad_norm": 0.687033474445343, "learning_rate": 0.003, "loss": 4.1007, "step": 3906 }, { "epoch": 0.03907, "grad_norm": 0.6406064033508301, "learning_rate": 0.003, "loss": 4.1115, "step": 3907 }, { "epoch": 0.03908, "grad_norm": 0.6577279567718506, "learning_rate": 0.003, "loss": 4.0944, "step": 3908 }, { "epoch": 0.03909, "grad_norm": 0.6069698333740234, "learning_rate": 0.003, "loss": 4.1004, "step": 3909 }, { "epoch": 0.0391, "grad_norm": 0.625200629234314, "learning_rate": 0.003, "loss": 4.1285, "step": 3910 }, { "epoch": 0.03911, "grad_norm": 0.6532104015350342, "learning_rate": 0.003, "loss": 4.1117, "step": 3911 }, { "epoch": 0.03912, "grad_norm": 0.5893357992172241, "learning_rate": 0.003, "loss": 4.1101, "step": 3912 }, { "epoch": 0.03913, "grad_norm": 0.6330502033233643, "learning_rate": 0.003, "loss": 4.0822, "step": 3913 }, { "epoch": 0.03914, "grad_norm": 0.7185488343238831, "learning_rate": 0.003, "loss": 4.1191, "step": 3914 }, { "epoch": 0.03915, "grad_norm": 0.8594509363174438, "learning_rate": 0.003, "loss": 4.0951, "step": 3915 }, { "epoch": 0.03916, "grad_norm": 0.9027094841003418, "learning_rate": 0.003, "loss": 4.1242, "step": 3916 }, { "epoch": 0.03917, "grad_norm": 0.9196935892105103, "learning_rate": 0.003, "loss": 4.1213, "step": 3917 }, { "epoch": 0.03918, "grad_norm": 1.003941535949707, "learning_rate": 0.003, "loss": 4.1127, "step": 3918 }, { "epoch": 0.03919, "grad_norm": 1.0010123252868652, "learning_rate": 0.003, "loss": 4.0827, "step": 3919 }, { "epoch": 0.0392, "grad_norm": 0.9059627056121826, "learning_rate": 0.003, "loss": 4.1073, "step": 3920 }, { "epoch": 0.03921, "grad_norm": 0.95637047290802, "learning_rate": 0.003, "loss": 4.1083, "step": 3921 }, { "epoch": 0.03922, "grad_norm": 0.8252183198928833, "learning_rate": 0.003, "loss": 4.1469, "step": 3922 }, { "epoch": 0.03923, "grad_norm": 0.8936133980751038, "learning_rate": 0.003, "loss": 4.103, "step": 3923 }, { "epoch": 0.03924, "grad_norm": 0.7836267352104187, "learning_rate": 0.003, "loss": 4.1067, "step": 3924 }, { "epoch": 0.03925, "grad_norm": 0.7859375476837158, "learning_rate": 0.003, "loss": 4.1147, "step": 3925 }, { "epoch": 0.03926, "grad_norm": 0.7374272346496582, "learning_rate": 0.003, "loss": 4.1025, "step": 3926 }, { "epoch": 0.03927, "grad_norm": 0.8471856117248535, "learning_rate": 0.003, "loss": 4.1198, "step": 3927 }, { "epoch": 0.03928, "grad_norm": 0.85481858253479, "learning_rate": 0.003, "loss": 4.1374, "step": 3928 }, { "epoch": 0.03929, "grad_norm": 0.907612144947052, "learning_rate": 0.003, "loss": 4.1445, "step": 3929 }, { "epoch": 0.0393, "grad_norm": 0.7699798345565796, "learning_rate": 0.003, "loss": 4.1283, "step": 3930 }, { "epoch": 0.03931, "grad_norm": 0.6045697331428528, "learning_rate": 0.003, "loss": 4.1255, "step": 3931 }, { "epoch": 0.03932, "grad_norm": 0.6712193489074707, "learning_rate": 0.003, "loss": 4.1268, "step": 3932 }, { "epoch": 0.03933, "grad_norm": 0.8346224427223206, "learning_rate": 0.003, "loss": 4.1207, "step": 3933 }, { "epoch": 0.03934, "grad_norm": 0.8903274536132812, "learning_rate": 0.003, "loss": 4.12, "step": 3934 }, { "epoch": 0.03935, "grad_norm": 0.7318519353866577, "learning_rate": 0.003, "loss": 4.1156, "step": 3935 }, { "epoch": 0.03936, "grad_norm": 0.6603662967681885, "learning_rate": 0.003, "loss": 4.0714, "step": 3936 }, { "epoch": 0.03937, "grad_norm": 0.6446847915649414, "learning_rate": 0.003, "loss": 4.1072, "step": 3937 }, { "epoch": 0.03938, "grad_norm": 0.6806734204292297, "learning_rate": 0.003, "loss": 4.1099, "step": 3938 }, { "epoch": 0.03939, "grad_norm": 0.5945215225219727, "learning_rate": 0.003, "loss": 4.088, "step": 3939 }, { "epoch": 0.0394, "grad_norm": 0.5750917196273804, "learning_rate": 0.003, "loss": 4.116, "step": 3940 }, { "epoch": 0.03941, "grad_norm": 0.5394409894943237, "learning_rate": 0.003, "loss": 4.1041, "step": 3941 }, { "epoch": 0.03942, "grad_norm": 0.49402347207069397, "learning_rate": 0.003, "loss": 4.1168, "step": 3942 }, { "epoch": 0.03943, "grad_norm": 0.39833274483680725, "learning_rate": 0.003, "loss": 4.1017, "step": 3943 }, { "epoch": 0.03944, "grad_norm": 0.4659424126148224, "learning_rate": 0.003, "loss": 4.1164, "step": 3944 }, { "epoch": 0.03945, "grad_norm": 0.4315028488636017, "learning_rate": 0.003, "loss": 4.0865, "step": 3945 }, { "epoch": 0.03946, "grad_norm": 0.40673503279685974, "learning_rate": 0.003, "loss": 4.1113, "step": 3946 }, { "epoch": 0.03947, "grad_norm": 0.38524332642555237, "learning_rate": 0.003, "loss": 4.1127, "step": 3947 }, { "epoch": 0.03948, "grad_norm": 0.33258089423179626, "learning_rate": 0.003, "loss": 4.1268, "step": 3948 }, { "epoch": 0.03949, "grad_norm": 0.38337442278862, "learning_rate": 0.003, "loss": 4.0688, "step": 3949 }, { "epoch": 0.0395, "grad_norm": 0.46301016211509705, "learning_rate": 0.003, "loss": 4.0994, "step": 3950 }, { "epoch": 0.03951, "grad_norm": 0.7521165013313293, "learning_rate": 0.003, "loss": 4.1331, "step": 3951 }, { "epoch": 0.03952, "grad_norm": 1.1936590671539307, "learning_rate": 0.003, "loss": 4.1283, "step": 3952 }, { "epoch": 0.03953, "grad_norm": 0.9014281034469604, "learning_rate": 0.003, "loss": 4.1033, "step": 3953 }, { "epoch": 0.03954, "grad_norm": 0.5299249887466431, "learning_rate": 0.003, "loss": 4.0948, "step": 3954 }, { "epoch": 0.03955, "grad_norm": 0.621684193611145, "learning_rate": 0.003, "loss": 4.0979, "step": 3955 }, { "epoch": 0.03956, "grad_norm": 0.7184932827949524, "learning_rate": 0.003, "loss": 4.101, "step": 3956 }, { "epoch": 0.03957, "grad_norm": 0.6237415075302124, "learning_rate": 0.003, "loss": 4.1048, "step": 3957 }, { "epoch": 0.03958, "grad_norm": 0.5475152134895325, "learning_rate": 0.003, "loss": 4.0784, "step": 3958 }, { "epoch": 0.03959, "grad_norm": 0.5959770083427429, "learning_rate": 0.003, "loss": 4.0812, "step": 3959 }, { "epoch": 0.0396, "grad_norm": 0.6074774861335754, "learning_rate": 0.003, "loss": 4.1001, "step": 3960 }, { "epoch": 0.03961, "grad_norm": 0.6236320734024048, "learning_rate": 0.003, "loss": 4.0973, "step": 3961 }, { "epoch": 0.03962, "grad_norm": 0.6013439297676086, "learning_rate": 0.003, "loss": 4.1081, "step": 3962 }, { "epoch": 0.03963, "grad_norm": 0.6263412237167358, "learning_rate": 0.003, "loss": 4.0929, "step": 3963 }, { "epoch": 0.03964, "grad_norm": 0.656822919845581, "learning_rate": 0.003, "loss": 4.0974, "step": 3964 }, { "epoch": 0.03965, "grad_norm": 0.6711769700050354, "learning_rate": 0.003, "loss": 4.096, "step": 3965 }, { "epoch": 0.03966, "grad_norm": 0.7558622360229492, "learning_rate": 0.003, "loss": 4.1394, "step": 3966 }, { "epoch": 0.03967, "grad_norm": 0.8678721785545349, "learning_rate": 0.003, "loss": 4.101, "step": 3967 }, { "epoch": 0.03968, "grad_norm": 0.9608453512191772, "learning_rate": 0.003, "loss": 4.1351, "step": 3968 }, { "epoch": 0.03969, "grad_norm": 0.92333984375, "learning_rate": 0.003, "loss": 4.1176, "step": 3969 }, { "epoch": 0.0397, "grad_norm": 1.0910831689834595, "learning_rate": 0.003, "loss": 4.1298, "step": 3970 }, { "epoch": 0.03971, "grad_norm": 0.8948251008987427, "learning_rate": 0.003, "loss": 4.1057, "step": 3971 }, { "epoch": 0.03972, "grad_norm": 0.8917779922485352, "learning_rate": 0.003, "loss": 4.1289, "step": 3972 }, { "epoch": 0.03973, "grad_norm": 0.9352078437805176, "learning_rate": 0.003, "loss": 4.1347, "step": 3973 }, { "epoch": 0.03974, "grad_norm": 0.8667543530464172, "learning_rate": 0.003, "loss": 4.1303, "step": 3974 }, { "epoch": 0.03975, "grad_norm": 0.7579936385154724, "learning_rate": 0.003, "loss": 4.1131, "step": 3975 }, { "epoch": 0.03976, "grad_norm": 0.7740127444267273, "learning_rate": 0.003, "loss": 4.1498, "step": 3976 }, { "epoch": 0.03977, "grad_norm": 0.7652050256729126, "learning_rate": 0.003, "loss": 4.1352, "step": 3977 }, { "epoch": 0.03978, "grad_norm": 0.7869767546653748, "learning_rate": 0.003, "loss": 4.1027, "step": 3978 }, { "epoch": 0.03979, "grad_norm": 0.8766190409660339, "learning_rate": 0.003, "loss": 4.1282, "step": 3979 }, { "epoch": 0.0398, "grad_norm": 0.9912131428718567, "learning_rate": 0.003, "loss": 4.1235, "step": 3980 }, { "epoch": 0.03981, "grad_norm": 1.0468101501464844, "learning_rate": 0.003, "loss": 4.1214, "step": 3981 }, { "epoch": 0.03982, "grad_norm": 0.8397706747055054, "learning_rate": 0.003, "loss": 4.1007, "step": 3982 }, { "epoch": 0.03983, "grad_norm": 0.6968387365341187, "learning_rate": 0.003, "loss": 4.1115, "step": 3983 }, { "epoch": 0.03984, "grad_norm": 0.6742430925369263, "learning_rate": 0.003, "loss": 4.1251, "step": 3984 }, { "epoch": 0.03985, "grad_norm": 0.7048434019088745, "learning_rate": 0.003, "loss": 4.1183, "step": 3985 }, { "epoch": 0.03986, "grad_norm": 0.7287977337837219, "learning_rate": 0.003, "loss": 4.1185, "step": 3986 }, { "epoch": 0.03987, "grad_norm": 0.6307860016822815, "learning_rate": 0.003, "loss": 4.1175, "step": 3987 }, { "epoch": 0.03988, "grad_norm": 0.6006014943122864, "learning_rate": 0.003, "loss": 4.1214, "step": 3988 }, { "epoch": 0.03989, "grad_norm": 0.614997386932373, "learning_rate": 0.003, "loss": 4.1068, "step": 3989 }, { "epoch": 0.0399, "grad_norm": 0.6875625252723694, "learning_rate": 0.003, "loss": 4.082, "step": 3990 }, { "epoch": 0.03991, "grad_norm": 0.7122326493263245, "learning_rate": 0.003, "loss": 4.1265, "step": 3991 }, { "epoch": 0.03992, "grad_norm": 0.8080908060073853, "learning_rate": 0.003, "loss": 4.1106, "step": 3992 }, { "epoch": 0.03993, "grad_norm": 0.7636401653289795, "learning_rate": 0.003, "loss": 4.1051, "step": 3993 }, { "epoch": 0.03994, "grad_norm": 0.7076770663261414, "learning_rate": 0.003, "loss": 4.1036, "step": 3994 }, { "epoch": 0.03995, "grad_norm": 0.5683047771453857, "learning_rate": 0.003, "loss": 4.0834, "step": 3995 }, { "epoch": 0.03996, "grad_norm": 0.4724688231945038, "learning_rate": 0.003, "loss": 4.1167, "step": 3996 }, { "epoch": 0.03997, "grad_norm": 0.5588562488555908, "learning_rate": 0.003, "loss": 4.116, "step": 3997 }, { "epoch": 0.03998, "grad_norm": 0.5613643527030945, "learning_rate": 0.003, "loss": 4.117, "step": 3998 }, { "epoch": 0.03999, "grad_norm": 0.5133464932441711, "learning_rate": 0.003, "loss": 4.1186, "step": 3999 }, { "epoch": 0.04, "grad_norm": 0.46894940733909607, "learning_rate": 0.003, "loss": 4.1184, "step": 4000 } ], "logging_steps": 1, "max_steps": 100000, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.00842963206144e+17, "train_batch_size": 256, "trial_name": null, "trial_params": null }