{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 23217, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.001292156609381057, "grad_norm": 0.9450203776359558, "learning_rate": 4.999997711254574e-05, "loss": 1.1858, "num_input_tokens_seen": 81920, "step": 10 }, { "epoch": 0.002584313218762114, "grad_norm": 0.6408165097236633, "learning_rate": 4.999990845022488e-05, "loss": 1.5568, "num_input_tokens_seen": 163840, "step": 20 }, { "epoch": 0.003876469828143171, "grad_norm": 0.5522646307945251, "learning_rate": 4.999979401316311e-05, "loss": 1.2099, "num_input_tokens_seen": 245760, "step": 30 }, { "epoch": 0.005168626437524228, "grad_norm": 0.5057278871536255, "learning_rate": 4.999963380156999e-05, "loss": 1.1902, "num_input_tokens_seen": 327680, "step": 40 }, { "epoch": 0.006460783046905285, "grad_norm": 0.40978753566741943, "learning_rate": 4.9999427815738856e-05, "loss": 1.0802, "num_input_tokens_seen": 409600, "step": 50 }, { "epoch": 0.007752939656286342, "grad_norm": 0.4875193238258362, "learning_rate": 4.999917605604688e-05, "loss": 1.0393, "num_input_tokens_seen": 491520, "step": 60 }, { "epoch": 0.009045096265667399, "grad_norm": 0.5074454545974731, "learning_rate": 4.999887852295502e-05, "loss": 1.1473, "num_input_tokens_seen": 573440, "step": 70 }, { "epoch": 0.010337252875048455, "grad_norm": 0.6053259968757629, "learning_rate": 4.9998535217008054e-05, "loss": 0.8868, "num_input_tokens_seen": 655360, "step": 80 }, { "epoch": 0.011629409484429512, "grad_norm": 0.5783246159553528, "learning_rate": 4.999814613883459e-05, "loss": 0.9585, "num_input_tokens_seen": 737280, "step": 90 }, { "epoch": 0.01292156609381057, "grad_norm": 0.5514815449714661, "learning_rate": 4.999771128914701e-05, "loss": 1.0241, "num_input_tokens_seen": 819200, "step": 100 }, { "epoch": 0.014213722703191626, "grad_norm": 0.6433015465736389, "learning_rate": 4.999723066874154e-05, "loss": 1.034, "num_input_tokens_seen": 901120, "step": 110 }, { "epoch": 0.015505879312572683, "grad_norm": 0.5091115832328796, "learning_rate": 4.9996704278498185e-05, "loss": 1.2244, "num_input_tokens_seen": 983040, "step": 120 }, { "epoch": 0.016798035921953742, "grad_norm": 0.40583527088165283, "learning_rate": 4.9996132119380764e-05, "loss": 1.266, "num_input_tokens_seen": 1064960, "step": 130 }, { "epoch": 0.018090192531334797, "grad_norm": 0.6629675626754761, "learning_rate": 4.999551419243691e-05, "loss": 1.1523, "num_input_tokens_seen": 1146880, "step": 140 }, { "epoch": 0.019382349140715856, "grad_norm": 0.6925719976425171, "learning_rate": 4.9994850498798026e-05, "loss": 1.1496, "num_input_tokens_seen": 1228800, "step": 150 }, { "epoch": 0.02067450575009691, "grad_norm": 0.43508079648017883, "learning_rate": 4.999414103967934e-05, "loss": 0.9302, "num_input_tokens_seen": 1310720, "step": 160 }, { "epoch": 0.02196666235947797, "grad_norm": 0.6177295446395874, "learning_rate": 4.9993385816379876e-05, "loss": 0.9475, "num_input_tokens_seen": 1392640, "step": 170 }, { "epoch": 0.023258818968859025, "grad_norm": 0.43350470066070557, "learning_rate": 4.999258483028243e-05, "loss": 1.0703, "num_input_tokens_seen": 1474560, "step": 180 }, { "epoch": 0.024550975578240083, "grad_norm": 0.8424047231674194, "learning_rate": 4.999173808285362e-05, "loss": 1.1235, "num_input_tokens_seen": 1556480, "step": 190 }, { "epoch": 0.02584313218762114, "grad_norm": 0.5010475516319275, "learning_rate": 4.999084557564383e-05, "loss": 1.3278, "num_input_tokens_seen": 1638400, "step": 200 }, { "epoch": 0.027135288797002197, "grad_norm": 0.5917531847953796, "learning_rate": 4.9989907310287243e-05, "loss": 1.1928, "num_input_tokens_seen": 1720320, "step": 210 }, { "epoch": 0.028427445406383253, "grad_norm": 0.40859103202819824, "learning_rate": 4.998892328850181e-05, "loss": 1.215, "num_input_tokens_seen": 1802240, "step": 220 }, { "epoch": 0.02971960201576431, "grad_norm": 0.7122620344161987, "learning_rate": 4.9987893512089276e-05, "loss": 1.3318, "num_input_tokens_seen": 1884160, "step": 230 }, { "epoch": 0.031011758625145366, "grad_norm": 0.6804201602935791, "learning_rate": 4.998681798293516e-05, "loss": 1.3226, "num_input_tokens_seen": 1966080, "step": 240 }, { "epoch": 0.03230391523452642, "grad_norm": 0.5440483093261719, "learning_rate": 4.998569670300876e-05, "loss": 1.1795, "num_input_tokens_seen": 2048000, "step": 250 }, { "epoch": 0.033596071843907484, "grad_norm": 1.6514626741409302, "learning_rate": 4.9984529674363114e-05, "loss": 0.9646, "num_input_tokens_seen": 2129920, "step": 260 }, { "epoch": 0.03488822845328854, "grad_norm": 0.46813198924064636, "learning_rate": 4.998331689913506e-05, "loss": 1.1236, "num_input_tokens_seen": 2211840, "step": 270 }, { "epoch": 0.036180385062669594, "grad_norm": 0.6068572998046875, "learning_rate": 4.998205837954518e-05, "loss": 0.9761, "num_input_tokens_seen": 2293760, "step": 280 }, { "epoch": 0.03747254167205065, "grad_norm": 0.4501520097255707, "learning_rate": 4.998075411789783e-05, "loss": 1.0846, "num_input_tokens_seen": 2375680, "step": 290 }, { "epoch": 0.03876469828143171, "grad_norm": 0.6213204264640808, "learning_rate": 4.9979404116581104e-05, "loss": 0.9604, "num_input_tokens_seen": 2457600, "step": 300 }, { "epoch": 0.04005685489081277, "grad_norm": 0.5889080166816711, "learning_rate": 4.9978008378066844e-05, "loss": 1.135, "num_input_tokens_seen": 2539520, "step": 310 }, { "epoch": 0.04134901150019382, "grad_norm": 0.5029394030570984, "learning_rate": 4.997656690491064e-05, "loss": 1.0162, "num_input_tokens_seen": 2621440, "step": 320 }, { "epoch": 0.04264116810957488, "grad_norm": 0.4100414216518402, "learning_rate": 4.9975079699751825e-05, "loss": 1.0928, "num_input_tokens_seen": 2703360, "step": 330 }, { "epoch": 0.04393332471895594, "grad_norm": 0.5180526971817017, "learning_rate": 4.997354676531348e-05, "loss": 1.4001, "num_input_tokens_seen": 2785280, "step": 340 }, { "epoch": 0.045225481328336994, "grad_norm": 0.47328633069992065, "learning_rate": 4.997196810440239e-05, "loss": 1.2969, "num_input_tokens_seen": 2867200, "step": 350 }, { "epoch": 0.04651763793771805, "grad_norm": 0.4311719834804535, "learning_rate": 4.997034371990907e-05, "loss": 1.0298, "num_input_tokens_seen": 2949120, "step": 360 }, { "epoch": 0.04780979454709911, "grad_norm": 0.8229668736457825, "learning_rate": 4.9968673614807787e-05, "loss": 1.1618, "num_input_tokens_seen": 3031040, "step": 370 }, { "epoch": 0.04910195115648017, "grad_norm": 0.4691760540008545, "learning_rate": 4.9966957792156475e-05, "loss": 0.8982, "num_input_tokens_seen": 3112960, "step": 380 }, { "epoch": 0.05039410776586122, "grad_norm": 0.539076566696167, "learning_rate": 4.99651962550968e-05, "loss": 1.15, "num_input_tokens_seen": 3194880, "step": 390 }, { "epoch": 0.05168626437524228, "grad_norm": 0.3812846541404724, "learning_rate": 4.996338900685414e-05, "loss": 1.3498, "num_input_tokens_seen": 3276800, "step": 400 }, { "epoch": 0.05297842098462334, "grad_norm": 0.4904411733150482, "learning_rate": 4.996153605073756e-05, "loss": 0.8624, "num_input_tokens_seen": 3358720, "step": 410 }, { "epoch": 0.054270577594004395, "grad_norm": 0.49068745970726013, "learning_rate": 4.9959637390139814e-05, "loss": 1.1246, "num_input_tokens_seen": 3440640, "step": 420 }, { "epoch": 0.05556273420338545, "grad_norm": 0.47231003642082214, "learning_rate": 4.995769302853733e-05, "loss": 0.8941, "num_input_tokens_seen": 3522560, "step": 430 }, { "epoch": 0.056854890812766505, "grad_norm": 0.4713283181190491, "learning_rate": 4.995570296949024e-05, "loss": 1.1223, "num_input_tokens_seen": 3604480, "step": 440 }, { "epoch": 0.05814704742214757, "grad_norm": 0.9544013738632202, "learning_rate": 4.995366721664234e-05, "loss": 0.7063, "num_input_tokens_seen": 3686400, "step": 450 }, { "epoch": 0.05943920403152862, "grad_norm": 0.43465104699134827, "learning_rate": 4.995158577372107e-05, "loss": 0.8703, "num_input_tokens_seen": 3768320, "step": 460 }, { "epoch": 0.06073136064090968, "grad_norm": 6.194292068481445, "learning_rate": 4.9949458644537556e-05, "loss": 0.8798, "num_input_tokens_seen": 3850240, "step": 470 }, { "epoch": 0.06202351725029073, "grad_norm": 0.4813326895236969, "learning_rate": 4.9947285832986553e-05, "loss": 1.154, "num_input_tokens_seen": 3932160, "step": 480 }, { "epoch": 0.0633156738596718, "grad_norm": 0.37111344933509827, "learning_rate": 4.9945067343046494e-05, "loss": 1.179, "num_input_tokens_seen": 4014080, "step": 490 }, { "epoch": 0.06460783046905284, "grad_norm": 0.473379909992218, "learning_rate": 4.9942803178779396e-05, "loss": 1.3203, "num_input_tokens_seen": 4096000, "step": 500 }, { "epoch": 0.0658999870784339, "grad_norm": 0.31630370020866394, "learning_rate": 4.994049334433095e-05, "loss": 0.95, "num_input_tokens_seen": 4177920, "step": 510 }, { "epoch": 0.06719214368781497, "grad_norm": 0.6394200921058655, "learning_rate": 4.9938137843930466e-05, "loss": 0.721, "num_input_tokens_seen": 4259840, "step": 520 }, { "epoch": 0.06848430029719602, "grad_norm": 0.4896668493747711, "learning_rate": 4.993573668189083e-05, "loss": 1.1505, "num_input_tokens_seen": 4341760, "step": 530 }, { "epoch": 0.06977645690657708, "grad_norm": 0.4696863293647766, "learning_rate": 4.9933289862608584e-05, "loss": 1.1169, "num_input_tokens_seen": 4423680, "step": 540 }, { "epoch": 0.07106861351595814, "grad_norm": 0.4316001236438751, "learning_rate": 4.9930797390563834e-05, "loss": 1.0893, "num_input_tokens_seen": 4505600, "step": 550 }, { "epoch": 0.07236077012533919, "grad_norm": 0.47244027256965637, "learning_rate": 4.9928259270320295e-05, "loss": 0.6521, "num_input_tokens_seen": 4587520, "step": 560 }, { "epoch": 0.07365292673472025, "grad_norm": 0.4212469160556793, "learning_rate": 4.992567550652525e-05, "loss": 0.9833, "num_input_tokens_seen": 4669440, "step": 570 }, { "epoch": 0.0749450833441013, "grad_norm": 0.49635788798332214, "learning_rate": 4.992304610390955e-05, "loss": 1.1048, "num_input_tokens_seen": 4751360, "step": 580 }, { "epoch": 0.07623723995348236, "grad_norm": 0.46898409724235535, "learning_rate": 4.9920371067287645e-05, "loss": 0.8898, "num_input_tokens_seen": 4833280, "step": 590 }, { "epoch": 0.07752939656286342, "grad_norm": 0.7145845293998718, "learning_rate": 4.9917650401557505e-05, "loss": 0.9906, "num_input_tokens_seen": 4915200, "step": 600 }, { "epoch": 0.07882155317224447, "grad_norm": 7.353865623474121, "learning_rate": 4.9914884111700656e-05, "loss": 0.9923, "num_input_tokens_seen": 4997120, "step": 610 }, { "epoch": 0.08011370978162553, "grad_norm": 0.7385011911392212, "learning_rate": 4.991207220278217e-05, "loss": 0.8434, "num_input_tokens_seen": 5079040, "step": 620 }, { "epoch": 0.0814058663910066, "grad_norm": 0.6693135499954224, "learning_rate": 4.990921467995064e-05, "loss": 0.6813, "num_input_tokens_seen": 5160960, "step": 630 }, { "epoch": 0.08269802300038764, "grad_norm": 0.4748353958129883, "learning_rate": 4.9906311548438184e-05, "loss": 0.8735, "num_input_tokens_seen": 5242880, "step": 640 }, { "epoch": 0.0839901796097687, "grad_norm": 0.5311540365219116, "learning_rate": 4.990336281356042e-05, "loss": 1.1035, "num_input_tokens_seen": 5324800, "step": 650 }, { "epoch": 0.08528233621914975, "grad_norm": 0.4842514991760254, "learning_rate": 4.9900368480716466e-05, "loss": 0.8329, "num_input_tokens_seen": 5406720, "step": 660 }, { "epoch": 0.08657449282853082, "grad_norm": 0.6414400935173035, "learning_rate": 4.9897328555388943e-05, "loss": 1.1088, "num_input_tokens_seen": 5488640, "step": 670 }, { "epoch": 0.08786664943791188, "grad_norm": 0.39652836322784424, "learning_rate": 4.989424304314395e-05, "loss": 1.0048, "num_input_tokens_seen": 5570560, "step": 680 }, { "epoch": 0.08915880604729293, "grad_norm": 0.6044394969940186, "learning_rate": 4.9891111949631023e-05, "loss": 0.9396, "num_input_tokens_seen": 5652480, "step": 690 }, { "epoch": 0.09045096265667399, "grad_norm": 0.4656500816345215, "learning_rate": 4.988793528058321e-05, "loss": 1.1961, "num_input_tokens_seen": 5734400, "step": 700 }, { "epoch": 0.09174311926605505, "grad_norm": 0.5340274572372437, "learning_rate": 4.988471304181697e-05, "loss": 0.9563, "num_input_tokens_seen": 5816320, "step": 710 }, { "epoch": 0.0930352758754361, "grad_norm": 0.4175763726234436, "learning_rate": 4.988144523923221e-05, "loss": 0.9487, "num_input_tokens_seen": 5898240, "step": 720 }, { "epoch": 0.09432743248481716, "grad_norm": 0.4877130389213562, "learning_rate": 4.987813187881226e-05, "loss": 0.9847, "num_input_tokens_seen": 5980160, "step": 730 }, { "epoch": 0.09561958909419822, "grad_norm": 0.5183830261230469, "learning_rate": 4.987477296662387e-05, "loss": 1.3118, "num_input_tokens_seen": 6062080, "step": 740 }, { "epoch": 0.09691174570357927, "grad_norm": 0.4533514678478241, "learning_rate": 4.987136850881721e-05, "loss": 1.0996, "num_input_tokens_seen": 6144000, "step": 750 }, { "epoch": 0.09820390231296033, "grad_norm": 0.4317517578601837, "learning_rate": 4.986791851162582e-05, "loss": 0.9023, "num_input_tokens_seen": 6225920, "step": 760 }, { "epoch": 0.09949605892234138, "grad_norm": 0.7972986698150635, "learning_rate": 4.986442298136663e-05, "loss": 1.0904, "num_input_tokens_seen": 6307840, "step": 770 }, { "epoch": 0.10078821553172244, "grad_norm": 1.0747971534729004, "learning_rate": 4.986088192443995e-05, "loss": 0.9374, "num_input_tokens_seen": 6389760, "step": 780 }, { "epoch": 0.1020803721411035, "grad_norm": 0.3895331919193268, "learning_rate": 4.985729534732944e-05, "loss": 0.8173, "num_input_tokens_seen": 6471680, "step": 790 }, { "epoch": 0.10337252875048455, "grad_norm": 0.6985827684402466, "learning_rate": 4.98536632566021e-05, "loss": 0.9162, "num_input_tokens_seen": 6553600, "step": 800 }, { "epoch": 0.10466468535986562, "grad_norm": 0.4658990502357483, "learning_rate": 4.9849985658908296e-05, "loss": 0.7986, "num_input_tokens_seen": 6635520, "step": 810 }, { "epoch": 0.10595684196924668, "grad_norm": 0.3981533944606781, "learning_rate": 4.9846262560981674e-05, "loss": 0.7034, "num_input_tokens_seen": 6717440, "step": 820 }, { "epoch": 0.10724899857862773, "grad_norm": 0.6919500231742859, "learning_rate": 4.9842493969639215e-05, "loss": 0.9466, "num_input_tokens_seen": 6799360, "step": 830 }, { "epoch": 0.10854115518800879, "grad_norm": 0.5051977038383484, "learning_rate": 4.9838679891781214e-05, "loss": 1.2844, "num_input_tokens_seen": 6881280, "step": 840 }, { "epoch": 0.10983331179738984, "grad_norm": 0.49048569798469543, "learning_rate": 4.983482033439122e-05, "loss": 0.7703, "num_input_tokens_seen": 6963200, "step": 850 }, { "epoch": 0.1111254684067709, "grad_norm": 0.29757964611053467, "learning_rate": 4.9830915304536065e-05, "loss": 1.2312, "num_input_tokens_seen": 7045120, "step": 860 }, { "epoch": 0.11241762501615196, "grad_norm": 0.4364021122455597, "learning_rate": 4.982696480936586e-05, "loss": 1.3658, "num_input_tokens_seen": 7127040, "step": 870 }, { "epoch": 0.11370978162553301, "grad_norm": 0.7192522287368774, "learning_rate": 4.9822968856113926e-05, "loss": 1.0453, "num_input_tokens_seen": 7208960, "step": 880 }, { "epoch": 0.11500193823491407, "grad_norm": 0.5548785924911499, "learning_rate": 4.9818927452096855e-05, "loss": 1.1852, "num_input_tokens_seen": 7290880, "step": 890 }, { "epoch": 0.11629409484429513, "grad_norm": 0.4914171099662781, "learning_rate": 4.981484060471444e-05, "loss": 1.2232, "num_input_tokens_seen": 7372800, "step": 900 }, { "epoch": 0.11758625145367618, "grad_norm": 0.4073733687400818, "learning_rate": 4.981070832144967e-05, "loss": 1.1144, "num_input_tokens_seen": 7454720, "step": 910 }, { "epoch": 0.11887840806305724, "grad_norm": 0.7221460342407227, "learning_rate": 4.980653060986877e-05, "loss": 1.0344, "num_input_tokens_seen": 7536640, "step": 920 }, { "epoch": 0.1201705646724383, "grad_norm": 0.5011032223701477, "learning_rate": 4.9802307477621084e-05, "loss": 1.1751, "num_input_tokens_seen": 7618560, "step": 930 }, { "epoch": 0.12146272128181936, "grad_norm": 0.4509833753108978, "learning_rate": 4.9798038932439175e-05, "loss": 1.1844, "num_input_tokens_seen": 7700480, "step": 940 }, { "epoch": 0.12275487789120042, "grad_norm": 0.6086503863334656, "learning_rate": 4.979372498213871e-05, "loss": 1.0034, "num_input_tokens_seen": 7782400, "step": 950 }, { "epoch": 0.12404703450058147, "grad_norm": 0.9258208274841309, "learning_rate": 4.978936563461854e-05, "loss": 0.9102, "num_input_tokens_seen": 7864320, "step": 960 }, { "epoch": 0.12533919110996253, "grad_norm": 0.5192177295684814, "learning_rate": 4.97849608978606e-05, "loss": 0.8154, "num_input_tokens_seen": 7946240, "step": 970 }, { "epoch": 0.1266313477193436, "grad_norm": 0.37925633788108826, "learning_rate": 4.978051077992994e-05, "loss": 1.0143, "num_input_tokens_seen": 8028160, "step": 980 }, { "epoch": 0.12792350432872465, "grad_norm": 2.0898265838623047, "learning_rate": 4.9776015288974736e-05, "loss": 0.7723, "num_input_tokens_seen": 8110080, "step": 990 }, { "epoch": 0.1292156609381057, "grad_norm": 0.667012631893158, "learning_rate": 4.9771474433226194e-05, "loss": 1.2866, "num_input_tokens_seen": 8192000, "step": 1000 }, { "epoch": 0.13050781754748675, "grad_norm": 0.4613621234893799, "learning_rate": 4.976688822099861e-05, "loss": 1.4743, "num_input_tokens_seen": 8273920, "step": 1010 }, { "epoch": 0.1317999741568678, "grad_norm": 0.7849400043487549, "learning_rate": 4.976225666068932e-05, "loss": 0.904, "num_input_tokens_seen": 8355840, "step": 1020 }, { "epoch": 0.13309213076624887, "grad_norm": 0.6418063044548035, "learning_rate": 4.9757579760778697e-05, "loss": 0.9119, "num_input_tokens_seen": 8437760, "step": 1030 }, { "epoch": 0.13438428737562993, "grad_norm": 0.47972187399864197, "learning_rate": 4.9752857529830125e-05, "loss": 1.1209, "num_input_tokens_seen": 8519680, "step": 1040 }, { "epoch": 0.135676443985011, "grad_norm": 0.4660983681678772, "learning_rate": 4.9748089976489996e-05, "loss": 1.2202, "num_input_tokens_seen": 8601600, "step": 1050 }, { "epoch": 0.13696860059439203, "grad_norm": 0.4959300756454468, "learning_rate": 4.9743277109487674e-05, "loss": 1.2102, "num_input_tokens_seen": 8683520, "step": 1060 }, { "epoch": 0.1382607572037731, "grad_norm": 0.541445791721344, "learning_rate": 4.973841893763551e-05, "loss": 0.9835, "num_input_tokens_seen": 8765440, "step": 1070 }, { "epoch": 0.13955291381315416, "grad_norm": 0.48562517762184143, "learning_rate": 4.9733515469828795e-05, "loss": 0.9155, "num_input_tokens_seen": 8847360, "step": 1080 }, { "epoch": 0.14084507042253522, "grad_norm": 0.4447590708732605, "learning_rate": 4.972856671504576e-05, "loss": 1.196, "num_input_tokens_seen": 8929280, "step": 1090 }, { "epoch": 0.14213722703191628, "grad_norm": 0.5334303379058838, "learning_rate": 4.9723572682347566e-05, "loss": 0.8913, "num_input_tokens_seen": 9011200, "step": 1100 }, { "epoch": 0.14342938364129731, "grad_norm": 0.376122385263443, "learning_rate": 4.971853338087825e-05, "loss": 1.1649, "num_input_tokens_seen": 9093120, "step": 1110 }, { "epoch": 0.14472154025067838, "grad_norm": 0.5344268083572388, "learning_rate": 4.971344881986477e-05, "loss": 1.084, "num_input_tokens_seen": 9175040, "step": 1120 }, { "epoch": 0.14601369686005944, "grad_norm": 0.8270778059959412, "learning_rate": 4.9708319008616926e-05, "loss": 0.9945, "num_input_tokens_seen": 9256960, "step": 1130 }, { "epoch": 0.1473058534694405, "grad_norm": 0.6520871520042419, "learning_rate": 4.97031439565274e-05, "loss": 0.8976, "num_input_tokens_seen": 9338880, "step": 1140 }, { "epoch": 0.14859801007882156, "grad_norm": 0.4764862656593323, "learning_rate": 4.969792367307168e-05, "loss": 1.063, "num_input_tokens_seen": 9420800, "step": 1150 }, { "epoch": 0.1498901666882026, "grad_norm": 0.48043620586395264, "learning_rate": 4.9692658167808094e-05, "loss": 0.8809, "num_input_tokens_seen": 9502720, "step": 1160 }, { "epoch": 0.15118232329758366, "grad_norm": 0.44339507818222046, "learning_rate": 4.9687347450377755e-05, "loss": 1.3418, "num_input_tokens_seen": 9584640, "step": 1170 }, { "epoch": 0.15247447990696472, "grad_norm": 0.40918466448783875, "learning_rate": 4.968199153050457e-05, "loss": 0.6324, "num_input_tokens_seen": 9666560, "step": 1180 }, { "epoch": 0.15376663651634578, "grad_norm": 0.5043272972106934, "learning_rate": 4.967659041799522e-05, "loss": 0.9144, "num_input_tokens_seen": 9748480, "step": 1190 }, { "epoch": 0.15505879312572685, "grad_norm": 0.47593727707862854, "learning_rate": 4.9671144122739106e-05, "loss": 0.8396, "num_input_tokens_seen": 9830400, "step": 1200 }, { "epoch": 0.1563509497351079, "grad_norm": 0.3913171887397766, "learning_rate": 4.966565265470838e-05, "loss": 1.0608, "num_input_tokens_seen": 9912320, "step": 1210 }, { "epoch": 0.15764310634448894, "grad_norm": 0.8421456813812256, "learning_rate": 4.9660116023957906e-05, "loss": 0.9964, "num_input_tokens_seen": 9994240, "step": 1220 }, { "epoch": 0.15893526295387, "grad_norm": 0.46809521317481995, "learning_rate": 4.9654534240625225e-05, "loss": 0.8427, "num_input_tokens_seen": 10076160, "step": 1230 }, { "epoch": 0.16022741956325107, "grad_norm": 0.5621390342712402, "learning_rate": 4.964890731493057e-05, "loss": 1.2353, "num_input_tokens_seen": 10158080, "step": 1240 }, { "epoch": 0.16151957617263213, "grad_norm": 0.4590226113796234, "learning_rate": 4.964323525717681e-05, "loss": 1.0439, "num_input_tokens_seen": 10240000, "step": 1250 }, { "epoch": 0.1628117327820132, "grad_norm": 0.40506160259246826, "learning_rate": 4.9637518077749476e-05, "loss": 0.914, "num_input_tokens_seen": 10321920, "step": 1260 }, { "epoch": 0.16410388939139423, "grad_norm": 0.8865328431129456, "learning_rate": 4.96317557871167e-05, "loss": 1.0189, "num_input_tokens_seen": 10403840, "step": 1270 }, { "epoch": 0.1653960460007753, "grad_norm": 0.37955254316329956, "learning_rate": 4.9625948395829216e-05, "loss": 1.0049, "num_input_tokens_seen": 10485760, "step": 1280 }, { "epoch": 0.16668820261015635, "grad_norm": 0.5044606328010559, "learning_rate": 4.962009591452032e-05, "loss": 0.941, "num_input_tokens_seen": 10567680, "step": 1290 }, { "epoch": 0.1679803592195374, "grad_norm": 0.6987674236297607, "learning_rate": 4.96141983539059e-05, "loss": 1.1209, "num_input_tokens_seen": 10649600, "step": 1300 }, { "epoch": 0.16927251582891847, "grad_norm": 1.0083003044128418, "learning_rate": 4.960825572478436e-05, "loss": 1.0478, "num_input_tokens_seen": 10731520, "step": 1310 }, { "epoch": 0.1705646724382995, "grad_norm": 0.3237228989601135, "learning_rate": 4.960226803803664e-05, "loss": 0.9342, "num_input_tokens_seen": 10813440, "step": 1320 }, { "epoch": 0.17185682904768057, "grad_norm": 1.4129977226257324, "learning_rate": 4.959623530462617e-05, "loss": 1.0727, "num_input_tokens_seen": 10895360, "step": 1330 }, { "epoch": 0.17314898565706163, "grad_norm": 0.3343133330345154, "learning_rate": 4.9590157535598855e-05, "loss": 1.0719, "num_input_tokens_seen": 10977280, "step": 1340 }, { "epoch": 0.1744411422664427, "grad_norm": 0.3956698775291443, "learning_rate": 4.958403474208308e-05, "loss": 1.0195, "num_input_tokens_seen": 11059200, "step": 1350 }, { "epoch": 0.17573329887582376, "grad_norm": 0.45558327436447144, "learning_rate": 4.957786693528965e-05, "loss": 1.1703, "num_input_tokens_seen": 11141120, "step": 1360 }, { "epoch": 0.17702545548520482, "grad_norm": 0.4159637689590454, "learning_rate": 4.95716541265118e-05, "loss": 0.8547, "num_input_tokens_seen": 11223040, "step": 1370 }, { "epoch": 0.17831761209458585, "grad_norm": 0.510883629322052, "learning_rate": 4.9565396327125155e-05, "loss": 1.056, "num_input_tokens_seen": 11304960, "step": 1380 }, { "epoch": 0.17960976870396692, "grad_norm": 0.4751374125480652, "learning_rate": 4.955909354858772e-05, "loss": 0.8731, "num_input_tokens_seen": 11386880, "step": 1390 }, { "epoch": 0.18090192531334798, "grad_norm": 0.3273867070674896, "learning_rate": 4.955274580243987e-05, "loss": 0.7233, "num_input_tokens_seen": 11468800, "step": 1400 }, { "epoch": 0.18219408192272904, "grad_norm": 0.38997891545295715, "learning_rate": 4.95463531003043e-05, "loss": 0.8429, "num_input_tokens_seen": 11550720, "step": 1410 }, { "epoch": 0.1834862385321101, "grad_norm": 0.4781230092048645, "learning_rate": 4.953991545388603e-05, "loss": 0.9205, "num_input_tokens_seen": 11632640, "step": 1420 }, { "epoch": 0.18477839514149114, "grad_norm": 0.39320600032806396, "learning_rate": 4.9533432874972366e-05, "loss": 1.0245, "num_input_tokens_seen": 11714560, "step": 1430 }, { "epoch": 0.1860705517508722, "grad_norm": 1.2111022472381592, "learning_rate": 4.952690537543287e-05, "loss": 1.161, "num_input_tokens_seen": 11796480, "step": 1440 }, { "epoch": 0.18736270836025326, "grad_norm": 0.42389845848083496, "learning_rate": 4.952033296721938e-05, "loss": 1.4307, "num_input_tokens_seen": 11878400, "step": 1450 }, { "epoch": 0.18865486496963432, "grad_norm": 0.3861338496208191, "learning_rate": 4.951371566236597e-05, "loss": 0.9983, "num_input_tokens_seen": 11960320, "step": 1460 }, { "epoch": 0.18994702157901538, "grad_norm": 0.39819157123565674, "learning_rate": 4.9507053472988867e-05, "loss": 0.927, "num_input_tokens_seen": 12042240, "step": 1470 }, { "epoch": 0.19123917818839645, "grad_norm": 0.47134819626808167, "learning_rate": 4.9500346411286534e-05, "loss": 1.0184, "num_input_tokens_seen": 12124160, "step": 1480 }, { "epoch": 0.19253133479777748, "grad_norm": 0.6282069683074951, "learning_rate": 4.949359448953959e-05, "loss": 0.9956, "num_input_tokens_seen": 12206080, "step": 1490 }, { "epoch": 0.19382349140715854, "grad_norm": 0.48138657212257385, "learning_rate": 4.9486797720110746e-05, "loss": 0.7369, "num_input_tokens_seen": 12288000, "step": 1500 }, { "epoch": 0.1951156480165396, "grad_norm": 0.42543670535087585, "learning_rate": 4.947995611544489e-05, "loss": 0.9916, "num_input_tokens_seen": 12369920, "step": 1510 }, { "epoch": 0.19640780462592067, "grad_norm": 0.6821240782737732, "learning_rate": 4.947306968806896e-05, "loss": 1.0502, "num_input_tokens_seen": 12451840, "step": 1520 }, { "epoch": 0.19769996123530173, "grad_norm": 0.7128183245658875, "learning_rate": 4.946613845059199e-05, "loss": 1.1209, "num_input_tokens_seen": 12533760, "step": 1530 }, { "epoch": 0.19899211784468276, "grad_norm": 0.5544474720954895, "learning_rate": 4.945916241570504e-05, "loss": 1.152, "num_input_tokens_seen": 12615680, "step": 1540 }, { "epoch": 0.20028427445406383, "grad_norm": 0.4839724600315094, "learning_rate": 4.945214159618121e-05, "loss": 1.1884, "num_input_tokens_seen": 12697600, "step": 1550 }, { "epoch": 0.2015764310634449, "grad_norm": 0.381164014339447, "learning_rate": 4.9445076004875596e-05, "loss": 1.1069, "num_input_tokens_seen": 12779520, "step": 1560 }, { "epoch": 0.20286858767282595, "grad_norm": 0.5115939974784851, "learning_rate": 4.9437965654725264e-05, "loss": 1.0457, "num_input_tokens_seen": 12861440, "step": 1570 }, { "epoch": 0.204160744282207, "grad_norm": 0.45792338252067566, "learning_rate": 4.943081055874925e-05, "loss": 0.8206, "num_input_tokens_seen": 12943360, "step": 1580 }, { "epoch": 0.20545290089158805, "grad_norm": 0.5342550873756409, "learning_rate": 4.9423610730048495e-05, "loss": 1.2474, "num_input_tokens_seen": 13025280, "step": 1590 }, { "epoch": 0.2067450575009691, "grad_norm": 0.6011587381362915, "learning_rate": 4.941636618180586e-05, "loss": 0.9485, "num_input_tokens_seen": 13107200, "step": 1600 }, { "epoch": 0.20803721411035017, "grad_norm": 1.3028993606567383, "learning_rate": 4.94090769272861e-05, "loss": 1.0329, "num_input_tokens_seen": 13189120, "step": 1610 }, { "epoch": 0.20932937071973123, "grad_norm": 0.4599340260028839, "learning_rate": 4.940174297983581e-05, "loss": 1.1409, "num_input_tokens_seen": 13271040, "step": 1620 }, { "epoch": 0.2106215273291123, "grad_norm": 0.4517963230609894, "learning_rate": 4.93943643528834e-05, "loss": 0.6724, "num_input_tokens_seen": 13352960, "step": 1630 }, { "epoch": 0.21191368393849336, "grad_norm": 0.4794328212738037, "learning_rate": 4.938694105993914e-05, "loss": 0.9835, "num_input_tokens_seen": 13434880, "step": 1640 }, { "epoch": 0.2132058405478744, "grad_norm": 0.43061280250549316, "learning_rate": 4.937947311459503e-05, "loss": 1.2099, "num_input_tokens_seen": 13516800, "step": 1650 }, { "epoch": 0.21449799715725545, "grad_norm": 0.4864104986190796, "learning_rate": 4.937196053052486e-05, "loss": 0.9276, "num_input_tokens_seen": 13598720, "step": 1660 }, { "epoch": 0.21579015376663652, "grad_norm": 0.40697136521339417, "learning_rate": 4.9364403321484145e-05, "loss": 1.4808, "num_input_tokens_seen": 13680640, "step": 1670 }, { "epoch": 0.21708231037601758, "grad_norm": 0.613544762134552, "learning_rate": 4.9356801501310105e-05, "loss": 1.248, "num_input_tokens_seen": 13762560, "step": 1680 }, { "epoch": 0.21837446698539864, "grad_norm": 0.25986501574516296, "learning_rate": 4.934915508392164e-05, "loss": 0.8261, "num_input_tokens_seen": 13844480, "step": 1690 }, { "epoch": 0.21966662359477968, "grad_norm": 0.4318770468235016, "learning_rate": 4.9341464083319314e-05, "loss": 1.2263, "num_input_tokens_seen": 13926400, "step": 1700 }, { "epoch": 0.22095878020416074, "grad_norm": 0.32169216871261597, "learning_rate": 4.933372851358532e-05, "loss": 0.8885, "num_input_tokens_seen": 14008320, "step": 1710 }, { "epoch": 0.2222509368135418, "grad_norm": 0.507994532585144, "learning_rate": 4.932594838888347e-05, "loss": 1.369, "num_input_tokens_seen": 14090240, "step": 1720 }, { "epoch": 0.22354309342292286, "grad_norm": 0.5168066620826721, "learning_rate": 4.931812372345913e-05, "loss": 1.1812, "num_input_tokens_seen": 14172160, "step": 1730 }, { "epoch": 0.22483525003230392, "grad_norm": 0.5198150873184204, "learning_rate": 4.9310254531639235e-05, "loss": 0.7136, "num_input_tokens_seen": 14254080, "step": 1740 }, { "epoch": 0.22612740664168499, "grad_norm": 0.4334443509578705, "learning_rate": 4.930234082783225e-05, "loss": 1.4446, "num_input_tokens_seen": 14336000, "step": 1750 }, { "epoch": 0.22741956325106602, "grad_norm": 0.4047377109527588, "learning_rate": 4.9294382626528144e-05, "loss": 0.7729, "num_input_tokens_seen": 14417920, "step": 1760 }, { "epoch": 0.22871171986044708, "grad_norm": 1.014719009399414, "learning_rate": 4.928637994229834e-05, "loss": 0.9155, "num_input_tokens_seen": 14499840, "step": 1770 }, { "epoch": 0.23000387646982814, "grad_norm": 0.5884613394737244, "learning_rate": 4.9278332789795746e-05, "loss": 1.0874, "num_input_tokens_seen": 14581760, "step": 1780 }, { "epoch": 0.2312960330792092, "grad_norm": 0.4626663029193878, "learning_rate": 4.9270241183754637e-05, "loss": 0.7714, "num_input_tokens_seen": 14663680, "step": 1790 }, { "epoch": 0.23258818968859027, "grad_norm": 0.42093148827552795, "learning_rate": 4.9262105138990745e-05, "loss": 1.3797, "num_input_tokens_seen": 14745600, "step": 1800 }, { "epoch": 0.2338803462979713, "grad_norm": 0.3722878396511078, "learning_rate": 4.925392467040112e-05, "loss": 1.2271, "num_input_tokens_seen": 14827520, "step": 1810 }, { "epoch": 0.23517250290735237, "grad_norm": 0.4904583990573883, "learning_rate": 4.924569979296417e-05, "loss": 0.9982, "num_input_tokens_seen": 14909440, "step": 1820 }, { "epoch": 0.23646465951673343, "grad_norm": 0.5454955101013184, "learning_rate": 4.9237430521739626e-05, "loss": 0.9374, "num_input_tokens_seen": 14991360, "step": 1830 }, { "epoch": 0.2377568161261145, "grad_norm": 0.9513818621635437, "learning_rate": 4.9229116871868485e-05, "loss": 0.9464, "num_input_tokens_seen": 15073280, "step": 1840 }, { "epoch": 0.23904897273549555, "grad_norm": 1.1613630056381226, "learning_rate": 4.922075885857301e-05, "loss": 0.8913, "num_input_tokens_seen": 15155200, "step": 1850 }, { "epoch": 0.2403411293448766, "grad_norm": 0.5714117288589478, "learning_rate": 4.92123564971567e-05, "loss": 1.3031, "num_input_tokens_seen": 15237120, "step": 1860 }, { "epoch": 0.24163328595425765, "grad_norm": 0.417524516582489, "learning_rate": 4.9203909803004245e-05, "loss": 1.0151, "num_input_tokens_seen": 15319040, "step": 1870 }, { "epoch": 0.2429254425636387, "grad_norm": 0.43594273924827576, "learning_rate": 4.9195418791581504e-05, "loss": 1.0122, "num_input_tokens_seen": 15400960, "step": 1880 }, { "epoch": 0.24421759917301977, "grad_norm": 0.5096331238746643, "learning_rate": 4.918688347843549e-05, "loss": 1.3312, "num_input_tokens_seen": 15482880, "step": 1890 }, { "epoch": 0.24550975578240083, "grad_norm": 0.4702720046043396, "learning_rate": 4.917830387919434e-05, "loss": 1.4312, "num_input_tokens_seen": 15564800, "step": 1900 }, { "epoch": 0.2468019123917819, "grad_norm": 0.49178382754325867, "learning_rate": 4.9169680009567254e-05, "loss": 0.8297, "num_input_tokens_seen": 15646720, "step": 1910 }, { "epoch": 0.24809406900116293, "grad_norm": 0.4930388033390045, "learning_rate": 4.916101188534452e-05, "loss": 0.9553, "num_input_tokens_seen": 15728640, "step": 1920 }, { "epoch": 0.249386225610544, "grad_norm": 0.44128185510635376, "learning_rate": 4.9152299522397424e-05, "loss": 1.2107, "num_input_tokens_seen": 15810560, "step": 1930 }, { "epoch": 0.25067838221992506, "grad_norm": 0.45994389057159424, "learning_rate": 4.91435429366783e-05, "loss": 1.3879, "num_input_tokens_seen": 15892480, "step": 1940 }, { "epoch": 0.2519705388293061, "grad_norm": 0.46608877182006836, "learning_rate": 4.9134742144220394e-05, "loss": 1.2823, "num_input_tokens_seen": 15974400, "step": 1950 }, { "epoch": 0.2532626954386872, "grad_norm": 0.45161423087120056, "learning_rate": 4.912589716113794e-05, "loss": 0.9816, "num_input_tokens_seen": 16056320, "step": 1960 }, { "epoch": 0.2545548520480682, "grad_norm": 0.5524086952209473, "learning_rate": 4.9117008003626066e-05, "loss": 0.8746, "num_input_tokens_seen": 16138240, "step": 1970 }, { "epoch": 0.2558470086574493, "grad_norm": 0.5476992726325989, "learning_rate": 4.910807468796079e-05, "loss": 0.8788, "num_input_tokens_seen": 16220160, "step": 1980 }, { "epoch": 0.25713916526683034, "grad_norm": 0.5770102739334106, "learning_rate": 4.9099097230498974e-05, "loss": 0.9458, "num_input_tokens_seen": 16302080, "step": 1990 }, { "epoch": 0.2584313218762114, "grad_norm": 0.4459473490715027, "learning_rate": 4.909007564767831e-05, "loss": 0.7193, "num_input_tokens_seen": 16384000, "step": 2000 }, { "epoch": 0.25972347848559246, "grad_norm": 0.43885576725006104, "learning_rate": 4.90810099560173e-05, "loss": 1.1239, "num_input_tokens_seen": 16465920, "step": 2010 }, { "epoch": 0.2610156350949735, "grad_norm": 0.42829430103302, "learning_rate": 4.907190017211517e-05, "loss": 1.2067, "num_input_tokens_seen": 16547840, "step": 2020 }, { "epoch": 0.2623077917043546, "grad_norm": 0.47339022159576416, "learning_rate": 4.906274631265191e-05, "loss": 1.0478, "num_input_tokens_seen": 16629760, "step": 2030 }, { "epoch": 0.2635999483137356, "grad_norm": 0.4274667799472809, "learning_rate": 4.90535483943882e-05, "loss": 1.0864, "num_input_tokens_seen": 16711680, "step": 2040 }, { "epoch": 0.26489210492311666, "grad_norm": 0.508808434009552, "learning_rate": 4.904430643416541e-05, "loss": 0.7331, "num_input_tokens_seen": 16793600, "step": 2050 }, { "epoch": 0.26618426153249775, "grad_norm": 0.39238959550857544, "learning_rate": 4.903502044890551e-05, "loss": 0.8874, "num_input_tokens_seen": 16875520, "step": 2060 }, { "epoch": 0.2674764181418788, "grad_norm": 0.5183860659599304, "learning_rate": 4.902569045561113e-05, "loss": 1.1042, "num_input_tokens_seen": 16957440, "step": 2070 }, { "epoch": 0.26876857475125987, "grad_norm": 0.45784205198287964, "learning_rate": 4.901631647136543e-05, "loss": 1.0121, "num_input_tokens_seen": 17039360, "step": 2080 }, { "epoch": 0.2700607313606409, "grad_norm": 0.43935465812683105, "learning_rate": 4.900689851333216e-05, "loss": 1.027, "num_input_tokens_seen": 17121280, "step": 2090 }, { "epoch": 0.271352887970022, "grad_norm": 0.3319588303565979, "learning_rate": 4.899743659875556e-05, "loss": 1.1295, "num_input_tokens_seen": 17203200, "step": 2100 }, { "epoch": 0.27264504457940303, "grad_norm": 0.5109902024269104, "learning_rate": 4.8987930744960355e-05, "loss": 1.3633, "num_input_tokens_seen": 17285120, "step": 2110 }, { "epoch": 0.27393720118878406, "grad_norm": 0.5351001620292664, "learning_rate": 4.897838096935174e-05, "loss": 1.1889, "num_input_tokens_seen": 17367040, "step": 2120 }, { "epoch": 0.27522935779816515, "grad_norm": 0.4509984850883484, "learning_rate": 4.896878728941531e-05, "loss": 0.927, "num_input_tokens_seen": 17448960, "step": 2130 }, { "epoch": 0.2765215144075462, "grad_norm": 0.7170317769050598, "learning_rate": 4.8959149722717057e-05, "loss": 0.7892, "num_input_tokens_seen": 17530880, "step": 2140 }, { "epoch": 0.2778136710169273, "grad_norm": 0.5630227327346802, "learning_rate": 4.894946828690334e-05, "loss": 1.0423, "num_input_tokens_seen": 17612800, "step": 2150 }, { "epoch": 0.2791058276263083, "grad_norm": 0.6323631405830383, "learning_rate": 4.893974299970082e-05, "loss": 1.2965, "num_input_tokens_seen": 17694720, "step": 2160 }, { "epoch": 0.28039798423568935, "grad_norm": 0.5983838438987732, "learning_rate": 4.892997387891648e-05, "loss": 0.9905, "num_input_tokens_seen": 17776640, "step": 2170 }, { "epoch": 0.28169014084507044, "grad_norm": 0.4904590845108032, "learning_rate": 4.892016094243753e-05, "loss": 1.1746, "num_input_tokens_seen": 17858560, "step": 2180 }, { "epoch": 0.28298229745445147, "grad_norm": 0.39486417174339294, "learning_rate": 4.891030420823142e-05, "loss": 0.818, "num_input_tokens_seen": 17940480, "step": 2190 }, { "epoch": 0.28427445406383256, "grad_norm": 0.48404547572135925, "learning_rate": 4.89004036943458e-05, "loss": 0.997, "num_input_tokens_seen": 18022400, "step": 2200 }, { "epoch": 0.2855666106732136, "grad_norm": 0.5132070183753967, "learning_rate": 4.8890459418908476e-05, "loss": 1.3072, "num_input_tokens_seen": 18104320, "step": 2210 }, { "epoch": 0.28685876728259463, "grad_norm": 0.5053073167800903, "learning_rate": 4.888047140012737e-05, "loss": 0.8763, "num_input_tokens_seen": 18186240, "step": 2220 }, { "epoch": 0.2881509238919757, "grad_norm": 0.4907422363758087, "learning_rate": 4.8870439656290525e-05, "loss": 0.8944, "num_input_tokens_seen": 18268160, "step": 2230 }, { "epoch": 0.28944308050135675, "grad_norm": 0.4988420307636261, "learning_rate": 4.8860364205766006e-05, "loss": 0.8613, "num_input_tokens_seen": 18350080, "step": 2240 }, { "epoch": 0.29073523711073784, "grad_norm": 0.6583264470100403, "learning_rate": 4.885024506700195e-05, "loss": 0.9404, "num_input_tokens_seen": 18432000, "step": 2250 }, { "epoch": 0.2920273937201189, "grad_norm": 0.4961051344871521, "learning_rate": 4.884008225852644e-05, "loss": 1.1465, "num_input_tokens_seen": 18513920, "step": 2260 }, { "epoch": 0.2933195503294999, "grad_norm": 0.62845778465271, "learning_rate": 4.8829875798947554e-05, "loss": 1.3089, "num_input_tokens_seen": 18595840, "step": 2270 }, { "epoch": 0.294611706938881, "grad_norm": 0.557864248752594, "learning_rate": 4.8819625706953286e-05, "loss": 1.0963, "num_input_tokens_seen": 18677760, "step": 2280 }, { "epoch": 0.29590386354826204, "grad_norm": 0.4075135886669159, "learning_rate": 4.88093320013115e-05, "loss": 0.6548, "num_input_tokens_seen": 18759680, "step": 2290 }, { "epoch": 0.2971960201576431, "grad_norm": 0.4397824704647064, "learning_rate": 4.879899470086995e-05, "loss": 0.7479, "num_input_tokens_seen": 18841600, "step": 2300 }, { "epoch": 0.29848817676702416, "grad_norm": 0.2952675521373749, "learning_rate": 4.8788613824556194e-05, "loss": 1.0112, "num_input_tokens_seen": 18923520, "step": 2310 }, { "epoch": 0.2997803333764052, "grad_norm": 0.46947795152664185, "learning_rate": 4.8778189391377574e-05, "loss": 1.3439, "num_input_tokens_seen": 19005440, "step": 2320 }, { "epoch": 0.3010724899857863, "grad_norm": 0.5178149342536926, "learning_rate": 4.876772142042117e-05, "loss": 1.0994, "num_input_tokens_seen": 19087360, "step": 2330 }, { "epoch": 0.3023646465951673, "grad_norm": 0.45673584938049316, "learning_rate": 4.875720993085384e-05, "loss": 1.2049, "num_input_tokens_seen": 19169280, "step": 2340 }, { "epoch": 0.3036568032045484, "grad_norm": 1.184935212135315, "learning_rate": 4.874665494192206e-05, "loss": 0.6961, "num_input_tokens_seen": 19251200, "step": 2350 }, { "epoch": 0.30494895981392944, "grad_norm": 0.4711746275424957, "learning_rate": 4.8736056472951955e-05, "loss": 0.8927, "num_input_tokens_seen": 19333120, "step": 2360 }, { "epoch": 0.3062411164233105, "grad_norm": 0.34465235471725464, "learning_rate": 4.8725414543349326e-05, "loss": 0.9728, "num_input_tokens_seen": 19415040, "step": 2370 }, { "epoch": 0.30753327303269157, "grad_norm": 0.7332857847213745, "learning_rate": 4.871472917259947e-05, "loss": 1.3036, "num_input_tokens_seen": 19496960, "step": 2380 }, { "epoch": 0.3088254296420726, "grad_norm": 0.4272123873233795, "learning_rate": 4.870400038026728e-05, "loss": 0.9335, "num_input_tokens_seen": 19578880, "step": 2390 }, { "epoch": 0.3101175862514537, "grad_norm": 0.47827640175819397, "learning_rate": 4.869322818599714e-05, "loss": 1.1586, "num_input_tokens_seen": 19660800, "step": 2400 }, { "epoch": 0.3114097428608347, "grad_norm": 0.5260847806930542, "learning_rate": 4.868241260951289e-05, "loss": 1.2544, "num_input_tokens_seen": 19742720, "step": 2410 }, { "epoch": 0.3127018994702158, "grad_norm": 0.5642371773719788, "learning_rate": 4.867155367061781e-05, "loss": 1.1045, "num_input_tokens_seen": 19824640, "step": 2420 }, { "epoch": 0.31399405607959685, "grad_norm": 0.2645663321018219, "learning_rate": 4.8660651389194576e-05, "loss": 0.5473, "num_input_tokens_seen": 19906560, "step": 2430 }, { "epoch": 0.3152862126889779, "grad_norm": 0.47711700201034546, "learning_rate": 4.8649705785205224e-05, "loss": 1.2962, "num_input_tokens_seen": 19988480, "step": 2440 }, { "epoch": 0.316578369298359, "grad_norm": 0.46864113211631775, "learning_rate": 4.8638716878691125e-05, "loss": 0.8808, "num_input_tokens_seen": 20070400, "step": 2450 }, { "epoch": 0.31787052590774, "grad_norm": 0.45676669478416443, "learning_rate": 4.862768468977293e-05, "loss": 1.2679, "num_input_tokens_seen": 20152320, "step": 2460 }, { "epoch": 0.3191626825171211, "grad_norm": 1.7806425094604492, "learning_rate": 4.861660923865052e-05, "loss": 0.9461, "num_input_tokens_seen": 20234240, "step": 2470 }, { "epoch": 0.32045483912650213, "grad_norm": 0.491797536611557, "learning_rate": 4.860549054560301e-05, "loss": 0.7236, "num_input_tokens_seen": 20316160, "step": 2480 }, { "epoch": 0.32174699573588317, "grad_norm": 0.46949175000190735, "learning_rate": 4.8594328630988696e-05, "loss": 1.2098, "num_input_tokens_seen": 20398080, "step": 2490 }, { "epoch": 0.32303915234526426, "grad_norm": 0.4671134948730469, "learning_rate": 4.858312351524499e-05, "loss": 1.2859, "num_input_tokens_seen": 20480000, "step": 2500 }, { "epoch": 0.3243313089546453, "grad_norm": 0.6649695634841919, "learning_rate": 4.857187521888843e-05, "loss": 0.9768, "num_input_tokens_seen": 20561920, "step": 2510 }, { "epoch": 0.3256234655640264, "grad_norm": 0.42521458864212036, "learning_rate": 4.8560583762514594e-05, "loss": 0.8317, "num_input_tokens_seen": 20643840, "step": 2520 }, { "epoch": 0.3269156221734074, "grad_norm": 0.6050983667373657, "learning_rate": 4.854924916679811e-05, "loss": 0.6581, "num_input_tokens_seen": 20725760, "step": 2530 }, { "epoch": 0.32820777878278845, "grad_norm": 0.6202778220176697, "learning_rate": 4.8537871452492565e-05, "loss": 1.248, "num_input_tokens_seen": 20807680, "step": 2540 }, { "epoch": 0.32949993539216954, "grad_norm": 0.4647495746612549, "learning_rate": 4.852645064043053e-05, "loss": 0.9416, "num_input_tokens_seen": 20889600, "step": 2550 }, { "epoch": 0.3307920920015506, "grad_norm": 0.5066781044006348, "learning_rate": 4.851498675152346e-05, "loss": 1.0115, "num_input_tokens_seen": 20971520, "step": 2560 }, { "epoch": 0.33208424861093166, "grad_norm": 0.252456933259964, "learning_rate": 4.8503479806761684e-05, "loss": 0.8474, "num_input_tokens_seen": 21053440, "step": 2570 }, { "epoch": 0.3333764052203127, "grad_norm": 0.4478939175605774, "learning_rate": 4.84919298272144e-05, "loss": 1.0834, "num_input_tokens_seen": 21135360, "step": 2580 }, { "epoch": 0.33466856182969373, "grad_norm": 0.6947191953659058, "learning_rate": 4.848033683402956e-05, "loss": 1.3049, "num_input_tokens_seen": 21217280, "step": 2590 }, { "epoch": 0.3359607184390748, "grad_norm": 0.7486204504966736, "learning_rate": 4.84687008484339e-05, "loss": 1.0012, "num_input_tokens_seen": 21299200, "step": 2600 }, { "epoch": 0.33725287504845586, "grad_norm": 0.2647557854652405, "learning_rate": 4.8457021891732866e-05, "loss": 0.6511, "num_input_tokens_seen": 21381120, "step": 2610 }, { "epoch": 0.33854503165783695, "grad_norm": 0.7009825110435486, "learning_rate": 4.844529998531058e-05, "loss": 0.715, "num_input_tokens_seen": 21463040, "step": 2620 }, { "epoch": 0.339837188267218, "grad_norm": 0.5106743574142456, "learning_rate": 4.843353515062982e-05, "loss": 0.9274, "num_input_tokens_seen": 21544960, "step": 2630 }, { "epoch": 0.341129344876599, "grad_norm": 0.26822468638420105, "learning_rate": 4.842172740923194e-05, "loss": 0.6468, "num_input_tokens_seen": 21626880, "step": 2640 }, { "epoch": 0.3424215014859801, "grad_norm": 0.7012847065925598, "learning_rate": 4.840987678273688e-05, "loss": 0.9614, "num_input_tokens_seen": 21708800, "step": 2650 }, { "epoch": 0.34371365809536114, "grad_norm": 0.4316296875476837, "learning_rate": 4.8397983292843095e-05, "loss": 0.8697, "num_input_tokens_seen": 21790720, "step": 2660 }, { "epoch": 0.34500581470474223, "grad_norm": 0.3868614137172699, "learning_rate": 4.838604696132753e-05, "loss": 0.7511, "num_input_tokens_seen": 21872640, "step": 2670 }, { "epoch": 0.34629797131412327, "grad_norm": 0.3558228313922882, "learning_rate": 4.837406781004554e-05, "loss": 1.081, "num_input_tokens_seen": 21954560, "step": 2680 }, { "epoch": 0.34759012792350436, "grad_norm": 0.47105276584625244, "learning_rate": 4.836204586093092e-05, "loss": 0.9772, "num_input_tokens_seen": 22036480, "step": 2690 }, { "epoch": 0.3488822845328854, "grad_norm": 0.3401098847389221, "learning_rate": 4.8349981135995826e-05, "loss": 0.7102, "num_input_tokens_seen": 22118400, "step": 2700 }, { "epoch": 0.3501744411422664, "grad_norm": 0.4399671256542206, "learning_rate": 4.833787365733071e-05, "loss": 0.8293, "num_input_tokens_seen": 22200320, "step": 2710 }, { "epoch": 0.3514665977516475, "grad_norm": 0.27914759516716003, "learning_rate": 4.832572344710433e-05, "loss": 0.6742, "num_input_tokens_seen": 22282240, "step": 2720 }, { "epoch": 0.35275875436102855, "grad_norm": 0.6157001256942749, "learning_rate": 4.831353052756367e-05, "loss": 1.1822, "num_input_tokens_seen": 22364160, "step": 2730 }, { "epoch": 0.35405091097040964, "grad_norm": 0.45960766077041626, "learning_rate": 4.830129492103392e-05, "loss": 1.0566, "num_input_tokens_seen": 22446080, "step": 2740 }, { "epoch": 0.3553430675797907, "grad_norm": 0.35881564021110535, "learning_rate": 4.828901664991845e-05, "loss": 0.8509, "num_input_tokens_seen": 22528000, "step": 2750 }, { "epoch": 0.3566352241891717, "grad_norm": 0.5361536145210266, "learning_rate": 4.8276695736698704e-05, "loss": 0.9067, "num_input_tokens_seen": 22609920, "step": 2760 }, { "epoch": 0.3579273807985528, "grad_norm": 0.39405712485313416, "learning_rate": 4.826433220393424e-05, "loss": 0.7381, "num_input_tokens_seen": 22691840, "step": 2770 }, { "epoch": 0.35921953740793383, "grad_norm": 0.42842912673950195, "learning_rate": 4.825192607426264e-05, "loss": 0.9375, "num_input_tokens_seen": 22773760, "step": 2780 }, { "epoch": 0.3605116940173149, "grad_norm": 0.5455546975135803, "learning_rate": 4.823947737039948e-05, "loss": 1.1993, "num_input_tokens_seen": 22855680, "step": 2790 }, { "epoch": 0.36180385062669596, "grad_norm": 0.6831130981445312, "learning_rate": 4.82269861151383e-05, "loss": 0.7907, "num_input_tokens_seen": 22937600, "step": 2800 }, { "epoch": 0.363096007236077, "grad_norm": 0.5363448858261108, "learning_rate": 4.821445233135053e-05, "loss": 1.0263, "num_input_tokens_seen": 23019520, "step": 2810 }, { "epoch": 0.3643881638454581, "grad_norm": 0.48520973324775696, "learning_rate": 4.8201876041985496e-05, "loss": 0.8387, "num_input_tokens_seen": 23101440, "step": 2820 }, { "epoch": 0.3656803204548391, "grad_norm": 0.48314452171325684, "learning_rate": 4.8189257270070335e-05, "loss": 1.1519, "num_input_tokens_seen": 23183360, "step": 2830 }, { "epoch": 0.3669724770642202, "grad_norm": 0.8397269248962402, "learning_rate": 4.817659603870995e-05, "loss": 0.8801, "num_input_tokens_seen": 23265280, "step": 2840 }, { "epoch": 0.36826463367360124, "grad_norm": 0.5130114555358887, "learning_rate": 4.8163892371087045e-05, "loss": 0.8017, "num_input_tokens_seen": 23347200, "step": 2850 }, { "epoch": 0.3695567902829823, "grad_norm": 0.653531551361084, "learning_rate": 4.815114629046196e-05, "loss": 0.9614, "num_input_tokens_seen": 23429120, "step": 2860 }, { "epoch": 0.37084894689236336, "grad_norm": 0.39914658665657043, "learning_rate": 4.813835782017274e-05, "loss": 0.93, "num_input_tokens_seen": 23511040, "step": 2870 }, { "epoch": 0.3721411035017444, "grad_norm": 0.6135099530220032, "learning_rate": 4.812552698363502e-05, "loss": 0.9083, "num_input_tokens_seen": 23592960, "step": 2880 }, { "epoch": 0.3734332601111255, "grad_norm": 0.3109150826931, "learning_rate": 4.8112653804342015e-05, "loss": 0.486, "num_input_tokens_seen": 23674880, "step": 2890 }, { "epoch": 0.3747254167205065, "grad_norm": 0.4596582353115082, "learning_rate": 4.809973830586446e-05, "loss": 1.176, "num_input_tokens_seen": 23756800, "step": 2900 }, { "epoch": 0.37601757332988756, "grad_norm": 0.4620276391506195, "learning_rate": 4.8086780511850606e-05, "loss": 1.007, "num_input_tokens_seen": 23838720, "step": 2910 }, { "epoch": 0.37730972993926865, "grad_norm": 0.5108826160430908, "learning_rate": 4.807378044602611e-05, "loss": 1.0682, "num_input_tokens_seen": 23920640, "step": 2920 }, { "epoch": 0.3786018865486497, "grad_norm": 0.48416051268577576, "learning_rate": 4.806073813219404e-05, "loss": 0.9145, "num_input_tokens_seen": 24002560, "step": 2930 }, { "epoch": 0.37989404315803077, "grad_norm": 0.4877118766307831, "learning_rate": 4.8047653594234855e-05, "loss": 0.6768, "num_input_tokens_seen": 24084480, "step": 2940 }, { "epoch": 0.3811861997674118, "grad_norm": 0.7045567631721497, "learning_rate": 4.803452685610626e-05, "loss": 1.3005, "num_input_tokens_seen": 24166400, "step": 2950 }, { "epoch": 0.3824783563767929, "grad_norm": 0.6394911408424377, "learning_rate": 4.802135794184329e-05, "loss": 0.9298, "num_input_tokens_seen": 24248320, "step": 2960 }, { "epoch": 0.38377051298617393, "grad_norm": 0.5416907072067261, "learning_rate": 4.800814687555817e-05, "loss": 0.7336, "num_input_tokens_seen": 24330240, "step": 2970 }, { "epoch": 0.38506266959555496, "grad_norm": 0.47221288084983826, "learning_rate": 4.799489368144031e-05, "loss": 1.221, "num_input_tokens_seen": 24412160, "step": 2980 }, { "epoch": 0.38635482620493605, "grad_norm": 0.49521034955978394, "learning_rate": 4.798159838375626e-05, "loss": 0.9109, "num_input_tokens_seen": 24494080, "step": 2990 }, { "epoch": 0.3876469828143171, "grad_norm": 0.4463275671005249, "learning_rate": 4.796826100684967e-05, "loss": 0.6901, "num_input_tokens_seen": 24576000, "step": 3000 }, { "epoch": 0.3889391394236982, "grad_norm": 0.46501803398132324, "learning_rate": 4.795488157514122e-05, "loss": 1.0645, "num_input_tokens_seen": 24657920, "step": 3010 }, { "epoch": 0.3902312960330792, "grad_norm": 0.40190380811691284, "learning_rate": 4.794146011312861e-05, "loss": 1.0953, "num_input_tokens_seen": 24739840, "step": 3020 }, { "epoch": 0.39152345264246025, "grad_norm": 0.5038986206054688, "learning_rate": 4.7927996645386476e-05, "loss": 1.2322, "num_input_tokens_seen": 24821760, "step": 3030 }, { "epoch": 0.39281560925184134, "grad_norm": 0.4735121428966522, "learning_rate": 4.791449119656638e-05, "loss": 0.7708, "num_input_tokens_seen": 24903680, "step": 3040 }, { "epoch": 0.39410776586122237, "grad_norm": 0.7169963121414185, "learning_rate": 4.790094379139676e-05, "loss": 0.8159, "num_input_tokens_seen": 24985600, "step": 3050 }, { "epoch": 0.39539992247060346, "grad_norm": 0.9569246172904968, "learning_rate": 4.7887354454682854e-05, "loss": 0.7697, "num_input_tokens_seen": 25067520, "step": 3060 }, { "epoch": 0.3966920790799845, "grad_norm": 0.5772688388824463, "learning_rate": 4.78737232113067e-05, "loss": 1.0574, "num_input_tokens_seen": 25149440, "step": 3070 }, { "epoch": 0.39798423568936553, "grad_norm": 0.4956931173801422, "learning_rate": 4.7860050086227035e-05, "loss": 1.0678, "num_input_tokens_seen": 25231360, "step": 3080 }, { "epoch": 0.3992763922987466, "grad_norm": 0.39392775297164917, "learning_rate": 4.784633510447932e-05, "loss": 0.9143, "num_input_tokens_seen": 25313280, "step": 3090 }, { "epoch": 0.40056854890812765, "grad_norm": 0.40444672107696533, "learning_rate": 4.7832578291175626e-05, "loss": 0.8812, "num_input_tokens_seen": 25395200, "step": 3100 }, { "epoch": 0.40186070551750874, "grad_norm": 0.7210461497306824, "learning_rate": 4.781877967150463e-05, "loss": 0.6671, "num_input_tokens_seen": 25477120, "step": 3110 }, { "epoch": 0.4031528621268898, "grad_norm": 0.6195704340934753, "learning_rate": 4.7804939270731564e-05, "loss": 1.0019, "num_input_tokens_seen": 25559040, "step": 3120 }, { "epoch": 0.4044450187362708, "grad_norm": 0.42882412672042847, "learning_rate": 4.7791057114198133e-05, "loss": 0.799, "num_input_tokens_seen": 25640960, "step": 3130 }, { "epoch": 0.4057371753456519, "grad_norm": 0.3896157741546631, "learning_rate": 4.7777133227322525e-05, "loss": 1.0606, "num_input_tokens_seen": 25722880, "step": 3140 }, { "epoch": 0.40702933195503294, "grad_norm": 0.3924260139465332, "learning_rate": 4.776316763559933e-05, "loss": 0.8224, "num_input_tokens_seen": 25804800, "step": 3150 }, { "epoch": 0.408321488564414, "grad_norm": 0.44281336665153503, "learning_rate": 4.774916036459949e-05, "loss": 0.8995, "num_input_tokens_seen": 25886720, "step": 3160 }, { "epoch": 0.40961364517379506, "grad_norm": 0.5028753280639648, "learning_rate": 4.773511143997026e-05, "loss": 1.1617, "num_input_tokens_seen": 25968640, "step": 3170 }, { "epoch": 0.4109058017831761, "grad_norm": 0.6678476929664612, "learning_rate": 4.7721020887435186e-05, "loss": 0.8537, "num_input_tokens_seen": 26050560, "step": 3180 }, { "epoch": 0.4121979583925572, "grad_norm": 0.6059293746948242, "learning_rate": 4.7706888732793996e-05, "loss": 1.5184, "num_input_tokens_seen": 26132480, "step": 3190 }, { "epoch": 0.4134901150019382, "grad_norm": 0.46364808082580566, "learning_rate": 4.769271500192264e-05, "loss": 1.1, "num_input_tokens_seen": 26214400, "step": 3200 }, { "epoch": 0.4147822716113193, "grad_norm": 0.6045968532562256, "learning_rate": 4.767849972077315e-05, "loss": 1.0147, "num_input_tokens_seen": 26296320, "step": 3210 }, { "epoch": 0.41607442822070034, "grad_norm": 0.6123097538948059, "learning_rate": 4.766424291537366e-05, "loss": 1.0684, "num_input_tokens_seen": 26378240, "step": 3220 }, { "epoch": 0.41736658483008143, "grad_norm": 0.3833664059638977, "learning_rate": 4.7649944611828316e-05, "loss": 0.7629, "num_input_tokens_seen": 26460160, "step": 3230 }, { "epoch": 0.41865874143946247, "grad_norm": 0.5847949981689453, "learning_rate": 4.763560483631728e-05, "loss": 0.8127, "num_input_tokens_seen": 26542080, "step": 3240 }, { "epoch": 0.4199508980488435, "grad_norm": 0.5139411687850952, "learning_rate": 4.762122361509662e-05, "loss": 1.0787, "num_input_tokens_seen": 26624000, "step": 3250 }, { "epoch": 0.4212430546582246, "grad_norm": 0.43457046151161194, "learning_rate": 4.7606800974498287e-05, "loss": 1.089, "num_input_tokens_seen": 26705920, "step": 3260 }, { "epoch": 0.4225352112676056, "grad_norm": 0.5299356579780579, "learning_rate": 4.75923369409301e-05, "loss": 0.7698, "num_input_tokens_seen": 26787840, "step": 3270 }, { "epoch": 0.4238273678769867, "grad_norm": 0.5959427952766418, "learning_rate": 4.757783154087564e-05, "loss": 1.2242, "num_input_tokens_seen": 26869760, "step": 3280 }, { "epoch": 0.42511952448636775, "grad_norm": 0.5723779201507568, "learning_rate": 4.756328480089425e-05, "loss": 1.2536, "num_input_tokens_seen": 26951680, "step": 3290 }, { "epoch": 0.4264116810957488, "grad_norm": 0.4997044503688812, "learning_rate": 4.7548696747620956e-05, "loss": 1.0486, "num_input_tokens_seen": 27033600, "step": 3300 }, { "epoch": 0.4277038377051299, "grad_norm": 0.4314401149749756, "learning_rate": 4.753406740776643e-05, "loss": 1.1756, "num_input_tokens_seen": 27115520, "step": 3310 }, { "epoch": 0.4289959943145109, "grad_norm": 0.5475839376449585, "learning_rate": 4.7519396808116933e-05, "loss": 1.2511, "num_input_tokens_seen": 27197440, "step": 3320 }, { "epoch": 0.430288150923892, "grad_norm": 0.4945213794708252, "learning_rate": 4.750468497553429e-05, "loss": 0.8987, "num_input_tokens_seen": 27279360, "step": 3330 }, { "epoch": 0.43158030753327303, "grad_norm": 0.4221036434173584, "learning_rate": 4.74899319369558e-05, "loss": 1.0922, "num_input_tokens_seen": 27361280, "step": 3340 }, { "epoch": 0.43287246414265407, "grad_norm": 0.40295591950416565, "learning_rate": 4.7475137719394234e-05, "loss": 1.028, "num_input_tokens_seen": 27443200, "step": 3350 }, { "epoch": 0.43416462075203516, "grad_norm": 0.42078787088394165, "learning_rate": 4.746030234993775e-05, "loss": 0.7594, "num_input_tokens_seen": 27525120, "step": 3360 }, { "epoch": 0.4354567773614162, "grad_norm": 0.5644673109054565, "learning_rate": 4.7445425855749844e-05, "loss": 1.21, "num_input_tokens_seen": 27607040, "step": 3370 }, { "epoch": 0.4367489339707973, "grad_norm": 0.9208412766456604, "learning_rate": 4.743050826406934e-05, "loss": 0.8709, "num_input_tokens_seen": 27688960, "step": 3380 }, { "epoch": 0.4380410905801783, "grad_norm": 0.43149518966674805, "learning_rate": 4.741554960221027e-05, "loss": 0.8737, "num_input_tokens_seen": 27770880, "step": 3390 }, { "epoch": 0.43933324718955935, "grad_norm": 0.45487073063850403, "learning_rate": 4.7400549897561914e-05, "loss": 1.2981, "num_input_tokens_seen": 27852800, "step": 3400 }, { "epoch": 0.44062540379894044, "grad_norm": 0.587581217288971, "learning_rate": 4.7385509177588664e-05, "loss": 0.7498, "num_input_tokens_seen": 27934720, "step": 3410 }, { "epoch": 0.4419175604083215, "grad_norm": 15.474299430847168, "learning_rate": 4.7370427469830016e-05, "loss": 1.5585, "num_input_tokens_seen": 28016640, "step": 3420 }, { "epoch": 0.44320971701770256, "grad_norm": 0.41457799077033997, "learning_rate": 4.735530480190053e-05, "loss": 1.2198, "num_input_tokens_seen": 28098560, "step": 3430 }, { "epoch": 0.4445018736270836, "grad_norm": 0.41844090819358826, "learning_rate": 4.734014120148976e-05, "loss": 0.7208, "num_input_tokens_seen": 28180480, "step": 3440 }, { "epoch": 0.44579403023646463, "grad_norm": 0.3628556728363037, "learning_rate": 4.73249366963622e-05, "loss": 0.8981, "num_input_tokens_seen": 28262400, "step": 3450 }, { "epoch": 0.4470861868458457, "grad_norm": 0.6861358284950256, "learning_rate": 4.730969131435724e-05, "loss": 0.9347, "num_input_tokens_seen": 28344320, "step": 3460 }, { "epoch": 0.44837834345522676, "grad_norm": 0.48623812198638916, "learning_rate": 4.729440508338911e-05, "loss": 1.1323, "num_input_tokens_seen": 28426240, "step": 3470 }, { "epoch": 0.44967050006460785, "grad_norm": 0.41574108600616455, "learning_rate": 4.727907803144686e-05, "loss": 1.1095, "num_input_tokens_seen": 28508160, "step": 3480 }, { "epoch": 0.4509626566739889, "grad_norm": 0.5009231567382812, "learning_rate": 4.726371018659427e-05, "loss": 1.2198, "num_input_tokens_seen": 28590080, "step": 3490 }, { "epoch": 0.45225481328336997, "grad_norm": 0.48607850074768066, "learning_rate": 4.724830157696979e-05, "loss": 1.078, "num_input_tokens_seen": 28672000, "step": 3500 }, { "epoch": 0.453546969892751, "grad_norm": 0.8269723653793335, "learning_rate": 4.723285223078653e-05, "loss": 0.9332, "num_input_tokens_seen": 28753920, "step": 3510 }, { "epoch": 0.45483912650213204, "grad_norm": 0.5304045677185059, "learning_rate": 4.721736217633219e-05, "loss": 1.3173, "num_input_tokens_seen": 28835840, "step": 3520 }, { "epoch": 0.45613128311151313, "grad_norm": 0.4949178695678711, "learning_rate": 4.7201831441969016e-05, "loss": 0.9488, "num_input_tokens_seen": 28917760, "step": 3530 }, { "epoch": 0.45742343972089416, "grad_norm": 0.742077112197876, "learning_rate": 4.71862600561337e-05, "loss": 0.8988, "num_input_tokens_seen": 28999680, "step": 3540 }, { "epoch": 0.45871559633027525, "grad_norm": 0.6357890963554382, "learning_rate": 4.7170648047337415e-05, "loss": 1.401, "num_input_tokens_seen": 29081600, "step": 3550 }, { "epoch": 0.4600077529396563, "grad_norm": 0.512546956539154, "learning_rate": 4.7154995444165685e-05, "loss": 0.768, "num_input_tokens_seen": 29163520, "step": 3560 }, { "epoch": 0.4612999095490373, "grad_norm": 0.4622116684913635, "learning_rate": 4.713930227527836e-05, "loss": 0.8524, "num_input_tokens_seen": 29245440, "step": 3570 }, { "epoch": 0.4625920661584184, "grad_norm": 0.620832085609436, "learning_rate": 4.712356856940958e-05, "loss": 0.8993, "num_input_tokens_seen": 29327360, "step": 3580 }, { "epoch": 0.46388422276779945, "grad_norm": 0.4882233142852783, "learning_rate": 4.710779435536772e-05, "loss": 0.7759, "num_input_tokens_seen": 29409280, "step": 3590 }, { "epoch": 0.46517637937718054, "grad_norm": 0.39704984426498413, "learning_rate": 4.709197966203528e-05, "loss": 0.7109, "num_input_tokens_seen": 29491200, "step": 3600 }, { "epoch": 0.46646853598656157, "grad_norm": 0.6212629079818726, "learning_rate": 4.707612451836892e-05, "loss": 1.2732, "num_input_tokens_seen": 29573120, "step": 3610 }, { "epoch": 0.4677606925959426, "grad_norm": 0.4958352744579315, "learning_rate": 4.706022895339936e-05, "loss": 1.0464, "num_input_tokens_seen": 29655040, "step": 3620 }, { "epoch": 0.4690528492053237, "grad_norm": 0.5100958943367004, "learning_rate": 4.704429299623129e-05, "loss": 0.8741, "num_input_tokens_seen": 29736960, "step": 3630 }, { "epoch": 0.47034500581470473, "grad_norm": 0.4343372583389282, "learning_rate": 4.7028316676043425e-05, "loss": 0.9055, "num_input_tokens_seen": 29818880, "step": 3640 }, { "epoch": 0.4716371624240858, "grad_norm": 0.5606836676597595, "learning_rate": 4.7012300022088326e-05, "loss": 0.9934, "num_input_tokens_seen": 29900800, "step": 3650 }, { "epoch": 0.47292931903346686, "grad_norm": 0.5697008371353149, "learning_rate": 4.6996243063692446e-05, "loss": 1.0764, "num_input_tokens_seen": 29982720, "step": 3660 }, { "epoch": 0.4742214756428479, "grad_norm": 0.9284478425979614, "learning_rate": 4.6980145830255993e-05, "loss": 0.5566, "num_input_tokens_seen": 30064640, "step": 3670 }, { "epoch": 0.475513632252229, "grad_norm": 0.48587319254875183, "learning_rate": 4.6964008351252964e-05, "loss": 0.9627, "num_input_tokens_seen": 30146560, "step": 3680 }, { "epoch": 0.47680578886161, "grad_norm": 0.6687888503074646, "learning_rate": 4.694783065623102e-05, "loss": 0.8859, "num_input_tokens_seen": 30228480, "step": 3690 }, { "epoch": 0.4780979454709911, "grad_norm": 0.6682938933372498, "learning_rate": 4.6931612774811445e-05, "loss": 1.1395, "num_input_tokens_seen": 30310400, "step": 3700 }, { "epoch": 0.47939010208037214, "grad_norm": 0.5708907246589661, "learning_rate": 4.691535473668914e-05, "loss": 0.7113, "num_input_tokens_seen": 30392320, "step": 3710 }, { "epoch": 0.4806822586897532, "grad_norm": 0.5752365589141846, "learning_rate": 4.68990565716325e-05, "loss": 1.049, "num_input_tokens_seen": 30474240, "step": 3720 }, { "epoch": 0.48197441529913426, "grad_norm": 0.5052396059036255, "learning_rate": 4.688271830948342e-05, "loss": 1.1382, "num_input_tokens_seen": 30556160, "step": 3730 }, { "epoch": 0.4832665719085153, "grad_norm": 0.587308406829834, "learning_rate": 4.686633998015718e-05, "loss": 0.8307, "num_input_tokens_seen": 30638080, "step": 3740 }, { "epoch": 0.4845587285178964, "grad_norm": 0.6255913972854614, "learning_rate": 4.6849921613642456e-05, "loss": 1.0814, "num_input_tokens_seen": 30720000, "step": 3750 }, { "epoch": 0.4858508851272774, "grad_norm": 0.5788549184799194, "learning_rate": 4.683346324000122e-05, "loss": 1.0745, "num_input_tokens_seen": 30801920, "step": 3760 }, { "epoch": 0.48714304173665846, "grad_norm": 0.45166078209877014, "learning_rate": 4.6816964889368674e-05, "loss": 1.0342, "num_input_tokens_seen": 30883840, "step": 3770 }, { "epoch": 0.48843519834603955, "grad_norm": 0.38736027479171753, "learning_rate": 4.680042659195325e-05, "loss": 1.5249, "num_input_tokens_seen": 30965760, "step": 3780 }, { "epoch": 0.4897273549554206, "grad_norm": 0.5221673846244812, "learning_rate": 4.678384837803651e-05, "loss": 0.9, "num_input_tokens_seen": 31047680, "step": 3790 }, { "epoch": 0.49101951156480167, "grad_norm": 0.5739164352416992, "learning_rate": 4.67672302779731e-05, "loss": 0.835, "num_input_tokens_seen": 31129600, "step": 3800 }, { "epoch": 0.4923116681741827, "grad_norm": 0.6319538950920105, "learning_rate": 4.6750572322190716e-05, "loss": 1.2393, "num_input_tokens_seen": 31211520, "step": 3810 }, { "epoch": 0.4936038247835638, "grad_norm": 0.8037749528884888, "learning_rate": 4.673387454118999e-05, "loss": 0.5902, "num_input_tokens_seen": 31293440, "step": 3820 }, { "epoch": 0.49489598139294483, "grad_norm": 0.5781430006027222, "learning_rate": 4.671713696554452e-05, "loss": 0.8908, "num_input_tokens_seen": 31375360, "step": 3830 }, { "epoch": 0.49618813800232586, "grad_norm": 0.5275561213493347, "learning_rate": 4.6700359625900724e-05, "loss": 0.9977, "num_input_tokens_seen": 31457280, "step": 3840 }, { "epoch": 0.49748029461170695, "grad_norm": 0.7814369201660156, "learning_rate": 4.668354255297785e-05, "loss": 0.8617, "num_input_tokens_seen": 31539200, "step": 3850 }, { "epoch": 0.498772451221088, "grad_norm": 0.26794329285621643, "learning_rate": 4.666668577756793e-05, "loss": 0.7011, "num_input_tokens_seen": 31621120, "step": 3860 }, { "epoch": 0.500064607830469, "grad_norm": 0.4963877201080322, "learning_rate": 4.664978933053562e-05, "loss": 0.8713, "num_input_tokens_seen": 31703040, "step": 3870 }, { "epoch": 0.5013567644398501, "grad_norm": 0.748673141002655, "learning_rate": 4.6632853242818274e-05, "loss": 1.0985, "num_input_tokens_seen": 31784960, "step": 3880 }, { "epoch": 0.5026489210492312, "grad_norm": 0.2532234489917755, "learning_rate": 4.66158775454258e-05, "loss": 0.8102, "num_input_tokens_seen": 31866880, "step": 3890 }, { "epoch": 0.5039410776586122, "grad_norm": 0.4093916416168213, "learning_rate": 4.659886226944063e-05, "loss": 1.0378, "num_input_tokens_seen": 31948800, "step": 3900 }, { "epoch": 0.5052332342679933, "grad_norm": 0.656688392162323, "learning_rate": 4.658180744601769e-05, "loss": 0.9426, "num_input_tokens_seen": 32030720, "step": 3910 }, { "epoch": 0.5065253908773744, "grad_norm": 0.7212385535240173, "learning_rate": 4.6564713106384296e-05, "loss": 1.1089, "num_input_tokens_seen": 32112640, "step": 3920 }, { "epoch": 0.5078175474867554, "grad_norm": 0.5661940574645996, "learning_rate": 4.65475792818401e-05, "loss": 1.0339, "num_input_tokens_seen": 32194560, "step": 3930 }, { "epoch": 0.5091097040961364, "grad_norm": 0.5363628268241882, "learning_rate": 4.653040600375709e-05, "loss": 1.1407, "num_input_tokens_seen": 32276480, "step": 3940 }, { "epoch": 0.5104018607055175, "grad_norm": 0.29208219051361084, "learning_rate": 4.6513193303579476e-05, "loss": 1.1492, "num_input_tokens_seen": 32358400, "step": 3950 }, { "epoch": 0.5116940173148986, "grad_norm": 0.7336292862892151, "learning_rate": 4.6495941212823644e-05, "loss": 0.8435, "num_input_tokens_seen": 32440320, "step": 3960 }, { "epoch": 0.5129861739242796, "grad_norm": 0.45039644837379456, "learning_rate": 4.647864976307811e-05, "loss": 0.6948, "num_input_tokens_seen": 32522240, "step": 3970 }, { "epoch": 0.5142783305336607, "grad_norm": 0.4214424788951874, "learning_rate": 4.646131898600345e-05, "loss": 0.8107, "num_input_tokens_seen": 32604160, "step": 3980 }, { "epoch": 0.5155704871430418, "grad_norm": 0.5167589783668518, "learning_rate": 4.644394891333227e-05, "loss": 1.0497, "num_input_tokens_seen": 32686080, "step": 3990 }, { "epoch": 0.5168626437524227, "grad_norm": 0.5262385010719299, "learning_rate": 4.64265395768691e-05, "loss": 0.988, "num_input_tokens_seen": 32768000, "step": 4000 }, { "epoch": 0.5181548003618038, "grad_norm": 0.464677631855011, "learning_rate": 4.6409091008490365e-05, "loss": 0.9792, "num_input_tokens_seen": 32849920, "step": 4010 }, { "epoch": 0.5194469569711849, "grad_norm": 0.4901339113712311, "learning_rate": 4.639160324014433e-05, "loss": 0.7922, "num_input_tokens_seen": 32931840, "step": 4020 }, { "epoch": 0.520739113580566, "grad_norm": 0.47087928652763367, "learning_rate": 4.637407630385104e-05, "loss": 1.2068, "num_input_tokens_seen": 33013760, "step": 4030 }, { "epoch": 0.522031270189947, "grad_norm": 0.510374128818512, "learning_rate": 4.6356510231702254e-05, "loss": 0.9503, "num_input_tokens_seen": 33095680, "step": 4040 }, { "epoch": 0.5233234267993281, "grad_norm": 0.5987057685852051, "learning_rate": 4.633890505586139e-05, "loss": 0.8793, "num_input_tokens_seen": 33177600, "step": 4050 }, { "epoch": 0.5246155834087092, "grad_norm": 0.5550726652145386, "learning_rate": 4.6321260808563445e-05, "loss": 1.0048, "num_input_tokens_seen": 33259520, "step": 4060 }, { "epoch": 0.5259077400180902, "grad_norm": 0.727328896522522, "learning_rate": 4.630357752211498e-05, "loss": 0.8116, "num_input_tokens_seen": 33341440, "step": 4070 }, { "epoch": 0.5271998966274712, "grad_norm": 1.1229157447814941, "learning_rate": 4.6285855228894025e-05, "loss": 1.202, "num_input_tokens_seen": 33423360, "step": 4080 }, { "epoch": 0.5284920532368523, "grad_norm": 0.8320699334144592, "learning_rate": 4.626809396135003e-05, "loss": 1.2678, "num_input_tokens_seen": 33505280, "step": 4090 }, { "epoch": 0.5297842098462333, "grad_norm": 0.3679317533969879, "learning_rate": 4.6250293752003834e-05, "loss": 1.052, "num_input_tokens_seen": 33587200, "step": 4100 }, { "epoch": 0.5310763664556144, "grad_norm": 0.5202254056930542, "learning_rate": 4.623245463344753e-05, "loss": 0.9749, "num_input_tokens_seen": 33669120, "step": 4110 }, { "epoch": 0.5323685230649955, "grad_norm": 0.6384007930755615, "learning_rate": 4.6214576638344484e-05, "loss": 0.9361, "num_input_tokens_seen": 33751040, "step": 4120 }, { "epoch": 0.5336606796743766, "grad_norm": 0.39264151453971863, "learning_rate": 4.619665979942924e-05, "loss": 1.0865, "num_input_tokens_seen": 33832960, "step": 4130 }, { "epoch": 0.5349528362837576, "grad_norm": 0.5233849287033081, "learning_rate": 4.617870414950748e-05, "loss": 0.8994, "num_input_tokens_seen": 33914880, "step": 4140 }, { "epoch": 0.5362449928931387, "grad_norm": 0.4549662172794342, "learning_rate": 4.616070972145591e-05, "loss": 1.0843, "num_input_tokens_seen": 33996800, "step": 4150 }, { "epoch": 0.5375371495025197, "grad_norm": 0.5111933350563049, "learning_rate": 4.614267654822228e-05, "loss": 1.1799, "num_input_tokens_seen": 34078720, "step": 4160 }, { "epoch": 0.5388293061119007, "grad_norm": 0.49319344758987427, "learning_rate": 4.612460466282525e-05, "loss": 1.018, "num_input_tokens_seen": 34160640, "step": 4170 }, { "epoch": 0.5401214627212818, "grad_norm": 0.5339105725288391, "learning_rate": 4.610649409835438e-05, "loss": 1.2698, "num_input_tokens_seen": 34242560, "step": 4180 }, { "epoch": 0.5414136193306629, "grad_norm": 0.6269609928131104, "learning_rate": 4.608834488797006e-05, "loss": 1.3887, "num_input_tokens_seen": 34324480, "step": 4190 }, { "epoch": 0.542705775940044, "grad_norm": 0.5303708910942078, "learning_rate": 4.607015706490341e-05, "loss": 0.983, "num_input_tokens_seen": 34406400, "step": 4200 }, { "epoch": 0.543997932549425, "grad_norm": 0.3554169237613678, "learning_rate": 4.6051930662456276e-05, "loss": 0.8547, "num_input_tokens_seen": 34488320, "step": 4210 }, { "epoch": 0.5452900891588061, "grad_norm": 0.6087794899940491, "learning_rate": 4.603366571400114e-05, "loss": 1.1971, "num_input_tokens_seen": 34570240, "step": 4220 }, { "epoch": 0.5465822457681871, "grad_norm": 0.47708624601364136, "learning_rate": 4.601536225298104e-05, "loss": 0.927, "num_input_tokens_seen": 34652160, "step": 4230 }, { "epoch": 0.5478744023775681, "grad_norm": 0.2548501789569855, "learning_rate": 4.5997020312909565e-05, "loss": 0.8928, "num_input_tokens_seen": 34734080, "step": 4240 }, { "epoch": 0.5491665589869492, "grad_norm": 0.3759148418903351, "learning_rate": 4.597863992737072e-05, "loss": 0.9546, "num_input_tokens_seen": 34816000, "step": 4250 }, { "epoch": 0.5504587155963303, "grad_norm": 0.5236682891845703, "learning_rate": 4.5960221130018946e-05, "loss": 1.1029, "num_input_tokens_seen": 34897920, "step": 4260 }, { "epoch": 0.5517508722057113, "grad_norm": 0.5889397263526917, "learning_rate": 4.594176395457897e-05, "loss": 1.1309, "num_input_tokens_seen": 34979840, "step": 4270 }, { "epoch": 0.5530430288150924, "grad_norm": 0.5856964588165283, "learning_rate": 4.592326843484583e-05, "loss": 0.8424, "num_input_tokens_seen": 35061760, "step": 4280 }, { "epoch": 0.5543351854244735, "grad_norm": 0.5260830521583557, "learning_rate": 4.590473460468475e-05, "loss": 1.2382, "num_input_tokens_seen": 35143680, "step": 4290 }, { "epoch": 0.5556273420338546, "grad_norm": 0.5527287721633911, "learning_rate": 4.58861624980311e-05, "loss": 1.0987, "num_input_tokens_seen": 35225600, "step": 4300 }, { "epoch": 0.5569194986432355, "grad_norm": 0.6066960096359253, "learning_rate": 4.586755214889035e-05, "loss": 0.6929, "num_input_tokens_seen": 35307520, "step": 4310 }, { "epoch": 0.5582116552526166, "grad_norm": 0.8027804493904114, "learning_rate": 4.584890359133797e-05, "loss": 0.9671, "num_input_tokens_seen": 35389440, "step": 4320 }, { "epoch": 0.5595038118619977, "grad_norm": 0.5972751975059509, "learning_rate": 4.58302168595194e-05, "loss": 0.7893, "num_input_tokens_seen": 35471360, "step": 4330 }, { "epoch": 0.5607959684713787, "grad_norm": 0.36377179622650146, "learning_rate": 4.5811491987649994e-05, "loss": 0.8821, "num_input_tokens_seen": 35553280, "step": 4340 }, { "epoch": 0.5620881250807598, "grad_norm": 0.5296213626861572, "learning_rate": 4.579272901001491e-05, "loss": 0.8082, "num_input_tokens_seen": 35635200, "step": 4350 }, { "epoch": 0.5633802816901409, "grad_norm": 0.4613911509513855, "learning_rate": 4.57739279609691e-05, "loss": 0.9638, "num_input_tokens_seen": 35717120, "step": 4360 }, { "epoch": 0.5646724382995219, "grad_norm": 0.5832899212837219, "learning_rate": 4.57550888749372e-05, "loss": 1.0087, "num_input_tokens_seen": 35799040, "step": 4370 }, { "epoch": 0.5659645949089029, "grad_norm": 0.461342453956604, "learning_rate": 4.5736211786413524e-05, "loss": 1.0578, "num_input_tokens_seen": 35880960, "step": 4380 }, { "epoch": 0.567256751518284, "grad_norm": 0.5198622941970825, "learning_rate": 4.571729672996195e-05, "loss": 0.5186, "num_input_tokens_seen": 35962880, "step": 4390 }, { "epoch": 0.5685489081276651, "grad_norm": 0.4189557135105133, "learning_rate": 4.5698343740215865e-05, "loss": 1.0137, "num_input_tokens_seen": 36044800, "step": 4400 }, { "epoch": 0.5698410647370461, "grad_norm": 0.4138655364513397, "learning_rate": 4.5679352851878135e-05, "loss": 0.91, "num_input_tokens_seen": 36126720, "step": 4410 }, { "epoch": 0.5711332213464272, "grad_norm": 0.327007532119751, "learning_rate": 4.5660324099721005e-05, "loss": 0.8943, "num_input_tokens_seen": 36208640, "step": 4420 }, { "epoch": 0.5724253779558083, "grad_norm": 0.48592689633369446, "learning_rate": 4.5641257518586044e-05, "loss": 0.8402, "num_input_tokens_seen": 36290560, "step": 4430 }, { "epoch": 0.5737175345651893, "grad_norm": 0.5149102210998535, "learning_rate": 4.562215314338411e-05, "loss": 0.8945, "num_input_tokens_seen": 36372480, "step": 4440 }, { "epoch": 0.5750096911745703, "grad_norm": 0.47241947054862976, "learning_rate": 4.560301100909522e-05, "loss": 1.3013, "num_input_tokens_seen": 36454400, "step": 4450 }, { "epoch": 0.5763018477839514, "grad_norm": 0.6244943737983704, "learning_rate": 4.558383115076857e-05, "loss": 0.8028, "num_input_tokens_seen": 36536320, "step": 4460 }, { "epoch": 0.5775940043933324, "grad_norm": 0.42100536823272705, "learning_rate": 4.556461360352241e-05, "loss": 1.1116, "num_input_tokens_seen": 36618240, "step": 4470 }, { "epoch": 0.5788861610027135, "grad_norm": 0.5363230109214783, "learning_rate": 4.554535840254398e-05, "loss": 1.094, "num_input_tokens_seen": 36700160, "step": 4480 }, { "epoch": 0.5801783176120946, "grad_norm": 0.6321589350700378, "learning_rate": 4.552606558308951e-05, "loss": 0.9717, "num_input_tokens_seen": 36782080, "step": 4490 }, { "epoch": 0.5814704742214757, "grad_norm": 0.7036007046699524, "learning_rate": 4.550673518048405e-05, "loss": 1.0816, "num_input_tokens_seen": 36864000, "step": 4500 }, { "epoch": 0.5827626308308567, "grad_norm": 0.4853123724460602, "learning_rate": 4.548736723012153e-05, "loss": 0.9271, "num_input_tokens_seen": 36945920, "step": 4510 }, { "epoch": 0.5840547874402378, "grad_norm": 0.7495065331459045, "learning_rate": 4.5467961767464575e-05, "loss": 0.9318, "num_input_tokens_seen": 37027840, "step": 4520 }, { "epoch": 0.5853469440496188, "grad_norm": 0.43084239959716797, "learning_rate": 4.5448518828044515e-05, "loss": 0.7255, "num_input_tokens_seen": 37109760, "step": 4530 }, { "epoch": 0.5866391006589998, "grad_norm": 0.560870885848999, "learning_rate": 4.5429038447461315e-05, "loss": 1.1148, "num_input_tokens_seen": 37191680, "step": 4540 }, { "epoch": 0.5879312572683809, "grad_norm": 0.5736927390098572, "learning_rate": 4.540952066138347e-05, "loss": 1.1077, "num_input_tokens_seen": 37273600, "step": 4550 }, { "epoch": 0.589223413877762, "grad_norm": 0.27289411425590515, "learning_rate": 4.538996550554798e-05, "loss": 0.923, "num_input_tokens_seen": 37355520, "step": 4560 }, { "epoch": 0.5905155704871431, "grad_norm": 0.4996427893638611, "learning_rate": 4.537037301576026e-05, "loss": 0.9954, "num_input_tokens_seen": 37437440, "step": 4570 }, { "epoch": 0.5918077270965241, "grad_norm": 0.4833759367465973, "learning_rate": 4.535074322789408e-05, "loss": 0.9237, "num_input_tokens_seen": 37519360, "step": 4580 }, { "epoch": 0.5930998837059052, "grad_norm": 0.20252840220928192, "learning_rate": 4.5331076177891527e-05, "loss": 1.0009, "num_input_tokens_seen": 37601280, "step": 4590 }, { "epoch": 0.5943920403152863, "grad_norm": 0.49745073914527893, "learning_rate": 4.531137190176289e-05, "loss": 1.0067, "num_input_tokens_seen": 37683200, "step": 4600 }, { "epoch": 0.5956841969246672, "grad_norm": 0.5191871523857117, "learning_rate": 4.529163043558662e-05, "loss": 0.7292, "num_input_tokens_seen": 37765120, "step": 4610 }, { "epoch": 0.5969763535340483, "grad_norm": 0.7261500954627991, "learning_rate": 4.527185181550928e-05, "loss": 1.0878, "num_input_tokens_seen": 37847040, "step": 4620 }, { "epoch": 0.5982685101434294, "grad_norm": 0.6851155757904053, "learning_rate": 4.525203607774544e-05, "loss": 0.741, "num_input_tokens_seen": 37928960, "step": 4630 }, { "epoch": 0.5995606667528104, "grad_norm": 0.5459215044975281, "learning_rate": 4.5232183258577655e-05, "loss": 0.498, "num_input_tokens_seen": 38010880, "step": 4640 }, { "epoch": 0.6008528233621915, "grad_norm": 0.3943910300731659, "learning_rate": 4.5212293394356356e-05, "loss": 0.854, "num_input_tokens_seen": 38092800, "step": 4650 }, { "epoch": 0.6021449799715726, "grad_norm": 0.402898907661438, "learning_rate": 4.519236652149981e-05, "loss": 0.8904, "num_input_tokens_seen": 38174720, "step": 4660 }, { "epoch": 0.6034371365809537, "grad_norm": 0.23560667037963867, "learning_rate": 4.517240267649405e-05, "loss": 0.6613, "num_input_tokens_seen": 38256640, "step": 4670 }, { "epoch": 0.6047292931903346, "grad_norm": 0.3227959871292114, "learning_rate": 4.515240189589282e-05, "loss": 0.679, "num_input_tokens_seen": 38338560, "step": 4680 }, { "epoch": 0.6060214497997157, "grad_norm": 0.29034727811813354, "learning_rate": 4.5132364216317446e-05, "loss": 0.7213, "num_input_tokens_seen": 38420480, "step": 4690 }, { "epoch": 0.6073136064090968, "grad_norm": 0.5373560190200806, "learning_rate": 4.5112289674456864e-05, "loss": 1.0668, "num_input_tokens_seen": 38502400, "step": 4700 }, { "epoch": 0.6086057630184778, "grad_norm": 0.48361408710479736, "learning_rate": 4.509217830706749e-05, "loss": 0.6861, "num_input_tokens_seen": 38584320, "step": 4710 }, { "epoch": 0.6098979196278589, "grad_norm": 0.4311217963695526, "learning_rate": 4.5072030150973154e-05, "loss": 0.9106, "num_input_tokens_seen": 38666240, "step": 4720 }, { "epoch": 0.61119007623724, "grad_norm": 0.47615599632263184, "learning_rate": 4.505184524306506e-05, "loss": 0.73, "num_input_tokens_seen": 38748160, "step": 4730 }, { "epoch": 0.612482232846621, "grad_norm": 0.5375292897224426, "learning_rate": 4.50316236203017e-05, "loss": 1.3151, "num_input_tokens_seen": 38830080, "step": 4740 }, { "epoch": 0.613774389456002, "grad_norm": 0.42838889360427856, "learning_rate": 4.5011365319708796e-05, "loss": 1.0097, "num_input_tokens_seen": 38912000, "step": 4750 }, { "epoch": 0.6150665460653831, "grad_norm": 0.5048761367797852, "learning_rate": 4.499107037837922e-05, "loss": 1.0672, "num_input_tokens_seen": 38993920, "step": 4760 }, { "epoch": 0.6163587026747642, "grad_norm": 0.4704453647136688, "learning_rate": 4.497073883347293e-05, "loss": 0.8592, "num_input_tokens_seen": 39075840, "step": 4770 }, { "epoch": 0.6176508592841452, "grad_norm": 0.5054408311843872, "learning_rate": 4.495037072221692e-05, "loss": 0.979, "num_input_tokens_seen": 39157760, "step": 4780 }, { "epoch": 0.6189430158935263, "grad_norm": 0.4307916462421417, "learning_rate": 4.49299660819051e-05, "loss": 1.1349, "num_input_tokens_seen": 39239680, "step": 4790 }, { "epoch": 0.6202351725029074, "grad_norm": 0.7351888418197632, "learning_rate": 4.490952494989834e-05, "loss": 0.8819, "num_input_tokens_seen": 39321600, "step": 4800 }, { "epoch": 0.6215273291122884, "grad_norm": 0.7164278030395508, "learning_rate": 4.4889047363624236e-05, "loss": 1.0527, "num_input_tokens_seen": 39403520, "step": 4810 }, { "epoch": 0.6228194857216695, "grad_norm": 0.5169610381126404, "learning_rate": 4.486853336057719e-05, "loss": 1.1608, "num_input_tokens_seen": 39485440, "step": 4820 }, { "epoch": 0.6241116423310505, "grad_norm": 0.9453770518302917, "learning_rate": 4.484798297831826e-05, "loss": 0.9382, "num_input_tokens_seen": 39567360, "step": 4830 }, { "epoch": 0.6254037989404316, "grad_norm": 0.49842312932014465, "learning_rate": 4.482739625447514e-05, "loss": 0.9578, "num_input_tokens_seen": 39649280, "step": 4840 }, { "epoch": 0.6266959555498126, "grad_norm": 0.5001227855682373, "learning_rate": 4.480677322674202e-05, "loss": 0.5964, "num_input_tokens_seen": 39731200, "step": 4850 }, { "epoch": 0.6279881121591937, "grad_norm": 0.42781203985214233, "learning_rate": 4.4786113932879605e-05, "loss": 1.0508, "num_input_tokens_seen": 39813120, "step": 4860 }, { "epoch": 0.6292802687685748, "grad_norm": 0.3244008719921112, "learning_rate": 4.476541841071498e-05, "loss": 0.7638, "num_input_tokens_seen": 39895040, "step": 4870 }, { "epoch": 0.6305724253779558, "grad_norm": 0.3672725558280945, "learning_rate": 4.4744686698141564e-05, "loss": 0.5432, "num_input_tokens_seen": 39976960, "step": 4880 }, { "epoch": 0.6318645819873369, "grad_norm": 0.44159212708473206, "learning_rate": 4.472391883311906e-05, "loss": 1.0626, "num_input_tokens_seen": 40058880, "step": 4890 }, { "epoch": 0.633156738596718, "grad_norm": 0.5819474458694458, "learning_rate": 4.470311485367335e-05, "loss": 0.8067, "num_input_tokens_seen": 40140800, "step": 4900 }, { "epoch": 0.6344488952060989, "grad_norm": 0.618045449256897, "learning_rate": 4.468227479789644e-05, "loss": 0.8431, "num_input_tokens_seen": 40222720, "step": 4910 }, { "epoch": 0.63574105181548, "grad_norm": 0.6308985948562622, "learning_rate": 4.4661398703946396e-05, "loss": 0.92, "num_input_tokens_seen": 40304640, "step": 4920 }, { "epoch": 0.6370332084248611, "grad_norm": 0.49686864018440247, "learning_rate": 4.464048661004727e-05, "loss": 1.0994, "num_input_tokens_seen": 40386560, "step": 4930 }, { "epoch": 0.6383253650342422, "grad_norm": 0.5074322819709778, "learning_rate": 4.461953855448903e-05, "loss": 1.0147, "num_input_tokens_seen": 40468480, "step": 4940 }, { "epoch": 0.6396175216436232, "grad_norm": 0.7457041144371033, "learning_rate": 4.4598554575627495e-05, "loss": 1.1607, "num_input_tokens_seen": 40550400, "step": 4950 }, { "epoch": 0.6409096782530043, "grad_norm": 0.5672173500061035, "learning_rate": 4.4577534711884244e-05, "loss": 1.1159, "num_input_tokens_seen": 40632320, "step": 4960 }, { "epoch": 0.6422018348623854, "grad_norm": 0.3177810609340668, "learning_rate": 4.455647900174658e-05, "loss": 1.0039, "num_input_tokens_seen": 40714240, "step": 4970 }, { "epoch": 0.6434939914717663, "grad_norm": 0.6089414358139038, "learning_rate": 4.453538748376742e-05, "loss": 0.8848, "num_input_tokens_seen": 40796160, "step": 4980 }, { "epoch": 0.6447861480811474, "grad_norm": 0.44886818528175354, "learning_rate": 4.451426019656526e-05, "loss": 0.624, "num_input_tokens_seen": 40878080, "step": 4990 }, { "epoch": 0.6460783046905285, "grad_norm": 0.336834192276001, "learning_rate": 4.449309717882409e-05, "loss": 1.0137, "num_input_tokens_seen": 40960000, "step": 5000 }, { "epoch": 0.6473704612999095, "grad_norm": 0.8568751215934753, "learning_rate": 4.4471898469293324e-05, "loss": 0.9478, "num_input_tokens_seen": 41041920, "step": 5010 }, { "epoch": 0.6486626179092906, "grad_norm": 0.6555817127227783, "learning_rate": 4.4450664106787706e-05, "loss": 0.8829, "num_input_tokens_seen": 41123840, "step": 5020 }, { "epoch": 0.6499547745186717, "grad_norm": 0.656535267829895, "learning_rate": 4.442939413018728e-05, "loss": 1.0126, "num_input_tokens_seen": 41205760, "step": 5030 }, { "epoch": 0.6512469311280528, "grad_norm": 0.5044434666633606, "learning_rate": 4.44080885784373e-05, "loss": 0.8302, "num_input_tokens_seen": 41287680, "step": 5040 }, { "epoch": 0.6525390877374337, "grad_norm": 1.4959832429885864, "learning_rate": 4.4386747490548156e-05, "loss": 1.0349, "num_input_tokens_seen": 41369600, "step": 5050 }, { "epoch": 0.6538312443468148, "grad_norm": 0.623151421546936, "learning_rate": 4.43653709055953e-05, "loss": 1.1468, "num_input_tokens_seen": 41451520, "step": 5060 }, { "epoch": 0.6551234009561959, "grad_norm": 0.606468141078949, "learning_rate": 4.434395886271917e-05, "loss": 1.1647, "num_input_tokens_seen": 41533440, "step": 5070 }, { "epoch": 0.6564155575655769, "grad_norm": 0.5442026853561401, "learning_rate": 4.4322511401125156e-05, "loss": 1.2088, "num_input_tokens_seen": 41615360, "step": 5080 }, { "epoch": 0.657707714174958, "grad_norm": 0.5820073485374451, "learning_rate": 4.430102856008347e-05, "loss": 0.8394, "num_input_tokens_seen": 41697280, "step": 5090 }, { "epoch": 0.6589998707843391, "grad_norm": 0.6123668551445007, "learning_rate": 4.427951037892911e-05, "loss": 0.7364, "num_input_tokens_seen": 41779200, "step": 5100 }, { "epoch": 0.6602920273937202, "grad_norm": 0.4739777743816376, "learning_rate": 4.4257956897061805e-05, "loss": 0.8559, "num_input_tokens_seen": 41861120, "step": 5110 }, { "epoch": 0.6615841840031012, "grad_norm": 0.5354482531547546, "learning_rate": 4.423636815394588e-05, "loss": 0.9424, "num_input_tokens_seen": 41943040, "step": 5120 }, { "epoch": 0.6628763406124822, "grad_norm": 0.5862747430801392, "learning_rate": 4.4214744189110266e-05, "loss": 1.1937, "num_input_tokens_seen": 42024960, "step": 5130 }, { "epoch": 0.6641684972218633, "grad_norm": 0.4908977150917053, "learning_rate": 4.4193085042148354e-05, "loss": 1.0227, "num_input_tokens_seen": 42106880, "step": 5140 }, { "epoch": 0.6654606538312443, "grad_norm": 0.8534203171730042, "learning_rate": 4.417139075271796e-05, "loss": 1.3091, "num_input_tokens_seen": 42188800, "step": 5150 }, { "epoch": 0.6667528104406254, "grad_norm": 0.47156473994255066, "learning_rate": 4.414966136054125e-05, "loss": 0.6186, "num_input_tokens_seen": 42270720, "step": 5160 }, { "epoch": 0.6680449670500065, "grad_norm": 0.5299347043037415, "learning_rate": 4.412789690540466e-05, "loss": 0.7974, "num_input_tokens_seen": 42352640, "step": 5170 }, { "epoch": 0.6693371236593875, "grad_norm": 0.5082312226295471, "learning_rate": 4.410609742715883e-05, "loss": 0.8996, "num_input_tokens_seen": 42434560, "step": 5180 }, { "epoch": 0.6706292802687686, "grad_norm": 0.48898372054100037, "learning_rate": 4.408426296571852e-05, "loss": 1.0341, "num_input_tokens_seen": 42516480, "step": 5190 }, { "epoch": 0.6719214368781496, "grad_norm": 0.7845025658607483, "learning_rate": 4.406239356106257e-05, "loss": 0.9729, "num_input_tokens_seen": 42598400, "step": 5200 }, { "epoch": 0.6732135934875307, "grad_norm": 0.3541695475578308, "learning_rate": 4.404048925323375e-05, "loss": 0.6879, "num_input_tokens_seen": 42680320, "step": 5210 }, { "epoch": 0.6745057500969117, "grad_norm": 0.6555191874504089, "learning_rate": 4.401855008233879e-05, "loss": 1.2569, "num_input_tokens_seen": 42762240, "step": 5220 }, { "epoch": 0.6757979067062928, "grad_norm": 0.44755908846855164, "learning_rate": 4.3996576088548214e-05, "loss": 1.1061, "num_input_tokens_seen": 42844160, "step": 5230 }, { "epoch": 0.6770900633156739, "grad_norm": 0.4899725615978241, "learning_rate": 4.397456731209634e-05, "loss": 1.0198, "num_input_tokens_seen": 42926080, "step": 5240 }, { "epoch": 0.6783822199250549, "grad_norm": 0.30107763409614563, "learning_rate": 4.395252379328115e-05, "loss": 0.7051, "num_input_tokens_seen": 43008000, "step": 5250 }, { "epoch": 0.679674376534436, "grad_norm": 0.24033145606517792, "learning_rate": 4.393044557246424e-05, "loss": 0.853, "num_input_tokens_seen": 43089920, "step": 5260 }, { "epoch": 0.680966533143817, "grad_norm": 0.5303036570549011, "learning_rate": 4.3908332690070765e-05, "loss": 0.9311, "num_input_tokens_seen": 43171840, "step": 5270 }, { "epoch": 0.682258689753198, "grad_norm": 0.4759383499622345, "learning_rate": 4.388618518658932e-05, "loss": 1.3277, "num_input_tokens_seen": 43253760, "step": 5280 }, { "epoch": 0.6835508463625791, "grad_norm": 0.4998660385608673, "learning_rate": 4.3864003102571916e-05, "loss": 0.8287, "num_input_tokens_seen": 43335680, "step": 5290 }, { "epoch": 0.6848430029719602, "grad_norm": 0.47711753845214844, "learning_rate": 4.384178647863385e-05, "loss": 0.6284, "num_input_tokens_seen": 43417600, "step": 5300 }, { "epoch": 0.6861351595813413, "grad_norm": 0.6608704328536987, "learning_rate": 4.381953535545369e-05, "loss": 1.1245, "num_input_tokens_seen": 43499520, "step": 5310 }, { "epoch": 0.6874273161907223, "grad_norm": 0.6293668746948242, "learning_rate": 4.3797249773773165e-05, "loss": 0.8103, "num_input_tokens_seen": 43581440, "step": 5320 }, { "epoch": 0.6887194728001034, "grad_norm": 0.7585543990135193, "learning_rate": 4.3774929774397086e-05, "loss": 0.6222, "num_input_tokens_seen": 43663360, "step": 5330 }, { "epoch": 0.6900116294094845, "grad_norm": 0.6242161393165588, "learning_rate": 4.375257539819328e-05, "loss": 1.0592, "num_input_tokens_seen": 43745280, "step": 5340 }, { "epoch": 0.6913037860188654, "grad_norm": 0.47205057740211487, "learning_rate": 4.373018668609256e-05, "loss": 1.1099, "num_input_tokens_seen": 43827200, "step": 5350 }, { "epoch": 0.6925959426282465, "grad_norm": 0.6397084593772888, "learning_rate": 4.370776367908854e-05, "loss": 1.0354, "num_input_tokens_seen": 43909120, "step": 5360 }, { "epoch": 0.6938880992376276, "grad_norm": 0.5018695592880249, "learning_rate": 4.368530641823769e-05, "loss": 1.2438, "num_input_tokens_seen": 43991040, "step": 5370 }, { "epoch": 0.6951802558470087, "grad_norm": 0.5705529451370239, "learning_rate": 4.3662814944659156e-05, "loss": 0.9602, "num_input_tokens_seen": 44072960, "step": 5380 }, { "epoch": 0.6964724124563897, "grad_norm": 0.5414160490036011, "learning_rate": 4.364028929953476e-05, "loss": 0.9233, "num_input_tokens_seen": 44154880, "step": 5390 }, { "epoch": 0.6977645690657708, "grad_norm": 0.5052191615104675, "learning_rate": 4.361772952410886e-05, "loss": 1.1027, "num_input_tokens_seen": 44236800, "step": 5400 }, { "epoch": 0.6990567256751519, "grad_norm": 0.5868683457374573, "learning_rate": 4.359513565968832e-05, "loss": 1.0273, "num_input_tokens_seen": 44318720, "step": 5410 }, { "epoch": 0.7003488822845328, "grad_norm": 0.5551066994667053, "learning_rate": 4.357250774764245e-05, "loss": 0.8502, "num_input_tokens_seen": 44400640, "step": 5420 }, { "epoch": 0.7016410388939139, "grad_norm": 0.36578068137168884, "learning_rate": 4.354984582940285e-05, "loss": 0.7773, "num_input_tokens_seen": 44482560, "step": 5430 }, { "epoch": 0.702933195503295, "grad_norm": 0.8225210309028625, "learning_rate": 4.35271499464634e-05, "loss": 0.9399, "num_input_tokens_seen": 44564480, "step": 5440 }, { "epoch": 0.704225352112676, "grad_norm": 0.5141643285751343, "learning_rate": 4.350442014038021e-05, "loss": 0.9732, "num_input_tokens_seen": 44646400, "step": 5450 }, { "epoch": 0.7055175087220571, "grad_norm": 0.42161181569099426, "learning_rate": 4.348165645277145e-05, "loss": 0.9438, "num_input_tokens_seen": 44728320, "step": 5460 }, { "epoch": 0.7068096653314382, "grad_norm": 0.44062143564224243, "learning_rate": 4.345885892531735e-05, "loss": 1.3828, "num_input_tokens_seen": 44810240, "step": 5470 }, { "epoch": 0.7081018219408193, "grad_norm": 0.44696852564811707, "learning_rate": 4.343602759976011e-05, "loss": 1.1361, "num_input_tokens_seen": 44892160, "step": 5480 }, { "epoch": 0.7093939785502003, "grad_norm": 0.7456682920455933, "learning_rate": 4.34131625179038e-05, "loss": 0.9972, "num_input_tokens_seen": 44974080, "step": 5490 }, { "epoch": 0.7106861351595813, "grad_norm": 0.4710252285003662, "learning_rate": 4.3390263721614286e-05, "loss": 1.1289, "num_input_tokens_seen": 45056000, "step": 5500 }, { "epoch": 0.7119782917689624, "grad_norm": 0.4541804790496826, "learning_rate": 4.33673312528192e-05, "loss": 0.699, "num_input_tokens_seen": 45137920, "step": 5510 }, { "epoch": 0.7132704483783434, "grad_norm": 0.5204092860221863, "learning_rate": 4.334436515350779e-05, "loss": 1.1252, "num_input_tokens_seen": 45219840, "step": 5520 }, { "epoch": 0.7145626049877245, "grad_norm": 0.5446497201919556, "learning_rate": 4.332136546573092e-05, "loss": 1.0118, "num_input_tokens_seen": 45301760, "step": 5530 }, { "epoch": 0.7158547615971056, "grad_norm": 0.5484737753868103, "learning_rate": 4.3298332231600925e-05, "loss": 0.8958, "num_input_tokens_seen": 45383680, "step": 5540 }, { "epoch": 0.7171469182064866, "grad_norm": 0.4673605263233185, "learning_rate": 4.327526549329157e-05, "loss": 0.7937, "num_input_tokens_seen": 45465600, "step": 5550 }, { "epoch": 0.7184390748158677, "grad_norm": 0.4332837760448456, "learning_rate": 4.325216529303798e-05, "loss": 0.7012, "num_input_tokens_seen": 45547520, "step": 5560 }, { "epoch": 0.7197312314252488, "grad_norm": 0.5795179009437561, "learning_rate": 4.3229031673136514e-05, "loss": 0.7965, "num_input_tokens_seen": 45629440, "step": 5570 }, { "epoch": 0.7210233880346298, "grad_norm": 0.5822563767433167, "learning_rate": 4.320586467594476e-05, "loss": 0.9927, "num_input_tokens_seen": 45711360, "step": 5580 }, { "epoch": 0.7223155446440108, "grad_norm": 0.3716464340686798, "learning_rate": 4.3182664343881415e-05, "loss": 0.8541, "num_input_tokens_seen": 45793280, "step": 5590 }, { "epoch": 0.7236077012533919, "grad_norm": 0.6027759313583374, "learning_rate": 4.315943071942619e-05, "loss": 0.989, "num_input_tokens_seen": 45875200, "step": 5600 }, { "epoch": 0.724899857862773, "grad_norm": 0.4732694923877716, "learning_rate": 4.313616384511976e-05, "loss": 1.0818, "num_input_tokens_seen": 45957120, "step": 5610 }, { "epoch": 0.726192014472154, "grad_norm": 0.5868046283721924, "learning_rate": 4.3112863763563695e-05, "loss": 1.0183, "num_input_tokens_seen": 46039040, "step": 5620 }, { "epoch": 0.7274841710815351, "grad_norm": 0.5000009536743164, "learning_rate": 4.308953051742036e-05, "loss": 1.1861, "num_input_tokens_seen": 46120960, "step": 5630 }, { "epoch": 0.7287763276909162, "grad_norm": 0.5284093022346497, "learning_rate": 4.3066164149412844e-05, "loss": 0.8941, "num_input_tokens_seen": 46202880, "step": 5640 }, { "epoch": 0.7300684843002972, "grad_norm": 0.4090714454650879, "learning_rate": 4.304276470232488e-05, "loss": 0.8618, "num_input_tokens_seen": 46284800, "step": 5650 }, { "epoch": 0.7313606409096782, "grad_norm": 0.6819685697555542, "learning_rate": 4.3019332219000766e-05, "loss": 0.8772, "num_input_tokens_seen": 46366720, "step": 5660 }, { "epoch": 0.7326527975190593, "grad_norm": 0.8263905048370361, "learning_rate": 4.299586674234529e-05, "loss": 1.0889, "num_input_tokens_seen": 46448640, "step": 5670 }, { "epoch": 0.7339449541284404, "grad_norm": 0.5756688714027405, "learning_rate": 4.2972368315323676e-05, "loss": 0.7212, "num_input_tokens_seen": 46530560, "step": 5680 }, { "epoch": 0.7352371107378214, "grad_norm": 0.5566821694374084, "learning_rate": 4.294883698096143e-05, "loss": 0.8605, "num_input_tokens_seen": 46612480, "step": 5690 }, { "epoch": 0.7365292673472025, "grad_norm": 0.5805408358573914, "learning_rate": 4.292527278234435e-05, "loss": 0.9687, "num_input_tokens_seen": 46694400, "step": 5700 }, { "epoch": 0.7378214239565836, "grad_norm": 0.19487954676151276, "learning_rate": 4.290167576261841e-05, "loss": 0.5747, "num_input_tokens_seen": 46776320, "step": 5710 }, { "epoch": 0.7391135805659645, "grad_norm": 0.5343565940856934, "learning_rate": 4.2878045964989646e-05, "loss": 0.7948, "num_input_tokens_seen": 46858240, "step": 5720 }, { "epoch": 0.7404057371753456, "grad_norm": 2.30495023727417, "learning_rate": 4.285438343272414e-05, "loss": 1.0718, "num_input_tokens_seen": 46940160, "step": 5730 }, { "epoch": 0.7416978937847267, "grad_norm": 0.5874922871589661, "learning_rate": 4.283068820914791e-05, "loss": 0.9519, "num_input_tokens_seen": 47022080, "step": 5740 }, { "epoch": 0.7429900503941078, "grad_norm": 0.4819450378417969, "learning_rate": 4.2806960337646804e-05, "loss": 1.015, "num_input_tokens_seen": 47104000, "step": 5750 }, { "epoch": 0.7442822070034888, "grad_norm": 0.6047049164772034, "learning_rate": 4.278319986166649e-05, "loss": 0.8983, "num_input_tokens_seen": 47185920, "step": 5760 }, { "epoch": 0.7455743636128699, "grad_norm": 0.32134854793548584, "learning_rate": 4.27594068247123e-05, "loss": 0.8726, "num_input_tokens_seen": 47267840, "step": 5770 }, { "epoch": 0.746866520222251, "grad_norm": 0.7429246306419373, "learning_rate": 4.27355812703492e-05, "loss": 0.7588, "num_input_tokens_seen": 47349760, "step": 5780 }, { "epoch": 0.748158676831632, "grad_norm": 0.5023065805435181, "learning_rate": 4.2711723242201695e-05, "loss": 1.1223, "num_input_tokens_seen": 47431680, "step": 5790 }, { "epoch": 0.749450833441013, "grad_norm": 0.38930198550224304, "learning_rate": 4.268783278395374e-05, "loss": 0.7947, "num_input_tokens_seen": 47513600, "step": 5800 }, { "epoch": 0.7507429900503941, "grad_norm": 0.488908976316452, "learning_rate": 4.2663909939348684e-05, "loss": 0.9494, "num_input_tokens_seen": 47595520, "step": 5810 }, { "epoch": 0.7520351466597751, "grad_norm": 0.4929217994213104, "learning_rate": 4.263995475218917e-05, "loss": 0.7431, "num_input_tokens_seen": 47677440, "step": 5820 }, { "epoch": 0.7533273032691562, "grad_norm": 0.4818821847438812, "learning_rate": 4.2615967266337045e-05, "loss": 0.8616, "num_input_tokens_seen": 47759360, "step": 5830 }, { "epoch": 0.7546194598785373, "grad_norm": 0.5090234875679016, "learning_rate": 4.2591947525713326e-05, "loss": 1.1937, "num_input_tokens_seen": 47841280, "step": 5840 }, { "epoch": 0.7559116164879184, "grad_norm": 0.6463255882263184, "learning_rate": 4.256789557429806e-05, "loss": 0.8848, "num_input_tokens_seen": 47923200, "step": 5850 }, { "epoch": 0.7572037730972994, "grad_norm": 0.5930004119873047, "learning_rate": 4.254381145613027e-05, "loss": 0.9741, "num_input_tokens_seen": 48005120, "step": 5860 }, { "epoch": 0.7584959297066804, "grad_norm": 0.5355798602104187, "learning_rate": 4.251969521530791e-05, "loss": 1.1395, "num_input_tokens_seen": 48087040, "step": 5870 }, { "epoch": 0.7597880863160615, "grad_norm": 0.4238862693309784, "learning_rate": 4.2495546895987724e-05, "loss": 1.074, "num_input_tokens_seen": 48168960, "step": 5880 }, { "epoch": 0.7610802429254425, "grad_norm": 0.5069129467010498, "learning_rate": 4.2471366542385196e-05, "loss": 0.9192, "num_input_tokens_seen": 48250880, "step": 5890 }, { "epoch": 0.7623723995348236, "grad_norm": 0.436614066362381, "learning_rate": 4.2447154198774445e-05, "loss": 0.999, "num_input_tokens_seen": 48332800, "step": 5900 }, { "epoch": 0.7636645561442047, "grad_norm": 0.21864083409309387, "learning_rate": 4.242290990948821e-05, "loss": 1.0004, "num_input_tokens_seen": 48414720, "step": 5910 }, { "epoch": 0.7649567127535858, "grad_norm": 0.5218377113342285, "learning_rate": 4.2398633718917684e-05, "loss": 0.8302, "num_input_tokens_seen": 48496640, "step": 5920 }, { "epoch": 0.7662488693629668, "grad_norm": 0.6044358611106873, "learning_rate": 4.237432567151248e-05, "loss": 1.2325, "num_input_tokens_seen": 48578560, "step": 5930 }, { "epoch": 0.7675410259723479, "grad_norm": 1.3460071086883545, "learning_rate": 4.234998581178056e-05, "loss": 0.636, "num_input_tokens_seen": 48660480, "step": 5940 }, { "epoch": 0.768833182581729, "grad_norm": 0.33106595277786255, "learning_rate": 4.2325614184288096e-05, "loss": 0.633, "num_input_tokens_seen": 48742400, "step": 5950 }, { "epoch": 0.7701253391911099, "grad_norm": 0.6000261306762695, "learning_rate": 4.2301210833659464e-05, "loss": 0.8583, "num_input_tokens_seen": 48824320, "step": 5960 }, { "epoch": 0.771417495800491, "grad_norm": 0.545335054397583, "learning_rate": 4.227677580457711e-05, "loss": 0.9888, "num_input_tokens_seen": 48906240, "step": 5970 }, { "epoch": 0.7727096524098721, "grad_norm": 0.5507743954658508, "learning_rate": 4.2252309141781464e-05, "loss": 1.0736, "num_input_tokens_seen": 48988160, "step": 5980 }, { "epoch": 0.7740018090192531, "grad_norm": 0.6265859007835388, "learning_rate": 4.222781089007092e-05, "loss": 0.995, "num_input_tokens_seen": 49070080, "step": 5990 }, { "epoch": 0.7752939656286342, "grad_norm": 1014203.875, "learning_rate": 4.220328109430167e-05, "loss": 0.8812, "num_input_tokens_seen": 49152000, "step": 6000 }, { "epoch": 0.7765861222380153, "grad_norm": 0.7153500318527222, "learning_rate": 4.217871979938769e-05, "loss": 1.3075, "num_input_tokens_seen": 49233920, "step": 6010 }, { "epoch": 0.7778782788473964, "grad_norm": 0.7981336712837219, "learning_rate": 4.215412705030063e-05, "loss": 0.8665, "num_input_tokens_seen": 49315840, "step": 6020 }, { "epoch": 0.7791704354567773, "grad_norm": 0.6426622867584229, "learning_rate": 4.21295028920697e-05, "loss": 0.852, "num_input_tokens_seen": 49397760, "step": 6030 }, { "epoch": 0.7804625920661584, "grad_norm": 0.6837995052337646, "learning_rate": 4.210484736978166e-05, "loss": 0.9422, "num_input_tokens_seen": 49479680, "step": 6040 }, { "epoch": 0.7817547486755395, "grad_norm": 0.4627484679222107, "learning_rate": 4.208016052858067e-05, "loss": 0.8523, "num_input_tokens_seen": 49561600, "step": 6050 }, { "epoch": 0.7830469052849205, "grad_norm": 0.29264596104621887, "learning_rate": 4.2055442413668264e-05, "loss": 0.7916, "num_input_tokens_seen": 49643520, "step": 6060 }, { "epoch": 0.7843390618943016, "grad_norm": 0.4580781161785126, "learning_rate": 4.2030693070303204e-05, "loss": 1.1109, "num_input_tokens_seen": 49725440, "step": 6070 }, { "epoch": 0.7856312185036827, "grad_norm": 0.24061749875545502, "learning_rate": 4.2005912543801444e-05, "loss": 0.7227, "num_input_tokens_seen": 49807360, "step": 6080 }, { "epoch": 0.7869233751130637, "grad_norm": 0.4084072709083557, "learning_rate": 4.198110087953606e-05, "loss": 0.7407, "num_input_tokens_seen": 49889280, "step": 6090 }, { "epoch": 0.7882155317224447, "grad_norm": 0.5031906962394714, "learning_rate": 4.195625812293709e-05, "loss": 1.4336, "num_input_tokens_seen": 49971200, "step": 6100 }, { "epoch": 0.7895076883318258, "grad_norm": 0.21128413081169128, "learning_rate": 4.193138431949155e-05, "loss": 0.9719, "num_input_tokens_seen": 50053120, "step": 6110 }, { "epoch": 0.7907998449412069, "grad_norm": 0.44993460178375244, "learning_rate": 4.190647951474328e-05, "loss": 1.0592, "num_input_tokens_seen": 50135040, "step": 6120 }, { "epoch": 0.7920920015505879, "grad_norm": 0.5219196677207947, "learning_rate": 4.188154375429288e-05, "loss": 0.8778, "num_input_tokens_seen": 50216960, "step": 6130 }, { "epoch": 0.793384158159969, "grad_norm": 0.5350174307823181, "learning_rate": 4.1856577083797646e-05, "loss": 1.329, "num_input_tokens_seen": 50298880, "step": 6140 }, { "epoch": 0.7946763147693501, "grad_norm": 0.6473702192306519, "learning_rate": 4.183157954897144e-05, "loss": 1.2239, "num_input_tokens_seen": 50380800, "step": 6150 }, { "epoch": 0.7959684713787311, "grad_norm": 0.5353970527648926, "learning_rate": 4.1806551195584685e-05, "loss": 1.0103, "num_input_tokens_seen": 50462720, "step": 6160 }, { "epoch": 0.7972606279881121, "grad_norm": 0.6963522434234619, "learning_rate": 4.178149206946419e-05, "loss": 0.8958, "num_input_tokens_seen": 50544640, "step": 6170 }, { "epoch": 0.7985527845974932, "grad_norm": 0.6284576654434204, "learning_rate": 4.1756402216493115e-05, "loss": 1.0327, "num_input_tokens_seen": 50626560, "step": 6180 }, { "epoch": 0.7998449412068743, "grad_norm": 0.49156904220581055, "learning_rate": 4.17312816826109e-05, "loss": 0.9924, "num_input_tokens_seen": 50708480, "step": 6190 }, { "epoch": 0.8011370978162553, "grad_norm": 0.8656590580940247, "learning_rate": 4.1706130513813146e-05, "loss": 0.7853, "num_input_tokens_seen": 50790400, "step": 6200 }, { "epoch": 0.8024292544256364, "grad_norm": 0.5040183067321777, "learning_rate": 4.1680948756151564e-05, "loss": 0.9263, "num_input_tokens_seen": 50872320, "step": 6210 }, { "epoch": 0.8037214110350175, "grad_norm": 0.28626561164855957, "learning_rate": 4.165573645573384e-05, "loss": 0.9224, "num_input_tokens_seen": 50954240, "step": 6220 }, { "epoch": 0.8050135676443985, "grad_norm": 0.5348039865493774, "learning_rate": 4.1630493658723606e-05, "loss": 1.2382, "num_input_tokens_seen": 51036160, "step": 6230 }, { "epoch": 0.8063057242537796, "grad_norm": 0.4930530786514282, "learning_rate": 4.160522041134035e-05, "loss": 1.0701, "num_input_tokens_seen": 51118080, "step": 6240 }, { "epoch": 0.8075978808631606, "grad_norm": 0.27241694927215576, "learning_rate": 4.1579916759859286e-05, "loss": 0.9246, "num_input_tokens_seen": 51200000, "step": 6250 }, { "epoch": 0.8088900374725416, "grad_norm": 0.8106761574745178, "learning_rate": 4.155458275061129e-05, "loss": 1.1799, "num_input_tokens_seen": 51281920, "step": 6260 }, { "epoch": 0.8101821940819227, "grad_norm": 0.6330926418304443, "learning_rate": 4.152921842998287e-05, "loss": 1.2025, "num_input_tokens_seen": 51363840, "step": 6270 }, { "epoch": 0.8114743506913038, "grad_norm": 0.5390750169754028, "learning_rate": 4.150382384441598e-05, "loss": 0.7594, "num_input_tokens_seen": 51445760, "step": 6280 }, { "epoch": 0.8127665073006849, "grad_norm": 0.645873486995697, "learning_rate": 4.147839904040803e-05, "loss": 0.8714, "num_input_tokens_seen": 51527680, "step": 6290 }, { "epoch": 0.8140586639100659, "grad_norm": 0.2540639638900757, "learning_rate": 4.145294406451173e-05, "loss": 1.2455, "num_input_tokens_seen": 51609600, "step": 6300 }, { "epoch": 0.815350820519447, "grad_norm": 0.75770103931427, "learning_rate": 4.142745896333505e-05, "loss": 1.1417, "num_input_tokens_seen": 51691520, "step": 6310 }, { "epoch": 0.816642977128828, "grad_norm": 0.505504310131073, "learning_rate": 4.140194378354113e-05, "loss": 1.1736, "num_input_tokens_seen": 51773440, "step": 6320 }, { "epoch": 0.817935133738209, "grad_norm": 0.47906693816185, "learning_rate": 4.137639857184815e-05, "loss": 0.8252, "num_input_tokens_seen": 51855360, "step": 6330 }, { "epoch": 0.8192272903475901, "grad_norm": 0.7896102666854858, "learning_rate": 4.1350823375029326e-05, "loss": 1.2724, "num_input_tokens_seen": 51937280, "step": 6340 }, { "epoch": 0.8205194469569712, "grad_norm": 0.6090170741081238, "learning_rate": 4.132521823991272e-05, "loss": 0.6208, "num_input_tokens_seen": 52019200, "step": 6350 }, { "epoch": 0.8218116035663522, "grad_norm": 0.4670509099960327, "learning_rate": 4.129958321338127e-05, "loss": 0.8883, "num_input_tokens_seen": 52101120, "step": 6360 }, { "epoch": 0.8231037601757333, "grad_norm": 0.7980589270591736, "learning_rate": 4.127391834237258e-05, "loss": 0.7734, "num_input_tokens_seen": 52183040, "step": 6370 }, { "epoch": 0.8243959167851144, "grad_norm": 0.47597405314445496, "learning_rate": 4.124822367387897e-05, "loss": 0.8842, "num_input_tokens_seen": 52264960, "step": 6380 }, { "epoch": 0.8256880733944955, "grad_norm": 0.28550368547439575, "learning_rate": 4.122249925494726e-05, "loss": 0.8806, "num_input_tokens_seen": 52346880, "step": 6390 }, { "epoch": 0.8269802300038764, "grad_norm": 0.5419972538948059, "learning_rate": 4.119674513267878e-05, "loss": 0.9348, "num_input_tokens_seen": 52428800, "step": 6400 }, { "epoch": 0.8282723866132575, "grad_norm": 0.5108693838119507, "learning_rate": 4.117096135422923e-05, "loss": 0.7648, "num_input_tokens_seen": 52510720, "step": 6410 }, { "epoch": 0.8295645432226386, "grad_norm": 0.22679108381271362, "learning_rate": 4.114514796680862e-05, "loss": 0.6926, "num_input_tokens_seen": 52592640, "step": 6420 }, { "epoch": 0.8308566998320196, "grad_norm": 0.5265064239501953, "learning_rate": 4.111930501768116e-05, "loss": 1.0296, "num_input_tokens_seen": 52674560, "step": 6430 }, { "epoch": 0.8321488564414007, "grad_norm": 0.41228848695755005, "learning_rate": 4.1093432554165196e-05, "loss": 0.9102, "num_input_tokens_seen": 52756480, "step": 6440 }, { "epoch": 0.8334410130507818, "grad_norm": 0.35673919320106506, "learning_rate": 4.106753062363311e-05, "loss": 0.8159, "num_input_tokens_seen": 52838400, "step": 6450 }, { "epoch": 0.8347331696601629, "grad_norm": 0.5186113715171814, "learning_rate": 4.104159927351125e-05, "loss": 1.1631, "num_input_tokens_seen": 52920320, "step": 6460 }, { "epoch": 0.8360253262695438, "grad_norm": 0.45875003933906555, "learning_rate": 4.1015638551279825e-05, "loss": 0.8534, "num_input_tokens_seen": 53002240, "step": 6470 }, { "epoch": 0.8373174828789249, "grad_norm": 0.5119809508323669, "learning_rate": 4.098964850447281e-05, "loss": 1.0583, "num_input_tokens_seen": 53084160, "step": 6480 }, { "epoch": 0.838609639488306, "grad_norm": 0.35101157426834106, "learning_rate": 4.0963629180677896e-05, "loss": 0.8959, "num_input_tokens_seen": 53166080, "step": 6490 }, { "epoch": 0.839901796097687, "grad_norm": 0.5710439085960388, "learning_rate": 4.093758062753638e-05, "loss": 1.1624, "num_input_tokens_seen": 53248000, "step": 6500 }, { "epoch": 0.8411939527070681, "grad_norm": 0.34572339057922363, "learning_rate": 4.0911502892743035e-05, "loss": 1.2004, "num_input_tokens_seen": 53329920, "step": 6510 }, { "epoch": 0.8424861093164492, "grad_norm": 0.5202426910400391, "learning_rate": 4.088539602404613e-05, "loss": 1.2094, "num_input_tokens_seen": 53411840, "step": 6520 }, { "epoch": 0.8437782659258302, "grad_norm": 0.46138355135917664, "learning_rate": 4.085926006924723e-05, "loss": 1.0853, "num_input_tokens_seen": 53493760, "step": 6530 }, { "epoch": 0.8450704225352113, "grad_norm": 0.5127395391464233, "learning_rate": 4.083309507620118e-05, "loss": 0.8684, "num_input_tokens_seen": 53575680, "step": 6540 }, { "epoch": 0.8463625791445923, "grad_norm": 0.5306764245033264, "learning_rate": 4.080690109281597e-05, "loss": 1.1792, "num_input_tokens_seen": 53657600, "step": 6550 }, { "epoch": 0.8476547357539734, "grad_norm": 0.48974373936653137, "learning_rate": 4.078067816705272e-05, "loss": 0.8527, "num_input_tokens_seen": 53739520, "step": 6560 }, { "epoch": 0.8489468923633544, "grad_norm": 0.4819824993610382, "learning_rate": 4.075442634692548e-05, "loss": 0.9558, "num_input_tokens_seen": 53821440, "step": 6570 }, { "epoch": 0.8502390489727355, "grad_norm": 0.5945402979850769, "learning_rate": 4.072814568050125e-05, "loss": 1.0556, "num_input_tokens_seen": 53903360, "step": 6580 }, { "epoch": 0.8515312055821166, "grad_norm": 0.26948097348213196, "learning_rate": 4.070183621589983e-05, "loss": 0.9564, "num_input_tokens_seen": 53985280, "step": 6590 }, { "epoch": 0.8528233621914976, "grad_norm": 0.5950406789779663, "learning_rate": 4.067549800129375e-05, "loss": 1.2202, "num_input_tokens_seen": 54067200, "step": 6600 }, { "epoch": 0.8541155188008787, "grad_norm": 1.1179322004318237, "learning_rate": 4.06491310849082e-05, "loss": 0.9542, "num_input_tokens_seen": 54149120, "step": 6610 }, { "epoch": 0.8554076754102597, "grad_norm": 0.5142911076545715, "learning_rate": 4.0622735515020896e-05, "loss": 1.0358, "num_input_tokens_seen": 54231040, "step": 6620 }, { "epoch": 0.8566998320196407, "grad_norm": 0.6860795021057129, "learning_rate": 4.059631133996203e-05, "loss": 1.2331, "num_input_tokens_seen": 54312960, "step": 6630 }, { "epoch": 0.8579919886290218, "grad_norm": 0.47902247309684753, "learning_rate": 4.0569858608114177e-05, "loss": 0.9423, "num_input_tokens_seen": 54394880, "step": 6640 }, { "epoch": 0.8592841452384029, "grad_norm": 0.46857351064682007, "learning_rate": 4.054337736791218e-05, "loss": 1.1609, "num_input_tokens_seen": 54476800, "step": 6650 }, { "epoch": 0.860576301847784, "grad_norm": 0.5907356142997742, "learning_rate": 4.05168676678431e-05, "loss": 0.9658, "num_input_tokens_seen": 54558720, "step": 6660 }, { "epoch": 0.861868458457165, "grad_norm": 0.26403316855430603, "learning_rate": 4.04903295564461e-05, "loss": 0.7992, "num_input_tokens_seen": 54640640, "step": 6670 }, { "epoch": 0.8631606150665461, "grad_norm": 0.6622225642204285, "learning_rate": 4.046376308231237e-05, "loss": 0.9918, "num_input_tokens_seen": 54722560, "step": 6680 }, { "epoch": 0.8644527716759272, "grad_norm": 0.16060075163841248, "learning_rate": 4.0437168294085013e-05, "loss": 0.9523, "num_input_tokens_seen": 54804480, "step": 6690 }, { "epoch": 0.8657449282853081, "grad_norm": 0.5184667706489563, "learning_rate": 4.0410545240459005e-05, "loss": 1.1628, "num_input_tokens_seen": 54886400, "step": 6700 }, { "epoch": 0.8670370848946892, "grad_norm": 0.4965847134590149, "learning_rate": 4.0383893970181054e-05, "loss": 1.0468, "num_input_tokens_seen": 54968320, "step": 6710 }, { "epoch": 0.8683292415040703, "grad_norm": 6.185298442840576, "learning_rate": 4.0357214532049535e-05, "loss": 1.2028, "num_input_tokens_seen": 55050240, "step": 6720 }, { "epoch": 0.8696213981134514, "grad_norm": 0.43425482511520386, "learning_rate": 4.03305069749144e-05, "loss": 0.503, "num_input_tokens_seen": 55132160, "step": 6730 }, { "epoch": 0.8709135547228324, "grad_norm": 0.2627141773700714, "learning_rate": 4.03037713476771e-05, "loss": 1.0739, "num_input_tokens_seen": 55214080, "step": 6740 }, { "epoch": 0.8722057113322135, "grad_norm": 0.5449861288070679, "learning_rate": 4.027700769929046e-05, "loss": 0.7428, "num_input_tokens_seen": 55296000, "step": 6750 }, { "epoch": 0.8734978679415946, "grad_norm": 0.5020452737808228, "learning_rate": 4.025021607875862e-05, "loss": 1.0242, "num_input_tokens_seen": 55377920, "step": 6760 }, { "epoch": 0.8747900245509755, "grad_norm": 0.5593128800392151, "learning_rate": 4.0223396535136945e-05, "loss": 1.2703, "num_input_tokens_seen": 55459840, "step": 6770 }, { "epoch": 0.8760821811603566, "grad_norm": 0.4105585813522339, "learning_rate": 4.019654911753193e-05, "loss": 0.8773, "num_input_tokens_seen": 55541760, "step": 6780 }, { "epoch": 0.8773743377697377, "grad_norm": 0.37367942929267883, "learning_rate": 4.016967387510108e-05, "loss": 1.094, "num_input_tokens_seen": 55623680, "step": 6790 }, { "epoch": 0.8786664943791187, "grad_norm": 0.5563257932662964, "learning_rate": 4.014277085705288e-05, "loss": 0.9265, "num_input_tokens_seen": 55705600, "step": 6800 }, { "epoch": 0.8799586509884998, "grad_norm": 0.7399642467498779, "learning_rate": 4.011584011264665e-05, "loss": 0.9153, "num_input_tokens_seen": 55787520, "step": 6810 }, { "epoch": 0.8812508075978809, "grad_norm": 0.6172972321510315, "learning_rate": 4.0088881691192474e-05, "loss": 0.7931, "num_input_tokens_seen": 55869440, "step": 6820 }, { "epoch": 0.882542964207262, "grad_norm": 0.4733666181564331, "learning_rate": 4.006189564205115e-05, "loss": 0.9963, "num_input_tokens_seen": 55951360, "step": 6830 }, { "epoch": 0.883835120816643, "grad_norm": 0.6193355321884155, "learning_rate": 4.0034882014634015e-05, "loss": 0.7636, "num_input_tokens_seen": 56033280, "step": 6840 }, { "epoch": 0.885127277426024, "grad_norm": 0.4271906018257141, "learning_rate": 4.000784085840293e-05, "loss": 0.7192, "num_input_tokens_seen": 56115200, "step": 6850 }, { "epoch": 0.8864194340354051, "grad_norm": 0.5514196157455444, "learning_rate": 3.9980772222870156e-05, "loss": 0.9082, "num_input_tokens_seen": 56197120, "step": 6860 }, { "epoch": 0.8877115906447861, "grad_norm": 0.5211493968963623, "learning_rate": 3.995367615759825e-05, "loss": 1.2218, "num_input_tokens_seen": 56279040, "step": 6870 }, { "epoch": 0.8890037472541672, "grad_norm": 0.5780483484268188, "learning_rate": 3.992655271220003e-05, "loss": 1.0894, "num_input_tokens_seen": 56360960, "step": 6880 }, { "epoch": 0.8902959038635483, "grad_norm": 0.537337601184845, "learning_rate": 3.98994019363384e-05, "loss": 1.0208, "num_input_tokens_seen": 56442880, "step": 6890 }, { "epoch": 0.8915880604729293, "grad_norm": 0.4431285858154297, "learning_rate": 3.9872223879726356e-05, "loss": 0.7955, "num_input_tokens_seen": 56524800, "step": 6900 }, { "epoch": 0.8928802170823104, "grad_norm": 0.44692462682724, "learning_rate": 3.98450185921268e-05, "loss": 1.262, "num_input_tokens_seen": 56606720, "step": 6910 }, { "epoch": 0.8941723736916914, "grad_norm": 0.5244356989860535, "learning_rate": 3.981778612335253e-05, "loss": 1.1836, "num_input_tokens_seen": 56688640, "step": 6920 }, { "epoch": 0.8954645303010725, "grad_norm": 0.5537554025650024, "learning_rate": 3.979052652326609e-05, "loss": 0.7662, "num_input_tokens_seen": 56770560, "step": 6930 }, { "epoch": 0.8967566869104535, "grad_norm": 0.5707337856292725, "learning_rate": 3.976323984177971e-05, "loss": 0.7414, "num_input_tokens_seen": 56852480, "step": 6940 }, { "epoch": 0.8980488435198346, "grad_norm": 0.4836041331291199, "learning_rate": 3.97359261288552e-05, "loss": 1.0458, "num_input_tokens_seen": 56934400, "step": 6950 }, { "epoch": 0.8993410001292157, "grad_norm": 0.47009721398353577, "learning_rate": 3.970858543450387e-05, "loss": 1.0642, "num_input_tokens_seen": 57016320, "step": 6960 }, { "epoch": 0.9006331567385967, "grad_norm": 0.6852318644523621, "learning_rate": 3.968121780878643e-05, "loss": 1.3093, "num_input_tokens_seen": 57098240, "step": 6970 }, { "epoch": 0.9019253133479778, "grad_norm": 0.24057495594024658, "learning_rate": 3.965382330181291e-05, "loss": 0.6589, "num_input_tokens_seen": 57180160, "step": 6980 }, { "epoch": 0.9032174699573589, "grad_norm": 0.33139023184776306, "learning_rate": 3.962640196374254e-05, "loss": 0.6929, "num_input_tokens_seen": 57262080, "step": 6990 }, { "epoch": 0.9045096265667399, "grad_norm": 0.5860159993171692, "learning_rate": 3.9598953844783705e-05, "loss": 1.1313, "num_input_tokens_seen": 57344000, "step": 7000 }, { "epoch": 0.9058017831761209, "grad_norm": 0.36473211646080017, "learning_rate": 3.957147899519379e-05, "loss": 0.8748, "num_input_tokens_seen": 57425920, "step": 7010 }, { "epoch": 0.907093939785502, "grad_norm": 0.547675371170044, "learning_rate": 3.954397746527916e-05, "loss": 0.3491, "num_input_tokens_seen": 57507840, "step": 7020 }, { "epoch": 0.9083860963948831, "grad_norm": 0.3887852430343628, "learning_rate": 3.951644930539502e-05, "loss": 0.7414, "num_input_tokens_seen": 57589760, "step": 7030 }, { "epoch": 0.9096782530042641, "grad_norm": 0.5362502932548523, "learning_rate": 3.9488894565945305e-05, "loss": 0.8839, "num_input_tokens_seen": 57671680, "step": 7040 }, { "epoch": 0.9109704096136452, "grad_norm": 0.578965425491333, "learning_rate": 3.9461313297382666e-05, "loss": 0.5389, "num_input_tokens_seen": 57753600, "step": 7050 }, { "epoch": 0.9122625662230263, "grad_norm": 0.6477794051170349, "learning_rate": 3.94337055502083e-05, "loss": 1.0182, "num_input_tokens_seen": 57835520, "step": 7060 }, { "epoch": 0.9135547228324072, "grad_norm": 0.505005955696106, "learning_rate": 3.9406071374971887e-05, "loss": 1.0376, "num_input_tokens_seen": 57917440, "step": 7070 }, { "epoch": 0.9148468794417883, "grad_norm": 1.0744434595108032, "learning_rate": 3.93784108222715e-05, "loss": 0.6095, "num_input_tokens_seen": 57999360, "step": 7080 }, { "epoch": 0.9161390360511694, "grad_norm": 0.518518328666687, "learning_rate": 3.935072394275352e-05, "loss": 1.0714, "num_input_tokens_seen": 58081280, "step": 7090 }, { "epoch": 0.9174311926605505, "grad_norm": 0.5842453241348267, "learning_rate": 3.9323010787112505e-05, "loss": 1.116, "num_input_tokens_seen": 58163200, "step": 7100 }, { "epoch": 0.9187233492699315, "grad_norm": 0.43083658814430237, "learning_rate": 3.929527140609115e-05, "loss": 1.1167, "num_input_tokens_seen": 58245120, "step": 7110 }, { "epoch": 0.9200155058793126, "grad_norm": 2.2373194694519043, "learning_rate": 3.926750585048016e-05, "loss": 0.7755, "num_input_tokens_seen": 58327040, "step": 7120 }, { "epoch": 0.9213076624886937, "grad_norm": 0.5515663623809814, "learning_rate": 3.9239714171118167e-05, "loss": 0.7525, "num_input_tokens_seen": 58408960, "step": 7130 }, { "epoch": 0.9225998190980746, "grad_norm": 0.5348473191261292, "learning_rate": 3.921189641889163e-05, "loss": 0.9602, "num_input_tokens_seen": 58490880, "step": 7140 }, { "epoch": 0.9238919757074557, "grad_norm": 0.4317019581794739, "learning_rate": 3.918405264473476e-05, "loss": 0.7652, "num_input_tokens_seen": 58572800, "step": 7150 }, { "epoch": 0.9251841323168368, "grad_norm": 0.48419129848480225, "learning_rate": 3.9156182899629404e-05, "loss": 0.5639, "num_input_tokens_seen": 58654720, "step": 7160 }, { "epoch": 0.9264762889262178, "grad_norm": 0.33040711283683777, "learning_rate": 3.912828723460495e-05, "loss": 0.8571, "num_input_tokens_seen": 58736640, "step": 7170 }, { "epoch": 0.9277684455355989, "grad_norm": 0.46214792132377625, "learning_rate": 3.9100365700738275e-05, "loss": 0.747, "num_input_tokens_seen": 58818560, "step": 7180 }, { "epoch": 0.92906060214498, "grad_norm": 0.4923725426197052, "learning_rate": 3.907241834915359e-05, "loss": 0.8309, "num_input_tokens_seen": 58900480, "step": 7190 }, { "epoch": 0.9303527587543611, "grad_norm": 0.5313102006912231, "learning_rate": 3.904444523102242e-05, "loss": 0.9932, "num_input_tokens_seen": 58982400, "step": 7200 }, { "epoch": 0.931644915363742, "grad_norm": 0.5704349875450134, "learning_rate": 3.901644639756342e-05, "loss": 0.9913, "num_input_tokens_seen": 59064320, "step": 7210 }, { "epoch": 0.9329370719731231, "grad_norm": 0.48948806524276733, "learning_rate": 3.898842190004235e-05, "loss": 0.9762, "num_input_tokens_seen": 59146240, "step": 7220 }, { "epoch": 0.9342292285825042, "grad_norm": 0.6252711415290833, "learning_rate": 3.896037178977196e-05, "loss": 0.6812, "num_input_tokens_seen": 59228160, "step": 7230 }, { "epoch": 0.9355213851918852, "grad_norm": 0.7012498378753662, "learning_rate": 3.893229611811192e-05, "loss": 1.3312, "num_input_tokens_seen": 59310080, "step": 7240 }, { "epoch": 0.9368135418012663, "grad_norm": 0.5096800923347473, "learning_rate": 3.8904194936468665e-05, "loss": 1.0935, "num_input_tokens_seen": 59392000, "step": 7250 }, { "epoch": 0.9381056984106474, "grad_norm": 0.47949451208114624, "learning_rate": 3.887606829629536e-05, "loss": 0.7918, "num_input_tokens_seen": 59473920, "step": 7260 }, { "epoch": 0.9393978550200285, "grad_norm": 0.256607323884964, "learning_rate": 3.884791624909178e-05, "loss": 0.8013, "num_input_tokens_seen": 59555840, "step": 7270 }, { "epoch": 0.9406900116294095, "grad_norm": 0.45952385663986206, "learning_rate": 3.881973884640422e-05, "loss": 1.0558, "num_input_tokens_seen": 59637760, "step": 7280 }, { "epoch": 0.9419821682387906, "grad_norm": 0.8293887376785278, "learning_rate": 3.87915361398254e-05, "loss": 1.1583, "num_input_tokens_seen": 59719680, "step": 7290 }, { "epoch": 0.9432743248481716, "grad_norm": 0.6213748455047607, "learning_rate": 3.8763308180994384e-05, "loss": 0.4953, "num_input_tokens_seen": 59801600, "step": 7300 }, { "epoch": 0.9445664814575526, "grad_norm": 0.7034590244293213, "learning_rate": 3.873505502159645e-05, "loss": 1.1762, "num_input_tokens_seen": 59883520, "step": 7310 }, { "epoch": 0.9458586380669337, "grad_norm": 0.4893646240234375, "learning_rate": 3.8706776713363025e-05, "loss": 0.7593, "num_input_tokens_seen": 59965440, "step": 7320 }, { "epoch": 0.9471507946763148, "grad_norm": 0.7708596587181091, "learning_rate": 3.86784733080716e-05, "loss": 0.6101, "num_input_tokens_seen": 60047360, "step": 7330 }, { "epoch": 0.9484429512856958, "grad_norm": 0.541378915309906, "learning_rate": 3.86501448575456e-05, "loss": 1.1621, "num_input_tokens_seen": 60129280, "step": 7340 }, { "epoch": 0.9497351078950769, "grad_norm": 0.46613985300064087, "learning_rate": 3.862179141365431e-05, "loss": 0.6934, "num_input_tokens_seen": 60211200, "step": 7350 }, { "epoch": 0.951027264504458, "grad_norm": 0.37554696202278137, "learning_rate": 3.859341302831279e-05, "loss": 1.064, "num_input_tokens_seen": 60293120, "step": 7360 }, { "epoch": 0.952319421113839, "grad_norm": 0.7480294108390808, "learning_rate": 3.856500975348176e-05, "loss": 0.9418, "num_input_tokens_seen": 60375040, "step": 7370 }, { "epoch": 0.95361157772322, "grad_norm": 0.5717000961303711, "learning_rate": 3.8536581641167506e-05, "loss": 0.8951, "num_input_tokens_seen": 60456960, "step": 7380 }, { "epoch": 0.9549037343326011, "grad_norm": 0.45350784063339233, "learning_rate": 3.85081287434218e-05, "loss": 0.9263, "num_input_tokens_seen": 60538880, "step": 7390 }, { "epoch": 0.9561958909419822, "grad_norm": 0.839464545249939, "learning_rate": 3.84796511123418e-05, "loss": 1.1756, "num_input_tokens_seen": 60620800, "step": 7400 }, { "epoch": 0.9574880475513632, "grad_norm": 0.5966989994049072, "learning_rate": 3.845114880006994e-05, "loss": 0.9992, "num_input_tokens_seen": 60702720, "step": 7410 }, { "epoch": 0.9587802041607443, "grad_norm": 0.5134586095809937, "learning_rate": 3.842262185879384e-05, "loss": 1.1923, "num_input_tokens_seen": 60784640, "step": 7420 }, { "epoch": 0.9600723607701254, "grad_norm": 0.2532576024532318, "learning_rate": 3.8394070340746234e-05, "loss": 0.828, "num_input_tokens_seen": 60866560, "step": 7430 }, { "epoch": 0.9613645173795063, "grad_norm": 0.4352121949195862, "learning_rate": 3.836549429820485e-05, "loss": 0.9765, "num_input_tokens_seen": 60948480, "step": 7440 }, { "epoch": 0.9626566739888874, "grad_norm": 0.5217087864875793, "learning_rate": 3.833689378349231e-05, "loss": 0.9062, "num_input_tokens_seen": 61030400, "step": 7450 }, { "epoch": 0.9639488305982685, "grad_norm": 0.5556167960166931, "learning_rate": 3.830826884897606e-05, "loss": 1.0222, "num_input_tokens_seen": 61112320, "step": 7460 }, { "epoch": 0.9652409872076496, "grad_norm": 0.4434933662414551, "learning_rate": 3.827961954706825e-05, "loss": 1.118, "num_input_tokens_seen": 61194240, "step": 7470 }, { "epoch": 0.9665331438170306, "grad_norm": 0.8783986568450928, "learning_rate": 3.825094593022563e-05, "loss": 1.078, "num_input_tokens_seen": 61276160, "step": 7480 }, { "epoch": 0.9678253004264117, "grad_norm": 0.26476526260375977, "learning_rate": 3.8222248050949505e-05, "loss": 0.665, "num_input_tokens_seen": 61358080, "step": 7490 }, { "epoch": 0.9691174570357928, "grad_norm": 0.5986683368682861, "learning_rate": 3.8193525961785584e-05, "loss": 0.9969, "num_input_tokens_seen": 61440000, "step": 7500 }, { "epoch": 0.9704096136451738, "grad_norm": 0.2270795851945877, "learning_rate": 3.8164779715323905e-05, "loss": 0.9114, "num_input_tokens_seen": 61521920, "step": 7510 }, { "epoch": 0.9717017702545548, "grad_norm": 0.780129075050354, "learning_rate": 3.813600936419874e-05, "loss": 0.7856, "num_input_tokens_seen": 61603840, "step": 7520 }, { "epoch": 0.9729939268639359, "grad_norm": 0.5222942233085632, "learning_rate": 3.81072149610885e-05, "loss": 1.0216, "num_input_tokens_seen": 61685760, "step": 7530 }, { "epoch": 0.9742860834733169, "grad_norm": 0.5253265500068665, "learning_rate": 3.807839655871563e-05, "loss": 0.7388, "num_input_tokens_seen": 61767680, "step": 7540 }, { "epoch": 0.975578240082698, "grad_norm": 0.46493276953697205, "learning_rate": 3.8049554209846514e-05, "loss": 0.6208, "num_input_tokens_seen": 61849600, "step": 7550 }, { "epoch": 0.9768703966920791, "grad_norm": 0.7733160853385925, "learning_rate": 3.802068796729139e-05, "loss": 1.1114, "num_input_tokens_seen": 61931520, "step": 7560 }, { "epoch": 0.9781625533014602, "grad_norm": 0.21577703952789307, "learning_rate": 3.7991797883904254e-05, "loss": 0.9243, "num_input_tokens_seen": 62013440, "step": 7570 }, { "epoch": 0.9794547099108412, "grad_norm": 0.6388958692550659, "learning_rate": 3.796288401258272e-05, "loss": 0.8196, "num_input_tokens_seen": 62095360, "step": 7580 }, { "epoch": 0.9807468665202222, "grad_norm": 0.2966822683811188, "learning_rate": 3.7933946406268e-05, "loss": 0.8933, "num_input_tokens_seen": 62177280, "step": 7590 }, { "epoch": 0.9820390231296033, "grad_norm": 0.5425664186477661, "learning_rate": 3.790498511794473e-05, "loss": 0.9035, "num_input_tokens_seen": 62259200, "step": 7600 }, { "epoch": 0.9833311797389843, "grad_norm": 0.47928035259246826, "learning_rate": 3.787600020064095e-05, "loss": 0.9369, "num_input_tokens_seen": 62341120, "step": 7610 }, { "epoch": 0.9846233363483654, "grad_norm": 0.4411972463130951, "learning_rate": 3.7846991707427905e-05, "loss": 0.7782, "num_input_tokens_seen": 62423040, "step": 7620 }, { "epoch": 0.9859154929577465, "grad_norm": 0.5929402112960815, "learning_rate": 3.7817959691420056e-05, "loss": 0.8775, "num_input_tokens_seen": 62504960, "step": 7630 }, { "epoch": 0.9872076495671276, "grad_norm": 0.5183133482933044, "learning_rate": 3.778890420577492e-05, "loss": 0.7959, "num_input_tokens_seen": 62586880, "step": 7640 }, { "epoch": 0.9884998061765086, "grad_norm": 0.3291929364204407, "learning_rate": 3.775982530369298e-05, "loss": 1.0962, "num_input_tokens_seen": 62668800, "step": 7650 }, { "epoch": 0.9897919627858897, "grad_norm": 0.5607266426086426, "learning_rate": 3.77307230384176e-05, "loss": 1.1062, "num_input_tokens_seen": 62750720, "step": 7660 }, { "epoch": 0.9910841193952707, "grad_norm": 0.6233817338943481, "learning_rate": 3.7701597463234916e-05, "loss": 0.6531, "num_input_tokens_seen": 62832640, "step": 7670 }, { "epoch": 0.9923762760046517, "grad_norm": 0.5039672255516052, "learning_rate": 3.767244863147377e-05, "loss": 0.8184, "num_input_tokens_seen": 62914560, "step": 7680 }, { "epoch": 0.9936684326140328, "grad_norm": 0.5114099979400635, "learning_rate": 3.764327659650553e-05, "loss": 1.1191, "num_input_tokens_seen": 62996480, "step": 7690 }, { "epoch": 0.9949605892234139, "grad_norm": 0.6352154612541199, "learning_rate": 3.7614081411744116e-05, "loss": 1.1411, "num_input_tokens_seen": 63078400, "step": 7700 }, { "epoch": 0.9962527458327949, "grad_norm": 0.5263626575469971, "learning_rate": 3.75848631306458e-05, "loss": 1.0581, "num_input_tokens_seen": 63160320, "step": 7710 }, { "epoch": 0.997544902442176, "grad_norm": 0.5167150497436523, "learning_rate": 3.755562180670914e-05, "loss": 0.8535, "num_input_tokens_seen": 63242240, "step": 7720 }, { "epoch": 0.9988370590515571, "grad_norm": 0.5089849829673767, "learning_rate": 3.75263574934749e-05, "loss": 0.7162, "num_input_tokens_seen": 63324160, "step": 7730 }, { "epoch": 1.000129215660938, "grad_norm": 0.412061870098114, "learning_rate": 3.7497070244525925e-05, "loss": 0.6882, "num_input_tokens_seen": 63406080, "step": 7740 }, { "epoch": 1.0014213722703191, "grad_norm": 0.6777093410491943, "learning_rate": 3.746776011348706e-05, "loss": 1.1799, "num_input_tokens_seen": 63488000, "step": 7750 }, { "epoch": 1.0027135288797002, "grad_norm": 0.5311883091926575, "learning_rate": 3.7438427154025045e-05, "loss": 0.5552, "num_input_tokens_seen": 63569920, "step": 7760 }, { "epoch": 1.0040056854890813, "grad_norm": 0.5286181569099426, "learning_rate": 3.7409071419848436e-05, "loss": 1.1324, "num_input_tokens_seen": 63651840, "step": 7770 }, { "epoch": 1.0052978420984624, "grad_norm": 0.5085585117340088, "learning_rate": 3.7379692964707456e-05, "loss": 0.9001, "num_input_tokens_seen": 63733760, "step": 7780 }, { "epoch": 1.0065899987078435, "grad_norm": 0.3858306109905243, "learning_rate": 3.735029184239396e-05, "loss": 1.1056, "num_input_tokens_seen": 63815680, "step": 7790 }, { "epoch": 1.0078821553172244, "grad_norm": 0.5353860855102539, "learning_rate": 3.73208681067413e-05, "loss": 0.9637, "num_input_tokens_seen": 63897600, "step": 7800 }, { "epoch": 1.0091743119266054, "grad_norm": 0.3585314452648163, "learning_rate": 3.7291421811624216e-05, "loss": 0.6649, "num_input_tokens_seen": 63979520, "step": 7810 }, { "epoch": 1.0104664685359865, "grad_norm": 0.515984296798706, "learning_rate": 3.726195301095877e-05, "loss": 0.5021, "num_input_tokens_seen": 64061440, "step": 7820 }, { "epoch": 1.0117586251453676, "grad_norm": 0.5484461784362793, "learning_rate": 3.7232461758702244e-05, "loss": 0.9135, "num_input_tokens_seen": 64143360, "step": 7830 }, { "epoch": 1.0130507817547487, "grad_norm": 0.6354055404663086, "learning_rate": 3.7202948108852984e-05, "loss": 0.6548, "num_input_tokens_seen": 64225280, "step": 7840 }, { "epoch": 1.0143429383641298, "grad_norm": 0.6655207276344299, "learning_rate": 3.717341211545039e-05, "loss": 0.6679, "num_input_tokens_seen": 64307200, "step": 7850 }, { "epoch": 1.015635094973511, "grad_norm": 0.5719141364097595, "learning_rate": 3.714385383257477e-05, "loss": 1.115, "num_input_tokens_seen": 64389120, "step": 7860 }, { "epoch": 1.0169272515828918, "grad_norm": 0.6638296842575073, "learning_rate": 3.711427331434721e-05, "loss": 0.7203, "num_input_tokens_seen": 64471040, "step": 7870 }, { "epoch": 1.0182194081922729, "grad_norm": 0.5564717650413513, "learning_rate": 3.7084670614929554e-05, "loss": 0.7494, "num_input_tokens_seen": 64552960, "step": 7880 }, { "epoch": 1.019511564801654, "grad_norm": 0.42404186725616455, "learning_rate": 3.7055045788524214e-05, "loss": 0.7702, "num_input_tokens_seen": 64634880, "step": 7890 }, { "epoch": 1.020803721411035, "grad_norm": 0.5771876573562622, "learning_rate": 3.702539888937414e-05, "loss": 1.0498, "num_input_tokens_seen": 64716800, "step": 7900 }, { "epoch": 1.0220958780204161, "grad_norm": 0.6172158718109131, "learning_rate": 3.699572997176272e-05, "loss": 0.8016, "num_input_tokens_seen": 64798720, "step": 7910 }, { "epoch": 1.0233880346297972, "grad_norm": 0.6441952586174011, "learning_rate": 3.696603909001361e-05, "loss": 0.741, "num_input_tokens_seen": 64880640, "step": 7920 }, { "epoch": 1.024680191239178, "grad_norm": 0.5712997913360596, "learning_rate": 3.69363262984907e-05, "loss": 0.7687, "num_input_tokens_seen": 64962560, "step": 7930 }, { "epoch": 1.0259723478485592, "grad_norm": 0.5301061272621155, "learning_rate": 3.690659165159803e-05, "loss": 1.0326, "num_input_tokens_seen": 65044480, "step": 7940 }, { "epoch": 1.0272645044579403, "grad_norm": 0.4970210790634155, "learning_rate": 3.6876835203779615e-05, "loss": 0.961, "num_input_tokens_seen": 65126400, "step": 7950 }, { "epoch": 1.0285566610673214, "grad_norm": 0.5584379434585571, "learning_rate": 3.68470570095194e-05, "loss": 0.8982, "num_input_tokens_seen": 65208320, "step": 7960 }, { "epoch": 1.0298488176767024, "grad_norm": 0.6428322792053223, "learning_rate": 3.681725712334115e-05, "loss": 0.8534, "num_input_tokens_seen": 65290240, "step": 7970 }, { "epoch": 1.0311409742860835, "grad_norm": 0.2095443606376648, "learning_rate": 3.678743559980835e-05, "loss": 0.6313, "num_input_tokens_seen": 65372160, "step": 7980 }, { "epoch": 1.0324331308954646, "grad_norm": 0.6237980127334595, "learning_rate": 3.67575924935241e-05, "loss": 0.9277, "num_input_tokens_seen": 65454080, "step": 7990 }, { "epoch": 1.0337252875048455, "grad_norm": 0.5029511451721191, "learning_rate": 3.672772785913102e-05, "loss": 0.8789, "num_input_tokens_seen": 65536000, "step": 8000 }, { "epoch": 1.0350174441142266, "grad_norm": 0.2065262496471405, "learning_rate": 3.669784175131115e-05, "loss": 0.4837, "num_input_tokens_seen": 65617920, "step": 8010 }, { "epoch": 1.0363096007236077, "grad_norm": 0.44190657138824463, "learning_rate": 3.666793422478583e-05, "loss": 0.7244, "num_input_tokens_seen": 65699840, "step": 8020 }, { "epoch": 1.0376017573329888, "grad_norm": 0.49034252762794495, "learning_rate": 3.663800533431564e-05, "loss": 0.63, "num_input_tokens_seen": 65781760, "step": 8030 }, { "epoch": 1.0388939139423699, "grad_norm": 0.6004568934440613, "learning_rate": 3.660805513470027e-05, "loss": 1.0153, "num_input_tokens_seen": 65863680, "step": 8040 }, { "epoch": 1.040186070551751, "grad_norm": 0.49091774225234985, "learning_rate": 3.657808368077843e-05, "loss": 0.7835, "num_input_tokens_seen": 65945600, "step": 8050 }, { "epoch": 1.041478227161132, "grad_norm": 0.46571245789527893, "learning_rate": 3.654809102742773e-05, "loss": 0.7988, "num_input_tokens_seen": 66027520, "step": 8060 }, { "epoch": 1.042770383770513, "grad_norm": 0.7933652400970459, "learning_rate": 3.651807722956462e-05, "loss": 1.3038, "num_input_tokens_seen": 66109440, "step": 8070 }, { "epoch": 1.044062540379894, "grad_norm": 0.584967851638794, "learning_rate": 3.648804234214425e-05, "loss": 0.7774, "num_input_tokens_seen": 66191360, "step": 8080 }, { "epoch": 1.045354696989275, "grad_norm": 0.49486130475997925, "learning_rate": 3.645798642016039e-05, "loss": 0.7951, "num_input_tokens_seen": 66273280, "step": 8090 }, { "epoch": 1.0466468535986562, "grad_norm": 0.5906932353973389, "learning_rate": 3.642790951864532e-05, "loss": 0.7105, "num_input_tokens_seen": 66355200, "step": 8100 }, { "epoch": 1.0479390102080373, "grad_norm": 0.6410627961158752, "learning_rate": 3.639781169266975e-05, "loss": 0.644, "num_input_tokens_seen": 66437120, "step": 8110 }, { "epoch": 1.0492311668174183, "grad_norm": 0.8290396332740784, "learning_rate": 3.636769299734267e-05, "loss": 0.7002, "num_input_tokens_seen": 66519040, "step": 8120 }, { "epoch": 1.0505233234267992, "grad_norm": 0.5405248403549194, "learning_rate": 3.63375534878113e-05, "loss": 0.9435, "num_input_tokens_seen": 66600960, "step": 8130 }, { "epoch": 1.0518154800361803, "grad_norm": 0.4574602544307709, "learning_rate": 3.6307393219261e-05, "loss": 0.8839, "num_input_tokens_seen": 66682880, "step": 8140 }, { "epoch": 1.0531076366455614, "grad_norm": 0.5063459873199463, "learning_rate": 3.627721224691507e-05, "loss": 0.8676, "num_input_tokens_seen": 66764800, "step": 8150 }, { "epoch": 1.0543997932549425, "grad_norm": 0.5998566150665283, "learning_rate": 3.6247010626034795e-05, "loss": 0.6555, "num_input_tokens_seen": 66846720, "step": 8160 }, { "epoch": 1.0556919498643236, "grad_norm": 0.6061252951622009, "learning_rate": 3.621678841191922e-05, "loss": 0.8207, "num_input_tokens_seen": 66928640, "step": 8170 }, { "epoch": 1.0569841064737047, "grad_norm": 0.6531092524528503, "learning_rate": 3.618654565990511e-05, "loss": 1.0707, "num_input_tokens_seen": 67010560, "step": 8180 }, { "epoch": 1.0582762630830858, "grad_norm": 0.7719940543174744, "learning_rate": 3.615628242536682e-05, "loss": 0.7523, "num_input_tokens_seen": 67092480, "step": 8190 }, { "epoch": 1.0595684196924666, "grad_norm": 0.28535163402557373, "learning_rate": 3.612599876371625e-05, "loss": 0.7847, "num_input_tokens_seen": 67174400, "step": 8200 }, { "epoch": 1.0608605763018477, "grad_norm": 0.5150948166847229, "learning_rate": 3.609569473040265e-05, "loss": 1.0002, "num_input_tokens_seen": 67256320, "step": 8210 }, { "epoch": 1.0621527329112288, "grad_norm": 0.47803130745887756, "learning_rate": 3.6065370380912587e-05, "loss": 0.9216, "num_input_tokens_seen": 67338240, "step": 8220 }, { "epoch": 1.06344488952061, "grad_norm": 0.5799218416213989, "learning_rate": 3.603502577076986e-05, "loss": 0.9941, "num_input_tokens_seen": 67420160, "step": 8230 }, { "epoch": 1.064737046129991, "grad_norm": 0.23019935190677643, "learning_rate": 3.600466095553532e-05, "loss": 0.6576, "num_input_tokens_seen": 67502080, "step": 8240 }, { "epoch": 1.066029202739372, "grad_norm": 0.4924595057964325, "learning_rate": 3.5974275990806846e-05, "loss": 1.2263, "num_input_tokens_seen": 67584000, "step": 8250 }, { "epoch": 1.0673213593487532, "grad_norm": 0.6706606149673462, "learning_rate": 3.5943870932219184e-05, "loss": 0.7686, "num_input_tokens_seen": 67665920, "step": 8260 }, { "epoch": 1.068613515958134, "grad_norm": 0.4998605251312256, "learning_rate": 3.59134458354439e-05, "loss": 0.9912, "num_input_tokens_seen": 67747840, "step": 8270 }, { "epoch": 1.0699056725675151, "grad_norm": 0.5857017040252686, "learning_rate": 3.588300075618922e-05, "loss": 0.7774, "num_input_tokens_seen": 67829760, "step": 8280 }, { "epoch": 1.0711978291768962, "grad_norm": 0.23385043442249298, "learning_rate": 3.5852535750199977e-05, "loss": 1.002, "num_input_tokens_seen": 67911680, "step": 8290 }, { "epoch": 1.0724899857862773, "grad_norm": 0.5875013470649719, "learning_rate": 3.5822050873257494e-05, "loss": 0.767, "num_input_tokens_seen": 67993600, "step": 8300 }, { "epoch": 1.0737821423956584, "grad_norm": 0.5456327795982361, "learning_rate": 3.579154618117946e-05, "loss": 1.0006, "num_input_tokens_seen": 68075520, "step": 8310 }, { "epoch": 1.0750742990050395, "grad_norm": 0.2565229535102844, "learning_rate": 3.576102172981986e-05, "loss": 0.4659, "num_input_tokens_seen": 68157440, "step": 8320 }, { "epoch": 1.0763664556144206, "grad_norm": 0.510844886302948, "learning_rate": 3.5730477575068845e-05, "loss": 0.9332, "num_input_tokens_seen": 68239360, "step": 8330 }, { "epoch": 1.0776586122238014, "grad_norm": 0.5561206936836243, "learning_rate": 3.5699913772852664e-05, "loss": 0.7617, "num_input_tokens_seen": 68321280, "step": 8340 }, { "epoch": 1.0789507688331825, "grad_norm": 0.6247846484184265, "learning_rate": 3.566933037913351e-05, "loss": 1.1367, "num_input_tokens_seen": 68403200, "step": 8350 }, { "epoch": 1.0802429254425636, "grad_norm": 0.8798884749412537, "learning_rate": 3.5638727449909473e-05, "loss": 0.6604, "num_input_tokens_seen": 68485120, "step": 8360 }, { "epoch": 1.0815350820519447, "grad_norm": 0.44991663098335266, "learning_rate": 3.560810504121441e-05, "loss": 1.0806, "num_input_tokens_seen": 68567040, "step": 8370 }, { "epoch": 1.0828272386613258, "grad_norm": 0.3702957332134247, "learning_rate": 3.5577463209117833e-05, "loss": 0.7424, "num_input_tokens_seen": 68648960, "step": 8380 }, { "epoch": 1.0841193952707069, "grad_norm": 0.8632538318634033, "learning_rate": 3.554680200972482e-05, "loss": 0.78, "num_input_tokens_seen": 68730880, "step": 8390 }, { "epoch": 1.085411551880088, "grad_norm": 0.5223381519317627, "learning_rate": 3.551612149917593e-05, "loss": 1.0331, "num_input_tokens_seen": 68812800, "step": 8400 }, { "epoch": 1.0867037084894688, "grad_norm": 0.6399882435798645, "learning_rate": 3.548542173364705e-05, "loss": 1.3448, "num_input_tokens_seen": 68894720, "step": 8410 }, { "epoch": 1.08799586509885, "grad_norm": 0.8672426342964172, "learning_rate": 3.545470276934934e-05, "loss": 0.9444, "num_input_tokens_seen": 68976640, "step": 8420 }, { "epoch": 1.089288021708231, "grad_norm": 0.998540461063385, "learning_rate": 3.542396466252913e-05, "loss": 0.6718, "num_input_tokens_seen": 69058560, "step": 8430 }, { "epoch": 1.0905801783176121, "grad_norm": 0.8492835760116577, "learning_rate": 3.539320746946775e-05, "loss": 0.8492, "num_input_tokens_seen": 69140480, "step": 8440 }, { "epoch": 1.0918723349269932, "grad_norm": 0.47759363055229187, "learning_rate": 3.5362431246481536e-05, "loss": 0.5818, "num_input_tokens_seen": 69222400, "step": 8450 }, { "epoch": 1.0931644915363743, "grad_norm": 0.5898222327232361, "learning_rate": 3.533163604992163e-05, "loss": 0.8846, "num_input_tokens_seen": 69304320, "step": 8460 }, { "epoch": 1.0944566481457552, "grad_norm": 0.672615110874176, "learning_rate": 3.5300821936173926e-05, "loss": 0.8992, "num_input_tokens_seen": 69386240, "step": 8470 }, { "epoch": 1.0957488047551363, "grad_norm": 0.265041321516037, "learning_rate": 3.526998896165894e-05, "loss": 0.9501, "num_input_tokens_seen": 69468160, "step": 8480 }, { "epoch": 1.0970409613645173, "grad_norm": 0.5370000600814819, "learning_rate": 3.523913718283175e-05, "loss": 1.1183, "num_input_tokens_seen": 69550080, "step": 8490 }, { "epoch": 1.0983331179738984, "grad_norm": 0.6693633794784546, "learning_rate": 3.520826665618184e-05, "loss": 0.9322, "num_input_tokens_seen": 69632000, "step": 8500 }, { "epoch": 1.0996252745832795, "grad_norm": 0.7608528733253479, "learning_rate": 3.5177377438233044e-05, "loss": 0.6564, "num_input_tokens_seen": 69713920, "step": 8510 }, { "epoch": 1.1009174311926606, "grad_norm": 0.5762624740600586, "learning_rate": 3.514646958554339e-05, "loss": 1.0945, "num_input_tokens_seen": 69795840, "step": 8520 }, { "epoch": 1.1022095878020417, "grad_norm": 0.7134038209915161, "learning_rate": 3.511554315470507e-05, "loss": 0.8922, "num_input_tokens_seen": 69877760, "step": 8530 }, { "epoch": 1.1035017444114226, "grad_norm": 0.6153433322906494, "learning_rate": 3.508459820234423e-05, "loss": 0.9603, "num_input_tokens_seen": 69959680, "step": 8540 }, { "epoch": 1.1047939010208037, "grad_norm": 0.765910267829895, "learning_rate": 3.5053634785121e-05, "loss": 0.8906, "num_input_tokens_seen": 70041600, "step": 8550 }, { "epoch": 1.1060860576301847, "grad_norm": 0.4776328206062317, "learning_rate": 3.5022652959729266e-05, "loss": 0.7746, "num_input_tokens_seen": 70123520, "step": 8560 }, { "epoch": 1.1073782142395658, "grad_norm": 0.5070037245750427, "learning_rate": 3.499165278289663e-05, "loss": 0.7997, "num_input_tokens_seen": 70205440, "step": 8570 }, { "epoch": 1.108670370848947, "grad_norm": 0.759103000164032, "learning_rate": 3.496063431138431e-05, "loss": 0.7416, "num_input_tokens_seen": 70287360, "step": 8580 }, { "epoch": 1.109962527458328, "grad_norm": 0.6531251668930054, "learning_rate": 3.492959760198702e-05, "loss": 0.7489, "num_input_tokens_seen": 70369280, "step": 8590 }, { "epoch": 1.111254684067709, "grad_norm": 0.34367960691452026, "learning_rate": 3.489854271153285e-05, "loss": 0.8175, "num_input_tokens_seen": 70451200, "step": 8600 }, { "epoch": 1.11254684067709, "grad_norm": 0.7481246590614319, "learning_rate": 3.4867469696883204e-05, "loss": 0.6624, "num_input_tokens_seen": 70533120, "step": 8610 }, { "epoch": 1.113838997286471, "grad_norm": 0.6177128553390503, "learning_rate": 3.483637861493264e-05, "loss": 0.8943, "num_input_tokens_seen": 70615040, "step": 8620 }, { "epoch": 1.1151311538958522, "grad_norm": 0.699070930480957, "learning_rate": 3.480526952260884e-05, "loss": 0.9308, "num_input_tokens_seen": 70696960, "step": 8630 }, { "epoch": 1.1164233105052332, "grad_norm": 0.6072255969047546, "learning_rate": 3.477414247687241e-05, "loss": 0.6464, "num_input_tokens_seen": 70778880, "step": 8640 }, { "epoch": 1.1177154671146143, "grad_norm": 0.5646083950996399, "learning_rate": 3.4742997534716884e-05, "loss": 0.6793, "num_input_tokens_seen": 70860800, "step": 8650 }, { "epoch": 1.1190076237239954, "grad_norm": 0.571111798286438, "learning_rate": 3.471183475316851e-05, "loss": 0.95, "num_input_tokens_seen": 70942720, "step": 8660 }, { "epoch": 1.1202997803333763, "grad_norm": 0.2792120575904846, "learning_rate": 3.468065418928625e-05, "loss": 0.8991, "num_input_tokens_seen": 71024640, "step": 8670 }, { "epoch": 1.1215919369427574, "grad_norm": 0.8360409736633301, "learning_rate": 3.4649455900161596e-05, "loss": 1.0195, "num_input_tokens_seen": 71106560, "step": 8680 }, { "epoch": 1.1228840935521385, "grad_norm": 0.5351372957229614, "learning_rate": 3.461823994291849e-05, "loss": 1.0533, "num_input_tokens_seen": 71188480, "step": 8690 }, { "epoch": 1.1241762501615196, "grad_norm": 1.5099799633026123, "learning_rate": 3.458700637471325e-05, "loss": 0.7323, "num_input_tokens_seen": 71270400, "step": 8700 }, { "epoch": 1.1254684067709007, "grad_norm": 0.3392684757709503, "learning_rate": 3.455575525273442e-05, "loss": 0.4897, "num_input_tokens_seen": 71352320, "step": 8710 }, { "epoch": 1.1267605633802817, "grad_norm": 0.29004672169685364, "learning_rate": 3.4524486634202685e-05, "loss": 0.9862, "num_input_tokens_seen": 71434240, "step": 8720 }, { "epoch": 1.1280527199896628, "grad_norm": 0.9040658473968506, "learning_rate": 3.4493200576370776e-05, "loss": 0.7261, "num_input_tokens_seen": 71516160, "step": 8730 }, { "epoch": 1.1293448765990437, "grad_norm": 0.509054958820343, "learning_rate": 3.4461897136523356e-05, "loss": 0.8157, "num_input_tokens_seen": 71598080, "step": 8740 }, { "epoch": 1.1306370332084248, "grad_norm": 0.5921317338943481, "learning_rate": 3.44305763719769e-05, "loss": 1.0178, "num_input_tokens_seen": 71680000, "step": 8750 }, { "epoch": 1.1319291898178059, "grad_norm": 0.7024086117744446, "learning_rate": 3.4399238340079607e-05, "loss": 0.9631, "num_input_tokens_seen": 71761920, "step": 8760 }, { "epoch": 1.133221346427187, "grad_norm": 0.2952291667461395, "learning_rate": 3.4367883098211316e-05, "loss": 0.7918, "num_input_tokens_seen": 71843840, "step": 8770 }, { "epoch": 1.134513503036568, "grad_norm": 0.6267214417457581, "learning_rate": 3.4336510703783345e-05, "loss": 0.8197, "num_input_tokens_seen": 71925760, "step": 8780 }, { "epoch": 1.1358056596459492, "grad_norm": 0.2442624568939209, "learning_rate": 3.4305121214238446e-05, "loss": 0.6943, "num_input_tokens_seen": 72007680, "step": 8790 }, { "epoch": 1.1370978162553302, "grad_norm": 0.6830674409866333, "learning_rate": 3.427371468705065e-05, "loss": 0.9242, "num_input_tokens_seen": 72089600, "step": 8800 }, { "epoch": 1.138389972864711, "grad_norm": 0.41818463802337646, "learning_rate": 3.42422911797252e-05, "loss": 0.8938, "num_input_tokens_seen": 72171520, "step": 8810 }, { "epoch": 1.1396821294740922, "grad_norm": 0.5191003680229187, "learning_rate": 3.4210850749798415e-05, "loss": 1.0008, "num_input_tokens_seen": 72253440, "step": 8820 }, { "epoch": 1.1409742860834733, "grad_norm": 0.6296160817146301, "learning_rate": 3.417939345483762e-05, "loss": 0.6786, "num_input_tokens_seen": 72335360, "step": 8830 }, { "epoch": 1.1422664426928544, "grad_norm": 0.4805864989757538, "learning_rate": 3.4147919352440995e-05, "loss": 0.9551, "num_input_tokens_seen": 72417280, "step": 8840 }, { "epoch": 1.1435585993022355, "grad_norm": 0.64635169506073, "learning_rate": 3.411642850023751e-05, "loss": 0.7622, "num_input_tokens_seen": 72499200, "step": 8850 }, { "epoch": 1.1448507559116166, "grad_norm": 0.37850773334503174, "learning_rate": 3.40849209558868e-05, "loss": 0.5537, "num_input_tokens_seen": 72581120, "step": 8860 }, { "epoch": 1.1461429125209976, "grad_norm": 0.836184561252594, "learning_rate": 3.405339677707906e-05, "loss": 0.6828, "num_input_tokens_seen": 72663040, "step": 8870 }, { "epoch": 1.1474350691303785, "grad_norm": 0.5654054880142212, "learning_rate": 3.402185602153495e-05, "loss": 0.8754, "num_input_tokens_seen": 72744960, "step": 8880 }, { "epoch": 1.1487272257397596, "grad_norm": 0.6414130926132202, "learning_rate": 3.3990298747005485e-05, "loss": 1.1836, "num_input_tokens_seen": 72826880, "step": 8890 }, { "epoch": 1.1500193823491407, "grad_norm": 0.5052874088287354, "learning_rate": 3.395872501127191e-05, "loss": 0.9686, "num_input_tokens_seen": 72908800, "step": 8900 }, { "epoch": 1.1513115389585218, "grad_norm": 0.6401844620704651, "learning_rate": 3.392713487214561e-05, "loss": 0.9425, "num_input_tokens_seen": 72990720, "step": 8910 }, { "epoch": 1.1526036955679029, "grad_norm": 0.49744102358818054, "learning_rate": 3.389552838746804e-05, "loss": 0.6426, "num_input_tokens_seen": 73072640, "step": 8920 }, { "epoch": 1.153895852177284, "grad_norm": 0.6056483387947083, "learning_rate": 3.386390561511055e-05, "loss": 0.9311, "num_input_tokens_seen": 73154560, "step": 8930 }, { "epoch": 1.155188008786665, "grad_norm": 0.7190436720848083, "learning_rate": 3.38322666129743e-05, "loss": 0.94, "num_input_tokens_seen": 73236480, "step": 8940 }, { "epoch": 1.156480165396046, "grad_norm": 0.6182407736778259, "learning_rate": 3.380061143899021e-05, "loss": 0.8698, "num_input_tokens_seen": 73318400, "step": 8950 }, { "epoch": 1.157772322005427, "grad_norm": 0.22560502588748932, "learning_rate": 3.376894015111876e-05, "loss": 1.0229, "num_input_tokens_seen": 73400320, "step": 8960 }, { "epoch": 1.159064478614808, "grad_norm": 0.4778119921684265, "learning_rate": 3.373725280735e-05, "loss": 0.7057, "num_input_tokens_seen": 73482240, "step": 8970 }, { "epoch": 1.1603566352241892, "grad_norm": 0.5372010469436646, "learning_rate": 3.3705549465703314e-05, "loss": 0.8812, "num_input_tokens_seen": 73564160, "step": 8980 }, { "epoch": 1.1616487918335703, "grad_norm": 0.6138181090354919, "learning_rate": 3.3673830184227414e-05, "loss": 0.9767, "num_input_tokens_seen": 73646080, "step": 8990 }, { "epoch": 1.1629409484429514, "grad_norm": 0.7862790822982788, "learning_rate": 3.3642095021000184e-05, "loss": 1.0073, "num_input_tokens_seen": 73728000, "step": 9000 }, { "epoch": 1.1642331050523325, "grad_norm": 0.6123846769332886, "learning_rate": 3.36103440341286e-05, "loss": 0.5677, "num_input_tokens_seen": 73809920, "step": 9010 }, { "epoch": 1.1655252616617133, "grad_norm": 0.6137543320655823, "learning_rate": 3.35785772817486e-05, "loss": 0.5504, "num_input_tokens_seen": 73891840, "step": 9020 }, { "epoch": 1.1668174182710944, "grad_norm": 0.6806734204292297, "learning_rate": 3.3546794822024976e-05, "loss": 0.949, "num_input_tokens_seen": 73973760, "step": 9030 }, { "epoch": 1.1681095748804755, "grad_norm": 0.7158621549606323, "learning_rate": 3.351499671315131e-05, "loss": 0.9297, "num_input_tokens_seen": 74055680, "step": 9040 }, { "epoch": 1.1694017314898566, "grad_norm": 0.9826081395149231, "learning_rate": 3.348318301334983e-05, "loss": 0.993, "num_input_tokens_seen": 74137600, "step": 9050 }, { "epoch": 1.1706938880992377, "grad_norm": 0.6197934746742249, "learning_rate": 3.3451353780871286e-05, "loss": 0.6176, "num_input_tokens_seen": 74219520, "step": 9060 }, { "epoch": 1.1719860447086188, "grad_norm": 0.22937451303005219, "learning_rate": 3.341950907399489e-05, "loss": 0.5138, "num_input_tokens_seen": 74301440, "step": 9070 }, { "epoch": 1.1732782013179996, "grad_norm": 0.591966450214386, "learning_rate": 3.338764895102821e-05, "loss": 0.7563, "num_input_tokens_seen": 74383360, "step": 9080 }, { "epoch": 1.1745703579273807, "grad_norm": 0.6814236640930176, "learning_rate": 3.335577347030697e-05, "loss": 1.3017, "num_input_tokens_seen": 74465280, "step": 9090 }, { "epoch": 1.1758625145367618, "grad_norm": 0.9694181084632874, "learning_rate": 3.33238826901951e-05, "loss": 1.1248, "num_input_tokens_seen": 74547200, "step": 9100 }, { "epoch": 1.177154671146143, "grad_norm": 0.24693933129310608, "learning_rate": 3.329197666908447e-05, "loss": 0.6756, "num_input_tokens_seen": 74629120, "step": 9110 }, { "epoch": 1.178446827755524, "grad_norm": 0.5937339663505554, "learning_rate": 3.32600554653949e-05, "loss": 0.9454, "num_input_tokens_seen": 74711040, "step": 9120 }, { "epoch": 1.179738984364905, "grad_norm": 0.7767700552940369, "learning_rate": 3.322811913757401e-05, "loss": 0.863, "num_input_tokens_seen": 74792960, "step": 9130 }, { "epoch": 1.181031140974286, "grad_norm": 0.6188927292823792, "learning_rate": 3.319616774409709e-05, "loss": 0.8522, "num_input_tokens_seen": 74874880, "step": 9140 }, { "epoch": 1.182323297583667, "grad_norm": 0.5785146355628967, "learning_rate": 3.316420134346701e-05, "loss": 0.8277, "num_input_tokens_seen": 74956800, "step": 9150 }, { "epoch": 1.1836154541930481, "grad_norm": 0.6327735781669617, "learning_rate": 3.313221999421415e-05, "loss": 0.8846, "num_input_tokens_seen": 75038720, "step": 9160 }, { "epoch": 1.1849076108024292, "grad_norm": 0.338527113199234, "learning_rate": 3.310022375489623e-05, "loss": 0.6351, "num_input_tokens_seen": 75120640, "step": 9170 }, { "epoch": 1.1861997674118103, "grad_norm": 1.0302643775939941, "learning_rate": 3.306821268409827e-05, "loss": 1.023, "num_input_tokens_seen": 75202560, "step": 9180 }, { "epoch": 1.1874919240211914, "grad_norm": 0.4798928499221802, "learning_rate": 3.30361868404324e-05, "loss": 1.0803, "num_input_tokens_seen": 75284480, "step": 9190 }, { "epoch": 1.1887840806305725, "grad_norm": 0.48873379826545715, "learning_rate": 3.300414628253783e-05, "loss": 0.7852, "num_input_tokens_seen": 75366400, "step": 9200 }, { "epoch": 1.1900762372399534, "grad_norm": 0.8409120440483093, "learning_rate": 3.297209106908072e-05, "loss": 0.8063, "num_input_tokens_seen": 75448320, "step": 9210 }, { "epoch": 1.1913683938493345, "grad_norm": 0.26400497555732727, "learning_rate": 3.294002125875402e-05, "loss": 0.6504, "num_input_tokens_seen": 75530240, "step": 9220 }, { "epoch": 1.1926605504587156, "grad_norm": 0.6408680081367493, "learning_rate": 3.290793691027746e-05, "loss": 0.7654, "num_input_tokens_seen": 75612160, "step": 9230 }, { "epoch": 1.1939527070680966, "grad_norm": 0.8105805516242981, "learning_rate": 3.287583808239735e-05, "loss": 0.7852, "num_input_tokens_seen": 75694080, "step": 9240 }, { "epoch": 1.1952448636774777, "grad_norm": 0.5738030076026917, "learning_rate": 3.284372483388652e-05, "loss": 0.5354, "num_input_tokens_seen": 75776000, "step": 9250 }, { "epoch": 1.1965370202868588, "grad_norm": 0.5625064373016357, "learning_rate": 3.2811597223544234e-05, "loss": 0.8227, "num_input_tokens_seen": 75857920, "step": 9260 }, { "epoch": 1.19782917689624, "grad_norm": 0.5138773918151855, "learning_rate": 3.277945531019601e-05, "loss": 0.9902, "num_input_tokens_seen": 75939840, "step": 9270 }, { "epoch": 1.1991213335056208, "grad_norm": 0.5785451531410217, "learning_rate": 3.274729915269358e-05, "loss": 0.6786, "num_input_tokens_seen": 76021760, "step": 9280 }, { "epoch": 1.2004134901150019, "grad_norm": 0.3385809063911438, "learning_rate": 3.271512880991476e-05, "loss": 0.7933, "num_input_tokens_seen": 76103680, "step": 9290 }, { "epoch": 1.201705646724383, "grad_norm": 0.2932916581630707, "learning_rate": 3.268294434076332e-05, "loss": 0.8867, "num_input_tokens_seen": 76185600, "step": 9300 }, { "epoch": 1.202997803333764, "grad_norm": 0.5734339952468872, "learning_rate": 3.26507458041689e-05, "loss": 0.9081, "num_input_tokens_seen": 76267520, "step": 9310 }, { "epoch": 1.2042899599431451, "grad_norm": 0.6236972808837891, "learning_rate": 3.261853325908691e-05, "loss": 1.1582, "num_input_tokens_seen": 76349440, "step": 9320 }, { "epoch": 1.2055821165525262, "grad_norm": 0.6559344530105591, "learning_rate": 3.2586306764498395e-05, "loss": 1.1172, "num_input_tokens_seen": 76431360, "step": 9330 }, { "epoch": 1.2068742731619073, "grad_norm": 0.5605185627937317, "learning_rate": 3.255406637940996e-05, "loss": 0.8069, "num_input_tokens_seen": 76513280, "step": 9340 }, { "epoch": 1.2081664297712882, "grad_norm": 0.697979748249054, "learning_rate": 3.252181216285363e-05, "loss": 1.0322, "num_input_tokens_seen": 76595200, "step": 9350 }, { "epoch": 1.2094585863806693, "grad_norm": 0.7321066856384277, "learning_rate": 3.2489544173886745e-05, "loss": 1.1227, "num_input_tokens_seen": 76677120, "step": 9360 }, { "epoch": 1.2107507429900504, "grad_norm": 0.623622715473175, "learning_rate": 3.245726247159189e-05, "loss": 0.9295, "num_input_tokens_seen": 76759040, "step": 9370 }, { "epoch": 1.2120428995994315, "grad_norm": 0.35311391949653625, "learning_rate": 3.242496711507673e-05, "loss": 1.0272, "num_input_tokens_seen": 76840960, "step": 9380 }, { "epoch": 1.2133350562088125, "grad_norm": 0.5266684889793396, "learning_rate": 3.239265816347397e-05, "loss": 0.9163, "num_input_tokens_seen": 76922880, "step": 9390 }, { "epoch": 1.2146272128181936, "grad_norm": 0.4497738182544708, "learning_rate": 3.236033567594115e-05, "loss": 0.8623, "num_input_tokens_seen": 77004800, "step": 9400 }, { "epoch": 1.2159193694275747, "grad_norm": 0.5825258493423462, "learning_rate": 3.232799971166064e-05, "loss": 0.9241, "num_input_tokens_seen": 77086720, "step": 9410 }, { "epoch": 1.2172115260369556, "grad_norm": 0.9179586172103882, "learning_rate": 3.2295650329839474e-05, "loss": 0.75, "num_input_tokens_seen": 77168640, "step": 9420 }, { "epoch": 1.2185036826463367, "grad_norm": 0.2291688621044159, "learning_rate": 3.2263287589709255e-05, "loss": 0.6456, "num_input_tokens_seen": 77250560, "step": 9430 }, { "epoch": 1.2197958392557178, "grad_norm": 0.4688994288444519, "learning_rate": 3.2230911550526035e-05, "loss": 0.8976, "num_input_tokens_seen": 77332480, "step": 9440 }, { "epoch": 1.2210879958650989, "grad_norm": 0.5401532649993896, "learning_rate": 3.219852227157022e-05, "loss": 0.9984, "num_input_tokens_seen": 77414400, "step": 9450 }, { "epoch": 1.22238015247448, "grad_norm": 0.7293322086334229, "learning_rate": 3.216611981214648e-05, "loss": 1.0182, "num_input_tokens_seen": 77496320, "step": 9460 }, { "epoch": 1.223672309083861, "grad_norm": 0.19243964552879333, "learning_rate": 3.2133704231583576e-05, "loss": 0.6102, "num_input_tokens_seen": 77578240, "step": 9470 }, { "epoch": 1.2249644656932421, "grad_norm": 0.556209146976471, "learning_rate": 3.210127558923434e-05, "loss": 0.868, "num_input_tokens_seen": 77660160, "step": 9480 }, { "epoch": 1.226256622302623, "grad_norm": 0.29714149236679077, "learning_rate": 3.206883394447547e-05, "loss": 0.9574, "num_input_tokens_seen": 77742080, "step": 9490 }, { "epoch": 1.227548778912004, "grad_norm": 0.4670780599117279, "learning_rate": 3.203637935670752e-05, "loss": 0.852, "num_input_tokens_seen": 77824000, "step": 9500 }, { "epoch": 1.2288409355213852, "grad_norm": 0.5203260779380798, "learning_rate": 3.200391188535472e-05, "loss": 0.7199, "num_input_tokens_seen": 77905920, "step": 9510 }, { "epoch": 1.2301330921307663, "grad_norm": 0.28679078817367554, "learning_rate": 3.197143158986489e-05, "loss": 0.7419, "num_input_tokens_seen": 77987840, "step": 9520 }, { "epoch": 1.2314252487401474, "grad_norm": 0.5256805419921875, "learning_rate": 3.193893852970932e-05, "loss": 0.7953, "num_input_tokens_seen": 78069760, "step": 9530 }, { "epoch": 1.2327174053495285, "grad_norm": 0.8118154406547546, "learning_rate": 3.1906432764382695e-05, "loss": 0.8027, "num_input_tokens_seen": 78151680, "step": 9540 }, { "epoch": 1.2340095619589095, "grad_norm": 0.3833126425743103, "learning_rate": 3.187391435340295e-05, "loss": 0.7832, "num_input_tokens_seen": 78233600, "step": 9550 }, { "epoch": 1.2353017185682904, "grad_norm": 0.3153989315032959, "learning_rate": 3.184138335631118e-05, "loss": 0.7582, "num_input_tokens_seen": 78315520, "step": 9560 }, { "epoch": 1.2365938751776715, "grad_norm": 0.5784737467765808, "learning_rate": 3.1808839832671523e-05, "loss": 1.1442, "num_input_tokens_seen": 78397440, "step": 9570 }, { "epoch": 1.2378860317870526, "grad_norm": 0.6450271010398865, "learning_rate": 3.1776283842071045e-05, "loss": 0.9673, "num_input_tokens_seen": 78479360, "step": 9580 }, { "epoch": 1.2391781883964337, "grad_norm": 0.5860177874565125, "learning_rate": 3.174371544411964e-05, "loss": 0.8106, "num_input_tokens_seen": 78561280, "step": 9590 }, { "epoch": 1.2404703450058148, "grad_norm": 0.6213310956954956, "learning_rate": 3.1711134698449946e-05, "loss": 0.9658, "num_input_tokens_seen": 78643200, "step": 9600 }, { "epoch": 1.2417625016151959, "grad_norm": 0.5813376903533936, "learning_rate": 3.167854166471717e-05, "loss": 0.9531, "num_input_tokens_seen": 78725120, "step": 9610 }, { "epoch": 1.2430546582245767, "grad_norm": 0.3016079366207123, "learning_rate": 3.164593640259904e-05, "loss": 0.8195, "num_input_tokens_seen": 78807040, "step": 9620 }, { "epoch": 1.2443468148339578, "grad_norm": 0.7248777747154236, "learning_rate": 3.161331897179568e-05, "loss": 0.9972, "num_input_tokens_seen": 78888960, "step": 9630 }, { "epoch": 1.245638971443339, "grad_norm": 0.19406473636627197, "learning_rate": 3.1580689432029484e-05, "loss": 0.5308, "num_input_tokens_seen": 78970880, "step": 9640 }, { "epoch": 1.24693112805272, "grad_norm": 0.8860630989074707, "learning_rate": 3.154804784304502e-05, "loss": 1.0639, "num_input_tokens_seen": 79052800, "step": 9650 }, { "epoch": 1.248223284662101, "grad_norm": 0.792770504951477, "learning_rate": 3.151539426460892e-05, "loss": 1.2022, "num_input_tokens_seen": 79134720, "step": 9660 }, { "epoch": 1.2495154412714822, "grad_norm": 0.7376882433891296, "learning_rate": 3.148272875650976e-05, "loss": 0.8717, "num_input_tokens_seen": 79216640, "step": 9670 }, { "epoch": 1.250807597880863, "grad_norm": 0.5317556262016296, "learning_rate": 3.145005137855796e-05, "loss": 0.9134, "num_input_tokens_seen": 79298560, "step": 9680 }, { "epoch": 1.2520997544902444, "grad_norm": 0.2553159296512604, "learning_rate": 3.14173621905857e-05, "loss": 0.671, "num_input_tokens_seen": 79380480, "step": 9690 }, { "epoch": 1.2533919110996252, "grad_norm": 0.6660608649253845, "learning_rate": 3.138466125244674e-05, "loss": 1.0354, "num_input_tokens_seen": 79462400, "step": 9700 }, { "epoch": 1.2546840677090063, "grad_norm": 0.5901172757148743, "learning_rate": 3.13519486240164e-05, "loss": 1.3345, "num_input_tokens_seen": 79544320, "step": 9710 }, { "epoch": 1.2559762243183874, "grad_norm": 0.5691861510276794, "learning_rate": 3.1319224365191366e-05, "loss": 0.7637, "num_input_tokens_seen": 79626240, "step": 9720 }, { "epoch": 1.2572683809277685, "grad_norm": 0.5517555475234985, "learning_rate": 3.128648853588965e-05, "loss": 0.5803, "num_input_tokens_seen": 79708160, "step": 9730 }, { "epoch": 1.2585605375371496, "grad_norm": 0.9708176255226135, "learning_rate": 3.1253741196050425e-05, "loss": 0.4912, "num_input_tokens_seen": 79790080, "step": 9740 }, { "epoch": 1.2598526941465304, "grad_norm": 0.689449667930603, "learning_rate": 3.122098240563396e-05, "loss": 0.7291, "num_input_tokens_seen": 79872000, "step": 9750 }, { "epoch": 1.2611448507559115, "grad_norm": 0.8461759686470032, "learning_rate": 3.118821222462147e-05, "loss": 1.0072, "num_input_tokens_seen": 79953920, "step": 9760 }, { "epoch": 1.2624370073652926, "grad_norm": 0.9328972697257996, "learning_rate": 3.1155430713015034e-05, "loss": 0.8663, "num_input_tokens_seen": 80035840, "step": 9770 }, { "epoch": 1.2637291639746737, "grad_norm": 0.19583484530448914, "learning_rate": 3.1122637930837486e-05, "loss": 0.8375, "num_input_tokens_seen": 80117760, "step": 9780 }, { "epoch": 1.2650213205840548, "grad_norm": 0.22281506657600403, "learning_rate": 3.10898339381323e-05, "loss": 0.7423, "num_input_tokens_seen": 80199680, "step": 9790 }, { "epoch": 1.266313477193436, "grad_norm": 0.6488370299339294, "learning_rate": 3.1057018794963454e-05, "loss": 1.3639, "num_input_tokens_seen": 80281600, "step": 9800 }, { "epoch": 1.267605633802817, "grad_norm": 1.4011627435684204, "learning_rate": 3.102419256141536e-05, "loss": 0.7074, "num_input_tokens_seen": 80363520, "step": 9810 }, { "epoch": 1.2688977904121979, "grad_norm": 0.5548164248466492, "learning_rate": 3.0991355297592734e-05, "loss": 1.1645, "num_input_tokens_seen": 80445440, "step": 9820 }, { "epoch": 1.270189947021579, "grad_norm": 0.8125051856040955, "learning_rate": 3.095850706362047e-05, "loss": 0.699, "num_input_tokens_seen": 80527360, "step": 9830 }, { "epoch": 1.27148210363096, "grad_norm": 0.8188523650169373, "learning_rate": 3.092564791964358e-05, "loss": 1.1145, "num_input_tokens_seen": 80609280, "step": 9840 }, { "epoch": 1.2727742602403411, "grad_norm": 0.553065836429596, "learning_rate": 3.089277792582704e-05, "loss": 0.9243, "num_input_tokens_seen": 80691200, "step": 9850 }, { "epoch": 1.2740664168497222, "grad_norm": 0.6267945170402527, "learning_rate": 3.085989714235568e-05, "loss": 0.8664, "num_input_tokens_seen": 80773120, "step": 9860 }, { "epoch": 1.2753585734591033, "grad_norm": 0.551201343536377, "learning_rate": 3.082700562943409e-05, "loss": 0.8113, "num_input_tokens_seen": 80855040, "step": 9870 }, { "epoch": 1.2766507300684844, "grad_norm": 0.39583033323287964, "learning_rate": 3.079410344728652e-05, "loss": 1.0342, "num_input_tokens_seen": 80936960, "step": 9880 }, { "epoch": 1.2779428866778653, "grad_norm": 0.7506781816482544, "learning_rate": 3.076119065615674e-05, "loss": 0.8064, "num_input_tokens_seen": 81018880, "step": 9890 }, { "epoch": 1.2792350432872464, "grad_norm": 0.6547626852989197, "learning_rate": 3.0728267316307945e-05, "loss": 0.7267, "num_input_tokens_seen": 81100800, "step": 9900 }, { "epoch": 1.2805271998966274, "grad_norm": 0.8237736225128174, "learning_rate": 3.069533348802266e-05, "loss": 0.6364, "num_input_tokens_seen": 81182720, "step": 9910 }, { "epoch": 1.2818193565060085, "grad_norm": 0.5278318524360657, "learning_rate": 3.0662389231602595e-05, "loss": 0.7681, "num_input_tokens_seen": 81264640, "step": 9920 }, { "epoch": 1.2831115131153896, "grad_norm": 0.48821374773979187, "learning_rate": 3.062943460736857e-05, "loss": 0.6366, "num_input_tokens_seen": 81346560, "step": 9930 }, { "epoch": 1.2844036697247707, "grad_norm": 0.6118844747543335, "learning_rate": 3.059646967566038e-05, "loss": 0.8101, "num_input_tokens_seen": 81428480, "step": 9940 }, { "epoch": 1.2856958263341518, "grad_norm": 0.7095397710800171, "learning_rate": 3.0563494496836686e-05, "loss": 0.5936, "num_input_tokens_seen": 81510400, "step": 9950 }, { "epoch": 1.2869879829435327, "grad_norm": 0.8946405053138733, "learning_rate": 3.0530509131274935e-05, "loss": 0.8716, "num_input_tokens_seen": 81592320, "step": 9960 }, { "epoch": 1.2882801395529138, "grad_norm": 1.0983461141586304, "learning_rate": 3.0497513639371195e-05, "loss": 0.7882, "num_input_tokens_seen": 81674240, "step": 9970 }, { "epoch": 1.2895722961622949, "grad_norm": 0.9059731364250183, "learning_rate": 3.04645080815401e-05, "loss": 0.8302, "num_input_tokens_seen": 81756160, "step": 9980 }, { "epoch": 1.290864452771676, "grad_norm": 0.6056944131851196, "learning_rate": 3.04314925182147e-05, "loss": 0.9569, "num_input_tokens_seen": 81838080, "step": 9990 }, { "epoch": 1.292156609381057, "grad_norm": 0.536361575126648, "learning_rate": 3.0398467009846375e-05, "loss": 0.9173, "num_input_tokens_seen": 81920000, "step": 10000 }, { "epoch": 1.2934487659904381, "grad_norm": 0.6062700152397156, "learning_rate": 3.0365431616904714e-05, "loss": 0.9092, "num_input_tokens_seen": 82001920, "step": 10010 }, { "epoch": 1.2947409225998192, "grad_norm": 0.5536112189292908, "learning_rate": 3.03323863998774e-05, "loss": 0.8183, "num_input_tokens_seen": 82083840, "step": 10020 }, { "epoch": 1.2960330792092, "grad_norm": 0.6913787126541138, "learning_rate": 3.02993314192701e-05, "loss": 1.2083, "num_input_tokens_seen": 82165760, "step": 10030 }, { "epoch": 1.2973252358185812, "grad_norm": 0.8038882613182068, "learning_rate": 3.0266266735606358e-05, "loss": 1.1272, "num_input_tokens_seen": 82247680, "step": 10040 }, { "epoch": 1.2986173924279623, "grad_norm": 0.5819315910339355, "learning_rate": 3.0233192409427492e-05, "loss": 0.6936, "num_input_tokens_seen": 82329600, "step": 10050 }, { "epoch": 1.2999095490373433, "grad_norm": 0.5388988852500916, "learning_rate": 3.0200108501292466e-05, "loss": 1.0776, "num_input_tokens_seen": 82411520, "step": 10060 }, { "epoch": 1.3012017056467244, "grad_norm": 0.2774782180786133, "learning_rate": 3.0167015071777815e-05, "loss": 0.5866, "num_input_tokens_seen": 82493440, "step": 10070 }, { "epoch": 1.3024938622561053, "grad_norm": 0.20768149197101593, "learning_rate": 3.0133912181477475e-05, "loss": 0.903, "num_input_tokens_seen": 82575360, "step": 10080 }, { "epoch": 1.3037860188654866, "grad_norm": 0.5745561718940735, "learning_rate": 3.010079989100271e-05, "loss": 1.1334, "num_input_tokens_seen": 82657280, "step": 10090 }, { "epoch": 1.3050781754748675, "grad_norm": 0.5522057414054871, "learning_rate": 3.0067678260982018e-05, "loss": 0.708, "num_input_tokens_seen": 82739200, "step": 10100 }, { "epoch": 1.3063703320842486, "grad_norm": 0.5716668367385864, "learning_rate": 3.003454735206097e-05, "loss": 1.094, "num_input_tokens_seen": 82821120, "step": 10110 }, { "epoch": 1.3076624886936297, "grad_norm": 0.5987653732299805, "learning_rate": 3.000140722490215e-05, "loss": 0.7914, "num_input_tokens_seen": 82903040, "step": 10120 }, { "epoch": 1.3089546453030108, "grad_norm": 0.754988431930542, "learning_rate": 2.9968257940184997e-05, "loss": 0.882, "num_input_tokens_seen": 82984960, "step": 10130 }, { "epoch": 1.3102468019123918, "grad_norm": 0.2810363471508026, "learning_rate": 2.9935099558605728e-05, "loss": 0.7434, "num_input_tokens_seen": 83066880, "step": 10140 }, { "epoch": 1.3115389585217727, "grad_norm": 0.6438339352607727, "learning_rate": 2.9901932140877232e-05, "loss": 1.0257, "num_input_tokens_seen": 83148800, "step": 10150 }, { "epoch": 1.312831115131154, "grad_norm": 0.8908247947692871, "learning_rate": 2.9868755747728927e-05, "loss": 0.6914, "num_input_tokens_seen": 83230720, "step": 10160 }, { "epoch": 1.314123271740535, "grad_norm": 0.5516972541809082, "learning_rate": 2.9835570439906657e-05, "loss": 0.9318, "num_input_tokens_seen": 83312640, "step": 10170 }, { "epoch": 1.315415428349916, "grad_norm": 0.6137295961380005, "learning_rate": 2.9802376278172612e-05, "loss": 0.9633, "num_input_tokens_seen": 83394560, "step": 10180 }, { "epoch": 1.316707584959297, "grad_norm": 0.8933919668197632, "learning_rate": 2.976917332330517e-05, "loss": 1.1187, "num_input_tokens_seen": 83476480, "step": 10190 }, { "epoch": 1.3179997415686782, "grad_norm": 0.5550206303596497, "learning_rate": 2.973596163609883e-05, "loss": 0.9031, "num_input_tokens_seen": 83558400, "step": 10200 }, { "epoch": 1.3192918981780593, "grad_norm": 0.7092847228050232, "learning_rate": 2.970274127736406e-05, "loss": 0.8322, "num_input_tokens_seen": 83640320, "step": 10210 }, { "epoch": 1.3205840547874401, "grad_norm": 0.9514045119285583, "learning_rate": 2.966951230792722e-05, "loss": 0.7939, "num_input_tokens_seen": 83722240, "step": 10220 }, { "epoch": 1.3218762113968214, "grad_norm": 0.6002041697502136, "learning_rate": 2.9636274788630437e-05, "loss": 1.0287, "num_input_tokens_seen": 83804160, "step": 10230 }, { "epoch": 1.3231683680062023, "grad_norm": 0.3773179054260254, "learning_rate": 2.9603028780331475e-05, "loss": 0.8743, "num_input_tokens_seen": 83886080, "step": 10240 }, { "epoch": 1.3244605246155834, "grad_norm": 0.5780991911888123, "learning_rate": 2.9569774343903662e-05, "loss": 1.089, "num_input_tokens_seen": 83968000, "step": 10250 }, { "epoch": 1.3257526812249645, "grad_norm": 0.7904821634292603, "learning_rate": 2.9536511540235744e-05, "loss": 0.9047, "num_input_tokens_seen": 84049920, "step": 10260 }, { "epoch": 1.3270448378343456, "grad_norm": 0.6269042491912842, "learning_rate": 2.9503240430231803e-05, "loss": 0.7258, "num_input_tokens_seen": 84131840, "step": 10270 }, { "epoch": 1.3283369944437267, "grad_norm": 0.5256585478782654, "learning_rate": 2.9469961074811103e-05, "loss": 0.6985, "num_input_tokens_seen": 84213760, "step": 10280 }, { "epoch": 1.3296291510531075, "grad_norm": 0.6166103482246399, "learning_rate": 2.9436673534908044e-05, "loss": 0.8227, "num_input_tokens_seen": 84295680, "step": 10290 }, { "epoch": 1.3309213076624886, "grad_norm": 0.9073558449745178, "learning_rate": 2.940337787147197e-05, "loss": 0.883, "num_input_tokens_seen": 84377600, "step": 10300 }, { "epoch": 1.3322134642718697, "grad_norm": 0.6241338849067688, "learning_rate": 2.9370074145467132e-05, "loss": 1.2221, "num_input_tokens_seen": 84459520, "step": 10310 }, { "epoch": 1.3335056208812508, "grad_norm": 0.6946704983711243, "learning_rate": 2.9336762417872516e-05, "loss": 0.7841, "num_input_tokens_seen": 84541440, "step": 10320 }, { "epoch": 1.3347977774906319, "grad_norm": 1.0090748071670532, "learning_rate": 2.9303442749681787e-05, "loss": 0.8774, "num_input_tokens_seen": 84623360, "step": 10330 }, { "epoch": 1.336089934100013, "grad_norm": 0.5861837863922119, "learning_rate": 2.927011520190313e-05, "loss": 0.7681, "num_input_tokens_seen": 84705280, "step": 10340 }, { "epoch": 1.337382090709394, "grad_norm": 0.5563804507255554, "learning_rate": 2.9236779835559165e-05, "loss": 0.9598, "num_input_tokens_seen": 84787200, "step": 10350 }, { "epoch": 1.338674247318775, "grad_norm": 0.6404061317443848, "learning_rate": 2.9203436711686817e-05, "loss": 0.6958, "num_input_tokens_seen": 84869120, "step": 10360 }, { "epoch": 1.339966403928156, "grad_norm": 0.7478158473968506, "learning_rate": 2.917008589133724e-05, "loss": 1.0247, "num_input_tokens_seen": 84951040, "step": 10370 }, { "epoch": 1.3412585605375371, "grad_norm": 0.5483884811401367, "learning_rate": 2.913672743557565e-05, "loss": 0.5671, "num_input_tokens_seen": 85032960, "step": 10380 }, { "epoch": 1.3425507171469182, "grad_norm": 0.7392348647117615, "learning_rate": 2.9103361405481272e-05, "loss": 0.6446, "num_input_tokens_seen": 85114880, "step": 10390 }, { "epoch": 1.3438428737562993, "grad_norm": 1.0634355545043945, "learning_rate": 2.906998786214717e-05, "loss": 1.0231, "num_input_tokens_seen": 85196800, "step": 10400 }, { "epoch": 1.3451350303656804, "grad_norm": 0.5788999199867249, "learning_rate": 2.9036606866680187e-05, "loss": 0.8076, "num_input_tokens_seen": 85278720, "step": 10410 }, { "epoch": 1.3464271869750615, "grad_norm": 0.5242962837219238, "learning_rate": 2.90032184802008e-05, "loss": 1.0485, "num_input_tokens_seen": 85360640, "step": 10420 }, { "epoch": 1.3477193435844423, "grad_norm": 0.7182193994522095, "learning_rate": 2.8969822763843018e-05, "loss": 1.0753, "num_input_tokens_seen": 85442560, "step": 10430 }, { "epoch": 1.3490115001938234, "grad_norm": 1.0175756216049194, "learning_rate": 2.8936419778754294e-05, "loss": 0.9629, "num_input_tokens_seen": 85524480, "step": 10440 }, { "epoch": 1.3503036568032045, "grad_norm": 2.7674241065979004, "learning_rate": 2.8903009586095353e-05, "loss": 0.7803, "num_input_tokens_seen": 85606400, "step": 10450 }, { "epoch": 1.3515958134125856, "grad_norm": 0.27844467759132385, "learning_rate": 2.8869592247040138e-05, "loss": 0.9991, "num_input_tokens_seen": 85688320, "step": 10460 }, { "epoch": 1.3528879700219667, "grad_norm": 0.18930500745773315, "learning_rate": 2.883616782277569e-05, "loss": 0.7521, "num_input_tokens_seen": 85770240, "step": 10470 }, { "epoch": 1.3541801266313478, "grad_norm": 0.9251275658607483, "learning_rate": 2.8802736374501994e-05, "loss": 1.186, "num_input_tokens_seen": 85852160, "step": 10480 }, { "epoch": 1.3554722832407289, "grad_norm": 0.6336624026298523, "learning_rate": 2.8769297963431908e-05, "loss": 1.1393, "num_input_tokens_seen": 85934080, "step": 10490 }, { "epoch": 1.3567644398501097, "grad_norm": 0.6428319811820984, "learning_rate": 2.8735852650791035e-05, "loss": 0.6548, "num_input_tokens_seen": 86016000, "step": 10500 }, { "epoch": 1.3580565964594908, "grad_norm": 0.6062332391738892, "learning_rate": 2.870240049781764e-05, "loss": 0.9466, "num_input_tokens_seen": 86097920, "step": 10510 }, { "epoch": 1.359348753068872, "grad_norm": 0.6514495611190796, "learning_rate": 2.8668941565762475e-05, "loss": 0.9157, "num_input_tokens_seen": 86179840, "step": 10520 }, { "epoch": 1.360640909678253, "grad_norm": 0.593705952167511, "learning_rate": 2.8635475915888732e-05, "loss": 0.8647, "num_input_tokens_seen": 86261760, "step": 10530 }, { "epoch": 1.361933066287634, "grad_norm": 0.8103840947151184, "learning_rate": 2.8602003609471888e-05, "loss": 0.8976, "num_input_tokens_seen": 86343680, "step": 10540 }, { "epoch": 1.3632252228970152, "grad_norm": 0.28389742970466614, "learning_rate": 2.856852470779962e-05, "loss": 1.0414, "num_input_tokens_seen": 86425600, "step": 10550 }, { "epoch": 1.3645173795063963, "grad_norm": 0.7390335202217102, "learning_rate": 2.853503927217167e-05, "loss": 0.9523, "num_input_tokens_seen": 86507520, "step": 10560 }, { "epoch": 1.3658095361157772, "grad_norm": 0.5908385515213013, "learning_rate": 2.8501547363899744e-05, "loss": 0.7521, "num_input_tokens_seen": 86589440, "step": 10570 }, { "epoch": 1.3671016927251582, "grad_norm": 0.8718129992485046, "learning_rate": 2.846804904430741e-05, "loss": 0.9591, "num_input_tokens_seen": 86671360, "step": 10580 }, { "epoch": 1.3683938493345393, "grad_norm": 0.6367455124855042, "learning_rate": 2.8434544374729965e-05, "loss": 0.9854, "num_input_tokens_seen": 86753280, "step": 10590 }, { "epoch": 1.3696860059439204, "grad_norm": 0.7115527391433716, "learning_rate": 2.8401033416514345e-05, "loss": 1.0439, "num_input_tokens_seen": 86835200, "step": 10600 }, { "epoch": 1.3709781625533015, "grad_norm": 0.5063609480857849, "learning_rate": 2.8367516231018976e-05, "loss": 0.6884, "num_input_tokens_seen": 86917120, "step": 10610 }, { "epoch": 1.3722703191626824, "grad_norm": 1.5641584396362305, "learning_rate": 2.8333992879613712e-05, "loss": 0.9114, "num_input_tokens_seen": 86999040, "step": 10620 }, { "epoch": 1.3735624757720637, "grad_norm": 0.7738798260688782, "learning_rate": 2.830046342367969e-05, "loss": 0.8389, "num_input_tokens_seen": 87080960, "step": 10630 }, { "epoch": 1.3748546323814446, "grad_norm": 0.6683527827262878, "learning_rate": 2.826692792460921e-05, "loss": 0.5361, "num_input_tokens_seen": 87162880, "step": 10640 }, { "epoch": 1.3761467889908257, "grad_norm": 0.26369839906692505, "learning_rate": 2.823338644380566e-05, "loss": 1.0727, "num_input_tokens_seen": 87244800, "step": 10650 }, { "epoch": 1.3774389456002067, "grad_norm": 0.6391986012458801, "learning_rate": 2.8199839042683363e-05, "loss": 0.5925, "num_input_tokens_seen": 87326720, "step": 10660 }, { "epoch": 1.3787311022095878, "grad_norm": 0.6280574798583984, "learning_rate": 2.8166285782667483e-05, "loss": 0.9665, "num_input_tokens_seen": 87408640, "step": 10670 }, { "epoch": 1.380023258818969, "grad_norm": 0.8266412615776062, "learning_rate": 2.8132726725193926e-05, "loss": 0.9418, "num_input_tokens_seen": 87490560, "step": 10680 }, { "epoch": 1.3813154154283498, "grad_norm": 0.26182371377944946, "learning_rate": 2.8099161931709195e-05, "loss": 0.6926, "num_input_tokens_seen": 87572480, "step": 10690 }, { "epoch": 1.382607572037731, "grad_norm": 0.6281419992446899, "learning_rate": 2.806559146367031e-05, "loss": 1.0825, "num_input_tokens_seen": 87654400, "step": 10700 }, { "epoch": 1.383899728647112, "grad_norm": 0.5953611135482788, "learning_rate": 2.803201538254467e-05, "loss": 0.63, "num_input_tokens_seen": 87736320, "step": 10710 }, { "epoch": 1.385191885256493, "grad_norm": 0.5723059177398682, "learning_rate": 2.799843374980996e-05, "loss": 0.8605, "num_input_tokens_seen": 87818240, "step": 10720 }, { "epoch": 1.3864840418658742, "grad_norm": 0.2304965704679489, "learning_rate": 2.796484662695402e-05, "loss": 1.0539, "num_input_tokens_seen": 87900160, "step": 10730 }, { "epoch": 1.3877761984752552, "grad_norm": 0.6087478399276733, "learning_rate": 2.7931254075474768e-05, "loss": 0.7748, "num_input_tokens_seen": 87982080, "step": 10740 }, { "epoch": 1.3890683550846363, "grad_norm": 3.60947322845459, "learning_rate": 2.789765615688003e-05, "loss": 0.9568, "num_input_tokens_seen": 88064000, "step": 10750 }, { "epoch": 1.3903605116940172, "grad_norm": 0.5466760993003845, "learning_rate": 2.786405293268747e-05, "loss": 1.0008, "num_input_tokens_seen": 88145920, "step": 10760 }, { "epoch": 1.3916526683033985, "grad_norm": 0.8872881531715393, "learning_rate": 2.7830444464424466e-05, "loss": 0.8518, "num_input_tokens_seen": 88227840, "step": 10770 }, { "epoch": 1.3929448249127794, "grad_norm": 0.9006865620613098, "learning_rate": 2.7796830813628004e-05, "loss": 0.8596, "num_input_tokens_seen": 88309760, "step": 10780 }, { "epoch": 1.3942369815221605, "grad_norm": 0.2589983344078064, "learning_rate": 2.776321204184456e-05, "loss": 0.9577, "num_input_tokens_seen": 88391680, "step": 10790 }, { "epoch": 1.3955291381315416, "grad_norm": 0.2515423893928528, "learning_rate": 2.772958821062997e-05, "loss": 0.9272, "num_input_tokens_seen": 88473600, "step": 10800 }, { "epoch": 1.3968212947409226, "grad_norm": 0.8317649364471436, "learning_rate": 2.7695959381549364e-05, "loss": 0.8736, "num_input_tokens_seen": 88555520, "step": 10810 }, { "epoch": 1.3981134513503037, "grad_norm": 0.8371811509132385, "learning_rate": 2.7662325616176993e-05, "loss": 0.7469, "num_input_tokens_seen": 88637440, "step": 10820 }, { "epoch": 1.3994056079596846, "grad_norm": 0.5307440161705017, "learning_rate": 2.7628686976096164e-05, "loss": 0.9801, "num_input_tokens_seen": 88719360, "step": 10830 }, { "epoch": 1.4006977645690657, "grad_norm": 0.5864846110343933, "learning_rate": 2.7595043522899093e-05, "loss": 1.1059, "num_input_tokens_seen": 88801280, "step": 10840 }, { "epoch": 1.4019899211784468, "grad_norm": 0.8047347068786621, "learning_rate": 2.756139531818684e-05, "loss": 1.1236, "num_input_tokens_seen": 88883200, "step": 10850 }, { "epoch": 1.4032820777878279, "grad_norm": 0.6610074043273926, "learning_rate": 2.7527742423569124e-05, "loss": 0.7606, "num_input_tokens_seen": 88965120, "step": 10860 }, { "epoch": 1.404574234397209, "grad_norm": 0.7933541536331177, "learning_rate": 2.7494084900664273e-05, "loss": 0.7754, "num_input_tokens_seen": 89047040, "step": 10870 }, { "epoch": 1.40586639100659, "grad_norm": 0.6643183827400208, "learning_rate": 2.746042281109911e-05, "loss": 0.9668, "num_input_tokens_seen": 89128960, "step": 10880 }, { "epoch": 1.4071585476159711, "grad_norm": 0.6505457758903503, "learning_rate": 2.7426756216508776e-05, "loss": 0.9552, "num_input_tokens_seen": 89210880, "step": 10890 }, { "epoch": 1.408450704225352, "grad_norm": 0.7870205640792847, "learning_rate": 2.7393085178536686e-05, "loss": 0.8455, "num_input_tokens_seen": 89292800, "step": 10900 }, { "epoch": 1.409742860834733, "grad_norm": 0.5975139737129211, "learning_rate": 2.7359409758834397e-05, "loss": 0.515, "num_input_tokens_seen": 89374720, "step": 10910 }, { "epoch": 1.4110350174441142, "grad_norm": 0.8741236329078674, "learning_rate": 2.7325730019061474e-05, "loss": 0.5575, "num_input_tokens_seen": 89456640, "step": 10920 }, { "epoch": 1.4123271740534953, "grad_norm": 0.6208974719047546, "learning_rate": 2.729204602088539e-05, "loss": 0.7631, "num_input_tokens_seen": 89538560, "step": 10930 }, { "epoch": 1.4136193306628764, "grad_norm": 0.6551584005355835, "learning_rate": 2.7258357825981433e-05, "loss": 0.8117, "num_input_tokens_seen": 89620480, "step": 10940 }, { "epoch": 1.4149114872722575, "grad_norm": 1.0060181617736816, "learning_rate": 2.7224665496032565e-05, "loss": 0.7802, "num_input_tokens_seen": 89702400, "step": 10950 }, { "epoch": 1.4162036438816386, "grad_norm": 0.6730552315711975, "learning_rate": 2.7190969092729308e-05, "loss": 0.8345, "num_input_tokens_seen": 89784320, "step": 10960 }, { "epoch": 1.4174958004910194, "grad_norm": 0.5672726035118103, "learning_rate": 2.7157268677769666e-05, "loss": 1.1491, "num_input_tokens_seen": 89866240, "step": 10970 }, { "epoch": 1.4187879571004005, "grad_norm": 0.6032657623291016, "learning_rate": 2.712356431285896e-05, "loss": 1.3288, "num_input_tokens_seen": 89948160, "step": 10980 }, { "epoch": 1.4200801137097816, "grad_norm": 0.6020485758781433, "learning_rate": 2.7089856059709774e-05, "loss": 0.8851, "num_input_tokens_seen": 90030080, "step": 10990 }, { "epoch": 1.4213722703191627, "grad_norm": 0.7019742727279663, "learning_rate": 2.7056143980041787e-05, "loss": 0.7634, "num_input_tokens_seen": 90112000, "step": 11000 }, { "epoch": 1.4226644269285438, "grad_norm": 0.6318424940109253, "learning_rate": 2.70224281355817e-05, "loss": 0.8556, "num_input_tokens_seen": 90193920, "step": 11010 }, { "epoch": 1.4239565835379249, "grad_norm": 0.5350080132484436, "learning_rate": 2.6988708588063093e-05, "loss": 0.7443, "num_input_tokens_seen": 90275840, "step": 11020 }, { "epoch": 1.425248740147306, "grad_norm": 0.6152655482292175, "learning_rate": 2.695498539922634e-05, "loss": 0.9481, "num_input_tokens_seen": 90357760, "step": 11030 }, { "epoch": 1.4265408967566868, "grad_norm": 0.9493053555488586, "learning_rate": 2.6921258630818475e-05, "loss": 0.817, "num_input_tokens_seen": 90439680, "step": 11040 }, { "epoch": 1.427833053366068, "grad_norm": 8.474701881408691, "learning_rate": 2.6887528344593087e-05, "loss": 0.6403, "num_input_tokens_seen": 90521600, "step": 11050 }, { "epoch": 1.429125209975449, "grad_norm": 0.1764688491821289, "learning_rate": 2.685379460231021e-05, "loss": 0.8245, "num_input_tokens_seen": 90603520, "step": 11060 }, { "epoch": 1.43041736658483, "grad_norm": 1.4876654148101807, "learning_rate": 2.6820057465736197e-05, "loss": 1.2493, "num_input_tokens_seen": 90685440, "step": 11070 }, { "epoch": 1.4317095231942112, "grad_norm": 0.8256916403770447, "learning_rate": 2.6786316996643623e-05, "loss": 0.9927, "num_input_tokens_seen": 90767360, "step": 11080 }, { "epoch": 1.4330016798035923, "grad_norm": 0.5362711548805237, "learning_rate": 2.6752573256811165e-05, "loss": 0.9965, "num_input_tokens_seen": 90849280, "step": 11090 }, { "epoch": 1.4342938364129734, "grad_norm": 0.6876575350761414, "learning_rate": 2.6718826308023487e-05, "loss": 0.8742, "num_input_tokens_seen": 90931200, "step": 11100 }, { "epoch": 1.4355859930223542, "grad_norm": 0.5102024674415588, "learning_rate": 2.668507621207113e-05, "loss": 0.9033, "num_input_tokens_seen": 91013120, "step": 11110 }, { "epoch": 1.4368781496317353, "grad_norm": 1.2666608095169067, "learning_rate": 2.6651323030750396e-05, "loss": 0.7038, "num_input_tokens_seen": 91095040, "step": 11120 }, { "epoch": 1.4381703062411164, "grad_norm": 0.5506048202514648, "learning_rate": 2.6617566825863237e-05, "loss": 0.7839, "num_input_tokens_seen": 91176960, "step": 11130 }, { "epoch": 1.4394624628504975, "grad_norm": 0.5495603084564209, "learning_rate": 2.6583807659217137e-05, "loss": 0.7871, "num_input_tokens_seen": 91258880, "step": 11140 }, { "epoch": 1.4407546194598786, "grad_norm": 0.5707017183303833, "learning_rate": 2.6550045592625007e-05, "loss": 1.0343, "num_input_tokens_seen": 91340800, "step": 11150 }, { "epoch": 1.4420467760692595, "grad_norm": 0.6786594390869141, "learning_rate": 2.651628068790507e-05, "loss": 1.1501, "num_input_tokens_seen": 91422720, "step": 11160 }, { "epoch": 1.4433389326786408, "grad_norm": 1.807134747505188, "learning_rate": 2.648251300688073e-05, "loss": 0.8592, "num_input_tokens_seen": 91504640, "step": 11170 }, { "epoch": 1.4446310892880216, "grad_norm": 0.5651780366897583, "learning_rate": 2.6448742611380515e-05, "loss": 0.8201, "num_input_tokens_seen": 91586560, "step": 11180 }, { "epoch": 1.4459232458974027, "grad_norm": 0.630124568939209, "learning_rate": 2.6414969563237874e-05, "loss": 1.074, "num_input_tokens_seen": 91668480, "step": 11190 }, { "epoch": 1.4472154025067838, "grad_norm": 0.7884289026260376, "learning_rate": 2.6381193924291143e-05, "loss": 0.8012, "num_input_tokens_seen": 91750400, "step": 11200 }, { "epoch": 1.448507559116165, "grad_norm": 0.6598234176635742, "learning_rate": 2.63474157563834e-05, "loss": 1.1103, "num_input_tokens_seen": 91832320, "step": 11210 }, { "epoch": 1.449799715725546, "grad_norm": 0.7381977438926697, "learning_rate": 2.6313635121362322e-05, "loss": 0.6353, "num_input_tokens_seen": 91914240, "step": 11220 }, { "epoch": 1.4510918723349269, "grad_norm": 0.29279613494873047, "learning_rate": 2.6279852081080153e-05, "loss": 0.8131, "num_input_tokens_seen": 91996160, "step": 11230 }, { "epoch": 1.4523840289443082, "grad_norm": 0.6259168386459351, "learning_rate": 2.6246066697393494e-05, "loss": 0.8539, "num_input_tokens_seen": 92078080, "step": 11240 }, { "epoch": 1.453676185553689, "grad_norm": 0.2948238253593445, "learning_rate": 2.6212279032163283e-05, "loss": 0.799, "num_input_tokens_seen": 92160000, "step": 11250 }, { "epoch": 1.4549683421630701, "grad_norm": 0.8171486854553223, "learning_rate": 2.6178489147254598e-05, "loss": 0.5722, "num_input_tokens_seen": 92241920, "step": 11260 }, { "epoch": 1.4562604987724512, "grad_norm": 0.7403898239135742, "learning_rate": 2.6144697104536597e-05, "loss": 0.8796, "num_input_tokens_seen": 92323840, "step": 11270 }, { "epoch": 1.4575526553818323, "grad_norm": 0.6193887591362, "learning_rate": 2.6110902965882383e-05, "loss": 1.1459, "num_input_tokens_seen": 92405760, "step": 11280 }, { "epoch": 1.4588448119912134, "grad_norm": 0.9331411719322205, "learning_rate": 2.607710679316891e-05, "loss": 0.7492, "num_input_tokens_seen": 92487680, "step": 11290 }, { "epoch": 1.4601369686005943, "grad_norm": 0.6176254153251648, "learning_rate": 2.6043308648276833e-05, "loss": 0.9672, "num_input_tokens_seen": 92569600, "step": 11300 }, { "epoch": 1.4614291252099754, "grad_norm": 0.836600661277771, "learning_rate": 2.6009508593090448e-05, "loss": 0.606, "num_input_tokens_seen": 92651520, "step": 11310 }, { "epoch": 1.4627212818193565, "grad_norm": 0.6364132165908813, "learning_rate": 2.5975706689497513e-05, "loss": 0.7642, "num_input_tokens_seen": 92733440, "step": 11320 }, { "epoch": 1.4640134384287375, "grad_norm": 0.6128113865852356, "learning_rate": 2.59419029993892e-05, "loss": 0.8852, "num_input_tokens_seen": 92815360, "step": 11330 }, { "epoch": 1.4653055950381186, "grad_norm": 0.6446326375007629, "learning_rate": 2.590809758465995e-05, "loss": 1.1366, "num_input_tokens_seen": 92897280, "step": 11340 }, { "epoch": 1.4665977516474997, "grad_norm": 0.9294217824935913, "learning_rate": 2.5874290507207337e-05, "loss": 1.004, "num_input_tokens_seen": 92979200, "step": 11350 }, { "epoch": 1.4678899082568808, "grad_norm": 0.5920690298080444, "learning_rate": 2.584048182893201e-05, "loss": 1.1198, "num_input_tokens_seen": 93061120, "step": 11360 }, { "epoch": 1.4691820648662617, "grad_norm": 0.5159122943878174, "learning_rate": 2.580667161173753e-05, "loss": 0.7811, "num_input_tokens_seen": 93143040, "step": 11370 }, { "epoch": 1.4704742214756428, "grad_norm": 0.8452631235122681, "learning_rate": 2.577285991753028e-05, "loss": 1.0665, "num_input_tokens_seen": 93224960, "step": 11380 }, { "epoch": 1.4717663780850239, "grad_norm": 0.6538841724395752, "learning_rate": 2.5739046808219348e-05, "loss": 0.9232, "num_input_tokens_seen": 93306880, "step": 11390 }, { "epoch": 1.473058534694405, "grad_norm": 0.7873169779777527, "learning_rate": 2.570523234571642e-05, "loss": 0.8771, "num_input_tokens_seen": 93388800, "step": 11400 }, { "epoch": 1.474350691303786, "grad_norm": 0.36491021513938904, "learning_rate": 2.5671416591935636e-05, "loss": 0.7952, "num_input_tokens_seen": 93470720, "step": 11410 }, { "epoch": 1.4756428479131671, "grad_norm": 0.5437069535255432, "learning_rate": 2.563759960879354e-05, "loss": 0.8034, "num_input_tokens_seen": 93552640, "step": 11420 }, { "epoch": 1.4769350045225482, "grad_norm": 0.3457854986190796, "learning_rate": 2.5603781458208885e-05, "loss": 0.9861, "num_input_tokens_seen": 93634560, "step": 11430 }, { "epoch": 1.478227161131929, "grad_norm": 0.6639708280563354, "learning_rate": 2.55699622021026e-05, "loss": 0.9154, "num_input_tokens_seen": 93716480, "step": 11440 }, { "epoch": 1.4795193177413102, "grad_norm": 0.5724432468414307, "learning_rate": 2.55361419023976e-05, "loss": 0.696, "num_input_tokens_seen": 93798400, "step": 11450 }, { "epoch": 1.4808114743506913, "grad_norm": 0.8275769948959351, "learning_rate": 2.5502320621018732e-05, "loss": 1.0897, "num_input_tokens_seen": 93880320, "step": 11460 }, { "epoch": 1.4821036309600724, "grad_norm": 0.6131581664085388, "learning_rate": 2.5468498419892656e-05, "loss": 0.9171, "num_input_tokens_seen": 93962240, "step": 11470 }, { "epoch": 1.4833957875694535, "grad_norm": 1.2641193866729736, "learning_rate": 2.5434675360947692e-05, "loss": 0.875, "num_input_tokens_seen": 94044160, "step": 11480 }, { "epoch": 1.4846879441788345, "grad_norm": 0.8318386673927307, "learning_rate": 2.5400851506113728e-05, "loss": 0.7646, "num_input_tokens_seen": 94126080, "step": 11490 }, { "epoch": 1.4859801007882156, "grad_norm": 0.6216618418693542, "learning_rate": 2.5367026917322117e-05, "loss": 0.9129, "num_input_tokens_seen": 94208000, "step": 11500 }, { "epoch": 1.4872722573975965, "grad_norm": 0.6390239000320435, "learning_rate": 2.5333201656505567e-05, "loss": 1.2751, "num_input_tokens_seen": 94289920, "step": 11510 }, { "epoch": 1.4885644140069776, "grad_norm": 0.8883216381072998, "learning_rate": 2.5299375785598005e-05, "loss": 0.8298, "num_input_tokens_seen": 94371840, "step": 11520 }, { "epoch": 1.4898565706163587, "grad_norm": 0.4036071300506592, "learning_rate": 2.5265549366534475e-05, "loss": 1.0023, "num_input_tokens_seen": 94453760, "step": 11530 }, { "epoch": 1.4911487272257398, "grad_norm": 0.9503811597824097, "learning_rate": 2.5231722461251017e-05, "loss": 0.8267, "num_input_tokens_seen": 94535680, "step": 11540 }, { "epoch": 1.4924408838351209, "grad_norm": 0.9378893971443176, "learning_rate": 2.519789513168459e-05, "loss": 0.6509, "num_input_tokens_seen": 94617600, "step": 11550 }, { "epoch": 1.493733040444502, "grad_norm": 0.6294723153114319, "learning_rate": 2.5164067439772898e-05, "loss": 0.7988, "num_input_tokens_seen": 94699520, "step": 11560 }, { "epoch": 1.495025197053883, "grad_norm": 2.2070484161376953, "learning_rate": 2.5130239447454328e-05, "loss": 0.7822, "num_input_tokens_seen": 94781440, "step": 11570 }, { "epoch": 1.496317353663264, "grad_norm": 0.5767642259597778, "learning_rate": 2.509641121666781e-05, "loss": 0.9319, "num_input_tokens_seen": 94863360, "step": 11580 }, { "epoch": 1.497609510272645, "grad_norm": 0.5216675400733948, "learning_rate": 2.5062582809352704e-05, "loss": 0.8723, "num_input_tokens_seen": 94945280, "step": 11590 }, { "epoch": 1.498901666882026, "grad_norm": 0.5917890667915344, "learning_rate": 2.5028754287448695e-05, "loss": 1.0985, "num_input_tokens_seen": 95027200, "step": 11600 }, { "epoch": 1.5001938234914072, "grad_norm": 0.5695708394050598, "learning_rate": 2.4994925712895697e-05, "loss": 0.898, "num_input_tokens_seen": 95109120, "step": 11610 }, { "epoch": 1.5014859801007883, "grad_norm": 0.6653333902359009, "learning_rate": 2.4961097147633698e-05, "loss": 1.2631, "num_input_tokens_seen": 95191040, "step": 11620 }, { "epoch": 1.5027781367101691, "grad_norm": 0.7980930209159851, "learning_rate": 2.4927268653602684e-05, "loss": 0.9865, "num_input_tokens_seen": 95272960, "step": 11630 }, { "epoch": 1.5040702933195504, "grad_norm": 0.27931126952171326, "learning_rate": 2.489344029274249e-05, "loss": 1.063, "num_input_tokens_seen": 95354880, "step": 11640 }, { "epoch": 1.5053624499289313, "grad_norm": 0.4017459452152252, "learning_rate": 2.4859612126992737e-05, "loss": 0.4969, "num_input_tokens_seen": 95436800, "step": 11650 }, { "epoch": 1.5066546065383124, "grad_norm": 0.9155080318450928, "learning_rate": 2.4825784218292664e-05, "loss": 0.6354, "num_input_tokens_seen": 95518720, "step": 11660 }, { "epoch": 1.5079467631476935, "grad_norm": 0.7411128282546997, "learning_rate": 2.479195662858105e-05, "loss": 0.9183, "num_input_tokens_seen": 95600640, "step": 11670 }, { "epoch": 1.5092389197570746, "grad_norm": 1.0255645513534546, "learning_rate": 2.4758129419796094e-05, "loss": 0.8669, "num_input_tokens_seen": 95682560, "step": 11680 }, { "epoch": 1.5105310763664557, "grad_norm": 0.612771213054657, "learning_rate": 2.4724302653875275e-05, "loss": 1.1856, "num_input_tokens_seen": 95764480, "step": 11690 }, { "epoch": 1.5118232329758365, "grad_norm": 0.7555944919586182, "learning_rate": 2.4690476392755298e-05, "loss": 0.7345, "num_input_tokens_seen": 95846400, "step": 11700 }, { "epoch": 1.5131153895852179, "grad_norm": 0.7884678244590759, "learning_rate": 2.4656650698371903e-05, "loss": 0.5009, "num_input_tokens_seen": 95928320, "step": 11710 }, { "epoch": 1.5144075461945987, "grad_norm": 0.5397049188613892, "learning_rate": 2.462282563265982e-05, "loss": 0.7891, "num_input_tokens_seen": 96010240, "step": 11720 }, { "epoch": 1.5156997028039798, "grad_norm": 0.6234694123268127, "learning_rate": 2.4589001257552637e-05, "loss": 0.6393, "num_input_tokens_seen": 96092160, "step": 11730 }, { "epoch": 1.516991859413361, "grad_norm": 0.6878401637077332, "learning_rate": 2.455517763498264e-05, "loss": 0.8309, "num_input_tokens_seen": 96174080, "step": 11740 }, { "epoch": 1.518284016022742, "grad_norm": 0.23810110986232758, "learning_rate": 2.452135482688077e-05, "loss": 0.7155, "num_input_tokens_seen": 96256000, "step": 11750 }, { "epoch": 1.519576172632123, "grad_norm": 0.5660243630409241, "learning_rate": 2.4487532895176457e-05, "loss": 0.8101, "num_input_tokens_seen": 96337920, "step": 11760 }, { "epoch": 1.520868329241504, "grad_norm": 0.9258086085319519, "learning_rate": 2.4453711901797543e-05, "loss": 0.5187, "num_input_tokens_seen": 96419840, "step": 11770 }, { "epoch": 1.5221604858508853, "grad_norm": 0.5753411054611206, "learning_rate": 2.4419891908670127e-05, "loss": 1.1635, "num_input_tokens_seen": 96501760, "step": 11780 }, { "epoch": 1.5234526424602661, "grad_norm": 0.6174433827400208, "learning_rate": 2.4386072977718503e-05, "loss": 0.8433, "num_input_tokens_seen": 96583680, "step": 11790 }, { "epoch": 1.5247447990696472, "grad_norm": 0.5415393710136414, "learning_rate": 2.4352255170865025e-05, "loss": 0.9885, "num_input_tokens_seen": 96665600, "step": 11800 }, { "epoch": 1.5260369556790283, "grad_norm": 0.5386336445808411, "learning_rate": 2.4318438550029946e-05, "loss": 1.0425, "num_input_tokens_seen": 96747520, "step": 11810 }, { "epoch": 1.5273291122884094, "grad_norm": 0.721867561340332, "learning_rate": 2.4284623177131395e-05, "loss": 0.6342, "num_input_tokens_seen": 96829440, "step": 11820 }, { "epoch": 1.5286212688977905, "grad_norm": 0.5259259343147278, "learning_rate": 2.4250809114085183e-05, "loss": 1.1709, "num_input_tokens_seen": 96911360, "step": 11830 }, { "epoch": 1.5299134255071714, "grad_norm": 0.7975210547447205, "learning_rate": 2.421699642280475e-05, "loss": 1.0094, "num_input_tokens_seen": 96993280, "step": 11840 }, { "epoch": 1.5312055821165527, "grad_norm": 0.5395787954330444, "learning_rate": 2.4183185165200998e-05, "loss": 0.7204, "num_input_tokens_seen": 97075200, "step": 11850 }, { "epoch": 1.5324977387259335, "grad_norm": 0.6701288223266602, "learning_rate": 2.4149375403182216e-05, "loss": 0.7901, "num_input_tokens_seen": 97157120, "step": 11860 }, { "epoch": 1.5337898953353146, "grad_norm": 0.7676398158073425, "learning_rate": 2.4115567198653963e-05, "loss": 1.0571, "num_input_tokens_seen": 97239040, "step": 11870 }, { "epoch": 1.5350820519446957, "grad_norm": 0.38611501455307007, "learning_rate": 2.4081760613518924e-05, "loss": 0.5656, "num_input_tokens_seen": 97320960, "step": 11880 }, { "epoch": 1.5363742085540768, "grad_norm": 0.5478717088699341, "learning_rate": 2.4047955709676852e-05, "loss": 1.2245, "num_input_tokens_seen": 97402880, "step": 11890 }, { "epoch": 1.537666365163458, "grad_norm": 0.5873267650604248, "learning_rate": 2.401415254902438e-05, "loss": 0.9675, "num_input_tokens_seen": 97484800, "step": 11900 }, { "epoch": 1.5389585217728388, "grad_norm": 0.5424181222915649, "learning_rate": 2.3980351193455e-05, "loss": 1.0065, "num_input_tokens_seen": 97566720, "step": 11910 }, { "epoch": 1.54025067838222, "grad_norm": 0.47638383507728577, "learning_rate": 2.3946551704858838e-05, "loss": 0.8522, "num_input_tokens_seen": 97648640, "step": 11920 }, { "epoch": 1.541542834991601, "grad_norm": 0.6472703218460083, "learning_rate": 2.3912754145122663e-05, "loss": 0.9128, "num_input_tokens_seen": 97730560, "step": 11930 }, { "epoch": 1.542834991600982, "grad_norm": 0.6344046592712402, "learning_rate": 2.3878958576129664e-05, "loss": 0.8387, "num_input_tokens_seen": 97812480, "step": 11940 }, { "epoch": 1.5441271482103631, "grad_norm": 0.8018627762794495, "learning_rate": 2.3845165059759402e-05, "loss": 0.9149, "num_input_tokens_seen": 97894400, "step": 11950 }, { "epoch": 1.545419304819744, "grad_norm": 0.398837685585022, "learning_rate": 2.3811373657887705e-05, "loss": 0.8989, "num_input_tokens_seen": 97976320, "step": 11960 }, { "epoch": 1.5467114614291253, "grad_norm": 0.6712442636489868, "learning_rate": 2.3777584432386474e-05, "loss": 0.8954, "num_input_tokens_seen": 98058240, "step": 11970 }, { "epoch": 1.5480036180385062, "grad_norm": 0.685006856918335, "learning_rate": 2.3743797445123688e-05, "loss": 0.9203, "num_input_tokens_seen": 98140160, "step": 11980 }, { "epoch": 1.5492957746478875, "grad_norm": 1.1985323429107666, "learning_rate": 2.3710012757963175e-05, "loss": 1.0436, "num_input_tokens_seen": 98222080, "step": 11990 }, { "epoch": 1.5505879312572683, "grad_norm": 0.6150766015052795, "learning_rate": 2.367623043276459e-05, "loss": 1.0798, "num_input_tokens_seen": 98304000, "step": 12000 }, { "epoch": 1.5518800878666494, "grad_norm": 0.5276364684104919, "learning_rate": 2.364245053138323e-05, "loss": 1.0669, "num_input_tokens_seen": 98385920, "step": 12010 }, { "epoch": 1.5531722444760305, "grad_norm": 0.291031152009964, "learning_rate": 2.3608673115669978e-05, "loss": 0.8337, "num_input_tokens_seen": 98467840, "step": 12020 }, { "epoch": 1.5544644010854114, "grad_norm": 0.4851202964782715, "learning_rate": 2.3574898247471167e-05, "loss": 1.2425, "num_input_tokens_seen": 98549760, "step": 12030 }, { "epoch": 1.5557565576947927, "grad_norm": 0.6985998153686523, "learning_rate": 2.354112598862845e-05, "loss": 0.9847, "num_input_tokens_seen": 98631680, "step": 12040 }, { "epoch": 1.5570487143041736, "grad_norm": 0.26096710562705994, "learning_rate": 2.350735640097871e-05, "loss": 0.79, "num_input_tokens_seen": 98713600, "step": 12050 }, { "epoch": 1.5583408709135549, "grad_norm": 0.830028772354126, "learning_rate": 2.347358954635393e-05, "loss": 0.8655, "num_input_tokens_seen": 98795520, "step": 12060 }, { "epoch": 1.5596330275229358, "grad_norm": 0.9304326772689819, "learning_rate": 2.3439825486581116e-05, "loss": 0.7855, "num_input_tokens_seen": 98877440, "step": 12070 }, { "epoch": 1.5609251841323168, "grad_norm": 0.505251407623291, "learning_rate": 2.3406064283482115e-05, "loss": 0.911, "num_input_tokens_seen": 98959360, "step": 12080 }, { "epoch": 1.562217340741698, "grad_norm": 0.6220946907997131, "learning_rate": 2.337230599887358e-05, "loss": 0.6987, "num_input_tokens_seen": 99041280, "step": 12090 }, { "epoch": 1.5635094973510788, "grad_norm": 1.1672903299331665, "learning_rate": 2.3338550694566817e-05, "loss": 0.7693, "num_input_tokens_seen": 99123200, "step": 12100 }, { "epoch": 1.5648016539604601, "grad_norm": 0.5260895490646362, "learning_rate": 2.3304798432367645e-05, "loss": 0.5875, "num_input_tokens_seen": 99205120, "step": 12110 }, { "epoch": 1.566093810569841, "grad_norm": 0.7243925333023071, "learning_rate": 2.327104927407634e-05, "loss": 1.1252, "num_input_tokens_seen": 99287040, "step": 12120 }, { "epoch": 1.567385967179222, "grad_norm": 0.5413958430290222, "learning_rate": 2.3237303281487487e-05, "loss": 0.8912, "num_input_tokens_seen": 99368960, "step": 12130 }, { "epoch": 1.5686781237886032, "grad_norm": 0.6078057885169983, "learning_rate": 2.3203560516389882e-05, "loss": 0.6356, "num_input_tokens_seen": 99450880, "step": 12140 }, { "epoch": 1.5699702803979843, "grad_norm": 0.9383816719055176, "learning_rate": 2.3169821040566387e-05, "loss": 1.1298, "num_input_tokens_seen": 99532800, "step": 12150 }, { "epoch": 1.5712624370073653, "grad_norm": 0.6377266049385071, "learning_rate": 2.313608491579387e-05, "loss": 1.0096, "num_input_tokens_seen": 99614720, "step": 12160 }, { "epoch": 1.5725545936167462, "grad_norm": 0.5839458703994751, "learning_rate": 2.3102352203843063e-05, "loss": 0.7456, "num_input_tokens_seen": 99696640, "step": 12170 }, { "epoch": 1.5738467502261275, "grad_norm": 0.5719941854476929, "learning_rate": 2.306862296647841e-05, "loss": 0.8698, "num_input_tokens_seen": 99778560, "step": 12180 }, { "epoch": 1.5751389068355084, "grad_norm": 0.7049387693405151, "learning_rate": 2.3034897265458056e-05, "loss": 0.6883, "num_input_tokens_seen": 99860480, "step": 12190 }, { "epoch": 1.5764310634448895, "grad_norm": 0.5691668391227722, "learning_rate": 2.3001175162533606e-05, "loss": 0.8952, "num_input_tokens_seen": 99942400, "step": 12200 }, { "epoch": 1.5777232200542706, "grad_norm": 0.9318535327911377, "learning_rate": 2.2967456719450127e-05, "loss": 0.7274, "num_input_tokens_seen": 100024320, "step": 12210 }, { "epoch": 1.5790153766636517, "grad_norm": 0.4153442680835724, "learning_rate": 2.2933741997945954e-05, "loss": 0.8773, "num_input_tokens_seen": 100106240, "step": 12220 }, { "epoch": 1.5803075332730327, "grad_norm": 0.7407699227333069, "learning_rate": 2.290003105975262e-05, "loss": 0.9009, "num_input_tokens_seen": 100188160, "step": 12230 }, { "epoch": 1.5815996898824136, "grad_norm": 0.6650556325912476, "learning_rate": 2.2866323966594736e-05, "loss": 0.6566, "num_input_tokens_seen": 100270080, "step": 12240 }, { "epoch": 1.582891846491795, "grad_norm": 0.2326425462961197, "learning_rate": 2.283262078018985e-05, "loss": 0.7216, "num_input_tokens_seen": 100352000, "step": 12250 }, { "epoch": 1.5841840031011758, "grad_norm": 0.9894224405288696, "learning_rate": 2.27989215622484e-05, "loss": 0.9039, "num_input_tokens_seen": 100433920, "step": 12260 }, { "epoch": 1.5854761597105569, "grad_norm": 0.6276862621307373, "learning_rate": 2.2765226374473504e-05, "loss": 0.9027, "num_input_tokens_seen": 100515840, "step": 12270 }, { "epoch": 1.586768316319938, "grad_norm": 0.33717378973960876, "learning_rate": 2.2731535278560944e-05, "loss": 0.7404, "num_input_tokens_seen": 100597760, "step": 12280 }, { "epoch": 1.588060472929319, "grad_norm": 0.8220341205596924, "learning_rate": 2.269784833619898e-05, "loss": 0.8567, "num_input_tokens_seen": 100679680, "step": 12290 }, { "epoch": 1.5893526295387002, "grad_norm": 0.7151164412498474, "learning_rate": 2.2664165609068304e-05, "loss": 0.9177, "num_input_tokens_seen": 100761600, "step": 12300 }, { "epoch": 1.590644786148081, "grad_norm": 0.5890063047409058, "learning_rate": 2.263048715884184e-05, "loss": 0.8511, "num_input_tokens_seen": 100843520, "step": 12310 }, { "epoch": 1.5919369427574623, "grad_norm": 0.919540286064148, "learning_rate": 2.2596813047184715e-05, "loss": 0.8876, "num_input_tokens_seen": 100925440, "step": 12320 }, { "epoch": 1.5932290993668432, "grad_norm": 0.8143252730369568, "learning_rate": 2.2563143335754118e-05, "loss": 0.9139, "num_input_tokens_seen": 101007360, "step": 12330 }, { "epoch": 1.5945212559762243, "grad_norm": 0.48788896203041077, "learning_rate": 2.252947808619914e-05, "loss": 0.5727, "num_input_tokens_seen": 101089280, "step": 12340 }, { "epoch": 1.5958134125856054, "grad_norm": 0.5264637470245361, "learning_rate": 2.249581736016076e-05, "loss": 0.7672, "num_input_tokens_seen": 101171200, "step": 12350 }, { "epoch": 1.5971055691949865, "grad_norm": 0.5975572466850281, "learning_rate": 2.2462161219271622e-05, "loss": 0.8705, "num_input_tokens_seen": 101253120, "step": 12360 }, { "epoch": 1.5983977258043676, "grad_norm": 0.2425682097673416, "learning_rate": 2.242850972515601e-05, "loss": 0.8957, "num_input_tokens_seen": 101335040, "step": 12370 }, { "epoch": 1.5996898824137484, "grad_norm": 0.5979612469673157, "learning_rate": 2.2394862939429677e-05, "loss": 1.0392, "num_input_tokens_seen": 101416960, "step": 12380 }, { "epoch": 1.6009820390231297, "grad_norm": 0.6197012066841125, "learning_rate": 2.236122092369977e-05, "loss": 0.7616, "num_input_tokens_seen": 101498880, "step": 12390 }, { "epoch": 1.6022741956325106, "grad_norm": 0.734671413898468, "learning_rate": 2.2327583739564696e-05, "loss": 1.1416, "num_input_tokens_seen": 101580800, "step": 12400 }, { "epoch": 1.6035663522418917, "grad_norm": 0.4598730504512787, "learning_rate": 2.229395144861402e-05, "loss": 0.8276, "num_input_tokens_seen": 101662720, "step": 12410 }, { "epoch": 1.6048585088512728, "grad_norm": 0.7918890714645386, "learning_rate": 2.2260324112428336e-05, "loss": 1.044, "num_input_tokens_seen": 101744640, "step": 12420 }, { "epoch": 1.6061506654606539, "grad_norm": 1.4990414381027222, "learning_rate": 2.2226701792579176e-05, "loss": 0.5725, "num_input_tokens_seen": 101826560, "step": 12430 }, { "epoch": 1.607442822070035, "grad_norm": 0.7049417495727539, "learning_rate": 2.219308455062889e-05, "loss": 0.8819, "num_input_tokens_seen": 101908480, "step": 12440 }, { "epoch": 1.6087349786794158, "grad_norm": 1.0287914276123047, "learning_rate": 2.2159472448130513e-05, "loss": 0.6064, "num_input_tokens_seen": 101990400, "step": 12450 }, { "epoch": 1.6100271352887972, "grad_norm": 0.6095555424690247, "learning_rate": 2.212586554662769e-05, "loss": 0.7968, "num_input_tokens_seen": 102072320, "step": 12460 }, { "epoch": 1.611319291898178, "grad_norm": 0.7661923170089722, "learning_rate": 2.2092263907654544e-05, "loss": 0.6481, "num_input_tokens_seen": 102154240, "step": 12470 }, { "epoch": 1.612611448507559, "grad_norm": 0.6737602353096008, "learning_rate": 2.2058667592735532e-05, "loss": 0.722, "num_input_tokens_seen": 102236160, "step": 12480 }, { "epoch": 1.6139036051169402, "grad_norm": 0.5054346919059753, "learning_rate": 2.20250766633854e-05, "loss": 1.321, "num_input_tokens_seen": 102318080, "step": 12490 }, { "epoch": 1.615195761726321, "grad_norm": 0.5641369819641113, "learning_rate": 2.199149118110901e-05, "loss": 0.6964, "num_input_tokens_seen": 102400000, "step": 12500 }, { "epoch": 1.6164879183357024, "grad_norm": 0.27639254927635193, "learning_rate": 2.1957911207401267e-05, "loss": 0.965, "num_input_tokens_seen": 102481920, "step": 12510 }, { "epoch": 1.6177800749450832, "grad_norm": 0.8085689544677734, "learning_rate": 2.192433680374696e-05, "loss": 0.6768, "num_input_tokens_seen": 102563840, "step": 12520 }, { "epoch": 1.6190722315544646, "grad_norm": 0.7601416110992432, "learning_rate": 2.1890768031620705e-05, "loss": 0.975, "num_input_tokens_seen": 102645760, "step": 12530 }, { "epoch": 1.6203643881638454, "grad_norm": 0.615585207939148, "learning_rate": 2.1857204952486824e-05, "loss": 1.0603, "num_input_tokens_seen": 102727680, "step": 12540 }, { "epoch": 1.6216565447732265, "grad_norm": 0.7139294147491455, "learning_rate": 2.182364762779916e-05, "loss": 0.737, "num_input_tokens_seen": 102809600, "step": 12550 }, { "epoch": 1.6229487013826076, "grad_norm": 0.38794225454330444, "learning_rate": 2.1790096119001077e-05, "loss": 0.8256, "num_input_tokens_seen": 102891520, "step": 12560 }, { "epoch": 1.6242408579919885, "grad_norm": 0.8240430951118469, "learning_rate": 2.1756550487525247e-05, "loss": 1.0425, "num_input_tokens_seen": 102973440, "step": 12570 }, { "epoch": 1.6255330146013698, "grad_norm": 0.7485008239746094, "learning_rate": 2.1723010794793612e-05, "loss": 0.6268, "num_input_tokens_seen": 103055360, "step": 12580 }, { "epoch": 1.6268251712107507, "grad_norm": 0.5244811177253723, "learning_rate": 2.168947710221722e-05, "loss": 0.9757, "num_input_tokens_seen": 103137280, "step": 12590 }, { "epoch": 1.628117327820132, "grad_norm": 0.6735374927520752, "learning_rate": 2.165594947119613e-05, "loss": 0.7146, "num_input_tokens_seen": 103219200, "step": 12600 }, { "epoch": 1.6294094844295128, "grad_norm": 0.7091389894485474, "learning_rate": 2.1622427963119337e-05, "loss": 1.4099, "num_input_tokens_seen": 103301120, "step": 12610 }, { "epoch": 1.630701641038894, "grad_norm": 0.21646621823310852, "learning_rate": 2.1588912639364567e-05, "loss": 1.0245, "num_input_tokens_seen": 103383040, "step": 12620 }, { "epoch": 1.631993797648275, "grad_norm": 0.4036228358745575, "learning_rate": 2.1555403561298287e-05, "loss": 1.1632, "num_input_tokens_seen": 103464960, "step": 12630 }, { "epoch": 1.6332859542576559, "grad_norm": 0.5068750977516174, "learning_rate": 2.152190079027547e-05, "loss": 0.747, "num_input_tokens_seen": 103546880, "step": 12640 }, { "epoch": 1.6345781108670372, "grad_norm": 0.657037079334259, "learning_rate": 2.148840438763959e-05, "loss": 1.056, "num_input_tokens_seen": 103628800, "step": 12650 }, { "epoch": 1.635870267476418, "grad_norm": 0.7527886033058167, "learning_rate": 2.1454914414722417e-05, "loss": 0.9465, "num_input_tokens_seen": 103710720, "step": 12660 }, { "epoch": 1.6371624240857992, "grad_norm": 0.504432201385498, "learning_rate": 2.1421430932843988e-05, "loss": 1.1174, "num_input_tokens_seen": 103792640, "step": 12670 }, { "epoch": 1.6384545806951802, "grad_norm": 0.23135700821876526, "learning_rate": 2.138795400331242e-05, "loss": 0.6262, "num_input_tokens_seen": 103874560, "step": 12680 }, { "epoch": 1.6397467373045613, "grad_norm": 0.5520825982093811, "learning_rate": 2.135448368742385e-05, "loss": 1.066, "num_input_tokens_seen": 103956480, "step": 12690 }, { "epoch": 1.6410388939139424, "grad_norm": 0.7319164276123047, "learning_rate": 2.1321020046462318e-05, "loss": 0.9554, "num_input_tokens_seen": 104038400, "step": 12700 }, { "epoch": 1.6423310505233233, "grad_norm": 0.40483906865119934, "learning_rate": 2.128756314169961e-05, "loss": 1.0322, "num_input_tokens_seen": 104120320, "step": 12710 }, { "epoch": 1.6436232071327046, "grad_norm": 0.683261513710022, "learning_rate": 2.1254113034395212e-05, "loss": 0.6685, "num_input_tokens_seen": 104202240, "step": 12720 }, { "epoch": 1.6449153637420855, "grad_norm": 0.6625344157218933, "learning_rate": 2.122066978579613e-05, "loss": 0.7232, "num_input_tokens_seen": 104284160, "step": 12730 }, { "epoch": 1.6462075203514666, "grad_norm": 0.5251320600509644, "learning_rate": 2.1187233457136858e-05, "loss": 0.8379, "num_input_tokens_seen": 104366080, "step": 12740 }, { "epoch": 1.6474996769608476, "grad_norm": 0.37905430793762207, "learning_rate": 2.1153804109639157e-05, "loss": 0.7044, "num_input_tokens_seen": 104448000, "step": 12750 }, { "epoch": 1.6487918335702287, "grad_norm": 0.7667315006256104, "learning_rate": 2.1120381804512066e-05, "loss": 0.7293, "num_input_tokens_seen": 104529920, "step": 12760 }, { "epoch": 1.6500839901796098, "grad_norm": 0.8801140785217285, "learning_rate": 2.1086966602951696e-05, "loss": 0.9354, "num_input_tokens_seen": 104611840, "step": 12770 }, { "epoch": 1.6513761467889907, "grad_norm": 0.6519145369529724, "learning_rate": 2.105355856614115e-05, "loss": 0.699, "num_input_tokens_seen": 104693760, "step": 12780 }, { "epoch": 1.652668303398372, "grad_norm": 0.5816752910614014, "learning_rate": 2.1020157755250437e-05, "loss": 0.6137, "num_input_tokens_seen": 104775680, "step": 12790 }, { "epoch": 1.6539604600077529, "grad_norm": 0.8572224378585815, "learning_rate": 2.09867642314363e-05, "loss": 0.8616, "num_input_tokens_seen": 104857600, "step": 12800 }, { "epoch": 1.655252616617134, "grad_norm": 0.367939293384552, "learning_rate": 2.0953378055842183e-05, "loss": 0.668, "num_input_tokens_seen": 104939520, "step": 12810 }, { "epoch": 1.656544773226515, "grad_norm": 0.5176908373832703, "learning_rate": 2.0919999289598027e-05, "loss": 0.961, "num_input_tokens_seen": 105021440, "step": 12820 }, { "epoch": 1.6578369298358961, "grad_norm": 0.21786247193813324, "learning_rate": 2.088662799382024e-05, "loss": 0.6605, "num_input_tokens_seen": 105103360, "step": 12830 }, { "epoch": 1.6591290864452772, "grad_norm": 0.7281939387321472, "learning_rate": 2.0853264229611557e-05, "loss": 1.176, "num_input_tokens_seen": 105185280, "step": 12840 }, { "epoch": 1.660421243054658, "grad_norm": 0.6157118678092957, "learning_rate": 2.081990805806089e-05, "loss": 0.9159, "num_input_tokens_seen": 105267200, "step": 12850 }, { "epoch": 1.6617133996640394, "grad_norm": 0.3668000102043152, "learning_rate": 2.078655954024327e-05, "loss": 0.6293, "num_input_tokens_seen": 105349120, "step": 12860 }, { "epoch": 1.6630055562734203, "grad_norm": 0.6818766593933105, "learning_rate": 2.075321873721972e-05, "loss": 0.7573, "num_input_tokens_seen": 105431040, "step": 12870 }, { "epoch": 1.6642977128828014, "grad_norm": 0.6922422647476196, "learning_rate": 2.0719885710037122e-05, "loss": 0.4979, "num_input_tokens_seen": 105512960, "step": 12880 }, { "epoch": 1.6655898694921825, "grad_norm": 0.6830812692642212, "learning_rate": 2.0686560519728117e-05, "loss": 1.026, "num_input_tokens_seen": 105594880, "step": 12890 }, { "epoch": 1.6668820261015636, "grad_norm": 0.574186384677887, "learning_rate": 2.0653243227311014e-05, "loss": 0.5754, "num_input_tokens_seen": 105676800, "step": 12900 }, { "epoch": 1.6681741827109446, "grad_norm": 1.0729289054870605, "learning_rate": 2.0619933893789673e-05, "loss": 0.8647, "num_input_tokens_seen": 105758720, "step": 12910 }, { "epoch": 1.6694663393203255, "grad_norm": 0.6160525679588318, "learning_rate": 2.0586632580153328e-05, "loss": 1.2784, "num_input_tokens_seen": 105840640, "step": 12920 }, { "epoch": 1.6707584959297068, "grad_norm": 0.5736418962478638, "learning_rate": 2.0553339347376592e-05, "loss": 0.8836, "num_input_tokens_seen": 105922560, "step": 12930 }, { "epoch": 1.6720506525390877, "grad_norm": 0.743789553642273, "learning_rate": 2.0520054256419236e-05, "loss": 0.909, "num_input_tokens_seen": 106004480, "step": 12940 }, { "epoch": 1.6733428091484688, "grad_norm": 0.6692883372306824, "learning_rate": 2.0486777368226143e-05, "loss": 0.7516, "num_input_tokens_seen": 106086400, "step": 12950 }, { "epoch": 1.6746349657578499, "grad_norm": 1.0037450790405273, "learning_rate": 2.045350874372717e-05, "loss": 0.977, "num_input_tokens_seen": 106168320, "step": 12960 }, { "epoch": 1.675927122367231, "grad_norm": 0.8744035363197327, "learning_rate": 2.0420248443837048e-05, "loss": 0.8461, "num_input_tokens_seen": 106250240, "step": 12970 }, { "epoch": 1.677219278976612, "grad_norm": 0.7862047553062439, "learning_rate": 2.0386996529455276e-05, "loss": 0.7468, "num_input_tokens_seen": 106332160, "step": 12980 }, { "epoch": 1.678511435585993, "grad_norm": 0.8968716263771057, "learning_rate": 2.0353753061465972e-05, "loss": 0.7406, "num_input_tokens_seen": 106414080, "step": 12990 }, { "epoch": 1.6798035921953742, "grad_norm": 0.9072631001472473, "learning_rate": 2.0320518100737817e-05, "loss": 0.9977, "num_input_tokens_seen": 106496000, "step": 13000 }, { "epoch": 1.681095748804755, "grad_norm": 0.5240357518196106, "learning_rate": 2.0287291708123888e-05, "loss": 0.867, "num_input_tokens_seen": 106577920, "step": 13010 }, { "epoch": 1.6823879054141362, "grad_norm": 0.6482962369918823, "learning_rate": 2.0254073944461603e-05, "loss": 0.86, "num_input_tokens_seen": 106659840, "step": 13020 }, { "epoch": 1.6836800620235173, "grad_norm": 0.7693918347358704, "learning_rate": 2.0220864870572555e-05, "loss": 0.9676, "num_input_tokens_seen": 106741760, "step": 13030 }, { "epoch": 1.6849722186328981, "grad_norm": 0.6665438413619995, "learning_rate": 2.0187664547262446e-05, "loss": 0.7239, "num_input_tokens_seen": 106823680, "step": 13040 }, { "epoch": 1.6862643752422795, "grad_norm": 0.2982095777988434, "learning_rate": 2.0154473035320936e-05, "loss": 0.9287, "num_input_tokens_seen": 106905600, "step": 13050 }, { "epoch": 1.6875565318516603, "grad_norm": 14.455750465393066, "learning_rate": 2.0121290395521566e-05, "loss": 0.7107, "num_input_tokens_seen": 106987520, "step": 13060 }, { "epoch": 1.6888486884610416, "grad_norm": 0.7520779371261597, "learning_rate": 2.008811668862164e-05, "loss": 0.7824, "num_input_tokens_seen": 107069440, "step": 13070 }, { "epoch": 1.6901408450704225, "grad_norm": 0.6052248477935791, "learning_rate": 2.0054951975362067e-05, "loss": 0.6419, "num_input_tokens_seen": 107151360, "step": 13080 }, { "epoch": 1.6914330016798036, "grad_norm": 0.2903338372707367, "learning_rate": 2.0021796316467346e-05, "loss": 0.5254, "num_input_tokens_seen": 107233280, "step": 13090 }, { "epoch": 1.6927251582891847, "grad_norm": 0.5044610500335693, "learning_rate": 1.9988649772645346e-05, "loss": 0.6578, "num_input_tokens_seen": 107315200, "step": 13100 }, { "epoch": 1.6940173148985656, "grad_norm": 0.6834781169891357, "learning_rate": 1.995551240458728e-05, "loss": 0.7578, "num_input_tokens_seen": 107397120, "step": 13110 }, { "epoch": 1.6953094715079469, "grad_norm": 0.47251078486442566, "learning_rate": 1.9922384272967535e-05, "loss": 0.6271, "num_input_tokens_seen": 107479040, "step": 13120 }, { "epoch": 1.6966016281173277, "grad_norm": 0.27154985070228577, "learning_rate": 1.9889265438443607e-05, "loss": 0.6214, "num_input_tokens_seen": 107560960, "step": 13130 }, { "epoch": 1.697893784726709, "grad_norm": 0.7411037683486938, "learning_rate": 1.985615596165597e-05, "loss": 1.0424, "num_input_tokens_seen": 107642880, "step": 13140 }, { "epoch": 1.69918594133609, "grad_norm": 0.7935945391654968, "learning_rate": 1.982305590322793e-05, "loss": 1.0063, "num_input_tokens_seen": 107724800, "step": 13150 }, { "epoch": 1.700478097945471, "grad_norm": 0.5809916257858276, "learning_rate": 1.97899653237656e-05, "loss": 0.796, "num_input_tokens_seen": 107806720, "step": 13160 }, { "epoch": 1.701770254554852, "grad_norm": 0.7981992363929749, "learning_rate": 1.9756884283857685e-05, "loss": 0.8084, "num_input_tokens_seen": 107888640, "step": 13170 }, { "epoch": 1.703062411164233, "grad_norm": 0.6108459234237671, "learning_rate": 1.9723812844075473e-05, "loss": 1.0913, "num_input_tokens_seen": 107970560, "step": 13180 }, { "epoch": 1.7043545677736143, "grad_norm": 0.6195822954177856, "learning_rate": 1.9690751064972625e-05, "loss": 1.1137, "num_input_tokens_seen": 108052480, "step": 13190 }, { "epoch": 1.7056467243829951, "grad_norm": 0.6808088421821594, "learning_rate": 1.965769900708515e-05, "loss": 0.9466, "num_input_tokens_seen": 108134400, "step": 13200 }, { "epoch": 1.7069388809923762, "grad_norm": 0.6025552749633789, "learning_rate": 1.9624656730931258e-05, "loss": 1.0154, "num_input_tokens_seen": 108216320, "step": 13210 }, { "epoch": 1.7082310376017573, "grad_norm": 0.6096053123474121, "learning_rate": 1.959162429701121e-05, "loss": 0.9086, "num_input_tokens_seen": 108298240, "step": 13220 }, { "epoch": 1.7095231942111384, "grad_norm": 0.7634369730949402, "learning_rate": 1.955860176580729e-05, "loss": 0.9669, "num_input_tokens_seen": 108380160, "step": 13230 }, { "epoch": 1.7108153508205195, "grad_norm": 5.748688220977783, "learning_rate": 1.9525589197783618e-05, "loss": 0.8869, "num_input_tokens_seen": 108462080, "step": 13240 }, { "epoch": 1.7121075074299004, "grad_norm": 0.9097843170166016, "learning_rate": 1.9492586653386103e-05, "loss": 1.3718, "num_input_tokens_seen": 108544000, "step": 13250 }, { "epoch": 1.7133996640392817, "grad_norm": 0.6823452711105347, "learning_rate": 1.945959419304226e-05, "loss": 1.1025, "num_input_tokens_seen": 108625920, "step": 13260 }, { "epoch": 1.7146918206486625, "grad_norm": 0.8677635192871094, "learning_rate": 1.942661187716118e-05, "loss": 0.7662, "num_input_tokens_seen": 108707840, "step": 13270 }, { "epoch": 1.7159839772580436, "grad_norm": 0.8804795742034912, "learning_rate": 1.9393639766133363e-05, "loss": 0.9356, "num_input_tokens_seen": 108789760, "step": 13280 }, { "epoch": 1.7172761338674247, "grad_norm": 0.6428921222686768, "learning_rate": 1.936067792033061e-05, "loss": 0.608, "num_input_tokens_seen": 108871680, "step": 13290 }, { "epoch": 1.7185682904768058, "grad_norm": 0.5811804533004761, "learning_rate": 1.9327726400105963e-05, "loss": 1.0139, "num_input_tokens_seen": 108953600, "step": 13300 }, { "epoch": 1.719860447086187, "grad_norm": 1.0158241987228394, "learning_rate": 1.9294785265793514e-05, "loss": 0.7744, "num_input_tokens_seen": 109035520, "step": 13310 }, { "epoch": 1.7211526036955678, "grad_norm": 1.1166173219680786, "learning_rate": 1.9261854577708366e-05, "loss": 0.7847, "num_input_tokens_seen": 109117440, "step": 13320 }, { "epoch": 1.722444760304949, "grad_norm": 0.5615746974945068, "learning_rate": 1.9228934396146486e-05, "loss": 0.9334, "num_input_tokens_seen": 109199360, "step": 13330 }, { "epoch": 1.72373691691433, "grad_norm": 0.474103182554245, "learning_rate": 1.9196024781384607e-05, "loss": 0.8011, "num_input_tokens_seen": 109281280, "step": 13340 }, { "epoch": 1.725029073523711, "grad_norm": 0.5429681539535522, "learning_rate": 1.9163125793680125e-05, "loss": 0.8737, "num_input_tokens_seen": 109363200, "step": 13350 }, { "epoch": 1.7263212301330921, "grad_norm": 0.22766174376010895, "learning_rate": 1.9130237493270948e-05, "loss": 0.5359, "num_input_tokens_seen": 109445120, "step": 13360 }, { "epoch": 1.7276133867424732, "grad_norm": 0.9762837290763855, "learning_rate": 1.9097359940375452e-05, "loss": 0.6703, "num_input_tokens_seen": 109527040, "step": 13370 }, { "epoch": 1.7289055433518543, "grad_norm": 0.3081243932247162, "learning_rate": 1.9064493195192293e-05, "loss": 0.8231, "num_input_tokens_seen": 109608960, "step": 13380 }, { "epoch": 1.7301976999612352, "grad_norm": 0.8469387292861938, "learning_rate": 1.9031637317900386e-05, "loss": 0.7302, "num_input_tokens_seen": 109690880, "step": 13390 }, { "epoch": 1.7314898565706165, "grad_norm": 0.8543357253074646, "learning_rate": 1.8998792368658703e-05, "loss": 0.7282, "num_input_tokens_seen": 109772800, "step": 13400 }, { "epoch": 1.7327820131799974, "grad_norm": 0.5329307317733765, "learning_rate": 1.8965958407606236e-05, "loss": 0.8997, "num_input_tokens_seen": 109854720, "step": 13410 }, { "epoch": 1.7340741697893785, "grad_norm": 0.7571797966957092, "learning_rate": 1.893313549486184e-05, "loss": 0.8703, "num_input_tokens_seen": 109936640, "step": 13420 }, { "epoch": 1.7353663263987595, "grad_norm": 0.6506255865097046, "learning_rate": 1.890032369052415e-05, "loss": 0.5685, "num_input_tokens_seen": 110018560, "step": 13430 }, { "epoch": 1.7366584830081406, "grad_norm": 0.2349756509065628, "learning_rate": 1.8867523054671475e-05, "loss": 0.9621, "num_input_tokens_seen": 110100480, "step": 13440 }, { "epoch": 1.7379506396175217, "grad_norm": 0.5002673864364624, "learning_rate": 1.8834733647361635e-05, "loss": 0.8279, "num_input_tokens_seen": 110182400, "step": 13450 }, { "epoch": 1.7392427962269026, "grad_norm": 0.4068397581577301, "learning_rate": 1.880195552863194e-05, "loss": 0.6358, "num_input_tokens_seen": 110264320, "step": 13460 }, { "epoch": 1.740534952836284, "grad_norm": 0.6473715901374817, "learning_rate": 1.8769188758498973e-05, "loss": 0.8022, "num_input_tokens_seen": 110346240, "step": 13470 }, { "epoch": 1.7418271094456648, "grad_norm": 0.29056867957115173, "learning_rate": 1.8736433396958605e-05, "loss": 0.6169, "num_input_tokens_seen": 110428160, "step": 13480 }, { "epoch": 1.7431192660550459, "grad_norm": 0.49378782510757446, "learning_rate": 1.8703689503985754e-05, "loss": 0.6387, "num_input_tokens_seen": 110510080, "step": 13490 }, { "epoch": 1.744411422664427, "grad_norm": 0.570563554763794, "learning_rate": 1.867095713953439e-05, "loss": 1.0567, "num_input_tokens_seen": 110592000, "step": 13500 }, { "epoch": 1.745703579273808, "grad_norm": 0.7375537753105164, "learning_rate": 1.8638236363537348e-05, "loss": 0.5526, "num_input_tokens_seen": 110673920, "step": 13510 }, { "epoch": 1.7469957358831891, "grad_norm": 0.6009176969528198, "learning_rate": 1.8605527235906235e-05, "loss": 0.987, "num_input_tokens_seen": 110755840, "step": 13520 }, { "epoch": 1.74828789249257, "grad_norm": 0.41143083572387695, "learning_rate": 1.8572829816531364e-05, "loss": 0.604, "num_input_tokens_seen": 110837760, "step": 13530 }, { "epoch": 1.7495800491019513, "grad_norm": 1.1467337608337402, "learning_rate": 1.854014416528157e-05, "loss": 0.6048, "num_input_tokens_seen": 110919680, "step": 13540 }, { "epoch": 1.7508722057113322, "grad_norm": 0.6407749652862549, "learning_rate": 1.8507470342004182e-05, "loss": 0.8796, "num_input_tokens_seen": 111001600, "step": 13550 }, { "epoch": 1.7521643623207133, "grad_norm": 0.8775169253349304, "learning_rate": 1.847480840652483e-05, "loss": 0.8039, "num_input_tokens_seen": 111083520, "step": 13560 }, { "epoch": 1.7534565189300944, "grad_norm": 0.9216033220291138, "learning_rate": 1.844215841864741e-05, "loss": 0.6875, "num_input_tokens_seen": 111165440, "step": 13570 }, { "epoch": 1.7547486755394752, "grad_norm": 0.7694590091705322, "learning_rate": 1.8409520438153933e-05, "loss": 0.9024, "num_input_tokens_seen": 111247360, "step": 13580 }, { "epoch": 1.7560408321488565, "grad_norm": 0.25773632526397705, "learning_rate": 1.8376894524804416e-05, "loss": 0.6588, "num_input_tokens_seen": 111329280, "step": 13590 }, { "epoch": 1.7573329887582374, "grad_norm": 0.8692495226860046, "learning_rate": 1.8344280738336796e-05, "loss": 0.9931, "num_input_tokens_seen": 111411200, "step": 13600 }, { "epoch": 1.7586251453676187, "grad_norm": 0.17592206597328186, "learning_rate": 1.8311679138466772e-05, "loss": 0.9949, "num_input_tokens_seen": 111493120, "step": 13610 }, { "epoch": 1.7599173019769996, "grad_norm": 0.8434010148048401, "learning_rate": 1.827908978488779e-05, "loss": 1.0308, "num_input_tokens_seen": 111575040, "step": 13620 }, { "epoch": 1.7612094585863807, "grad_norm": 1.8556253910064697, "learning_rate": 1.8246512737270798e-05, "loss": 0.996, "num_input_tokens_seen": 111656960, "step": 13630 }, { "epoch": 1.7625016151957618, "grad_norm": 4.145997047424316, "learning_rate": 1.8213948055264278e-05, "loss": 1.0672, "num_input_tokens_seen": 111738880, "step": 13640 }, { "epoch": 1.7637937718051426, "grad_norm": 0.3059723377227783, "learning_rate": 1.8181395798494048e-05, "loss": 0.5107, "num_input_tokens_seen": 111820800, "step": 13650 }, { "epoch": 1.765085928414524, "grad_norm": 0.9706294536590576, "learning_rate": 1.8148856026563148e-05, "loss": 1.0283, "num_input_tokens_seen": 111902720, "step": 13660 }, { "epoch": 1.7663780850239048, "grad_norm": 0.6483558416366577, "learning_rate": 1.81163287990518e-05, "loss": 0.9913, "num_input_tokens_seen": 111984640, "step": 13670 }, { "epoch": 1.767670241633286, "grad_norm": 0.5920456647872925, "learning_rate": 1.8083814175517234e-05, "loss": 0.5184, "num_input_tokens_seen": 112066560, "step": 13680 }, { "epoch": 1.768962398242667, "grad_norm": 0.6596719622612, "learning_rate": 1.80513122154936e-05, "loss": 0.962, "num_input_tokens_seen": 112148480, "step": 13690 }, { "epoch": 1.770254554852048, "grad_norm": 0.7937291264533997, "learning_rate": 1.8018822978491872e-05, "loss": 0.6034, "num_input_tokens_seen": 112230400, "step": 13700 }, { "epoch": 1.7715467114614292, "grad_norm": 0.5990145206451416, "learning_rate": 1.798634652399972e-05, "loss": 0.8974, "num_input_tokens_seen": 112312320, "step": 13710 }, { "epoch": 1.77283886807081, "grad_norm": 0.6401026844978333, "learning_rate": 1.795388291148143e-05, "loss": 1.0047, "num_input_tokens_seen": 112394240, "step": 13720 }, { "epoch": 1.7741310246801913, "grad_norm": 0.6857889294624329, "learning_rate": 1.7921432200377734e-05, "loss": 0.608, "num_input_tokens_seen": 112476160, "step": 13730 }, { "epoch": 1.7754231812895722, "grad_norm": 0.6856977939605713, "learning_rate": 1.7888994450105788e-05, "loss": 1.2077, "num_input_tokens_seen": 112558080, "step": 13740 }, { "epoch": 1.7767153378989533, "grad_norm": 0.4340798258781433, "learning_rate": 1.785656972005897e-05, "loss": 1.0667, "num_input_tokens_seen": 112640000, "step": 13750 }, { "epoch": 1.7780074945083344, "grad_norm": 0.6757582426071167, "learning_rate": 1.7824158069606867e-05, "loss": 0.8964, "num_input_tokens_seen": 112721920, "step": 13760 }, { "epoch": 1.7792996511177155, "grad_norm": 0.8289663791656494, "learning_rate": 1.7791759558095077e-05, "loss": 0.6691, "num_input_tokens_seen": 112803840, "step": 13770 }, { "epoch": 1.7805918077270966, "grad_norm": 0.620309054851532, "learning_rate": 1.775937424484515e-05, "loss": 0.9059, "num_input_tokens_seen": 112885760, "step": 13780 }, { "epoch": 1.7818839643364774, "grad_norm": 0.7046340703964233, "learning_rate": 1.7727002189154502e-05, "loss": 0.724, "num_input_tokens_seen": 112967680, "step": 13790 }, { "epoch": 1.7831761209458588, "grad_norm": 0.7934147715568542, "learning_rate": 1.7694643450296216e-05, "loss": 1.064, "num_input_tokens_seen": 113049600, "step": 13800 }, { "epoch": 1.7844682775552396, "grad_norm": 0.5995201468467712, "learning_rate": 1.7662298087519052e-05, "loss": 0.9433, "num_input_tokens_seen": 113131520, "step": 13810 }, { "epoch": 1.7857604341646207, "grad_norm": 0.8406663537025452, "learning_rate": 1.762996616004723e-05, "loss": 0.8747, "num_input_tokens_seen": 113213440, "step": 13820 }, { "epoch": 1.7870525907740018, "grad_norm": 0.5735936760902405, "learning_rate": 1.7597647727080408e-05, "loss": 0.9011, "num_input_tokens_seen": 113295360, "step": 13830 }, { "epoch": 1.788344747383383, "grad_norm": 0.7899253368377686, "learning_rate": 1.7565342847793502e-05, "loss": 0.9851, "num_input_tokens_seen": 113377280, "step": 13840 }, { "epoch": 1.789636903992764, "grad_norm": 0.9319769740104675, "learning_rate": 1.7533051581336644e-05, "loss": 0.9077, "num_input_tokens_seen": 113459200, "step": 13850 }, { "epoch": 1.7909290606021449, "grad_norm": 0.6628711819648743, "learning_rate": 1.7500773986835013e-05, "loss": 0.9925, "num_input_tokens_seen": 113541120, "step": 13860 }, { "epoch": 1.7922212172115262, "grad_norm": 0.373602032661438, "learning_rate": 1.7468510123388775e-05, "loss": 0.7818, "num_input_tokens_seen": 113623040, "step": 13870 }, { "epoch": 1.793513373820907, "grad_norm": 0.574948787689209, "learning_rate": 1.743626005007294e-05, "loss": 0.9479, "num_input_tokens_seen": 113704960, "step": 13880 }, { "epoch": 1.7948055304302881, "grad_norm": 0.7120422124862671, "learning_rate": 1.740402382593727e-05, "loss": 0.9624, "num_input_tokens_seen": 113786880, "step": 13890 }, { "epoch": 1.7960976870396692, "grad_norm": 0.5288504958152771, "learning_rate": 1.7371801510006193e-05, "loss": 0.9844, "num_input_tokens_seen": 113868800, "step": 13900 }, { "epoch": 1.7973898436490503, "grad_norm": 0.5549502372741699, "learning_rate": 1.733959316127862e-05, "loss": 0.8112, "num_input_tokens_seen": 113950720, "step": 13910 }, { "epoch": 1.7986820002584314, "grad_norm": 0.6821995377540588, "learning_rate": 1.730739883872795e-05, "loss": 0.9197, "num_input_tokens_seen": 114032640, "step": 13920 }, { "epoch": 1.7999741568678123, "grad_norm": 0.44002169370651245, "learning_rate": 1.7275218601301848e-05, "loss": 0.6208, "num_input_tokens_seen": 114114560, "step": 13930 }, { "epoch": 1.8012663134771936, "grad_norm": 0.7106473445892334, "learning_rate": 1.7243052507922226e-05, "loss": 0.9374, "num_input_tokens_seen": 114196480, "step": 13940 }, { "epoch": 1.8025584700865744, "grad_norm": 0.6388192772865295, "learning_rate": 1.7210900617485075e-05, "loss": 1.0343, "num_input_tokens_seen": 114278400, "step": 13950 }, { "epoch": 1.8038506266959555, "grad_norm": 0.644347071647644, "learning_rate": 1.7178762988860393e-05, "loss": 0.8684, "num_input_tokens_seen": 114360320, "step": 13960 }, { "epoch": 1.8051427833053366, "grad_norm": 0.5687951445579529, "learning_rate": 1.7146639680892062e-05, "loss": 1.0918, "num_input_tokens_seen": 114442240, "step": 13970 }, { "epoch": 1.8064349399147177, "grad_norm": 0.775614321231842, "learning_rate": 1.711453075239773e-05, "loss": 0.7429, "num_input_tokens_seen": 114524160, "step": 13980 }, { "epoch": 1.8077270965240988, "grad_norm": 0.25670918822288513, "learning_rate": 1.7082436262168745e-05, "loss": 0.7727, "num_input_tokens_seen": 114606080, "step": 13990 }, { "epoch": 1.8090192531334797, "grad_norm": 0.9331316351890564, "learning_rate": 1.705035626896998e-05, "loss": 1.0647, "num_input_tokens_seen": 114688000, "step": 14000 }, { "epoch": 1.810311409742861, "grad_norm": 1.6310906410217285, "learning_rate": 1.7018290831539795e-05, "loss": 0.7367, "num_input_tokens_seen": 114769920, "step": 14010 }, { "epoch": 1.8116035663522418, "grad_norm": 0.9030694961547852, "learning_rate": 1.6986240008589903e-05, "loss": 0.9311, "num_input_tokens_seen": 114851840, "step": 14020 }, { "epoch": 1.812895722961623, "grad_norm": 0.6895557045936584, "learning_rate": 1.695420385880522e-05, "loss": 0.9413, "num_input_tokens_seen": 114933760, "step": 14030 }, { "epoch": 1.814187879571004, "grad_norm": 0.7710975408554077, "learning_rate": 1.6922182440843843e-05, "loss": 0.7935, "num_input_tokens_seen": 115015680, "step": 14040 }, { "epoch": 1.8154800361803851, "grad_norm": 0.979817807674408, "learning_rate": 1.689017581333685e-05, "loss": 0.9507, "num_input_tokens_seen": 115097600, "step": 14050 }, { "epoch": 1.8167721927897662, "grad_norm": 0.2595027983188629, "learning_rate": 1.685818403488827e-05, "loss": 1.1626, "num_input_tokens_seen": 115179520, "step": 14060 }, { "epoch": 1.818064349399147, "grad_norm": 0.20249100029468536, "learning_rate": 1.6826207164074924e-05, "loss": 0.4731, "num_input_tokens_seen": 115261440, "step": 14070 }, { "epoch": 1.8193565060085284, "grad_norm": 0.5475636124610901, "learning_rate": 1.6794245259446347e-05, "loss": 0.8077, "num_input_tokens_seen": 115343360, "step": 14080 }, { "epoch": 1.8206486626179093, "grad_norm": 0.6575671434402466, "learning_rate": 1.6762298379524684e-05, "loss": 0.8489, "num_input_tokens_seen": 115425280, "step": 14090 }, { "epoch": 1.8219408192272903, "grad_norm": 0.5988595485687256, "learning_rate": 1.6730366582804535e-05, "loss": 0.5274, "num_input_tokens_seen": 115507200, "step": 14100 }, { "epoch": 1.8232329758366714, "grad_norm": 0.2947266399860382, "learning_rate": 1.6698449927752924e-05, "loss": 0.659, "num_input_tokens_seen": 115589120, "step": 14110 }, { "epoch": 1.8245251324460523, "grad_norm": 0.7184574604034424, "learning_rate": 1.6666548472809104e-05, "loss": 1.1145, "num_input_tokens_seen": 115671040, "step": 14120 }, { "epoch": 1.8258172890554336, "grad_norm": 0.8786848783493042, "learning_rate": 1.6634662276384548e-05, "loss": 0.8391, "num_input_tokens_seen": 115752960, "step": 14130 }, { "epoch": 1.8271094456648145, "grad_norm": 0.6880916357040405, "learning_rate": 1.660279139686275e-05, "loss": 0.802, "num_input_tokens_seen": 115834880, "step": 14140 }, { "epoch": 1.8284016022741958, "grad_norm": 0.8588114380836487, "learning_rate": 1.657093589259917e-05, "loss": 0.7398, "num_input_tokens_seen": 115916800, "step": 14150 }, { "epoch": 1.8296937588835767, "grad_norm": 0.6294015049934387, "learning_rate": 1.6539095821921136e-05, "loss": 0.8751, "num_input_tokens_seen": 115998720, "step": 14160 }, { "epoch": 1.8309859154929577, "grad_norm": 0.8277295231819153, "learning_rate": 1.650727124312768e-05, "loss": 0.7839, "num_input_tokens_seen": 116080640, "step": 14170 }, { "epoch": 1.8322780721023388, "grad_norm": 0.5361806154251099, "learning_rate": 1.6475462214489513e-05, "loss": 0.8393, "num_input_tokens_seen": 116162560, "step": 14180 }, { "epoch": 1.8335702287117197, "grad_norm": 0.5568910241127014, "learning_rate": 1.6443668794248828e-05, "loss": 0.6922, "num_input_tokens_seen": 116244480, "step": 14190 }, { "epoch": 1.834862385321101, "grad_norm": 0.615774393081665, "learning_rate": 1.641189104061928e-05, "loss": 0.9365, "num_input_tokens_seen": 116326400, "step": 14200 }, { "epoch": 1.8361545419304819, "grad_norm": 0.5470173358917236, "learning_rate": 1.63801290117858e-05, "loss": 0.8715, "num_input_tokens_seen": 116408320, "step": 14210 }, { "epoch": 1.837446698539863, "grad_norm": 0.7241597771644592, "learning_rate": 1.6348382765904567e-05, "loss": 0.8787, "num_input_tokens_seen": 116490240, "step": 14220 }, { "epoch": 1.838738855149244, "grad_norm": 0.7052979469299316, "learning_rate": 1.631665236110283e-05, "loss": 0.8164, "num_input_tokens_seen": 116572160, "step": 14230 }, { "epoch": 1.8400310117586252, "grad_norm": 0.6551203727722168, "learning_rate": 1.6284937855478837e-05, "loss": 1.0631, "num_input_tokens_seen": 116654080, "step": 14240 }, { "epoch": 1.8413231683680062, "grad_norm": 0.6758942604064941, "learning_rate": 1.6253239307101748e-05, "loss": 1.008, "num_input_tokens_seen": 116736000, "step": 14250 }, { "epoch": 1.8426153249773871, "grad_norm": 0.588173508644104, "learning_rate": 1.6221556774011474e-05, "loss": 1.006, "num_input_tokens_seen": 116817920, "step": 14260 }, { "epoch": 1.8439074815867684, "grad_norm": 0.2741510272026062, "learning_rate": 1.6189890314218634e-05, "loss": 0.8024, "num_input_tokens_seen": 116899840, "step": 14270 }, { "epoch": 1.8451996381961493, "grad_norm": 0.38996413350105286, "learning_rate": 1.6158239985704378e-05, "loss": 0.9059, "num_input_tokens_seen": 116981760, "step": 14280 }, { "epoch": 1.8464917948055304, "grad_norm": 0.6522138118743896, "learning_rate": 1.6126605846420366e-05, "loss": 0.7783, "num_input_tokens_seen": 117063680, "step": 14290 }, { "epoch": 1.8477839514149115, "grad_norm": 0.5174360275268555, "learning_rate": 1.609498795428857e-05, "loss": 0.8708, "num_input_tokens_seen": 117145600, "step": 14300 }, { "epoch": 1.8490761080242926, "grad_norm": 0.47565585374832153, "learning_rate": 1.606338636720125e-05, "loss": 0.4722, "num_input_tokens_seen": 117227520, "step": 14310 }, { "epoch": 1.8503682646336737, "grad_norm": 0.5831138491630554, "learning_rate": 1.6031801143020785e-05, "loss": 0.9373, "num_input_tokens_seen": 117309440, "step": 14320 }, { "epoch": 1.8516604212430545, "grad_norm": 0.5426406860351562, "learning_rate": 1.6000232339579616e-05, "loss": 0.9453, "num_input_tokens_seen": 117391360, "step": 14330 }, { "epoch": 1.8529525778524358, "grad_norm": 0.899085283279419, "learning_rate": 1.5968680014680105e-05, "loss": 0.8859, "num_input_tokens_seen": 117473280, "step": 14340 }, { "epoch": 1.8542447344618167, "grad_norm": 0.5543519258499146, "learning_rate": 1.5937144226094426e-05, "loss": 1.0594, "num_input_tokens_seen": 117555200, "step": 14350 }, { "epoch": 1.8555368910711978, "grad_norm": 0.8025951385498047, "learning_rate": 1.590562503156452e-05, "loss": 0.8882, "num_input_tokens_seen": 117637120, "step": 14360 }, { "epoch": 1.8568290476805789, "grad_norm": 0.6252597570419312, "learning_rate": 1.5874122488801888e-05, "loss": 0.6613, "num_input_tokens_seen": 117719040, "step": 14370 }, { "epoch": 1.85812120428996, "grad_norm": 0.5692026615142822, "learning_rate": 1.5842636655487585e-05, "loss": 0.6339, "num_input_tokens_seen": 117800960, "step": 14380 }, { "epoch": 1.859413360899341, "grad_norm": 0.651740550994873, "learning_rate": 1.5811167589272068e-05, "loss": 1.0055, "num_input_tokens_seen": 117882880, "step": 14390 }, { "epoch": 1.860705517508722, "grad_norm": 0.6472187638282776, "learning_rate": 1.577971534777507e-05, "loss": 0.7919, "num_input_tokens_seen": 117964800, "step": 14400 }, { "epoch": 1.8619976741181032, "grad_norm": 0.5527770519256592, "learning_rate": 1.5748279988585528e-05, "loss": 0.7019, "num_input_tokens_seen": 118046720, "step": 14410 }, { "epoch": 1.863289830727484, "grad_norm": 0.3149762451648712, "learning_rate": 1.571686156926147e-05, "loss": 0.7455, "num_input_tokens_seen": 118128640, "step": 14420 }, { "epoch": 1.8645819873368652, "grad_norm": 0.6503821015357971, "learning_rate": 1.5685460147329917e-05, "loss": 0.68, "num_input_tokens_seen": 118210560, "step": 14430 }, { "epoch": 1.8658741439462463, "grad_norm": 0.6401498913764954, "learning_rate": 1.5654075780286742e-05, "loss": 0.9481, "num_input_tokens_seen": 118292480, "step": 14440 }, { "epoch": 1.8671663005556274, "grad_norm": 0.5408946871757507, "learning_rate": 1.562270852559661e-05, "loss": 1.0229, "num_input_tokens_seen": 118374400, "step": 14450 }, { "epoch": 1.8684584571650085, "grad_norm": 0.7171612977981567, "learning_rate": 1.5591358440692865e-05, "loss": 1.0381, "num_input_tokens_seen": 118456320, "step": 14460 }, { "epoch": 1.8697506137743893, "grad_norm": 0.7857020497322083, "learning_rate": 1.5560025582977377e-05, "loss": 0.988, "num_input_tokens_seen": 118538240, "step": 14470 }, { "epoch": 1.8710427703837706, "grad_norm": 0.3020257353782654, "learning_rate": 1.5528710009820513e-05, "loss": 1.0819, "num_input_tokens_seen": 118620160, "step": 14480 }, { "epoch": 1.8723349269931515, "grad_norm": 0.6323096752166748, "learning_rate": 1.5497411778560954e-05, "loss": 1.2201, "num_input_tokens_seen": 118702080, "step": 14490 }, { "epoch": 1.8736270836025326, "grad_norm": 0.7573959827423096, "learning_rate": 1.5466130946505664e-05, "loss": 0.7508, "num_input_tokens_seen": 118784000, "step": 14500 }, { "epoch": 1.8749192402119137, "grad_norm": 0.48801571130752563, "learning_rate": 1.5434867570929724e-05, "loss": 0.8983, "num_input_tokens_seen": 118865920, "step": 14510 }, { "epoch": 1.8762113968212948, "grad_norm": 0.34075334668159485, "learning_rate": 1.5403621709076247e-05, "loss": 0.6051, "num_input_tokens_seen": 118947840, "step": 14520 }, { "epoch": 1.8775035534306759, "grad_norm": 0.27687662839889526, "learning_rate": 1.5372393418156323e-05, "loss": 0.7321, "num_input_tokens_seen": 119029760, "step": 14530 }, { "epoch": 1.8787957100400567, "grad_norm": 0.7669917941093445, "learning_rate": 1.5341182755348806e-05, "loss": 0.887, "num_input_tokens_seen": 119111680, "step": 14540 }, { "epoch": 1.880087866649438, "grad_norm": 0.30796754360198975, "learning_rate": 1.530998977780033e-05, "loss": 0.7769, "num_input_tokens_seen": 119193600, "step": 14550 }, { "epoch": 1.881380023258819, "grad_norm": 0.6400482654571533, "learning_rate": 1.5278814542625107e-05, "loss": 0.7488, "num_input_tokens_seen": 119275520, "step": 14560 }, { "epoch": 1.8826721798682, "grad_norm": 0.6054263114929199, "learning_rate": 1.5247657106904891e-05, "loss": 0.8852, "num_input_tokens_seen": 119357440, "step": 14570 }, { "epoch": 1.883964336477581, "grad_norm": 0.19290970265865326, "learning_rate": 1.5216517527688818e-05, "loss": 0.8831, "num_input_tokens_seen": 119439360, "step": 14580 }, { "epoch": 1.8852564930869622, "grad_norm": 0.5725274085998535, "learning_rate": 1.5185395861993353e-05, "loss": 0.8191, "num_input_tokens_seen": 119521280, "step": 14590 }, { "epoch": 1.8865486496963433, "grad_norm": 0.7249799370765686, "learning_rate": 1.515429216680216e-05, "loss": 0.9853, "num_input_tokens_seen": 119603200, "step": 14600 }, { "epoch": 1.8878408063057242, "grad_norm": 0.7958240509033203, "learning_rate": 1.5123206499065967e-05, "loss": 0.9514, "num_input_tokens_seen": 119685120, "step": 14610 }, { "epoch": 1.8891329629151055, "grad_norm": 0.6748232245445251, "learning_rate": 1.5092138915702545e-05, "loss": 1.2975, "num_input_tokens_seen": 119767040, "step": 14620 }, { "epoch": 1.8904251195244863, "grad_norm": 0.9683583974838257, "learning_rate": 1.5061089473596501e-05, "loss": 0.8597, "num_input_tokens_seen": 119848960, "step": 14630 }, { "epoch": 1.8917172761338674, "grad_norm": 0.6267436742782593, "learning_rate": 1.5030058229599275e-05, "loss": 0.7876, "num_input_tokens_seen": 119930880, "step": 14640 }, { "epoch": 1.8930094327432485, "grad_norm": 0.18290306627750397, "learning_rate": 1.4999045240528935e-05, "loss": 0.4448, "num_input_tokens_seen": 120012800, "step": 14650 }, { "epoch": 1.8943015893526294, "grad_norm": 0.908818244934082, "learning_rate": 1.4968050563170177e-05, "loss": 0.9315, "num_input_tokens_seen": 120094720, "step": 14660 }, { "epoch": 1.8955937459620107, "grad_norm": 0.8734130859375, "learning_rate": 1.4937074254274117e-05, "loss": 1.0145, "num_input_tokens_seen": 120176640, "step": 14670 }, { "epoch": 1.8968859025713916, "grad_norm": 0.6656931042671204, "learning_rate": 1.4906116370558276e-05, "loss": 0.6827, "num_input_tokens_seen": 120258560, "step": 14680 }, { "epoch": 1.8981780591807729, "grad_norm": 0.612580418586731, "learning_rate": 1.4875176968706434e-05, "loss": 0.8853, "num_input_tokens_seen": 120340480, "step": 14690 }, { "epoch": 1.8994702157901537, "grad_norm": 0.30025026202201843, "learning_rate": 1.4844256105368504e-05, "loss": 0.7183, "num_input_tokens_seen": 120422400, "step": 14700 }, { "epoch": 1.9007623723995348, "grad_norm": 0.3702358603477478, "learning_rate": 1.4813353837160488e-05, "loss": 0.5776, "num_input_tokens_seen": 120504320, "step": 14710 }, { "epoch": 1.902054529008916, "grad_norm": 0.5429428219795227, "learning_rate": 1.4782470220664313e-05, "loss": 1.1882, "num_input_tokens_seen": 120586240, "step": 14720 }, { "epoch": 1.9033466856182968, "grad_norm": 0.8188362717628479, "learning_rate": 1.4751605312427786e-05, "loss": 0.9617, "num_input_tokens_seen": 120668160, "step": 14730 }, { "epoch": 1.904638842227678, "grad_norm": 0.8028152585029602, "learning_rate": 1.472075916896442e-05, "loss": 0.8622, "num_input_tokens_seen": 120750080, "step": 14740 }, { "epoch": 1.905930998837059, "grad_norm": 0.839156985282898, "learning_rate": 1.4689931846753402e-05, "loss": 0.9725, "num_input_tokens_seen": 120832000, "step": 14750 }, { "epoch": 1.90722315544644, "grad_norm": 0.7380186319351196, "learning_rate": 1.4659123402239454e-05, "loss": 1.2234, "num_input_tokens_seen": 120913920, "step": 14760 }, { "epoch": 1.9085153120558211, "grad_norm": 0.5705264806747437, "learning_rate": 1.4628333891832713e-05, "loss": 0.8926, "num_input_tokens_seen": 120995840, "step": 14770 }, { "epoch": 1.9098074686652022, "grad_norm": 0.7131834626197815, "learning_rate": 1.4597563371908663e-05, "loss": 1.0101, "num_input_tokens_seen": 121077760, "step": 14780 }, { "epoch": 1.9110996252745833, "grad_norm": 0.9028184413909912, "learning_rate": 1.4566811898808013e-05, "loss": 0.9358, "num_input_tokens_seen": 121159680, "step": 14790 }, { "epoch": 1.9123917818839642, "grad_norm": 0.5184687972068787, "learning_rate": 1.4536079528836605e-05, "loss": 0.7874, "num_input_tokens_seen": 121241600, "step": 14800 }, { "epoch": 1.9136839384933455, "grad_norm": 0.6445053815841675, "learning_rate": 1.4505366318265278e-05, "loss": 0.6709, "num_input_tokens_seen": 121323520, "step": 14810 }, { "epoch": 1.9149760951027264, "grad_norm": 0.3584889769554138, "learning_rate": 1.4474672323329819e-05, "loss": 0.6944, "num_input_tokens_seen": 121405440, "step": 14820 }, { "epoch": 1.9162682517121075, "grad_norm": 0.5627390742301941, "learning_rate": 1.4443997600230832e-05, "loss": 0.8012, "num_input_tokens_seen": 121487360, "step": 14830 }, { "epoch": 1.9175604083214886, "grad_norm": 0.6850423216819763, "learning_rate": 1.4413342205133604e-05, "loss": 0.9204, "num_input_tokens_seen": 121569280, "step": 14840 }, { "epoch": 1.9188525649308696, "grad_norm": 0.7621904611587524, "learning_rate": 1.4382706194168066e-05, "loss": 0.6648, "num_input_tokens_seen": 121651200, "step": 14850 }, { "epoch": 1.9201447215402507, "grad_norm": 0.636451244354248, "learning_rate": 1.4352089623428627e-05, "loss": 0.8368, "num_input_tokens_seen": 121733120, "step": 14860 }, { "epoch": 1.9214368781496316, "grad_norm": 0.5977798700332642, "learning_rate": 1.4321492548974137e-05, "loss": 1.248, "num_input_tokens_seen": 121815040, "step": 14870 }, { "epoch": 1.922729034759013, "grad_norm": 0.24377988278865814, "learning_rate": 1.42909150268277e-05, "loss": 1.0932, "num_input_tokens_seen": 121896960, "step": 14880 }, { "epoch": 1.9240211913683938, "grad_norm": 0.6114721894264221, "learning_rate": 1.4260357112976664e-05, "loss": 0.8562, "num_input_tokens_seen": 121978880, "step": 14890 }, { "epoch": 1.9253133479777749, "grad_norm": 0.5565524101257324, "learning_rate": 1.4229818863372463e-05, "loss": 0.9967, "num_input_tokens_seen": 122060800, "step": 14900 }, { "epoch": 1.926605504587156, "grad_norm": 0.9591652154922485, "learning_rate": 1.4199300333930515e-05, "loss": 1.2298, "num_input_tokens_seen": 122142720, "step": 14910 }, { "epoch": 1.927897661196537, "grad_norm": 0.7166655659675598, "learning_rate": 1.4168801580530119e-05, "loss": 0.6193, "num_input_tokens_seen": 122224640, "step": 14920 }, { "epoch": 1.9291898178059181, "grad_norm": 0.5449065566062927, "learning_rate": 1.4138322659014408e-05, "loss": 0.9493, "num_input_tokens_seen": 122306560, "step": 14930 }, { "epoch": 1.930481974415299, "grad_norm": 1.1408071517944336, "learning_rate": 1.4107863625190163e-05, "loss": 0.8788, "num_input_tokens_seen": 122388480, "step": 14940 }, { "epoch": 1.9317741310246803, "grad_norm": 0.3808801770210266, "learning_rate": 1.4077424534827752e-05, "loss": 0.7922, "num_input_tokens_seen": 122470400, "step": 14950 }, { "epoch": 1.9330662876340612, "grad_norm": 0.7727448344230652, "learning_rate": 1.4047005443661048e-05, "loss": 1.057, "num_input_tokens_seen": 122552320, "step": 14960 }, { "epoch": 1.9343584442434423, "grad_norm": 0.7547351121902466, "learning_rate": 1.4016606407387312e-05, "loss": 0.9366, "num_input_tokens_seen": 122634240, "step": 14970 }, { "epoch": 1.9356506008528234, "grad_norm": 0.9177370667457581, "learning_rate": 1.398622748166704e-05, "loss": 0.3856, "num_input_tokens_seen": 122716160, "step": 14980 }, { "epoch": 1.9369427574622045, "grad_norm": 0.519792914390564, "learning_rate": 1.3955868722123955e-05, "loss": 0.7705, "num_input_tokens_seen": 122798080, "step": 14990 }, { "epoch": 1.9382349140715855, "grad_norm": 0.8138200044631958, "learning_rate": 1.3925530184344818e-05, "loss": 0.7423, "num_input_tokens_seen": 122880000, "step": 15000 }, { "epoch": 1.9395270706809664, "grad_norm": 0.6865666508674622, "learning_rate": 1.3895211923879397e-05, "loss": 1.0697, "num_input_tokens_seen": 122961920, "step": 15010 }, { "epoch": 1.9408192272903477, "grad_norm": 0.6570258736610413, "learning_rate": 1.3864913996240304e-05, "loss": 0.967, "num_input_tokens_seen": 123043840, "step": 15020 }, { "epoch": 1.9421113838997286, "grad_norm": 1.129082441329956, "learning_rate": 1.3834636456902944e-05, "loss": 0.8996, "num_input_tokens_seen": 123125760, "step": 15030 }, { "epoch": 1.9434035405091097, "grad_norm": 0.1828085333108902, "learning_rate": 1.3804379361305363e-05, "loss": 0.5727, "num_input_tokens_seen": 123207680, "step": 15040 }, { "epoch": 1.9446956971184908, "grad_norm": 0.5514014363288879, "learning_rate": 1.3774142764848207e-05, "loss": 0.9822, "num_input_tokens_seen": 123289600, "step": 15050 }, { "epoch": 1.9459878537278719, "grad_norm": 0.51252681016922, "learning_rate": 1.3743926722894579e-05, "loss": 0.7735, "num_input_tokens_seen": 123371520, "step": 15060 }, { "epoch": 1.947280010337253, "grad_norm": 0.6373499631881714, "learning_rate": 1.3713731290769921e-05, "loss": 1.0148, "num_input_tokens_seen": 123453440, "step": 15070 }, { "epoch": 1.9485721669466338, "grad_norm": 0.7882586121559143, "learning_rate": 1.3683556523761981e-05, "loss": 0.8874, "num_input_tokens_seen": 123535360, "step": 15080 }, { "epoch": 1.9498643235560151, "grad_norm": 0.8441190719604492, "learning_rate": 1.365340247712064e-05, "loss": 0.9097, "num_input_tokens_seen": 123617280, "step": 15090 }, { "epoch": 1.951156480165396, "grad_norm": 0.6517019271850586, "learning_rate": 1.362326920605783e-05, "loss": 0.8473, "num_input_tokens_seen": 123699200, "step": 15100 }, { "epoch": 1.952448636774777, "grad_norm": 0.4873802363872528, "learning_rate": 1.3593156765747483e-05, "loss": 0.7964, "num_input_tokens_seen": 123781120, "step": 15110 }, { "epoch": 1.9537407933841582, "grad_norm": 0.6770155429840088, "learning_rate": 1.3563065211325349e-05, "loss": 0.795, "num_input_tokens_seen": 123863040, "step": 15120 }, { "epoch": 1.9550329499935393, "grad_norm": 0.4398241937160492, "learning_rate": 1.3532994597888971e-05, "loss": 0.6041, "num_input_tokens_seen": 123944960, "step": 15130 }, { "epoch": 1.9563251066029204, "grad_norm": 0.2949928939342499, "learning_rate": 1.3502944980497514e-05, "loss": 0.7019, "num_input_tokens_seen": 124026880, "step": 15140 }, { "epoch": 1.9576172632123012, "grad_norm": 0.7733688354492188, "learning_rate": 1.3472916414171738e-05, "loss": 1.2308, "num_input_tokens_seen": 124108800, "step": 15150 }, { "epoch": 1.9589094198216825, "grad_norm": 0.6464848518371582, "learning_rate": 1.3442908953893816e-05, "loss": 0.8531, "num_input_tokens_seen": 124190720, "step": 15160 }, { "epoch": 1.9602015764310634, "grad_norm": 0.7353049516677856, "learning_rate": 1.3412922654607318e-05, "loss": 0.8579, "num_input_tokens_seen": 124272640, "step": 15170 }, { "epoch": 1.9614937330404445, "grad_norm": 0.8654274344444275, "learning_rate": 1.338295757121703e-05, "loss": 0.7652, "num_input_tokens_seen": 124354560, "step": 15180 }, { "epoch": 1.9627858896498256, "grad_norm": 0.5775660872459412, "learning_rate": 1.3353013758588923e-05, "loss": 0.8972, "num_input_tokens_seen": 124436480, "step": 15190 }, { "epoch": 1.9640780462592065, "grad_norm": 0.5791903138160706, "learning_rate": 1.3323091271550011e-05, "loss": 0.8894, "num_input_tokens_seen": 124518400, "step": 15200 }, { "epoch": 1.9653702028685878, "grad_norm": 0.8218627572059631, "learning_rate": 1.3293190164888242e-05, "loss": 1.0825, "num_input_tokens_seen": 124600320, "step": 15210 }, { "epoch": 1.9666623594779686, "grad_norm": 0.3494950234889984, "learning_rate": 1.3263310493352454e-05, "loss": 0.8476, "num_input_tokens_seen": 124682240, "step": 15220 }, { "epoch": 1.96795451608735, "grad_norm": 0.5643335580825806, "learning_rate": 1.3233452311652197e-05, "loss": 0.6004, "num_input_tokens_seen": 124764160, "step": 15230 }, { "epoch": 1.9692466726967308, "grad_norm": 0.6971800923347473, "learning_rate": 1.3203615674457709e-05, "loss": 0.4468, "num_input_tokens_seen": 124846080, "step": 15240 }, { "epoch": 1.970538829306112, "grad_norm": 0.690518319606781, "learning_rate": 1.3173800636399744e-05, "loss": 0.704, "num_input_tokens_seen": 124928000, "step": 15250 }, { "epoch": 1.971830985915493, "grad_norm": 0.568469226360321, "learning_rate": 1.3144007252069552e-05, "loss": 0.5738, "num_input_tokens_seen": 125009920, "step": 15260 }, { "epoch": 1.9731231425248739, "grad_norm": 0.6457722783088684, "learning_rate": 1.3114235576018686e-05, "loss": 0.6915, "num_input_tokens_seen": 125091840, "step": 15270 }, { "epoch": 1.9744152991342552, "grad_norm": 0.6253653168678284, "learning_rate": 1.3084485662758994e-05, "loss": 0.6731, "num_input_tokens_seen": 125173760, "step": 15280 }, { "epoch": 1.975707455743636, "grad_norm": 0.6036949753761292, "learning_rate": 1.3054757566762454e-05, "loss": 0.8532, "num_input_tokens_seen": 125255680, "step": 15290 }, { "epoch": 1.9769996123530171, "grad_norm": 1.014550805091858, "learning_rate": 1.3025051342461087e-05, "loss": 1.3152, "num_input_tokens_seen": 125337600, "step": 15300 }, { "epoch": 1.9782917689623982, "grad_norm": 0.5557976961135864, "learning_rate": 1.2995367044246903e-05, "loss": 0.6177, "num_input_tokens_seen": 125419520, "step": 15310 }, { "epoch": 1.9795839255717793, "grad_norm": 1.1027765274047852, "learning_rate": 1.2965704726471729e-05, "loss": 0.7341, "num_input_tokens_seen": 125501440, "step": 15320 }, { "epoch": 1.9808760821811604, "grad_norm": 0.8762277364730835, "learning_rate": 1.2936064443447157e-05, "loss": 0.7912, "num_input_tokens_seen": 125583360, "step": 15330 }, { "epoch": 1.9821682387905413, "grad_norm": 0.5100016593933105, "learning_rate": 1.2906446249444457e-05, "loss": 1.0051, "num_input_tokens_seen": 125665280, "step": 15340 }, { "epoch": 1.9834603953999226, "grad_norm": 0.583299994468689, "learning_rate": 1.2876850198694409e-05, "loss": 0.4944, "num_input_tokens_seen": 125747200, "step": 15350 }, { "epoch": 1.9847525520093035, "grad_norm": 0.277055025100708, "learning_rate": 1.2847276345387299e-05, "loss": 0.7472, "num_input_tokens_seen": 125829120, "step": 15360 }, { "epoch": 1.9860447086186845, "grad_norm": 1.0343480110168457, "learning_rate": 1.2817724743672715e-05, "loss": 0.6023, "num_input_tokens_seen": 125911040, "step": 15370 }, { "epoch": 1.9873368652280656, "grad_norm": 0.6616178750991821, "learning_rate": 1.2788195447659562e-05, "loss": 1.1141, "num_input_tokens_seen": 125992960, "step": 15380 }, { "epoch": 1.9886290218374467, "grad_norm": 0.5394881367683411, "learning_rate": 1.2758688511415848e-05, "loss": 0.9213, "num_input_tokens_seen": 126074880, "step": 15390 }, { "epoch": 1.9899211784468278, "grad_norm": 0.7356629967689514, "learning_rate": 1.2729203988968674e-05, "loss": 0.7153, "num_input_tokens_seen": 126156800, "step": 15400 }, { "epoch": 1.9912133350562087, "grad_norm": 0.22516459226608276, "learning_rate": 1.2699741934304104e-05, "loss": 0.5926, "num_input_tokens_seen": 126238720, "step": 15410 }, { "epoch": 1.99250549166559, "grad_norm": 0.6779276728630066, "learning_rate": 1.2670302401367035e-05, "loss": 0.9406, "num_input_tokens_seen": 126320640, "step": 15420 }, { "epoch": 1.9937976482749709, "grad_norm": 0.5746923089027405, "learning_rate": 1.2640885444061163e-05, "loss": 0.9174, "num_input_tokens_seen": 126402560, "step": 15430 }, { "epoch": 1.995089804884352, "grad_norm": 0.6525429487228394, "learning_rate": 1.2611491116248802e-05, "loss": 1.254, "num_input_tokens_seen": 126484480, "step": 15440 }, { "epoch": 1.996381961493733, "grad_norm": 0.30466407537460327, "learning_rate": 1.2582119471750888e-05, "loss": 0.7516, "num_input_tokens_seen": 126566400, "step": 15450 }, { "epoch": 1.9976741181031141, "grad_norm": 0.6706373691558838, "learning_rate": 1.2552770564346781e-05, "loss": 0.9337, "num_input_tokens_seen": 126648320, "step": 15460 }, { "epoch": 1.9989662747124952, "grad_norm": 0.7642003297805786, "learning_rate": 1.2523444447774213e-05, "loss": 0.9429, "num_input_tokens_seen": 126730240, "step": 15470 }, { "epoch": 2.000258431321876, "grad_norm": 0.71830153465271, "learning_rate": 1.2494141175729216e-05, "loss": 0.9215, "num_input_tokens_seen": 126812160, "step": 15480 }, { "epoch": 2.0015505879312574, "grad_norm": 0.5191543698310852, "learning_rate": 1.2464860801865954e-05, "loss": 0.611, "num_input_tokens_seen": 126894080, "step": 15490 }, { "epoch": 2.0028427445406383, "grad_norm": 0.8038392066955566, "learning_rate": 1.2435603379796704e-05, "loss": 1.1059, "num_input_tokens_seen": 126976000, "step": 15500 }, { "epoch": 2.0041349011500196, "grad_norm": 0.5426739454269409, "learning_rate": 1.240636896309168e-05, "loss": 0.9048, "num_input_tokens_seen": 127057920, "step": 15510 }, { "epoch": 2.0054270577594004, "grad_norm": 0.6243565082550049, "learning_rate": 1.237715760527901e-05, "loss": 0.7904, "num_input_tokens_seen": 127139840, "step": 15520 }, { "epoch": 2.0067192143687813, "grad_norm": 0.35670045018196106, "learning_rate": 1.2347969359844566e-05, "loss": 0.7196, "num_input_tokens_seen": 127221760, "step": 15530 }, { "epoch": 2.0080113709781626, "grad_norm": 0.9198890924453735, "learning_rate": 1.2318804280231939e-05, "loss": 1.102, "num_input_tokens_seen": 127303680, "step": 15540 }, { "epoch": 2.0093035275875435, "grad_norm": 0.5498625636100769, "learning_rate": 1.2289662419842258e-05, "loss": 0.5505, "num_input_tokens_seen": 127385600, "step": 15550 }, { "epoch": 2.010595684196925, "grad_norm": 0.872312068939209, "learning_rate": 1.2260543832034177e-05, "loss": 0.6824, "num_input_tokens_seen": 127467520, "step": 15560 }, { "epoch": 2.0118878408063057, "grad_norm": 0.6140616536140442, "learning_rate": 1.2231448570123732e-05, "loss": 0.9454, "num_input_tokens_seen": 127549440, "step": 15570 }, { "epoch": 2.013179997415687, "grad_norm": 0.32139700651168823, "learning_rate": 1.2202376687384223e-05, "loss": 0.9467, "num_input_tokens_seen": 127631360, "step": 15580 }, { "epoch": 2.014472154025068, "grad_norm": 0.5901627540588379, "learning_rate": 1.2173328237046178e-05, "loss": 0.6482, "num_input_tokens_seen": 127713280, "step": 15590 }, { "epoch": 2.0157643106344487, "grad_norm": 0.9374563694000244, "learning_rate": 1.2144303272297186e-05, "loss": 0.547, "num_input_tokens_seen": 127795200, "step": 15600 }, { "epoch": 2.01705646724383, "grad_norm": 0.6503939032554626, "learning_rate": 1.2115301846281871e-05, "loss": 0.9459, "num_input_tokens_seen": 127877120, "step": 15610 }, { "epoch": 2.018348623853211, "grad_norm": 0.645089864730835, "learning_rate": 1.2086324012101716e-05, "loss": 0.8437, "num_input_tokens_seen": 127959040, "step": 15620 }, { "epoch": 2.019640780462592, "grad_norm": 0.6811534762382507, "learning_rate": 1.2057369822815051e-05, "loss": 0.9276, "num_input_tokens_seen": 128040960, "step": 15630 }, { "epoch": 2.020932937071973, "grad_norm": 0.5539160370826721, "learning_rate": 1.2028439331436869e-05, "loss": 0.8422, "num_input_tokens_seen": 128122880, "step": 15640 }, { "epoch": 2.0222250936813544, "grad_norm": 0.5917775630950928, "learning_rate": 1.1999532590938817e-05, "loss": 0.9474, "num_input_tokens_seen": 128204800, "step": 15650 }, { "epoch": 2.0235172502907353, "grad_norm": 0.35007065534591675, "learning_rate": 1.1970649654249017e-05, "loss": 0.5438, "num_input_tokens_seen": 128286720, "step": 15660 }, { "epoch": 2.024809406900116, "grad_norm": 0.8497019410133362, "learning_rate": 1.1941790574252013e-05, "loss": 0.5885, "num_input_tokens_seen": 128368640, "step": 15670 }, { "epoch": 2.0261015635094974, "grad_norm": 0.6658693552017212, "learning_rate": 1.1912955403788695e-05, "loss": 0.878, "num_input_tokens_seen": 128450560, "step": 15680 }, { "epoch": 2.0273937201188783, "grad_norm": 0.6294447779655457, "learning_rate": 1.1884144195656133e-05, "loss": 0.7301, "num_input_tokens_seen": 128532480, "step": 15690 }, { "epoch": 2.0286858767282596, "grad_norm": 1.484786868095398, "learning_rate": 1.1855357002607556e-05, "loss": 0.3667, "num_input_tokens_seen": 128614400, "step": 15700 }, { "epoch": 2.0299780333376405, "grad_norm": 1.5291662216186523, "learning_rate": 1.1826593877352216e-05, "loss": 0.6625, "num_input_tokens_seen": 128696320, "step": 15710 }, { "epoch": 2.031270189947022, "grad_norm": 0.41958874464035034, "learning_rate": 1.1797854872555272e-05, "loss": 0.8176, "num_input_tokens_seen": 128778240, "step": 15720 }, { "epoch": 2.0325623465564027, "grad_norm": 0.7266905307769775, "learning_rate": 1.1769140040837755e-05, "loss": 0.8307, "num_input_tokens_seen": 128860160, "step": 15730 }, { "epoch": 2.0338545031657835, "grad_norm": 0.606711208820343, "learning_rate": 1.1740449434776402e-05, "loss": 0.4084, "num_input_tokens_seen": 128942080, "step": 15740 }, { "epoch": 2.035146659775165, "grad_norm": 0.7246209979057312, "learning_rate": 1.171178310690362e-05, "loss": 0.664, "num_input_tokens_seen": 129024000, "step": 15750 }, { "epoch": 2.0364388163845457, "grad_norm": 0.588646650314331, "learning_rate": 1.1683141109707339e-05, "loss": 0.7079, "num_input_tokens_seen": 129105920, "step": 15760 }, { "epoch": 2.037730972993927, "grad_norm": 0.6580897569656372, "learning_rate": 1.165452349563095e-05, "loss": 0.9028, "num_input_tokens_seen": 129187840, "step": 15770 }, { "epoch": 2.039023129603308, "grad_norm": 0.7622004747390747, "learning_rate": 1.1625930317073221e-05, "loss": 0.6014, "num_input_tokens_seen": 129269760, "step": 15780 }, { "epoch": 2.0403152862126888, "grad_norm": 0.7260866761207581, "learning_rate": 1.159736162638813e-05, "loss": 0.7052, "num_input_tokens_seen": 129351680, "step": 15790 }, { "epoch": 2.04160744282207, "grad_norm": 0.6975520849227905, "learning_rate": 1.1568817475884868e-05, "loss": 0.5514, "num_input_tokens_seen": 129433600, "step": 15800 }, { "epoch": 2.042899599431451, "grad_norm": 1.0300558805465698, "learning_rate": 1.154029791782765e-05, "loss": 0.9024, "num_input_tokens_seen": 129515520, "step": 15810 }, { "epoch": 2.0441917560408323, "grad_norm": 0.5849384069442749, "learning_rate": 1.1511803004435704e-05, "loss": 0.8982, "num_input_tokens_seen": 129597440, "step": 15820 }, { "epoch": 2.045483912650213, "grad_norm": 0.6071699857711792, "learning_rate": 1.1483332787883096e-05, "loss": 0.8733, "num_input_tokens_seen": 129679360, "step": 15830 }, { "epoch": 2.0467760692595944, "grad_norm": 0.26067107915878296, "learning_rate": 1.1454887320298686e-05, "loss": 0.7505, "num_input_tokens_seen": 129761280, "step": 15840 }, { "epoch": 2.0480682258689753, "grad_norm": 0.8173131942749023, "learning_rate": 1.1426466653766036e-05, "loss": 0.6831, "num_input_tokens_seen": 129843200, "step": 15850 }, { "epoch": 2.049360382478356, "grad_norm": 0.7379295229911804, "learning_rate": 1.1398070840323264e-05, "loss": 0.8099, "num_input_tokens_seen": 129925120, "step": 15860 }, { "epoch": 2.0506525390877375, "grad_norm": 0.6924763917922974, "learning_rate": 1.1369699931963018e-05, "loss": 0.7674, "num_input_tokens_seen": 130007040, "step": 15870 }, { "epoch": 2.0519446956971183, "grad_norm": 0.8446022272109985, "learning_rate": 1.1341353980632313e-05, "loss": 0.9515, "num_input_tokens_seen": 130088960, "step": 15880 }, { "epoch": 2.0532368523064997, "grad_norm": 0.9424275755882263, "learning_rate": 1.1313033038232498e-05, "loss": 0.9029, "num_input_tokens_seen": 130170880, "step": 15890 }, { "epoch": 2.0545290089158805, "grad_norm": 0.9581369161605835, "learning_rate": 1.1284737156619096e-05, "loss": 0.7374, "num_input_tokens_seen": 130252800, "step": 15900 }, { "epoch": 2.055821165525262, "grad_norm": 0.6615799069404602, "learning_rate": 1.1256466387601782e-05, "loss": 0.7558, "num_input_tokens_seen": 130334720, "step": 15910 }, { "epoch": 2.0571133221346427, "grad_norm": 0.3666354715824127, "learning_rate": 1.1228220782944212e-05, "loss": 0.7905, "num_input_tokens_seen": 130416640, "step": 15920 }, { "epoch": 2.0584054787440236, "grad_norm": 0.6414427757263184, "learning_rate": 1.1200000394363996e-05, "loss": 0.9914, "num_input_tokens_seen": 130498560, "step": 15930 }, { "epoch": 2.059697635353405, "grad_norm": 0.23354114592075348, "learning_rate": 1.1171805273532567e-05, "loss": 0.7977, "num_input_tokens_seen": 130580480, "step": 15940 }, { "epoch": 2.0609897919627858, "grad_norm": 0.4102308750152588, "learning_rate": 1.1143635472075074e-05, "loss": 0.8725, "num_input_tokens_seen": 130662400, "step": 15950 }, { "epoch": 2.062281948572167, "grad_norm": 1.2794528007507324, "learning_rate": 1.1115491041570337e-05, "loss": 1.0455, "num_input_tokens_seen": 130744320, "step": 15960 }, { "epoch": 2.063574105181548, "grad_norm": 0.6293236613273621, "learning_rate": 1.1087372033550685e-05, "loss": 0.8646, "num_input_tokens_seen": 130826240, "step": 15970 }, { "epoch": 2.0648662617909292, "grad_norm": 0.7114168405532837, "learning_rate": 1.105927849950194e-05, "loss": 0.9453, "num_input_tokens_seen": 130908160, "step": 15980 }, { "epoch": 2.06615841840031, "grad_norm": 0.9560719728469849, "learning_rate": 1.103121049086324e-05, "loss": 0.822, "num_input_tokens_seen": 130990080, "step": 15990 }, { "epoch": 2.067450575009691, "grad_norm": 0.36301013827323914, "learning_rate": 1.1003168059027025e-05, "loss": 0.8453, "num_input_tokens_seen": 131072000, "step": 16000 }, { "epoch": 2.0687427316190723, "grad_norm": 0.5861186385154724, "learning_rate": 1.0975151255338867e-05, "loss": 0.9873, "num_input_tokens_seen": 131153920, "step": 16010 }, { "epoch": 2.070034888228453, "grad_norm": 0.6049230098724365, "learning_rate": 1.094716013109745e-05, "loss": 0.6615, "num_input_tokens_seen": 131235840, "step": 16020 }, { "epoch": 2.0713270448378345, "grad_norm": 0.3231169283390045, "learning_rate": 1.0919194737554409e-05, "loss": 0.5598, "num_input_tokens_seen": 131317760, "step": 16030 }, { "epoch": 2.0726192014472153, "grad_norm": 0.7107923030853271, "learning_rate": 1.0891255125914269e-05, "loss": 0.8139, "num_input_tokens_seen": 131399680, "step": 16040 }, { "epoch": 2.0739113580565967, "grad_norm": 0.196472629904747, "learning_rate": 1.0863341347334376e-05, "loss": 0.7574, "num_input_tokens_seen": 131481600, "step": 16050 }, { "epoch": 2.0752035146659775, "grad_norm": 0.9992514848709106, "learning_rate": 1.0835453452924737e-05, "loss": 0.578, "num_input_tokens_seen": 131563520, "step": 16060 }, { "epoch": 2.0764956712753584, "grad_norm": 0.8521533608436584, "learning_rate": 1.0807591493747992e-05, "loss": 0.6804, "num_input_tokens_seen": 131645440, "step": 16070 }, { "epoch": 2.0777878278847397, "grad_norm": 0.5789138078689575, "learning_rate": 1.0779755520819302e-05, "loss": 0.5871, "num_input_tokens_seen": 131727360, "step": 16080 }, { "epoch": 2.0790799844941206, "grad_norm": 0.27690398693084717, "learning_rate": 1.0751945585106205e-05, "loss": 0.8846, "num_input_tokens_seen": 131809280, "step": 16090 }, { "epoch": 2.080372141103502, "grad_norm": 0.5583693981170654, "learning_rate": 1.0724161737528616e-05, "loss": 0.9714, "num_input_tokens_seen": 131891200, "step": 16100 }, { "epoch": 2.0816642977128827, "grad_norm": 1.2210655212402344, "learning_rate": 1.0696404028958634e-05, "loss": 0.7469, "num_input_tokens_seen": 131973120, "step": 16110 }, { "epoch": 2.082956454322264, "grad_norm": 1.0592528581619263, "learning_rate": 1.0668672510220548e-05, "loss": 1.0468, "num_input_tokens_seen": 132055040, "step": 16120 }, { "epoch": 2.084248610931645, "grad_norm": 0.7899770140647888, "learning_rate": 1.0640967232090643e-05, "loss": 0.4886, "num_input_tokens_seen": 132136960, "step": 16130 }, { "epoch": 2.085540767541026, "grad_norm": 0.5646101832389832, "learning_rate": 1.0613288245297193e-05, "loss": 0.9022, "num_input_tokens_seen": 132218880, "step": 16140 }, { "epoch": 2.086832924150407, "grad_norm": 0.5926666855812073, "learning_rate": 1.0585635600520327e-05, "loss": 0.9351, "num_input_tokens_seen": 132300800, "step": 16150 }, { "epoch": 2.088125080759788, "grad_norm": 1.35215163230896, "learning_rate": 1.0558009348391926e-05, "loss": 0.7735, "num_input_tokens_seen": 132382720, "step": 16160 }, { "epoch": 2.0894172373691693, "grad_norm": 0.7683168649673462, "learning_rate": 1.053040953949557e-05, "loss": 0.6433, "num_input_tokens_seen": 132464640, "step": 16170 }, { "epoch": 2.09070939397855, "grad_norm": 0.49856263399124146, "learning_rate": 1.0502836224366389e-05, "loss": 0.9383, "num_input_tokens_seen": 132546560, "step": 16180 }, { "epoch": 2.0920015505879315, "grad_norm": 1.046005368232727, "learning_rate": 1.0475289453491038e-05, "loss": 0.8762, "num_input_tokens_seen": 132628480, "step": 16190 }, { "epoch": 2.0932937071973123, "grad_norm": 0.6279125213623047, "learning_rate": 1.0447769277307554e-05, "loss": 0.7354, "num_input_tokens_seen": 132710400, "step": 16200 }, { "epoch": 2.094585863806693, "grad_norm": 0.666612982749939, "learning_rate": 1.042027574620526e-05, "loss": 0.6087, "num_input_tokens_seen": 132792320, "step": 16210 }, { "epoch": 2.0958780204160745, "grad_norm": 0.6725567579269409, "learning_rate": 1.0392808910524735e-05, "loss": 0.8064, "num_input_tokens_seen": 132874240, "step": 16220 }, { "epoch": 2.0971701770254554, "grad_norm": 0.7392178773880005, "learning_rate": 1.0365368820557633e-05, "loss": 0.6069, "num_input_tokens_seen": 132956160, "step": 16230 }, { "epoch": 2.0984623336348367, "grad_norm": 0.7892594337463379, "learning_rate": 1.0337955526546678e-05, "loss": 0.6704, "num_input_tokens_seen": 133038080, "step": 16240 }, { "epoch": 2.0997544902442176, "grad_norm": 0.6627844572067261, "learning_rate": 1.0310569078685494e-05, "loss": 0.764, "num_input_tokens_seen": 133120000, "step": 16250 }, { "epoch": 2.1010466468535984, "grad_norm": 0.22952121496200562, "learning_rate": 1.0283209527118584e-05, "loss": 0.7597, "num_input_tokens_seen": 133201920, "step": 16260 }, { "epoch": 2.1023388034629797, "grad_norm": 1.4370572566986084, "learning_rate": 1.0255876921941165e-05, "loss": 0.8361, "num_input_tokens_seen": 133283840, "step": 16270 }, { "epoch": 2.1036309600723606, "grad_norm": 1.0919463634490967, "learning_rate": 1.0228571313199161e-05, "loss": 0.7833, "num_input_tokens_seen": 133365760, "step": 16280 }, { "epoch": 2.104923116681742, "grad_norm": 0.7926573753356934, "learning_rate": 1.0201292750889022e-05, "loss": 0.6994, "num_input_tokens_seen": 133447680, "step": 16290 }, { "epoch": 2.106215273291123, "grad_norm": 0.636835515499115, "learning_rate": 1.0174041284957703e-05, "loss": 0.549, "num_input_tokens_seen": 133529600, "step": 16300 }, { "epoch": 2.107507429900504, "grad_norm": 0.7572881579399109, "learning_rate": 1.0146816965302546e-05, "loss": 0.966, "num_input_tokens_seen": 133611520, "step": 16310 }, { "epoch": 2.108799586509885, "grad_norm": 0.6216884255409241, "learning_rate": 1.011961984177117e-05, "loss": 0.8817, "num_input_tokens_seen": 133693440, "step": 16320 }, { "epoch": 2.1100917431192663, "grad_norm": 1.0052217245101929, "learning_rate": 1.0092449964161416e-05, "loss": 0.7589, "num_input_tokens_seen": 133775360, "step": 16330 }, { "epoch": 2.111383899728647, "grad_norm": 0.8983517289161682, "learning_rate": 1.006530738222122e-05, "loss": 0.7143, "num_input_tokens_seen": 133857280, "step": 16340 }, { "epoch": 2.112676056338028, "grad_norm": 0.5024335384368896, "learning_rate": 1.0038192145648567e-05, "loss": 0.9325, "num_input_tokens_seen": 133939200, "step": 16350 }, { "epoch": 2.1139682129474093, "grad_norm": 1.1501423120498657, "learning_rate": 1.001110430409134e-05, "loss": 0.8195, "num_input_tokens_seen": 134021120, "step": 16360 }, { "epoch": 2.11526036955679, "grad_norm": 0.6280581951141357, "learning_rate": 9.98404390714729e-06, "loss": 0.6386, "num_input_tokens_seen": 134103040, "step": 16370 }, { "epoch": 2.1165525261661715, "grad_norm": 0.6069371104240417, "learning_rate": 9.95701100436389e-06, "loss": 0.8842, "num_input_tokens_seen": 134184960, "step": 16380 }, { "epoch": 2.1178446827755524, "grad_norm": 0.8397451639175415, "learning_rate": 9.930005645238302e-06, "loss": 1.0033, "num_input_tokens_seen": 134266880, "step": 16390 }, { "epoch": 2.1191368393849332, "grad_norm": 0.9031828045845032, "learning_rate": 9.903027879217237e-06, "loss": 0.7303, "num_input_tokens_seen": 134348800, "step": 16400 }, { "epoch": 2.1204289959943146, "grad_norm": 0.8852018117904663, "learning_rate": 9.876077755696868e-06, "loss": 0.4911, "num_input_tokens_seen": 134430720, "step": 16410 }, { "epoch": 2.1217211526036954, "grad_norm": 0.6135576963424683, "learning_rate": 9.849155324022799e-06, "loss": 0.8788, "num_input_tokens_seen": 134512640, "step": 16420 }, { "epoch": 2.1230133092130767, "grad_norm": 0.9423490762710571, "learning_rate": 9.82226063348988e-06, "loss": 0.7269, "num_input_tokens_seen": 134594560, "step": 16430 }, { "epoch": 2.1243054658224576, "grad_norm": 0.6482061147689819, "learning_rate": 9.795393733342203e-06, "loss": 0.8314, "num_input_tokens_seen": 134676480, "step": 16440 }, { "epoch": 2.125597622431839, "grad_norm": 1.0169423818588257, "learning_rate": 9.76855467277297e-06, "loss": 0.7974, "num_input_tokens_seen": 134758400, "step": 16450 }, { "epoch": 2.12688977904122, "grad_norm": 0.5455058813095093, "learning_rate": 9.741743500924388e-06, "loss": 0.8748, "num_input_tokens_seen": 134840320, "step": 16460 }, { "epoch": 2.1281819356506007, "grad_norm": 0.3267827033996582, "learning_rate": 9.71496026688763e-06, "loss": 0.6756, "num_input_tokens_seen": 134922240, "step": 16470 }, { "epoch": 2.129474092259982, "grad_norm": 0.6928475499153137, "learning_rate": 9.688205019702684e-06, "loss": 1.01, "num_input_tokens_seen": 135004160, "step": 16480 }, { "epoch": 2.130766248869363, "grad_norm": 0.7541202306747437, "learning_rate": 9.661477808358323e-06, "loss": 0.8252, "num_input_tokens_seen": 135086080, "step": 16490 }, { "epoch": 2.132058405478744, "grad_norm": 1.1462032794952393, "learning_rate": 9.634778681791962e-06, "loss": 0.8617, "num_input_tokens_seen": 135168000, "step": 16500 }, { "epoch": 2.133350562088125, "grad_norm": 0.8049100041389465, "learning_rate": 9.608107688889609e-06, "loss": 1.2436, "num_input_tokens_seen": 135249920, "step": 16510 }, { "epoch": 2.1346427186975063, "grad_norm": 0.6773203015327454, "learning_rate": 9.581464878485764e-06, "loss": 0.7527, "num_input_tokens_seen": 135331840, "step": 16520 }, { "epoch": 2.135934875306887, "grad_norm": 0.743137776851654, "learning_rate": 9.554850299363294e-06, "loss": 0.7333, "num_input_tokens_seen": 135413760, "step": 16530 }, { "epoch": 2.137227031916268, "grad_norm": 0.5760340094566345, "learning_rate": 9.52826400025342e-06, "loss": 1.0707, "num_input_tokens_seen": 135495680, "step": 16540 }, { "epoch": 2.1385191885256494, "grad_norm": 0.5763425230979919, "learning_rate": 9.501706029835544e-06, "loss": 0.7467, "num_input_tokens_seen": 135577600, "step": 16550 }, { "epoch": 2.1398113451350302, "grad_norm": 1.1024504899978638, "learning_rate": 9.47517643673721e-06, "loss": 0.6344, "num_input_tokens_seen": 135659520, "step": 16560 }, { "epoch": 2.1411035017444116, "grad_norm": 0.5545995235443115, "learning_rate": 9.448675269534015e-06, "loss": 0.844, "num_input_tokens_seen": 135741440, "step": 16570 }, { "epoch": 2.1423956583537924, "grad_norm": 0.710102915763855, "learning_rate": 9.422202576749492e-06, "loss": 0.9002, "num_input_tokens_seen": 135823360, "step": 16580 }, { "epoch": 2.1436878149631737, "grad_norm": 0.7064651846885681, "learning_rate": 9.395758406855053e-06, "loss": 0.9066, "num_input_tokens_seen": 135905280, "step": 16590 }, { "epoch": 2.1449799715725546, "grad_norm": 0.34538978338241577, "learning_rate": 9.369342808269862e-06, "loss": 0.8606, "num_input_tokens_seen": 135987200, "step": 16600 }, { "epoch": 2.1462721281819355, "grad_norm": 0.7004348635673523, "learning_rate": 9.342955829360806e-06, "loss": 0.8238, "num_input_tokens_seen": 136069120, "step": 16610 }, { "epoch": 2.147564284791317, "grad_norm": 0.664341390132904, "learning_rate": 9.31659751844232e-06, "loss": 0.571, "num_input_tokens_seen": 136151040, "step": 16620 }, { "epoch": 2.1488564414006976, "grad_norm": 0.9971612691879272, "learning_rate": 9.290267923776397e-06, "loss": 0.7521, "num_input_tokens_seen": 136232960, "step": 16630 }, { "epoch": 2.150148598010079, "grad_norm": 0.39424338936805725, "learning_rate": 9.263967093572412e-06, "loss": 0.3768, "num_input_tokens_seen": 136314880, "step": 16640 }, { "epoch": 2.15144075461946, "grad_norm": 0.7910518646240234, "learning_rate": 9.237695075987106e-06, "loss": 0.727, "num_input_tokens_seen": 136396800, "step": 16650 }, { "epoch": 2.152732911228841, "grad_norm": 0.6754958629608154, "learning_rate": 9.211451919124429e-06, "loss": 0.8798, "num_input_tokens_seen": 136478720, "step": 16660 }, { "epoch": 2.154025067838222, "grad_norm": 0.6639232039451599, "learning_rate": 9.185237671035512e-06, "loss": 1.1385, "num_input_tokens_seen": 136560640, "step": 16670 }, { "epoch": 2.155317224447603, "grad_norm": 0.8237486481666565, "learning_rate": 9.15905237971856e-06, "loss": 0.8854, "num_input_tokens_seen": 136642560, "step": 16680 }, { "epoch": 2.156609381056984, "grad_norm": 0.863280713558197, "learning_rate": 9.132896093118726e-06, "loss": 0.6467, "num_input_tokens_seen": 136724480, "step": 16690 }, { "epoch": 2.157901537666365, "grad_norm": 0.24760442972183228, "learning_rate": 9.10676885912809e-06, "loss": 0.7871, "num_input_tokens_seen": 136806400, "step": 16700 }, { "epoch": 2.1591936942757464, "grad_norm": 0.5246381759643555, "learning_rate": 9.080670725585511e-06, "loss": 0.3123, "num_input_tokens_seen": 136888320, "step": 16710 }, { "epoch": 2.1604858508851272, "grad_norm": 0.6992424726486206, "learning_rate": 9.054601740276586e-06, "loss": 0.7407, "num_input_tokens_seen": 136970240, "step": 16720 }, { "epoch": 2.161778007494508, "grad_norm": 0.7333977222442627, "learning_rate": 9.028561950933517e-06, "loss": 0.6363, "num_input_tokens_seen": 137052160, "step": 16730 }, { "epoch": 2.1630701641038894, "grad_norm": 0.5862302184104919, "learning_rate": 9.002551405235082e-06, "loss": 0.7121, "num_input_tokens_seen": 137134080, "step": 16740 }, { "epoch": 2.1643623207132703, "grad_norm": 0.36307257413864136, "learning_rate": 8.976570150806486e-06, "loss": 0.9597, "num_input_tokens_seen": 137216000, "step": 16750 }, { "epoch": 2.1656544773226516, "grad_norm": 0.8547699451446533, "learning_rate": 8.950618235219302e-06, "loss": 0.7746, "num_input_tokens_seen": 137297920, "step": 16760 }, { "epoch": 2.1669466339320325, "grad_norm": 0.5659860372543335, "learning_rate": 8.924695705991407e-06, "loss": 0.4788, "num_input_tokens_seen": 137379840, "step": 16770 }, { "epoch": 2.1682387905414138, "grad_norm": 0.7544788122177124, "learning_rate": 8.898802610586843e-06, "loss": 0.6243, "num_input_tokens_seen": 137461760, "step": 16780 }, { "epoch": 2.1695309471507946, "grad_norm": 0.728805661201477, "learning_rate": 8.872938996415791e-06, "loss": 0.9596, "num_input_tokens_seen": 137543680, "step": 16790 }, { "epoch": 2.170823103760176, "grad_norm": 0.7234178185462952, "learning_rate": 8.847104910834414e-06, "loss": 0.83, "num_input_tokens_seen": 137625600, "step": 16800 }, { "epoch": 2.172115260369557, "grad_norm": 0.29097720980644226, "learning_rate": 8.821300401144836e-06, "loss": 0.8546, "num_input_tokens_seen": 137707520, "step": 16810 }, { "epoch": 2.1734074169789377, "grad_norm": 0.6291516423225403, "learning_rate": 8.795525514595032e-06, "loss": 0.7418, "num_input_tokens_seen": 137789440, "step": 16820 }, { "epoch": 2.174699573588319, "grad_norm": 0.7936385273933411, "learning_rate": 8.769780298378705e-06, "loss": 0.9961, "num_input_tokens_seen": 137871360, "step": 16830 }, { "epoch": 2.1759917301977, "grad_norm": 0.8866328001022339, "learning_rate": 8.74406479963527e-06, "loss": 0.5871, "num_input_tokens_seen": 137953280, "step": 16840 }, { "epoch": 2.177283886807081, "grad_norm": 1.1491591930389404, "learning_rate": 8.718379065449694e-06, "loss": 1.1771, "num_input_tokens_seen": 138035200, "step": 16850 }, { "epoch": 2.178576043416462, "grad_norm": 0.9200372099876404, "learning_rate": 8.69272314285248e-06, "loss": 0.5562, "num_input_tokens_seen": 138117120, "step": 16860 }, { "epoch": 2.179868200025843, "grad_norm": 0.7428059577941895, "learning_rate": 8.667097078819511e-06, "loss": 1.0242, "num_input_tokens_seen": 138199040, "step": 16870 }, { "epoch": 2.1811603566352242, "grad_norm": 0.7725750803947449, "learning_rate": 8.641500920272022e-06, "loss": 0.8353, "num_input_tokens_seen": 138280960, "step": 16880 }, { "epoch": 2.182452513244605, "grad_norm": 0.8504687547683716, "learning_rate": 8.6159347140765e-06, "loss": 0.7933, "num_input_tokens_seen": 138362880, "step": 16890 }, { "epoch": 2.1837446698539864, "grad_norm": 0.8972610831260681, "learning_rate": 8.59039850704455e-06, "loss": 0.9106, "num_input_tokens_seen": 138444800, "step": 16900 }, { "epoch": 2.1850368264633673, "grad_norm": 0.9138893485069275, "learning_rate": 8.564892345932899e-06, "loss": 0.9258, "num_input_tokens_seen": 138526720, "step": 16910 }, { "epoch": 2.1863289830727486, "grad_norm": 0.8643404841423035, "learning_rate": 8.539416277443218e-06, "loss": 0.6456, "num_input_tokens_seen": 138608640, "step": 16920 }, { "epoch": 2.1876211396821295, "grad_norm": 0.5902085304260254, "learning_rate": 8.513970348222095e-06, "loss": 0.7899, "num_input_tokens_seen": 138690560, "step": 16930 }, { "epoch": 2.1889132962915103, "grad_norm": 0.7431287169456482, "learning_rate": 8.488554604860947e-06, "loss": 1.0066, "num_input_tokens_seen": 138772480, "step": 16940 }, { "epoch": 2.1902054529008916, "grad_norm": 0.330143541097641, "learning_rate": 8.463169093895887e-06, "loss": 0.5788, "num_input_tokens_seen": 138854400, "step": 16950 }, { "epoch": 2.1914976095102725, "grad_norm": 0.7542316913604736, "learning_rate": 8.437813861807712e-06, "loss": 0.6761, "num_input_tokens_seen": 138936320, "step": 16960 }, { "epoch": 2.192789766119654, "grad_norm": 0.6555983424186707, "learning_rate": 8.412488955021744e-06, "loss": 1.3443, "num_input_tokens_seen": 139018240, "step": 16970 }, { "epoch": 2.1940819227290347, "grad_norm": 0.9010647535324097, "learning_rate": 8.38719441990781e-06, "loss": 0.8272, "num_input_tokens_seen": 139100160, "step": 16980 }, { "epoch": 2.195374079338416, "grad_norm": 0.4530339539051056, "learning_rate": 8.361930302780091e-06, "loss": 0.6814, "num_input_tokens_seen": 139182080, "step": 16990 }, { "epoch": 2.196666235947797, "grad_norm": 0.5172061324119568, "learning_rate": 8.336696649897116e-06, "loss": 0.5844, "num_input_tokens_seen": 139264000, "step": 17000 }, { "epoch": 2.1979583925571777, "grad_norm": 1.0065109729766846, "learning_rate": 8.311493507461593e-06, "loss": 0.6501, "num_input_tokens_seen": 139345920, "step": 17010 }, { "epoch": 2.199250549166559, "grad_norm": 0.3245972990989685, "learning_rate": 8.286320921620394e-06, "loss": 0.6703, "num_input_tokens_seen": 139427840, "step": 17020 }, { "epoch": 2.20054270577594, "grad_norm": 0.8792958855628967, "learning_rate": 8.261178938464422e-06, "loss": 0.83, "num_input_tokens_seen": 139509760, "step": 17030 }, { "epoch": 2.2018348623853212, "grad_norm": 0.7680603265762329, "learning_rate": 8.236067604028563e-06, "loss": 0.6316, "num_input_tokens_seen": 139591680, "step": 17040 }, { "epoch": 2.203127018994702, "grad_norm": 0.8535128831863403, "learning_rate": 8.210986964291587e-06, "loss": 0.7002, "num_input_tokens_seen": 139673600, "step": 17050 }, { "epoch": 2.2044191756040834, "grad_norm": 0.7545835375785828, "learning_rate": 8.185937065176033e-06, "loss": 0.9392, "num_input_tokens_seen": 139755520, "step": 17060 }, { "epoch": 2.2057113322134643, "grad_norm": 0.5624322891235352, "learning_rate": 8.160917952548197e-06, "loss": 0.828, "num_input_tokens_seen": 139837440, "step": 17070 }, { "epoch": 2.207003488822845, "grad_norm": 1.1098763942718506, "learning_rate": 8.13592967221796e-06, "loss": 0.9871, "num_input_tokens_seen": 139919360, "step": 17080 }, { "epoch": 2.2082956454322265, "grad_norm": 0.5870392322540283, "learning_rate": 8.110972269938793e-06, "loss": 1.0047, "num_input_tokens_seen": 140001280, "step": 17090 }, { "epoch": 2.2095878020416073, "grad_norm": 0.8012586236000061, "learning_rate": 8.08604579140759e-06, "loss": 1.1041, "num_input_tokens_seen": 140083200, "step": 17100 }, { "epoch": 2.2108799586509886, "grad_norm": 0.7019762992858887, "learning_rate": 8.06115028226466e-06, "loss": 1.116, "num_input_tokens_seen": 140165120, "step": 17110 }, { "epoch": 2.2121721152603695, "grad_norm": 0.646838366985321, "learning_rate": 8.036285788093578e-06, "loss": 0.82, "num_input_tokens_seen": 140247040, "step": 17120 }, { "epoch": 2.213464271869751, "grad_norm": 0.8307018876075745, "learning_rate": 8.011452354421136e-06, "loss": 0.6641, "num_input_tokens_seen": 140328960, "step": 17130 }, { "epoch": 2.2147564284791317, "grad_norm": 0.39130905270576477, "learning_rate": 7.986650026717277e-06, "loss": 0.5812, "num_input_tokens_seen": 140410880, "step": 17140 }, { "epoch": 2.2160485850885125, "grad_norm": 0.7408241033554077, "learning_rate": 7.961878850394952e-06, "loss": 0.8512, "num_input_tokens_seen": 140492800, "step": 17150 }, { "epoch": 2.217340741697894, "grad_norm": 0.6536303758621216, "learning_rate": 7.937138870810115e-06, "loss": 0.7622, "num_input_tokens_seen": 140574720, "step": 17160 }, { "epoch": 2.2186328983072747, "grad_norm": 0.9935400485992432, "learning_rate": 7.912430133261562e-06, "loss": 0.7604, "num_input_tokens_seen": 140656640, "step": 17170 }, { "epoch": 2.219925054916656, "grad_norm": 0.6916456818580627, "learning_rate": 7.887752682990903e-06, "loss": 0.7567, "num_input_tokens_seen": 140738560, "step": 17180 }, { "epoch": 2.221217211526037, "grad_norm": 0.7032532095909119, "learning_rate": 7.863106565182474e-06, "loss": 1.0241, "num_input_tokens_seen": 140820480, "step": 17190 }, { "epoch": 2.222509368135418, "grad_norm": 0.6725844740867615, "learning_rate": 7.838491824963207e-06, "loss": 0.8592, "num_input_tokens_seen": 140902400, "step": 17200 }, { "epoch": 2.223801524744799, "grad_norm": 0.851714551448822, "learning_rate": 7.81390850740262e-06, "loss": 0.9155, "num_input_tokens_seen": 140984320, "step": 17210 }, { "epoch": 2.22509368135418, "grad_norm": 0.7544275522232056, "learning_rate": 7.78935665751266e-06, "loss": 1.1792, "num_input_tokens_seen": 141066240, "step": 17220 }, { "epoch": 2.2263858379635613, "grad_norm": 1.2108078002929688, "learning_rate": 7.764836320247686e-06, "loss": 0.7382, "num_input_tokens_seen": 141148160, "step": 17230 }, { "epoch": 2.227677994572942, "grad_norm": 0.5292445421218872, "learning_rate": 7.740347540504336e-06, "loss": 0.7866, "num_input_tokens_seen": 141230080, "step": 17240 }, { "epoch": 2.2289701511823234, "grad_norm": 0.5671383738517761, "learning_rate": 7.715890363121484e-06, "loss": 0.5538, "num_input_tokens_seen": 141312000, "step": 17250 }, { "epoch": 2.2302623077917043, "grad_norm": 1.1066720485687256, "learning_rate": 7.691464832880135e-06, "loss": 0.7333, "num_input_tokens_seen": 141393920, "step": 17260 }, { "epoch": 2.2315544644010856, "grad_norm": 0.3376465439796448, "learning_rate": 7.667070994503334e-06, "loss": 0.7558, "num_input_tokens_seen": 141475840, "step": 17270 }, { "epoch": 2.2328466210104665, "grad_norm": 0.6462758779525757, "learning_rate": 7.642708892656125e-06, "loss": 0.4585, "num_input_tokens_seen": 141557760, "step": 17280 }, { "epoch": 2.2341387776198474, "grad_norm": 0.7883977890014648, "learning_rate": 7.618378571945417e-06, "loss": 0.602, "num_input_tokens_seen": 141639680, "step": 17290 }, { "epoch": 2.2354309342292287, "grad_norm": 0.7147501707077026, "learning_rate": 7.5940800769199345e-06, "loss": 0.7789, "num_input_tokens_seen": 141721600, "step": 17300 }, { "epoch": 2.2367230908386095, "grad_norm": 0.7497975826263428, "learning_rate": 7.569813452070146e-06, "loss": 0.5047, "num_input_tokens_seen": 141803520, "step": 17310 }, { "epoch": 2.238015247447991, "grad_norm": 0.6466274261474609, "learning_rate": 7.545578741828136e-06, "loss": 0.6986, "num_input_tokens_seen": 141885440, "step": 17320 }, { "epoch": 2.2393074040573717, "grad_norm": 0.7558254599571228, "learning_rate": 7.521375990567589e-06, "loss": 0.8236, "num_input_tokens_seen": 141967360, "step": 17330 }, { "epoch": 2.2405995606667526, "grad_norm": 0.7305789589881897, "learning_rate": 7.497205242603636e-06, "loss": 0.469, "num_input_tokens_seen": 142049280, "step": 17340 }, { "epoch": 2.241891717276134, "grad_norm": 0.29660215973854065, "learning_rate": 7.4730665421928445e-06, "loss": 0.7266, "num_input_tokens_seen": 142131200, "step": 17350 }, { "epoch": 2.2431838738855148, "grad_norm": 0.26684778928756714, "learning_rate": 7.4489599335330704e-06, "loss": 1.0225, "num_input_tokens_seen": 142213120, "step": 17360 }, { "epoch": 2.244476030494896, "grad_norm": 0.6525532007217407, "learning_rate": 7.424885460763442e-06, "loss": 0.5151, "num_input_tokens_seen": 142295040, "step": 17370 }, { "epoch": 2.245768187104277, "grad_norm": 0.7799077033996582, "learning_rate": 7.4008431679642165e-06, "loss": 0.5883, "num_input_tokens_seen": 142376960, "step": 17380 }, { "epoch": 2.2470603437136583, "grad_norm": 0.9852105975151062, "learning_rate": 7.3768330991567495e-06, "loss": 0.6615, "num_input_tokens_seen": 142458880, "step": 17390 }, { "epoch": 2.248352500323039, "grad_norm": 0.7310599684715271, "learning_rate": 7.3528552983033985e-06, "loss": 0.7732, "num_input_tokens_seen": 142540800, "step": 17400 }, { "epoch": 2.2496446569324204, "grad_norm": 0.7955625653266907, "learning_rate": 7.328909809307413e-06, "loss": 0.954, "num_input_tokens_seen": 142622720, "step": 17410 }, { "epoch": 2.2509368135418013, "grad_norm": 0.8421788215637207, "learning_rate": 7.304996676012913e-06, "loss": 0.5088, "num_input_tokens_seen": 142704640, "step": 17420 }, { "epoch": 2.252228970151182, "grad_norm": 0.772693932056427, "learning_rate": 7.281115942204739e-06, "loss": 0.812, "num_input_tokens_seen": 142786560, "step": 17430 }, { "epoch": 2.2535211267605635, "grad_norm": 0.9953277707099915, "learning_rate": 7.257267651608446e-06, "loss": 0.9194, "num_input_tokens_seen": 142868480, "step": 17440 }, { "epoch": 2.2548132833699444, "grad_norm": 0.6572209000587463, "learning_rate": 7.233451847890149e-06, "loss": 0.6278, "num_input_tokens_seen": 142950400, "step": 17450 }, { "epoch": 2.2561054399793257, "grad_norm": 0.3987307846546173, "learning_rate": 7.209668574656514e-06, "loss": 0.7189, "num_input_tokens_seen": 143032320, "step": 17460 }, { "epoch": 2.2573975965887065, "grad_norm": 0.6912767291069031, "learning_rate": 7.185917875454615e-06, "loss": 0.8608, "num_input_tokens_seen": 143114240, "step": 17470 }, { "epoch": 2.2586897531980874, "grad_norm": 0.6624774932861328, "learning_rate": 7.162199793771904e-06, "loss": 0.8747, "num_input_tokens_seen": 143196160, "step": 17480 }, { "epoch": 2.2599819098074687, "grad_norm": 0.3295046091079712, "learning_rate": 7.138514373036098e-06, "loss": 0.8996, "num_input_tokens_seen": 143278080, "step": 17490 }, { "epoch": 2.2612740664168496, "grad_norm": 0.7381219267845154, "learning_rate": 7.11486165661511e-06, "loss": 0.6536, "num_input_tokens_seen": 143360000, "step": 17500 }, { "epoch": 2.262566223026231, "grad_norm": 0.9237964749336243, "learning_rate": 7.091241687816988e-06, "loss": 0.9105, "num_input_tokens_seen": 143441920, "step": 17510 }, { "epoch": 2.2638583796356118, "grad_norm": 0.6693175435066223, "learning_rate": 7.0676545098897956e-06, "loss": 0.7303, "num_input_tokens_seen": 143523840, "step": 17520 }, { "epoch": 2.265150536244993, "grad_norm": 0.803263247013092, "learning_rate": 7.044100166021583e-06, "loss": 0.6937, "num_input_tokens_seen": 143605760, "step": 17530 }, { "epoch": 2.266442692854374, "grad_norm": 1.2088638544082642, "learning_rate": 7.020578699340255e-06, "loss": 0.7094, "num_input_tokens_seen": 143687680, "step": 17540 }, { "epoch": 2.2677348494637553, "grad_norm": 0.6141706109046936, "learning_rate": 6.997090152913535e-06, "loss": 1.0294, "num_input_tokens_seen": 143769600, "step": 17550 }, { "epoch": 2.269027006073136, "grad_norm": 0.8863146305084229, "learning_rate": 6.97363456974888e-06, "loss": 0.5169, "num_input_tokens_seen": 143851520, "step": 17560 }, { "epoch": 2.270319162682517, "grad_norm": 0.5809889435768127, "learning_rate": 6.950211992793354e-06, "loss": 0.8054, "num_input_tokens_seen": 143933440, "step": 17570 }, { "epoch": 2.2716113192918983, "grad_norm": 0.6235867142677307, "learning_rate": 6.92682246493363e-06, "loss": 1.089, "num_input_tokens_seen": 144015360, "step": 17580 }, { "epoch": 2.272903475901279, "grad_norm": 0.7494005560874939, "learning_rate": 6.903466028995828e-06, "loss": 0.5911, "num_input_tokens_seen": 144097280, "step": 17590 }, { "epoch": 2.2741956325106605, "grad_norm": 0.3730751574039459, "learning_rate": 6.880142727745517e-06, "loss": 0.4274, "num_input_tokens_seen": 144179200, "step": 17600 }, { "epoch": 2.2754877891200413, "grad_norm": 0.6000313758850098, "learning_rate": 6.856852603887556e-06, "loss": 0.5875, "num_input_tokens_seen": 144261120, "step": 17610 }, { "epoch": 2.276779945729422, "grad_norm": 0.8631287217140198, "learning_rate": 6.8335957000660925e-06, "loss": 1.0119, "num_input_tokens_seen": 144343040, "step": 17620 }, { "epoch": 2.2780721023388035, "grad_norm": 0.606353223323822, "learning_rate": 6.810372058864429e-06, "loss": 0.8559, "num_input_tokens_seen": 144424960, "step": 17630 }, { "epoch": 2.2793642589481844, "grad_norm": 0.7653380036354065, "learning_rate": 6.787181722804959e-06, "loss": 0.9657, "num_input_tokens_seen": 144506880, "step": 17640 }, { "epoch": 2.2806564155575657, "grad_norm": 0.706365168094635, "learning_rate": 6.764024734349117e-06, "loss": 0.9287, "num_input_tokens_seen": 144588800, "step": 17650 }, { "epoch": 2.2819485721669466, "grad_norm": 1.115876317024231, "learning_rate": 6.740901135897257e-06, "loss": 0.8438, "num_input_tokens_seen": 144670720, "step": 17660 }, { "epoch": 2.2832407287763274, "grad_norm": 0.9500836133956909, "learning_rate": 6.717810969788596e-06, "loss": 0.8497, "num_input_tokens_seen": 144752640, "step": 17670 }, { "epoch": 2.2845328853857088, "grad_norm": 0.8140472769737244, "learning_rate": 6.694754278301154e-06, "loss": 0.8295, "num_input_tokens_seen": 144834560, "step": 17680 }, { "epoch": 2.2858250419950896, "grad_norm": 0.36906084418296814, "learning_rate": 6.671731103651641e-06, "loss": 0.6225, "num_input_tokens_seen": 144916480, "step": 17690 }, { "epoch": 2.287117198604471, "grad_norm": 0.9518002271652222, "learning_rate": 6.648741487995416e-06, "loss": 0.7609, "num_input_tokens_seen": 144998400, "step": 17700 }, { "epoch": 2.288409355213852, "grad_norm": 0.6692580580711365, "learning_rate": 6.625785473426369e-06, "loss": 0.9989, "num_input_tokens_seen": 145080320, "step": 17710 }, { "epoch": 2.289701511823233, "grad_norm": 1.0394073724746704, "learning_rate": 6.602863101976886e-06, "loss": 0.9415, "num_input_tokens_seen": 145162240, "step": 17720 }, { "epoch": 2.290993668432614, "grad_norm": 0.7470195889472961, "learning_rate": 6.57997441561774e-06, "loss": 0.8093, "num_input_tokens_seen": 145244160, "step": 17730 }, { "epoch": 2.2922858250419953, "grad_norm": 0.5699132084846497, "learning_rate": 6.557119456258043e-06, "loss": 0.6653, "num_input_tokens_seen": 145326080, "step": 17740 }, { "epoch": 2.293577981651376, "grad_norm": 0.5270111560821533, "learning_rate": 6.534298265745128e-06, "loss": 0.4557, "num_input_tokens_seen": 145408000, "step": 17750 }, { "epoch": 2.294870138260757, "grad_norm": 1.350411057472229, "learning_rate": 6.511510885864516e-06, "loss": 0.4692, "num_input_tokens_seen": 145489920, "step": 17760 }, { "epoch": 2.2961622948701383, "grad_norm": 0.2834679186344147, "learning_rate": 6.4887573583398255e-06, "loss": 0.6354, "num_input_tokens_seen": 145571840, "step": 17770 }, { "epoch": 2.297454451479519, "grad_norm": 0.27980419993400574, "learning_rate": 6.466037724832666e-06, "loss": 0.4408, "num_input_tokens_seen": 145653760, "step": 17780 }, { "epoch": 2.2987466080889005, "grad_norm": 0.7254844307899475, "learning_rate": 6.44335202694262e-06, "loss": 0.9355, "num_input_tokens_seen": 145735680, "step": 17790 }, { "epoch": 2.3000387646982814, "grad_norm": 0.7839716076850891, "learning_rate": 6.420700306207103e-06, "loss": 1.1966, "num_input_tokens_seen": 145817600, "step": 17800 }, { "epoch": 2.3013309213076623, "grad_norm": 0.8338444232940674, "learning_rate": 6.3980826041013464e-06, "loss": 0.3919, "num_input_tokens_seen": 145899520, "step": 17810 }, { "epoch": 2.3026230779170436, "grad_norm": 0.608752429485321, "learning_rate": 6.375498962038265e-06, "loss": 0.9223, "num_input_tokens_seen": 145981440, "step": 17820 }, { "epoch": 2.3039152345264244, "grad_norm": 0.5790471434593201, "learning_rate": 6.35294942136844e-06, "loss": 0.5995, "num_input_tokens_seen": 146063360, "step": 17830 }, { "epoch": 2.3052073911358058, "grad_norm": 0.9189697504043579, "learning_rate": 6.3304340233799805e-06, "loss": 0.6963, "num_input_tokens_seen": 146145280, "step": 17840 }, { "epoch": 2.3064995477451866, "grad_norm": 0.6007638573646545, "learning_rate": 6.307952809298517e-06, "loss": 0.6846, "num_input_tokens_seen": 146227200, "step": 17850 }, { "epoch": 2.307791704354568, "grad_norm": 0.8957340717315674, "learning_rate": 6.28550582028706e-06, "loss": 0.542, "num_input_tokens_seen": 146309120, "step": 17860 }, { "epoch": 2.309083860963949, "grad_norm": 0.8787885904312134, "learning_rate": 6.263093097445957e-06, "loss": 0.9602, "num_input_tokens_seen": 146391040, "step": 17870 }, { "epoch": 2.31037601757333, "grad_norm": 0.655099630355835, "learning_rate": 6.240714681812837e-06, "loss": 0.8196, "num_input_tokens_seen": 146472960, "step": 17880 }, { "epoch": 2.311668174182711, "grad_norm": 1.1180391311645508, "learning_rate": 6.218370614362484e-06, "loss": 0.6883, "num_input_tokens_seen": 146554880, "step": 17890 }, { "epoch": 2.312960330792092, "grad_norm": 0.5654224753379822, "learning_rate": 6.196060936006817e-06, "loss": 0.5604, "num_input_tokens_seen": 146636800, "step": 17900 }, { "epoch": 2.314252487401473, "grad_norm": 0.617138147354126, "learning_rate": 6.173785687594761e-06, "loss": 1.045, "num_input_tokens_seen": 146718720, "step": 17910 }, { "epoch": 2.315544644010854, "grad_norm": 0.2932003140449524, "learning_rate": 6.1515449099122185e-06, "loss": 0.8279, "num_input_tokens_seen": 146800640, "step": 17920 }, { "epoch": 2.3168368006202353, "grad_norm": 0.7684862017631531, "learning_rate": 6.129338643681984e-06, "loss": 1.0301, "num_input_tokens_seen": 146882560, "step": 17930 }, { "epoch": 2.318128957229616, "grad_norm": 0.6417585015296936, "learning_rate": 6.107166929563629e-06, "loss": 0.6595, "num_input_tokens_seen": 146964480, "step": 17940 }, { "epoch": 2.319421113838997, "grad_norm": 0.4829157292842865, "learning_rate": 6.085029808153503e-06, "loss": 0.672, "num_input_tokens_seen": 147046400, "step": 17950 }, { "epoch": 2.3207132704483784, "grad_norm": 0.6301985383033752, "learning_rate": 6.062927319984576e-06, "loss": 0.7306, "num_input_tokens_seen": 147128320, "step": 17960 }, { "epoch": 2.3220054270577593, "grad_norm": 0.5591002106666565, "learning_rate": 6.040859505526439e-06, "loss": 1.0703, "num_input_tokens_seen": 147210240, "step": 17970 }, { "epoch": 2.3232975836671406, "grad_norm": 0.6914545893669128, "learning_rate": 6.018826405185163e-06, "loss": 0.9107, "num_input_tokens_seen": 147292160, "step": 17980 }, { "epoch": 2.3245897402765214, "grad_norm": 0.6147554516792297, "learning_rate": 5.99682805930328e-06, "loss": 0.9921, "num_input_tokens_seen": 147374080, "step": 17990 }, { "epoch": 2.3258818968859027, "grad_norm": 0.8620073795318604, "learning_rate": 5.974864508159692e-06, "loss": 1.121, "num_input_tokens_seen": 147456000, "step": 18000 }, { "epoch": 2.3271740534952836, "grad_norm": 0.7225034236907959, "learning_rate": 5.952935791969574e-06, "loss": 0.7416, "num_input_tokens_seen": 147537920, "step": 18010 }, { "epoch": 2.328466210104665, "grad_norm": 0.9885997772216797, "learning_rate": 5.931041950884314e-06, "loss": 0.8224, "num_input_tokens_seen": 147619840, "step": 18020 }, { "epoch": 2.329758366714046, "grad_norm": 0.8367599844932556, "learning_rate": 5.9091830249914685e-06, "loss": 0.4575, "num_input_tokens_seen": 147701760, "step": 18030 }, { "epoch": 2.3310505233234267, "grad_norm": 0.685955286026001, "learning_rate": 5.887359054314648e-06, "loss": 0.8888, "num_input_tokens_seen": 147783680, "step": 18040 }, { "epoch": 2.332342679932808, "grad_norm": 0.8157021403312683, "learning_rate": 5.8655700788134535e-06, "loss": 1.0576, "num_input_tokens_seen": 147865600, "step": 18050 }, { "epoch": 2.333634836542189, "grad_norm": 0.7296263575553894, "learning_rate": 5.843816138383429e-06, "loss": 0.967, "num_input_tokens_seen": 147947520, "step": 18060 }, { "epoch": 2.33492699315157, "grad_norm": 0.6871107816696167, "learning_rate": 5.822097272855964e-06, "loss": 0.5175, "num_input_tokens_seen": 148029440, "step": 18070 }, { "epoch": 2.336219149760951, "grad_norm": 0.9326339364051819, "learning_rate": 5.800413521998208e-06, "loss": 0.8177, "num_input_tokens_seen": 148111360, "step": 18080 }, { "epoch": 2.337511306370332, "grad_norm": 0.6087706089019775, "learning_rate": 5.778764925513045e-06, "loss": 0.9179, "num_input_tokens_seen": 148193280, "step": 18090 }, { "epoch": 2.338803462979713, "grad_norm": 1.1223865747451782, "learning_rate": 5.7571515230389586e-06, "loss": 0.9355, "num_input_tokens_seen": 148275200, "step": 18100 }, { "epoch": 2.340095619589094, "grad_norm": 0.6350956559181213, "learning_rate": 5.7355733541500285e-06, "loss": 0.9177, "num_input_tokens_seen": 148357120, "step": 18110 }, { "epoch": 2.3413877761984754, "grad_norm": 0.543329656124115, "learning_rate": 5.714030458355784e-06, "loss": 0.8673, "num_input_tokens_seen": 148439040, "step": 18120 }, { "epoch": 2.3426799328078562, "grad_norm": 0.8380277752876282, "learning_rate": 5.692522875101203e-06, "loss": 0.8191, "num_input_tokens_seen": 148520960, "step": 18130 }, { "epoch": 2.3439720894172376, "grad_norm": 0.6148279905319214, "learning_rate": 5.67105064376659e-06, "loss": 0.4773, "num_input_tokens_seen": 148602880, "step": 18140 }, { "epoch": 2.3452642460266184, "grad_norm": 0.766385555267334, "learning_rate": 5.649613803667511e-06, "loss": 0.8423, "num_input_tokens_seen": 148684800, "step": 18150 }, { "epoch": 2.3465564026359993, "grad_norm": 0.24485957622528076, "learning_rate": 5.628212394054758e-06, "loss": 0.7039, "num_input_tokens_seen": 148766720, "step": 18160 }, { "epoch": 2.3478485592453806, "grad_norm": 0.30911335349082947, "learning_rate": 5.606846454114218e-06, "loss": 0.4886, "num_input_tokens_seen": 148848640, "step": 18170 }, { "epoch": 2.3491407158547615, "grad_norm": 1.1630913019180298, "learning_rate": 5.5855160229668636e-06, "loss": 0.8072, "num_input_tokens_seen": 148930560, "step": 18180 }, { "epoch": 2.350432872464143, "grad_norm": 0.8715269565582275, "learning_rate": 5.564221139668621e-06, "loss": 0.7343, "num_input_tokens_seen": 149012480, "step": 18190 }, { "epoch": 2.3517250290735237, "grad_norm": 0.7479017376899719, "learning_rate": 5.542961843210359e-06, "loss": 1.1034, "num_input_tokens_seen": 149094400, "step": 18200 }, { "epoch": 2.353017185682905, "grad_norm": 0.23909921944141388, "learning_rate": 5.5217381725177624e-06, "loss": 0.7438, "num_input_tokens_seen": 149176320, "step": 18210 }, { "epoch": 2.354309342292286, "grad_norm": 0.6988628506660461, "learning_rate": 5.50055016645129e-06, "loss": 0.6506, "num_input_tokens_seen": 149258240, "step": 18220 }, { "epoch": 2.3556014989016667, "grad_norm": 0.9495237469673157, "learning_rate": 5.479397863806115e-06, "loss": 0.7191, "num_input_tokens_seen": 149340160, "step": 18230 }, { "epoch": 2.356893655511048, "grad_norm": 0.9350152015686035, "learning_rate": 5.458281303312016e-06, "loss": 0.8379, "num_input_tokens_seen": 149422080, "step": 18240 }, { "epoch": 2.358185812120429, "grad_norm": 0.8469577431678772, "learning_rate": 5.437200523633348e-06, "loss": 0.388, "num_input_tokens_seen": 149504000, "step": 18250 }, { "epoch": 2.35947796872981, "grad_norm": 0.6918286085128784, "learning_rate": 5.41615556336893e-06, "loss": 1.0104, "num_input_tokens_seen": 149585920, "step": 18260 }, { "epoch": 2.360770125339191, "grad_norm": 0.8202119469642639, "learning_rate": 5.39514646105202e-06, "loss": 0.7642, "num_input_tokens_seen": 149667840, "step": 18270 }, { "epoch": 2.362062281948572, "grad_norm": 0.20039023458957672, "learning_rate": 5.374173255150194e-06, "loss": 0.6885, "num_input_tokens_seen": 149749760, "step": 18280 }, { "epoch": 2.3633544385579532, "grad_norm": 44.923805236816406, "learning_rate": 5.353235984065321e-06, "loss": 0.6808, "num_input_tokens_seen": 149831680, "step": 18290 }, { "epoch": 2.364646595167334, "grad_norm": 0.7638149261474609, "learning_rate": 5.332334686133475e-06, "loss": 0.7185, "num_input_tokens_seen": 149913600, "step": 18300 }, { "epoch": 2.3659387517767154, "grad_norm": 0.9998169541358948, "learning_rate": 5.311469399624844e-06, "loss": 0.8391, "num_input_tokens_seen": 149995520, "step": 18310 }, { "epoch": 2.3672309083860963, "grad_norm": 0.6908437609672546, "learning_rate": 5.290640162743704e-06, "loss": 0.6135, "num_input_tokens_seen": 150077440, "step": 18320 }, { "epoch": 2.3685230649954776, "grad_norm": 1.3081281185150146, "learning_rate": 5.269847013628299e-06, "loss": 0.9426, "num_input_tokens_seen": 150159360, "step": 18330 }, { "epoch": 2.3698152216048585, "grad_norm": 0.7654116749763489, "learning_rate": 5.24908999035082e-06, "loss": 0.9153, "num_input_tokens_seen": 150241280, "step": 18340 }, { "epoch": 2.37110737821424, "grad_norm": 1.00165855884552, "learning_rate": 5.228369130917288e-06, "loss": 0.4039, "num_input_tokens_seen": 150323200, "step": 18350 }, { "epoch": 2.3723995348236206, "grad_norm": 0.593526303768158, "learning_rate": 5.207684473267527e-06, "loss": 0.573, "num_input_tokens_seen": 150405120, "step": 18360 }, { "epoch": 2.3736916914330015, "grad_norm": 0.5847054719924927, "learning_rate": 5.187036055275077e-06, "loss": 1.009, "num_input_tokens_seen": 150487040, "step": 18370 }, { "epoch": 2.374983848042383, "grad_norm": 0.7985702157020569, "learning_rate": 5.16642391474711e-06, "loss": 0.6435, "num_input_tokens_seen": 150568960, "step": 18380 }, { "epoch": 2.3762760046517637, "grad_norm": 0.8118183612823486, "learning_rate": 5.145848089424374e-06, "loss": 0.884, "num_input_tokens_seen": 150650880, "step": 18390 }, { "epoch": 2.377568161261145, "grad_norm": 0.7361833453178406, "learning_rate": 5.125308616981139e-06, "loss": 0.7895, "num_input_tokens_seen": 150732800, "step": 18400 }, { "epoch": 2.378860317870526, "grad_norm": 0.6324991583824158, "learning_rate": 5.1048055350251e-06, "loss": 0.971, "num_input_tokens_seen": 150814720, "step": 18410 }, { "epoch": 2.3801524744799067, "grad_norm": 0.899131178855896, "learning_rate": 5.0843388810973195e-06, "loss": 0.5018, "num_input_tokens_seen": 150896640, "step": 18420 }, { "epoch": 2.381444631089288, "grad_norm": 0.6457487344741821, "learning_rate": 5.06390869267217e-06, "loss": 0.953, "num_input_tokens_seen": 150978560, "step": 18430 }, { "epoch": 2.382736787698669, "grad_norm": 0.8277502655982971, "learning_rate": 5.043515007157263e-06, "loss": 1.2336, "num_input_tokens_seen": 151060480, "step": 18440 }, { "epoch": 2.3840289443080502, "grad_norm": 0.7911684513092041, "learning_rate": 5.02315786189334e-06, "loss": 0.6531, "num_input_tokens_seen": 151142400, "step": 18450 }, { "epoch": 2.385321100917431, "grad_norm": 0.945289671421051, "learning_rate": 5.002837294154283e-06, "loss": 0.7825, "num_input_tokens_seen": 151224320, "step": 18460 }, { "epoch": 2.3866132575268124, "grad_norm": 0.6046878695487976, "learning_rate": 4.982553341146956e-06, "loss": 0.7779, "num_input_tokens_seen": 151306240, "step": 18470 }, { "epoch": 2.3879054141361933, "grad_norm": 0.8228399753570557, "learning_rate": 4.962306040011222e-06, "loss": 0.9538, "num_input_tokens_seen": 151388160, "step": 18480 }, { "epoch": 2.3891975707455746, "grad_norm": 0.7043998837471008, "learning_rate": 4.942095427819796e-06, "loss": 1.0866, "num_input_tokens_seen": 151470080, "step": 18490 }, { "epoch": 2.3904897273549555, "grad_norm": 0.5962804555892944, "learning_rate": 4.921921541578248e-06, "loss": 0.8085, "num_input_tokens_seen": 151552000, "step": 18500 }, { "epoch": 2.3917818839643363, "grad_norm": 0.6313290596008301, "learning_rate": 4.901784418224892e-06, "loss": 0.8541, "num_input_tokens_seen": 151633920, "step": 18510 }, { "epoch": 2.3930740405737176, "grad_norm": 0.6041284203529358, "learning_rate": 4.881684094630712e-06, "loss": 0.6467, "num_input_tokens_seen": 151715840, "step": 18520 }, { "epoch": 2.3943661971830985, "grad_norm": 0.7616141438484192, "learning_rate": 4.861620607599346e-06, "loss": 0.7586, "num_input_tokens_seen": 151797760, "step": 18530 }, { "epoch": 2.39565835379248, "grad_norm": 0.3103398084640503, "learning_rate": 4.841593993866949e-06, "loss": 0.853, "num_input_tokens_seen": 151879680, "step": 18540 }, { "epoch": 2.3969505104018607, "grad_norm": 0.7408877015113831, "learning_rate": 4.821604290102191e-06, "loss": 1.02, "num_input_tokens_seen": 151961600, "step": 18550 }, { "epoch": 2.3982426670112416, "grad_norm": 0.24368135631084442, "learning_rate": 4.801651532906135e-06, "loss": 0.8047, "num_input_tokens_seen": 152043520, "step": 18560 }, { "epoch": 2.399534823620623, "grad_norm": 0.7442583441734314, "learning_rate": 4.781735758812217e-06, "loss": 0.9561, "num_input_tokens_seen": 152125440, "step": 18570 }, { "epoch": 2.4008269802300037, "grad_norm": 0.6959922313690186, "learning_rate": 4.761857004286141e-06, "loss": 0.6804, "num_input_tokens_seen": 152207360, "step": 18580 }, { "epoch": 2.402119136839385, "grad_norm": 1.0393434762954712, "learning_rate": 4.742015305725828e-06, "loss": 0.9148, "num_input_tokens_seen": 152289280, "step": 18590 }, { "epoch": 2.403411293448766, "grad_norm": 0.9378628134727478, "learning_rate": 4.7222106994613655e-06, "loss": 0.6088, "num_input_tokens_seen": 152371200, "step": 18600 }, { "epoch": 2.4047034500581472, "grad_norm": 0.8462372422218323, "learning_rate": 4.702443221754904e-06, "loss": 0.7719, "num_input_tokens_seen": 152453120, "step": 18610 }, { "epoch": 2.405995606667528, "grad_norm": 0.6147511005401611, "learning_rate": 4.6827129088006375e-06, "loss": 0.8772, "num_input_tokens_seen": 152535040, "step": 18620 }, { "epoch": 2.4072877632769094, "grad_norm": 0.8971914052963257, "learning_rate": 4.663019796724685e-06, "loss": 0.7122, "num_input_tokens_seen": 152616960, "step": 18630 }, { "epoch": 2.4085799198862903, "grad_norm": 0.7405256628990173, "learning_rate": 4.6433639215850696e-06, "loss": 1.0226, "num_input_tokens_seen": 152698880, "step": 18640 }, { "epoch": 2.409872076495671, "grad_norm": 0.3762902021408081, "learning_rate": 4.623745319371617e-06, "loss": 0.6679, "num_input_tokens_seen": 152780800, "step": 18650 }, { "epoch": 2.4111642331050525, "grad_norm": 0.9347968697547913, "learning_rate": 4.604164026005925e-06, "loss": 1.0588, "num_input_tokens_seen": 152862720, "step": 18660 }, { "epoch": 2.4124563897144333, "grad_norm": 0.7165658473968506, "learning_rate": 4.584620077341273e-06, "loss": 0.8622, "num_input_tokens_seen": 152944640, "step": 18670 }, { "epoch": 2.4137485463238146, "grad_norm": 0.715330958366394, "learning_rate": 4.565113509162547e-06, "loss": 0.9585, "num_input_tokens_seen": 153026560, "step": 18680 }, { "epoch": 2.4150407029331955, "grad_norm": 0.5120941996574402, "learning_rate": 4.5456443571862185e-06, "loss": 0.5105, "num_input_tokens_seen": 153108480, "step": 18690 }, { "epoch": 2.4163328595425764, "grad_norm": 0.6818029284477234, "learning_rate": 4.5262126570602135e-06, "loss": 0.7058, "num_input_tokens_seen": 153190400, "step": 18700 }, { "epoch": 2.4176250161519577, "grad_norm": 0.3870543837547302, "learning_rate": 4.506818444363925e-06, "loss": 0.7815, "num_input_tokens_seen": 153272320, "step": 18710 }, { "epoch": 2.4189171727613386, "grad_norm": 0.23925046622753143, "learning_rate": 4.487461754608066e-06, "loss": 0.6579, "num_input_tokens_seen": 153354240, "step": 18720 }, { "epoch": 2.42020932937072, "grad_norm": 0.25111180543899536, "learning_rate": 4.468142623234678e-06, "loss": 0.5656, "num_input_tokens_seen": 153436160, "step": 18730 }, { "epoch": 2.4215014859801007, "grad_norm": 0.6497703790664673, "learning_rate": 4.448861085617018e-06, "loss": 0.9916, "num_input_tokens_seen": 153518080, "step": 18740 }, { "epoch": 2.4227936425894816, "grad_norm": 0.30985990166664124, "learning_rate": 4.429617177059508e-06, "loss": 0.3525, "num_input_tokens_seen": 153600000, "step": 18750 }, { "epoch": 2.424085799198863, "grad_norm": 0.848656177520752, "learning_rate": 4.410410932797671e-06, "loss": 0.912, "num_input_tokens_seen": 153681920, "step": 18760 }, { "epoch": 2.425377955808244, "grad_norm": 0.6000288724899292, "learning_rate": 4.391242387998079e-06, "loss": 0.8084, "num_input_tokens_seen": 153763840, "step": 18770 }, { "epoch": 2.426670112417625, "grad_norm": 0.6745223999023438, "learning_rate": 4.372111577758261e-06, "loss": 0.8788, "num_input_tokens_seen": 153845760, "step": 18780 }, { "epoch": 2.427962269027006, "grad_norm": 0.9669370651245117, "learning_rate": 4.353018537106657e-06, "loss": 0.8239, "num_input_tokens_seen": 153927680, "step": 18790 }, { "epoch": 2.4292544256363873, "grad_norm": 0.9055847525596619, "learning_rate": 4.333963301002558e-06, "loss": 0.6755, "num_input_tokens_seen": 154009600, "step": 18800 }, { "epoch": 2.430546582245768, "grad_norm": 0.5974034070968628, "learning_rate": 4.314945904336037e-06, "loss": 0.8883, "num_input_tokens_seen": 154091520, "step": 18810 }, { "epoch": 2.4318387388551495, "grad_norm": 0.6422154307365417, "learning_rate": 4.295966381927871e-06, "loss": 0.9761, "num_input_tokens_seen": 154173440, "step": 18820 }, { "epoch": 2.4331308954645303, "grad_norm": 1.6391959190368652, "learning_rate": 4.2770247685295e-06, "loss": 0.7079, "num_input_tokens_seen": 154255360, "step": 18830 }, { "epoch": 2.434423052073911, "grad_norm": 1.442500114440918, "learning_rate": 4.258121098822945e-06, "loss": 0.8145, "num_input_tokens_seen": 154337280, "step": 18840 }, { "epoch": 2.4357152086832925, "grad_norm": 0.6912276744842529, "learning_rate": 4.239255407420764e-06, "loss": 0.668, "num_input_tokens_seen": 154419200, "step": 18850 }, { "epoch": 2.4370073652926734, "grad_norm": 0.5514190196990967, "learning_rate": 4.220427728865956e-06, "loss": 0.635, "num_input_tokens_seen": 154501120, "step": 18860 }, { "epoch": 2.4382995219020547, "grad_norm": 0.8538394570350647, "learning_rate": 4.201638097631938e-06, "loss": 0.8883, "num_input_tokens_seen": 154583040, "step": 18870 }, { "epoch": 2.4395916785114355, "grad_norm": 1.320961356163025, "learning_rate": 4.182886548122464e-06, "loss": 0.6052, "num_input_tokens_seen": 154664960, "step": 18880 }, { "epoch": 2.4408838351208164, "grad_norm": 0.70893394947052, "learning_rate": 4.164173114671538e-06, "loss": 0.5808, "num_input_tokens_seen": 154746880, "step": 18890 }, { "epoch": 2.4421759917301977, "grad_norm": 0.386322021484375, "learning_rate": 4.145497831543402e-06, "loss": 1.0335, "num_input_tokens_seen": 154828800, "step": 18900 }, { "epoch": 2.4434681483395786, "grad_norm": 0.5800808668136597, "learning_rate": 4.1268607329324195e-06, "loss": 0.8685, "num_input_tokens_seen": 154910720, "step": 18910 }, { "epoch": 2.44476030494896, "grad_norm": 1.0976125001907349, "learning_rate": 4.108261852963061e-06, "loss": 0.9031, "num_input_tokens_seen": 154992640, "step": 18920 }, { "epoch": 2.4460524615583408, "grad_norm": 0.9657114148139954, "learning_rate": 4.089701225689793e-06, "loss": 0.6177, "num_input_tokens_seen": 155074560, "step": 18930 }, { "epoch": 2.447344618167722, "grad_norm": 0.718108594417572, "learning_rate": 4.071178885097074e-06, "loss": 0.6072, "num_input_tokens_seen": 155156480, "step": 18940 }, { "epoch": 2.448636774777103, "grad_norm": 0.8545438051223755, "learning_rate": 4.052694865099232e-06, "loss": 0.8248, "num_input_tokens_seen": 155238400, "step": 18950 }, { "epoch": 2.4499289313864843, "grad_norm": 0.739714503288269, "learning_rate": 4.034249199540432e-06, "loss": 0.8951, "num_input_tokens_seen": 155320320, "step": 18960 }, { "epoch": 2.451221087995865, "grad_norm": 0.3881072998046875, "learning_rate": 4.015841922194638e-06, "loss": 0.6641, "num_input_tokens_seen": 155402240, "step": 18970 }, { "epoch": 2.452513244605246, "grad_norm": 0.2270699441432953, "learning_rate": 3.997473066765489e-06, "loss": 0.3194, "num_input_tokens_seen": 155484160, "step": 18980 }, { "epoch": 2.4538054012146273, "grad_norm": 0.6020457744598389, "learning_rate": 3.97914266688631e-06, "loss": 0.8105, "num_input_tokens_seen": 155566080, "step": 18990 }, { "epoch": 2.455097557824008, "grad_norm": 1.502171277999878, "learning_rate": 3.96085075611998e-06, "loss": 0.7238, "num_input_tokens_seen": 155648000, "step": 19000 }, { "epoch": 2.4563897144333895, "grad_norm": 0.7338537573814392, "learning_rate": 3.942597367958928e-06, "loss": 0.9272, "num_input_tokens_seen": 155729920, "step": 19010 }, { "epoch": 2.4576818710427704, "grad_norm": 0.7727378010749817, "learning_rate": 3.924382535825047e-06, "loss": 0.5264, "num_input_tokens_seen": 155811840, "step": 19020 }, { "epoch": 2.4589740276521512, "grad_norm": 0.7787820100784302, "learning_rate": 3.906206293069617e-06, "loss": 0.8021, "num_input_tokens_seen": 155893760, "step": 19030 }, { "epoch": 2.4602661842615325, "grad_norm": 0.7348833084106445, "learning_rate": 3.88806867297328e-06, "loss": 0.9558, "num_input_tokens_seen": 155975680, "step": 19040 }, { "epoch": 2.4615583408709134, "grad_norm": 0.7787769436836243, "learning_rate": 3.869969708745946e-06, "loss": 1.0861, "num_input_tokens_seen": 156057600, "step": 19050 }, { "epoch": 2.4628504974802947, "grad_norm": 0.7689657807350159, "learning_rate": 3.85190943352676e-06, "loss": 0.9696, "num_input_tokens_seen": 156139520, "step": 19060 }, { "epoch": 2.4641426540896756, "grad_norm": 1.0891441106796265, "learning_rate": 3.833887880384007e-06, "loss": 0.7405, "num_input_tokens_seen": 156221440, "step": 19070 }, { "epoch": 2.465434810699057, "grad_norm": 0.4140090048313141, "learning_rate": 3.815905082315102e-06, "loss": 0.6941, "num_input_tokens_seen": 156303360, "step": 19080 }, { "epoch": 2.4667269673084378, "grad_norm": 0.9087291359901428, "learning_rate": 3.7979610722464643e-06, "loss": 0.472, "num_input_tokens_seen": 156385280, "step": 19090 }, { "epoch": 2.468019123917819, "grad_norm": 0.9746045470237732, "learning_rate": 3.780055883033523e-06, "loss": 1.1217, "num_input_tokens_seen": 156467200, "step": 19100 }, { "epoch": 2.4693112805272, "grad_norm": 0.43733227252960205, "learning_rate": 3.762189547460615e-06, "loss": 0.7157, "num_input_tokens_seen": 156549120, "step": 19110 }, { "epoch": 2.470603437136581, "grad_norm": 0.6600936055183411, "learning_rate": 3.7443620982409305e-06, "loss": 0.9332, "num_input_tokens_seen": 156631040, "step": 19120 }, { "epoch": 2.471895593745962, "grad_norm": 0.22970102727413177, "learning_rate": 3.7265735680164615e-06, "loss": 0.5993, "num_input_tokens_seen": 156712960, "step": 19130 }, { "epoch": 2.473187750355343, "grad_norm": 0.37218940258026123, "learning_rate": 3.7088239893579456e-06, "loss": 0.8641, "num_input_tokens_seen": 156794880, "step": 19140 }, { "epoch": 2.4744799069647243, "grad_norm": 0.8411920666694641, "learning_rate": 3.6911133947648002e-06, "loss": 0.5555, "num_input_tokens_seen": 156876800, "step": 19150 }, { "epoch": 2.475772063574105, "grad_norm": 0.7639785408973694, "learning_rate": 3.6734418166650436e-06, "loss": 0.7846, "num_input_tokens_seen": 156958720, "step": 19160 }, { "epoch": 2.477064220183486, "grad_norm": 0.5358415842056274, "learning_rate": 3.655809287415285e-06, "loss": 0.7447, "num_input_tokens_seen": 157040640, "step": 19170 }, { "epoch": 2.4783563767928674, "grad_norm": 0.6833941340446472, "learning_rate": 3.638215839300624e-06, "loss": 0.8456, "num_input_tokens_seen": 157122560, "step": 19180 }, { "epoch": 2.4796485334022482, "grad_norm": 0.5907963514328003, "learning_rate": 3.6206615045345837e-06, "loss": 0.9318, "num_input_tokens_seen": 157204480, "step": 19190 }, { "epoch": 2.4809406900116295, "grad_norm": 0.9480105638504028, "learning_rate": 3.603146315259104e-06, "loss": 1.074, "num_input_tokens_seen": 157286400, "step": 19200 }, { "epoch": 2.4822328466210104, "grad_norm": 0.9866535067558289, "learning_rate": 3.5856703035444196e-06, "loss": 0.6913, "num_input_tokens_seen": 157368320, "step": 19210 }, { "epoch": 2.4835250032303917, "grad_norm": 0.965872585773468, "learning_rate": 3.568233501389054e-06, "loss": 0.7036, "num_input_tokens_seen": 157450240, "step": 19220 }, { "epoch": 2.4848171598397726, "grad_norm": 0.752018928527832, "learning_rate": 3.5508359407197157e-06, "loss": 0.7991, "num_input_tokens_seen": 157532160, "step": 19230 }, { "epoch": 2.4861093164491535, "grad_norm": 0.6639631986618042, "learning_rate": 3.5334776533912846e-06, "loss": 0.6216, "num_input_tokens_seen": 157614080, "step": 19240 }, { "epoch": 2.4874014730585348, "grad_norm": 0.7294473052024841, "learning_rate": 3.516158671186723e-06, "loss": 0.9294, "num_input_tokens_seen": 157696000, "step": 19250 }, { "epoch": 2.4886936296679156, "grad_norm": 0.920391321182251, "learning_rate": 3.4988790258170146e-06, "loss": 0.8402, "num_input_tokens_seen": 157777920, "step": 19260 }, { "epoch": 2.489985786277297, "grad_norm": 1.0001167058944702, "learning_rate": 3.481638748921137e-06, "loss": 0.8772, "num_input_tokens_seen": 157859840, "step": 19270 }, { "epoch": 2.491277942886678, "grad_norm": 1.0365166664123535, "learning_rate": 3.4644378720659648e-06, "loss": 0.5995, "num_input_tokens_seen": 157941760, "step": 19280 }, { "epoch": 2.492570099496059, "grad_norm": 0.8598686456680298, "learning_rate": 3.4472764267462486e-06, "loss": 0.8775, "num_input_tokens_seen": 158023680, "step": 19290 }, { "epoch": 2.49386225610544, "grad_norm": 0.7252724170684814, "learning_rate": 3.430154444384523e-06, "loss": 0.827, "num_input_tokens_seen": 158105600, "step": 19300 }, { "epoch": 2.495154412714821, "grad_norm": 0.9044560194015503, "learning_rate": 3.4130719563310877e-06, "loss": 0.6568, "num_input_tokens_seen": 158187520, "step": 19310 }, { "epoch": 2.496446569324202, "grad_norm": 0.3147662281990051, "learning_rate": 3.396028993863906e-06, "loss": 0.8331, "num_input_tokens_seen": 158269440, "step": 19320 }, { "epoch": 2.497738725933583, "grad_norm": 0.6290098428726196, "learning_rate": 3.379025588188578e-06, "loss": 0.6592, "num_input_tokens_seen": 158351360, "step": 19330 }, { "epoch": 2.4990308825429643, "grad_norm": 0.49024534225463867, "learning_rate": 3.362061770438285e-06, "loss": 0.9447, "num_input_tokens_seen": 158433280, "step": 19340 }, { "epoch": 2.500323039152345, "grad_norm": 1.2774690389633179, "learning_rate": 3.3451375716737067e-06, "loss": 0.4547, "num_input_tokens_seen": 158515200, "step": 19350 }, { "epoch": 2.501615195761726, "grad_norm": 0.6173495650291443, "learning_rate": 3.328253022883002e-06, "loss": 0.8312, "num_input_tokens_seen": 158597120, "step": 19360 }, { "epoch": 2.5029073523711074, "grad_norm": 0.7313811779022217, "learning_rate": 3.3114081549817018e-06, "loss": 0.744, "num_input_tokens_seen": 158679040, "step": 19370 }, { "epoch": 2.5041995089804887, "grad_norm": 0.28148242831230164, "learning_rate": 3.2946029988127068e-06, "loss": 0.6551, "num_input_tokens_seen": 158760960, "step": 19380 }, { "epoch": 2.5054916655898696, "grad_norm": 0.8187404870986938, "learning_rate": 3.2778375851462013e-06, "loss": 1.1236, "num_input_tokens_seen": 158842880, "step": 19390 }, { "epoch": 2.5067838221992504, "grad_norm": 0.7850127220153809, "learning_rate": 3.2611119446795844e-06, "loss": 0.7408, "num_input_tokens_seen": 158924800, "step": 19400 }, { "epoch": 2.5080759788086318, "grad_norm": 0.9370409250259399, "learning_rate": 3.2444261080374546e-06, "loss": 0.8042, "num_input_tokens_seen": 159006720, "step": 19410 }, { "epoch": 2.5093681354180126, "grad_norm": 0.6807863116264343, "learning_rate": 3.227780105771505e-06, "loss": 0.7672, "num_input_tokens_seen": 159088640, "step": 19420 }, { "epoch": 2.510660292027394, "grad_norm": 0.713300347328186, "learning_rate": 3.2111739683605204e-06, "loss": 0.9146, "num_input_tokens_seen": 159170560, "step": 19430 }, { "epoch": 2.511952448636775, "grad_norm": 0.7816091179847717, "learning_rate": 3.194607726210261e-06, "loss": 0.8129, "num_input_tokens_seen": 159252480, "step": 19440 }, { "epoch": 2.5132446052461557, "grad_norm": 0.1920579969882965, "learning_rate": 3.178081409653469e-06, "loss": 0.6187, "num_input_tokens_seen": 159334400, "step": 19450 }, { "epoch": 2.514536761855537, "grad_norm": 0.6369209885597229, "learning_rate": 3.1615950489497587e-06, "loss": 0.7696, "num_input_tokens_seen": 159416320, "step": 19460 }, { "epoch": 2.515828918464918, "grad_norm": 0.4802658259868622, "learning_rate": 3.1451486742856055e-06, "loss": 0.8106, "num_input_tokens_seen": 159498240, "step": 19470 }, { "epoch": 2.517121075074299, "grad_norm": 0.8319321274757385, "learning_rate": 3.128742315774255e-06, "loss": 0.8289, "num_input_tokens_seen": 159580160, "step": 19480 }, { "epoch": 2.51841323168368, "grad_norm": 0.8333976864814758, "learning_rate": 3.1123760034556943e-06, "loss": 0.4341, "num_input_tokens_seen": 159662080, "step": 19490 }, { "epoch": 2.519705388293061, "grad_norm": 1.237228274345398, "learning_rate": 3.0960497672965825e-06, "loss": 0.7386, "num_input_tokens_seen": 159744000, "step": 19500 }, { "epoch": 2.520997544902442, "grad_norm": 0.8341740369796753, "learning_rate": 3.0797636371901863e-06, "loss": 0.6727, "num_input_tokens_seen": 159825920, "step": 19510 }, { "epoch": 2.522289701511823, "grad_norm": 0.6844837069511414, "learning_rate": 3.063517642956365e-06, "loss": 0.8113, "num_input_tokens_seen": 159907840, "step": 19520 }, { "epoch": 2.5235818581212044, "grad_norm": 0.8005863428115845, "learning_rate": 3.0473118143414634e-06, "loss": 0.793, "num_input_tokens_seen": 159989760, "step": 19530 }, { "epoch": 2.5248740147305853, "grad_norm": 0.6440595984458923, "learning_rate": 3.031146181018299e-06, "loss": 0.8666, "num_input_tokens_seen": 160071680, "step": 19540 }, { "epoch": 2.526166171339966, "grad_norm": 0.6987878680229187, "learning_rate": 3.0150207725860912e-06, "loss": 0.6603, "num_input_tokens_seen": 160153600, "step": 19550 }, { "epoch": 2.5274583279493474, "grad_norm": 0.9995496273040771, "learning_rate": 2.9989356185703975e-06, "loss": 0.6602, "num_input_tokens_seen": 160235520, "step": 19560 }, { "epoch": 2.5287504845587288, "grad_norm": 0.8544566035270691, "learning_rate": 2.982890748423084e-06, "loss": 0.4753, "num_input_tokens_seen": 160317440, "step": 19570 }, { "epoch": 2.5300426411681096, "grad_norm": 0.9914432764053345, "learning_rate": 2.9668861915222364e-06, "loss": 0.9147, "num_input_tokens_seen": 160399360, "step": 19580 }, { "epoch": 2.5313347977774905, "grad_norm": 0.5936645865440369, "learning_rate": 2.950921977172155e-06, "loss": 0.9882, "num_input_tokens_seen": 160481280, "step": 19590 }, { "epoch": 2.532626954386872, "grad_norm": 0.45081570744514465, "learning_rate": 2.934998134603245e-06, "loss": 0.5187, "num_input_tokens_seen": 160563200, "step": 19600 }, { "epoch": 2.5339191109962527, "grad_norm": 0.7718636393547058, "learning_rate": 2.919114692972008e-06, "loss": 0.7276, "num_input_tokens_seen": 160645120, "step": 19610 }, { "epoch": 2.535211267605634, "grad_norm": 0.371920108795166, "learning_rate": 2.9032716813609723e-06, "loss": 0.6929, "num_input_tokens_seen": 160727040, "step": 19620 }, { "epoch": 2.536503424215015, "grad_norm": 0.7801430225372314, "learning_rate": 2.8874691287786275e-06, "loss": 0.5275, "num_input_tokens_seen": 160808960, "step": 19630 }, { "epoch": 2.5377955808243957, "grad_norm": 0.8307329416275024, "learning_rate": 2.8717070641593987e-06, "loss": 0.6655, "num_input_tokens_seen": 160890880, "step": 19640 }, { "epoch": 2.539087737433777, "grad_norm": 0.44690385460853577, "learning_rate": 2.8559855163635544e-06, "loss": 0.6951, "num_input_tokens_seen": 160972800, "step": 19650 }, { "epoch": 2.540379894043158, "grad_norm": 1.1183629035949707, "learning_rate": 2.8403045141772054e-06, "loss": 0.6766, "num_input_tokens_seen": 161054720, "step": 19660 }, { "epoch": 2.541672050652539, "grad_norm": 0.7904671430587769, "learning_rate": 2.824664086312204e-06, "loss": 0.7144, "num_input_tokens_seen": 161136640, "step": 19670 }, { "epoch": 2.54296420726192, "grad_norm": 0.6050812602043152, "learning_rate": 2.809064261406111e-06, "loss": 0.8016, "num_input_tokens_seen": 161218560, "step": 19680 }, { "epoch": 2.544256363871301, "grad_norm": 0.7455185651779175, "learning_rate": 2.7935050680221565e-06, "loss": 0.808, "num_input_tokens_seen": 161300480, "step": 19690 }, { "epoch": 2.5455485204806823, "grad_norm": 0.6713374853134155, "learning_rate": 2.7779865346491576e-06, "loss": 0.6117, "num_input_tokens_seen": 161382400, "step": 19700 }, { "epoch": 2.5468406770900636, "grad_norm": 0.7479690909385681, "learning_rate": 2.762508689701504e-06, "loss": 0.8894, "num_input_tokens_seen": 161464320, "step": 19710 }, { "epoch": 2.5481328336994444, "grad_norm": 0.8708047270774841, "learning_rate": 2.74707156151906e-06, "loss": 0.9919, "num_input_tokens_seen": 161546240, "step": 19720 }, { "epoch": 2.5494249903088253, "grad_norm": 0.33944186568260193, "learning_rate": 2.7316751783671655e-06, "loss": 1.0815, "num_input_tokens_seen": 161628160, "step": 19730 }, { "epoch": 2.5507171469182066, "grad_norm": 0.6622774004936218, "learning_rate": 2.716319568436529e-06, "loss": 0.9753, "num_input_tokens_seen": 161710080, "step": 19740 }, { "epoch": 2.5520093035275875, "grad_norm": 1.0370920896530151, "learning_rate": 2.7010047598432205e-06, "loss": 1.0534, "num_input_tokens_seen": 161792000, "step": 19750 }, { "epoch": 2.553301460136969, "grad_norm": 0.35142797231674194, "learning_rate": 2.6857307806286037e-06, "loss": 0.5268, "num_input_tokens_seen": 161873920, "step": 19760 }, { "epoch": 2.5545936167463497, "grad_norm": 1.3889226913452148, "learning_rate": 2.6704976587592688e-06, "loss": 0.7309, "num_input_tokens_seen": 161955840, "step": 19770 }, { "epoch": 2.5558857733557305, "grad_norm": 0.6227067708969116, "learning_rate": 2.655305422127016e-06, "loss": 0.9389, "num_input_tokens_seen": 162037760, "step": 19780 }, { "epoch": 2.557177929965112, "grad_norm": 0.35299497842788696, "learning_rate": 2.6401540985487667e-06, "loss": 0.6891, "num_input_tokens_seen": 162119680, "step": 19790 }, { "epoch": 2.5584700865744927, "grad_norm": 1.1497657299041748, "learning_rate": 2.6250437157665455e-06, "loss": 0.6537, "num_input_tokens_seen": 162201600, "step": 19800 }, { "epoch": 2.559762243183874, "grad_norm": 0.7763955593109131, "learning_rate": 2.6099743014474014e-06, "loss": 0.7283, "num_input_tokens_seen": 162283520, "step": 19810 }, { "epoch": 2.561054399793255, "grad_norm": 0.6793556809425354, "learning_rate": 2.594945883183386e-06, "loss": 0.6683, "num_input_tokens_seen": 162365440, "step": 19820 }, { "epoch": 2.5623465564026358, "grad_norm": 0.6524314880371094, "learning_rate": 2.5799584884914685e-06, "loss": 0.855, "num_input_tokens_seen": 162447360, "step": 19830 }, { "epoch": 2.563638713012017, "grad_norm": 1.6065878868103027, "learning_rate": 2.5650121448135222e-06, "loss": 0.9368, "num_input_tokens_seen": 162529280, "step": 19840 }, { "epoch": 2.5649308696213984, "grad_norm": 0.7918826937675476, "learning_rate": 2.550106879516237e-06, "loss": 0.5496, "num_input_tokens_seen": 162611200, "step": 19850 }, { "epoch": 2.5662230262307792, "grad_norm": 0.7481831312179565, "learning_rate": 2.535242719891112e-06, "loss": 1.0861, "num_input_tokens_seen": 162693120, "step": 19860 }, { "epoch": 2.56751518284016, "grad_norm": 0.519053041934967, "learning_rate": 2.5204196931543635e-06, "loss": 0.4623, "num_input_tokens_seen": 162775040, "step": 19870 }, { "epoch": 2.5688073394495414, "grad_norm": 0.7055248618125916, "learning_rate": 2.505637826446891e-06, "loss": 1.0828, "num_input_tokens_seen": 162856960, "step": 19880 }, { "epoch": 2.5700994960589223, "grad_norm": 0.2592199444770813, "learning_rate": 2.4908971468342535e-06, "loss": 0.8837, "num_input_tokens_seen": 162938880, "step": 19890 }, { "epoch": 2.5713916526683036, "grad_norm": 0.6449967622756958, "learning_rate": 2.4761976813065663e-06, "loss": 0.986, "num_input_tokens_seen": 163020800, "step": 19900 }, { "epoch": 2.5726838092776845, "grad_norm": 0.601736307144165, "learning_rate": 2.4615394567785055e-06, "loss": 0.9548, "num_input_tokens_seen": 163102720, "step": 19910 }, { "epoch": 2.5739759658870653, "grad_norm": 0.7824547290802002, "learning_rate": 2.44692250008923e-06, "loss": 1.0377, "num_input_tokens_seen": 163184640, "step": 19920 }, { "epoch": 2.5752681224964467, "grad_norm": 0.4208737015724182, "learning_rate": 2.432346838002325e-06, "loss": 0.8889, "num_input_tokens_seen": 163266560, "step": 19930 }, { "epoch": 2.5765602791058275, "grad_norm": 0.7913038730621338, "learning_rate": 2.417812497205782e-06, "loss": 0.7366, "num_input_tokens_seen": 163348480, "step": 19940 }, { "epoch": 2.577852435715209, "grad_norm": 1.0949358940124512, "learning_rate": 2.403319504311921e-06, "loss": 0.9359, "num_input_tokens_seen": 163430400, "step": 19950 }, { "epoch": 2.5791445923245897, "grad_norm": 0.877521276473999, "learning_rate": 2.3888678858573625e-06, "loss": 1.1218, "num_input_tokens_seen": 163512320, "step": 19960 }, { "epoch": 2.5804367489339706, "grad_norm": 0.8615255951881409, "learning_rate": 2.374457668302962e-06, "loss": 0.8245, "num_input_tokens_seen": 163594240, "step": 19970 }, { "epoch": 2.581728905543352, "grad_norm": 0.5929073691368103, "learning_rate": 2.360088878033778e-06, "loss": 1.0923, "num_input_tokens_seen": 163676160, "step": 19980 }, { "epoch": 2.5830210621527327, "grad_norm": 0.7284789681434631, "learning_rate": 2.3457615413590177e-06, "loss": 1.161, "num_input_tokens_seen": 163758080, "step": 19990 }, { "epoch": 2.584313218762114, "grad_norm": 1.2667911052703857, "learning_rate": 2.3314756845119746e-06, "loss": 0.9189, "num_input_tokens_seen": 163840000, "step": 20000 }, { "epoch": 2.585605375371495, "grad_norm": 0.672854483127594, "learning_rate": 2.317231333650005e-06, "loss": 0.8559, "num_input_tokens_seen": 163921920, "step": 20010 }, { "epoch": 2.5868975319808762, "grad_norm": 0.6795851588249207, "learning_rate": 2.3030285148544577e-06, "loss": 0.9338, "num_input_tokens_seen": 164003840, "step": 20020 }, { "epoch": 2.588189688590257, "grad_norm": 0.6645368933677673, "learning_rate": 2.2888672541306525e-06, "loss": 0.7432, "num_input_tokens_seen": 164085760, "step": 20030 }, { "epoch": 2.5894818451996384, "grad_norm": 0.7248007655143738, "learning_rate": 2.2747475774077986e-06, "loss": 0.7769, "num_input_tokens_seen": 164167680, "step": 20040 }, { "epoch": 2.5907740018090193, "grad_norm": 0.7931993007659912, "learning_rate": 2.2606695105389653e-06, "loss": 0.8439, "num_input_tokens_seen": 164249600, "step": 20050 }, { "epoch": 2.5920661584184, "grad_norm": 0.8612631559371948, "learning_rate": 2.2466330793010555e-06, "loss": 0.772, "num_input_tokens_seen": 164331520, "step": 20060 }, { "epoch": 2.5933583150277815, "grad_norm": 0.7526144981384277, "learning_rate": 2.2326383093947135e-06, "loss": 0.7069, "num_input_tokens_seen": 164413440, "step": 20070 }, { "epoch": 2.5946504716371623, "grad_norm": 1.1278603076934814, "learning_rate": 2.2186852264443196e-06, "loss": 0.9164, "num_input_tokens_seen": 164495360, "step": 20080 }, { "epoch": 2.5959426282465436, "grad_norm": 0.7620711326599121, "learning_rate": 2.2047738559979104e-06, "loss": 0.8105, "num_input_tokens_seen": 164577280, "step": 20090 }, { "epoch": 2.5972347848559245, "grad_norm": 0.6490961909294128, "learning_rate": 2.1909042235271597e-06, "loss": 1.0952, "num_input_tokens_seen": 164659200, "step": 20100 }, { "epoch": 2.5985269414653054, "grad_norm": 1.2163642644882202, "learning_rate": 2.1770763544273098e-06, "loss": 0.9695, "num_input_tokens_seen": 164741120, "step": 20110 }, { "epoch": 2.5998190980746867, "grad_norm": 0.4267684519290924, "learning_rate": 2.1632902740171378e-06, "loss": 0.8496, "num_input_tokens_seen": 164823040, "step": 20120 }, { "epoch": 2.6011112546840676, "grad_norm": 0.7075666785240173, "learning_rate": 2.1495460075389133e-06, "loss": 0.7883, "num_input_tokens_seen": 164904960, "step": 20130 }, { "epoch": 2.602403411293449, "grad_norm": 0.6190396547317505, "learning_rate": 2.1358435801583283e-06, "loss": 0.6911, "num_input_tokens_seen": 164986880, "step": 20140 }, { "epoch": 2.6036955679028297, "grad_norm": 0.3564266264438629, "learning_rate": 2.122183016964488e-06, "loss": 0.5446, "num_input_tokens_seen": 165068800, "step": 20150 }, { "epoch": 2.6049877245122106, "grad_norm": 0.7317832112312317, "learning_rate": 2.1085643429698236e-06, "loss": 0.7104, "num_input_tokens_seen": 165150720, "step": 20160 }, { "epoch": 2.606279881121592, "grad_norm": 0.6708076000213623, "learning_rate": 2.094987583110086e-06, "loss": 0.9044, "num_input_tokens_seen": 165232640, "step": 20170 }, { "epoch": 2.6075720377309732, "grad_norm": 0.6246494650840759, "learning_rate": 2.0814527622442626e-06, "loss": 0.687, "num_input_tokens_seen": 165314560, "step": 20180 }, { "epoch": 2.608864194340354, "grad_norm": 0.7877633571624756, "learning_rate": 2.067959905154568e-06, "loss": 0.791, "num_input_tokens_seen": 165396480, "step": 20190 }, { "epoch": 2.610156350949735, "grad_norm": 0.33362698554992676, "learning_rate": 2.0545090365463788e-06, "loss": 0.508, "num_input_tokens_seen": 165478400, "step": 20200 }, { "epoch": 2.6114485075591163, "grad_norm": 0.5828604102134705, "learning_rate": 2.041100181048178e-06, "loss": 0.9302, "num_input_tokens_seen": 165560320, "step": 20210 }, { "epoch": 2.612740664168497, "grad_norm": 0.7072322964668274, "learning_rate": 2.0277333632115288e-06, "loss": 0.7966, "num_input_tokens_seen": 165642240, "step": 20220 }, { "epoch": 2.6140328207778785, "grad_norm": 0.760685920715332, "learning_rate": 2.0144086075110367e-06, "loss": 1.1115, "num_input_tokens_seen": 165724160, "step": 20230 }, { "epoch": 2.6153249773872593, "grad_norm": 1.190176248550415, "learning_rate": 2.001125938344273e-06, "loss": 0.6785, "num_input_tokens_seen": 165806080, "step": 20240 }, { "epoch": 2.61661713399664, "grad_norm": 0.2874329686164856, "learning_rate": 1.9878853800317535e-06, "loss": 0.8553, "num_input_tokens_seen": 165888000, "step": 20250 }, { "epoch": 2.6179092906060215, "grad_norm": 1.3199117183685303, "learning_rate": 1.9746869568168985e-06, "loss": 0.7277, "num_input_tokens_seen": 165969920, "step": 20260 }, { "epoch": 2.6192014472154024, "grad_norm": 0.7847501039505005, "learning_rate": 1.9615306928659677e-06, "loss": 0.823, "num_input_tokens_seen": 166051840, "step": 20270 }, { "epoch": 2.6204936038247837, "grad_norm": 0.6932337284088135, "learning_rate": 1.948416612268034e-06, "loss": 0.9189, "num_input_tokens_seen": 166133760, "step": 20280 }, { "epoch": 2.6217857604341646, "grad_norm": 0.6914857029914856, "learning_rate": 1.935344739034936e-06, "loss": 0.8662, "num_input_tokens_seen": 166215680, "step": 20290 }, { "epoch": 2.6230779170435454, "grad_norm": 0.6377202868461609, "learning_rate": 1.922315097101218e-06, "loss": 0.9959, "num_input_tokens_seen": 166297600, "step": 20300 }, { "epoch": 2.6243700736529267, "grad_norm": 0.9114018082618713, "learning_rate": 1.909327710324116e-06, "loss": 0.4612, "num_input_tokens_seen": 166379520, "step": 20310 }, { "epoch": 2.625662230262308, "grad_norm": 0.7677037715911865, "learning_rate": 1.8963826024834734e-06, "loss": 0.9159, "num_input_tokens_seen": 166461440, "step": 20320 }, { "epoch": 2.626954386871689, "grad_norm": 0.4625994563102722, "learning_rate": 1.8834797972817508e-06, "loss": 0.6083, "num_input_tokens_seen": 166543360, "step": 20330 }, { "epoch": 2.62824654348107, "grad_norm": 0.6611701250076294, "learning_rate": 1.8706193183439247e-06, "loss": 0.8399, "num_input_tokens_seen": 166625280, "step": 20340 }, { "epoch": 2.629538700090451, "grad_norm": 0.528550386428833, "learning_rate": 1.8578011892174924e-06, "loss": 0.4694, "num_input_tokens_seen": 166707200, "step": 20350 }, { "epoch": 2.630830856699832, "grad_norm": 0.7156257033348083, "learning_rate": 1.845025433372402e-06, "loss": 1.0965, "num_input_tokens_seen": 166789120, "step": 20360 }, { "epoch": 2.6321230133092133, "grad_norm": 0.6360397934913635, "learning_rate": 1.8322920742010086e-06, "loss": 0.9704, "num_input_tokens_seen": 166871040, "step": 20370 }, { "epoch": 2.633415169918594, "grad_norm": 1.0048847198486328, "learning_rate": 1.8196011350180563e-06, "loss": 1.1178, "num_input_tokens_seen": 166952960, "step": 20380 }, { "epoch": 2.634707326527975, "grad_norm": 0.7727967500686646, "learning_rate": 1.8069526390605968e-06, "loss": 0.6866, "num_input_tokens_seen": 167034880, "step": 20390 }, { "epoch": 2.6359994831373563, "grad_norm": 0.8432693481445312, "learning_rate": 1.7943466094879902e-06, "loss": 0.8549, "num_input_tokens_seen": 167116800, "step": 20400 }, { "epoch": 2.637291639746737, "grad_norm": 0.6722807884216309, "learning_rate": 1.7817830693818288e-06, "loss": 0.7212, "num_input_tokens_seen": 167198720, "step": 20410 }, { "epoch": 2.6385837963561185, "grad_norm": 0.6678622961044312, "learning_rate": 1.7692620417459004e-06, "loss": 0.3765, "num_input_tokens_seen": 167280640, "step": 20420 }, { "epoch": 2.6398759529654994, "grad_norm": 0.7946398854255676, "learning_rate": 1.7567835495061718e-06, "loss": 0.9786, "num_input_tokens_seen": 167362560, "step": 20430 }, { "epoch": 2.6411681095748802, "grad_norm": 0.26235970854759216, "learning_rate": 1.7443476155107052e-06, "loss": 0.8443, "num_input_tokens_seen": 167444480, "step": 20440 }, { "epoch": 2.6424602661842616, "grad_norm": 0.9268401861190796, "learning_rate": 1.7319542625296613e-06, "loss": 0.9905, "num_input_tokens_seen": 167526400, "step": 20450 }, { "epoch": 2.643752422793643, "grad_norm": 0.7280152440071106, "learning_rate": 1.7196035132552135e-06, "loss": 0.7714, "num_input_tokens_seen": 167608320, "step": 20460 }, { "epoch": 2.6450445794030237, "grad_norm": 0.6951790452003479, "learning_rate": 1.7072953903015498e-06, "loss": 0.9703, "num_input_tokens_seen": 167690240, "step": 20470 }, { "epoch": 2.6463367360124046, "grad_norm": 0.5963300466537476, "learning_rate": 1.6950299162047878e-06, "loss": 0.7194, "num_input_tokens_seen": 167772160, "step": 20480 }, { "epoch": 2.647628892621786, "grad_norm": 1.0526518821716309, "learning_rate": 1.682807113422971e-06, "loss": 0.7812, "num_input_tokens_seen": 167854080, "step": 20490 }, { "epoch": 2.648921049231167, "grad_norm": 0.33847522735595703, "learning_rate": 1.6706270043360117e-06, "loss": 0.6726, "num_input_tokens_seen": 167936000, "step": 20500 }, { "epoch": 2.650213205840548, "grad_norm": 0.7132168412208557, "learning_rate": 1.6584896112456338e-06, "loss": 0.921, "num_input_tokens_seen": 168017920, "step": 20510 }, { "epoch": 2.651505362449929, "grad_norm": 0.6660868525505066, "learning_rate": 1.646394956375369e-06, "loss": 0.8371, "num_input_tokens_seen": 168099840, "step": 20520 }, { "epoch": 2.65279751905931, "grad_norm": 0.7083340287208557, "learning_rate": 1.6343430618704775e-06, "loss": 0.6175, "num_input_tokens_seen": 168181760, "step": 20530 }, { "epoch": 2.654089675668691, "grad_norm": 0.35995742678642273, "learning_rate": 1.622333949797944e-06, "loss": 0.6474, "num_input_tokens_seen": 168263680, "step": 20540 }, { "epoch": 2.655381832278072, "grad_norm": 0.6416080594062805, "learning_rate": 1.6103676421463986e-06, "loss": 0.6756, "num_input_tokens_seen": 168345600, "step": 20550 }, { "epoch": 2.6566739888874533, "grad_norm": 0.6493408679962158, "learning_rate": 1.5984441608261152e-06, "loss": 0.8549, "num_input_tokens_seen": 168427520, "step": 20560 }, { "epoch": 2.657966145496834, "grad_norm": 1.0145015716552734, "learning_rate": 1.5865635276689412e-06, "loss": 0.6964, "num_input_tokens_seen": 168509440, "step": 20570 }, { "epoch": 2.659258302106215, "grad_norm": 0.8252952098846436, "learning_rate": 1.5747257644282726e-06, "loss": 0.8505, "num_input_tokens_seen": 168591360, "step": 20580 }, { "epoch": 2.6605504587155964, "grad_norm": 1.0364081859588623, "learning_rate": 1.5629308927790077e-06, "loss": 0.4486, "num_input_tokens_seen": 168673280, "step": 20590 }, { "epoch": 2.6618426153249772, "grad_norm": 0.7188685536384583, "learning_rate": 1.551178934317521e-06, "loss": 0.7925, "num_input_tokens_seen": 168755200, "step": 20600 }, { "epoch": 2.6631347719343585, "grad_norm": 0.6135556101799011, "learning_rate": 1.5394699105616002e-06, "loss": 0.9941, "num_input_tokens_seen": 168837120, "step": 20610 }, { "epoch": 2.6644269285437394, "grad_norm": 0.39184391498565674, "learning_rate": 1.5278038429504177e-06, "loss": 0.8127, "num_input_tokens_seen": 168919040, "step": 20620 }, { "epoch": 2.6657190851531203, "grad_norm": 0.3787175416946411, "learning_rate": 1.516180752844515e-06, "loss": 0.3516, "num_input_tokens_seen": 169000960, "step": 20630 }, { "epoch": 2.6670112417625016, "grad_norm": 0.7270440459251404, "learning_rate": 1.504600661525718e-06, "loss": 0.7978, "num_input_tokens_seen": 169082880, "step": 20640 }, { "epoch": 2.668303398371883, "grad_norm": 0.8899626731872559, "learning_rate": 1.493063590197133e-06, "loss": 0.6851, "num_input_tokens_seen": 169164800, "step": 20650 }, { "epoch": 2.6695955549812638, "grad_norm": 0.7514258027076721, "learning_rate": 1.4815695599830981e-06, "loss": 1.0456, "num_input_tokens_seen": 169246720, "step": 20660 }, { "epoch": 2.6708877115906446, "grad_norm": 1.2644627094268799, "learning_rate": 1.4701185919291372e-06, "loss": 0.7482, "num_input_tokens_seen": 169328640, "step": 20670 }, { "epoch": 2.672179868200026, "grad_norm": 0.5645007491111755, "learning_rate": 1.4587107070019368e-06, "loss": 0.8197, "num_input_tokens_seen": 169410560, "step": 20680 }, { "epoch": 2.673472024809407, "grad_norm": 0.5567523241043091, "learning_rate": 1.447345926089283e-06, "loss": 0.8722, "num_input_tokens_seen": 169492480, "step": 20690 }, { "epoch": 2.674764181418788, "grad_norm": 1.2603198289871216, "learning_rate": 1.436024270000058e-06, "loss": 0.9786, "num_input_tokens_seen": 169574400, "step": 20700 }, { "epoch": 2.676056338028169, "grad_norm": 1.105191946029663, "learning_rate": 1.4247457594641662e-06, "loss": 0.8513, "num_input_tokens_seen": 169656320, "step": 20710 }, { "epoch": 2.67734849463755, "grad_norm": 0.5803696513175964, "learning_rate": 1.4135104151325184e-06, "loss": 0.8101, "num_input_tokens_seen": 169738240, "step": 20720 }, { "epoch": 2.678640651246931, "grad_norm": 0.6467001438140869, "learning_rate": 1.4023182575769956e-06, "loss": 0.5225, "num_input_tokens_seen": 169820160, "step": 20730 }, { "epoch": 2.679932807856312, "grad_norm": 1.2854204177856445, "learning_rate": 1.391169307290391e-06, "loss": 0.7088, "num_input_tokens_seen": 169902080, "step": 20740 }, { "epoch": 2.6812249644656934, "grad_norm": 0.6026389002799988, "learning_rate": 1.3800635846863973e-06, "loss": 0.9884, "num_input_tokens_seen": 169984000, "step": 20750 }, { "epoch": 2.6825171210750742, "grad_norm": 0.8329545259475708, "learning_rate": 1.3690011100995437e-06, "loss": 1.097, "num_input_tokens_seen": 170065920, "step": 20760 }, { "epoch": 2.683809277684455, "grad_norm": 1.0132107734680176, "learning_rate": 1.357981903785191e-06, "loss": 0.7501, "num_input_tokens_seen": 170147840, "step": 20770 }, { "epoch": 2.6851014342938364, "grad_norm": 0.5987882614135742, "learning_rate": 1.3470059859194583e-06, "loss": 0.9277, "num_input_tokens_seen": 170229760, "step": 20780 }, { "epoch": 2.6863935909032177, "grad_norm": 1.0641794204711914, "learning_rate": 1.3360733765992116e-06, "loss": 0.6113, "num_input_tokens_seen": 170311680, "step": 20790 }, { "epoch": 2.6876857475125986, "grad_norm": 0.6288622617721558, "learning_rate": 1.325184095842022e-06, "loss": 0.7514, "num_input_tokens_seen": 170393600, "step": 20800 }, { "epoch": 2.6889779041219795, "grad_norm": 1.762484073638916, "learning_rate": 1.3143381635861207e-06, "loss": 0.7529, "num_input_tokens_seen": 170475520, "step": 20810 }, { "epoch": 2.6902700607313608, "grad_norm": 0.8595255017280579, "learning_rate": 1.3035355996903697e-06, "loss": 0.8549, "num_input_tokens_seen": 170557440, "step": 20820 }, { "epoch": 2.6915622173407416, "grad_norm": 0.663703441619873, "learning_rate": 1.2927764239342221e-06, "loss": 0.7501, "num_input_tokens_seen": 170639360, "step": 20830 }, { "epoch": 2.692854373950123, "grad_norm": 0.9071587920188904, "learning_rate": 1.2820606560176945e-06, "loss": 0.9612, "num_input_tokens_seen": 170721280, "step": 20840 }, { "epoch": 2.694146530559504, "grad_norm": 1.5256946086883545, "learning_rate": 1.2713883155613144e-06, "loss": 0.8317, "num_input_tokens_seen": 170803200, "step": 20850 }, { "epoch": 2.6954386871688847, "grad_norm": 1.048254370689392, "learning_rate": 1.2607594221060975e-06, "loss": 0.6523, "num_input_tokens_seen": 170885120, "step": 20860 }, { "epoch": 2.696730843778266, "grad_norm": 0.8906717896461487, "learning_rate": 1.2501739951135155e-06, "loss": 0.8824, "num_input_tokens_seen": 170967040, "step": 20870 }, { "epoch": 2.698023000387647, "grad_norm": 0.6903151869773865, "learning_rate": 1.2396320539654366e-06, "loss": 0.8567, "num_input_tokens_seen": 171048960, "step": 20880 }, { "epoch": 2.699315156997028, "grad_norm": 1.00175940990448, "learning_rate": 1.229133617964126e-06, "loss": 1.1474, "num_input_tokens_seen": 171130880, "step": 20890 }, { "epoch": 2.700607313606409, "grad_norm": 0.7980899810791016, "learning_rate": 1.2186787063321743e-06, "loss": 0.5778, "num_input_tokens_seen": 171212800, "step": 20900 }, { "epoch": 2.70189947021579, "grad_norm": 0.848283052444458, "learning_rate": 1.208267338212493e-06, "loss": 0.9957, "num_input_tokens_seen": 171294720, "step": 20910 }, { "epoch": 2.7031916268251712, "grad_norm": 0.7135230898857117, "learning_rate": 1.1978995326682535e-06, "loss": 0.8154, "num_input_tokens_seen": 171376640, "step": 20920 }, { "epoch": 2.7044837834345525, "grad_norm": 0.7431100010871887, "learning_rate": 1.1875753086828727e-06, "loss": 0.3983, "num_input_tokens_seen": 171458560, "step": 20930 }, { "epoch": 2.7057759400439334, "grad_norm": 0.3723915219306946, "learning_rate": 1.177294685159963e-06, "loss": 0.8132, "num_input_tokens_seen": 171540480, "step": 20940 }, { "epoch": 2.7070680966533143, "grad_norm": 0.5851907134056091, "learning_rate": 1.167057680923317e-06, "loss": 0.6962, "num_input_tokens_seen": 171622400, "step": 20950 }, { "epoch": 2.7083602532626956, "grad_norm": 0.5370262861251831, "learning_rate": 1.1568643147168434e-06, "loss": 0.5466, "num_input_tokens_seen": 171704320, "step": 20960 }, { "epoch": 2.7096524098720765, "grad_norm": 0.6362653374671936, "learning_rate": 1.1467146052045603e-06, "loss": 0.9384, "num_input_tokens_seen": 171786240, "step": 20970 }, { "epoch": 2.7109445664814578, "grad_norm": 1.7740426063537598, "learning_rate": 1.1366085709705515e-06, "loss": 0.3847, "num_input_tokens_seen": 171868160, "step": 20980 }, { "epoch": 2.7122367230908386, "grad_norm": 0.7024763226509094, "learning_rate": 1.1265462305189268e-06, "loss": 0.7502, "num_input_tokens_seen": 171950080, "step": 20990 }, { "epoch": 2.7135288797002195, "grad_norm": 0.7445507645606995, "learning_rate": 1.1165276022737926e-06, "loss": 0.8216, "num_input_tokens_seen": 172032000, "step": 21000 }, { "epoch": 2.714821036309601, "grad_norm": 0.6791986227035522, "learning_rate": 1.1065527045792251e-06, "loss": 0.6687, "num_input_tokens_seen": 172113920, "step": 21010 }, { "epoch": 2.7161131929189817, "grad_norm": 0.999146580696106, "learning_rate": 1.0966215556992231e-06, "loss": 1.0147, "num_input_tokens_seen": 172195840, "step": 21020 }, { "epoch": 2.717405349528363, "grad_norm": 1.1012611389160156, "learning_rate": 1.0867341738176857e-06, "loss": 0.9388, "num_input_tokens_seen": 172277760, "step": 21030 }, { "epoch": 2.718697506137744, "grad_norm": 0.8878210783004761, "learning_rate": 1.076890577038367e-06, "loss": 0.9003, "num_input_tokens_seen": 172359680, "step": 21040 }, { "epoch": 2.7199896627471247, "grad_norm": 0.6756826043128967, "learning_rate": 1.0670907833848664e-06, "loss": 0.6774, "num_input_tokens_seen": 172441600, "step": 21050 }, { "epoch": 2.721281819356506, "grad_norm": 0.6432533860206604, "learning_rate": 1.0573348108005614e-06, "loss": 0.9375, "num_input_tokens_seen": 172523520, "step": 21060 }, { "epoch": 2.722573975965887, "grad_norm": 0.5785542726516724, "learning_rate": 1.0476226771486074e-06, "loss": 0.5949, "num_input_tokens_seen": 172605440, "step": 21070 }, { "epoch": 2.723866132575268, "grad_norm": 0.6814249753952026, "learning_rate": 1.0379544002118824e-06, "loss": 0.5954, "num_input_tokens_seen": 172687360, "step": 21080 }, { "epoch": 2.725158289184649, "grad_norm": 0.6625814437866211, "learning_rate": 1.0283299976929672e-06, "loss": 0.4849, "num_input_tokens_seen": 172769280, "step": 21090 }, { "epoch": 2.7264504457940304, "grad_norm": 0.4581983983516693, "learning_rate": 1.0187494872141102e-06, "loss": 0.6476, "num_input_tokens_seen": 172851200, "step": 21100 }, { "epoch": 2.7277426024034113, "grad_norm": 0.617347240447998, "learning_rate": 1.0092128863171846e-06, "loss": 1.1527, "num_input_tokens_seen": 172933120, "step": 21110 }, { "epoch": 2.7290347590127926, "grad_norm": 0.6780828833580017, "learning_rate": 9.997202124636785e-07, "loss": 0.7512, "num_input_tokens_seen": 173015040, "step": 21120 }, { "epoch": 2.7303269156221734, "grad_norm": 1.0733816623687744, "learning_rate": 9.902714830346437e-07, "loss": 0.6027, "num_input_tokens_seen": 173096960, "step": 21130 }, { "epoch": 2.7316190722315543, "grad_norm": 0.6169953942298889, "learning_rate": 9.808667153306612e-07, "loss": 0.5431, "num_input_tokens_seen": 173178880, "step": 21140 }, { "epoch": 2.7329112288409356, "grad_norm": 0.3251383602619171, "learning_rate": 9.715059265718335e-07, "loss": 0.7888, "num_input_tokens_seen": 173260800, "step": 21150 }, { "epoch": 2.7342033854503165, "grad_norm": 0.7879831194877625, "learning_rate": 9.62189133897723e-07, "loss": 0.9271, "num_input_tokens_seen": 173342720, "step": 21160 }, { "epoch": 2.735495542059698, "grad_norm": 0.988198459148407, "learning_rate": 9.52916354367353e-07, "loss": 0.8073, "num_input_tokens_seen": 173424640, "step": 21170 }, { "epoch": 2.7367876986690787, "grad_norm": 0.5871654152870178, "learning_rate": 9.436876049591398e-07, "loss": 1.2963, "num_input_tokens_seen": 173506560, "step": 21180 }, { "epoch": 2.7380798552784595, "grad_norm": 0.9331020712852478, "learning_rate": 9.345029025708995e-07, "loss": 0.6766, "num_input_tokens_seen": 173588480, "step": 21190 }, { "epoch": 2.739372011887841, "grad_norm": 1.031977891921997, "learning_rate": 9.253622640197773e-07, "loss": 0.5872, "num_input_tokens_seen": 173670400, "step": 21200 }, { "epoch": 2.7406641684972217, "grad_norm": 0.28784510493278503, "learning_rate": 9.162657060422574e-07, "loss": 0.5351, "num_input_tokens_seen": 173752320, "step": 21210 }, { "epoch": 2.741956325106603, "grad_norm": 0.7928471565246582, "learning_rate": 9.072132452941002e-07, "loss": 0.6825, "num_input_tokens_seen": 173834240, "step": 21220 }, { "epoch": 2.743248481715984, "grad_norm": 0.6947720646858215, "learning_rate": 8.982048983503271e-07, "loss": 0.84, "num_input_tokens_seen": 173916160, "step": 21230 }, { "epoch": 2.7445406383253648, "grad_norm": 0.6338273882865906, "learning_rate": 8.892406817051946e-07, "loss": 0.4665, "num_input_tokens_seen": 173998080, "step": 21240 }, { "epoch": 2.745832794934746, "grad_norm": 0.6995043754577637, "learning_rate": 8.803206117721424e-07, "loss": 0.8926, "num_input_tokens_seen": 174080000, "step": 21250 }, { "epoch": 2.7471249515441274, "grad_norm": 0.9222922325134277, "learning_rate": 8.714447048837948e-07, "loss": 0.8874, "num_input_tokens_seen": 174161920, "step": 21260 }, { "epoch": 2.7484171081535083, "grad_norm": 1.1156824827194214, "learning_rate": 8.626129772918962e-07, "loss": 0.9445, "num_input_tokens_seen": 174243840, "step": 21270 }, { "epoch": 2.749709264762889, "grad_norm": 0.2606189250946045, "learning_rate": 8.538254451673138e-07, "loss": 0.7208, "num_input_tokens_seen": 174325760, "step": 21280 }, { "epoch": 2.7510014213722704, "grad_norm": 1.007570743560791, "learning_rate": 8.450821245999829e-07, "loss": 0.6216, "num_input_tokens_seen": 174407680, "step": 21290 }, { "epoch": 2.7522935779816513, "grad_norm": 1.0008625984191895, "learning_rate": 8.363830315988947e-07, "loss": 0.9531, "num_input_tokens_seen": 174489600, "step": 21300 }, { "epoch": 2.7535857345910326, "grad_norm": 0.3577563166618347, "learning_rate": 8.277281820920523e-07, "loss": 0.7485, "num_input_tokens_seen": 174571520, "step": 21310 }, { "epoch": 2.7548778912004135, "grad_norm": 0.6377021074295044, "learning_rate": 8.191175919264604e-07, "loss": 1.0028, "num_input_tokens_seen": 174653440, "step": 21320 }, { "epoch": 2.7561700478097944, "grad_norm": 1.360044002532959, "learning_rate": 8.105512768680712e-07, "loss": 0.7594, "num_input_tokens_seen": 174735360, "step": 21330 }, { "epoch": 2.7574622044191757, "grad_norm": 0.5991158485412598, "learning_rate": 8.02029252601777e-07, "loss": 0.7408, "num_input_tokens_seen": 174817280, "step": 21340 }, { "epoch": 2.7587543610285565, "grad_norm": 0.593151330947876, "learning_rate": 7.935515347313793e-07, "loss": 0.845, "num_input_tokens_seen": 174899200, "step": 21350 }, { "epoch": 2.760046517637938, "grad_norm": 0.6272279620170593, "learning_rate": 7.851181387795392e-07, "loss": 0.9223, "num_input_tokens_seen": 174981120, "step": 21360 }, { "epoch": 2.7613386742473187, "grad_norm": 0.7071083188056946, "learning_rate": 7.767290801877796e-07, "loss": 0.5397, "num_input_tokens_seen": 175063040, "step": 21370 }, { "epoch": 2.7626308308566996, "grad_norm": 0.23855863511562347, "learning_rate": 7.683843743164359e-07, "loss": 0.7537, "num_input_tokens_seen": 175144960, "step": 21380 }, { "epoch": 2.763922987466081, "grad_norm": 0.9484770894050598, "learning_rate": 7.600840364446333e-07, "loss": 0.7232, "num_input_tokens_seen": 175226880, "step": 21390 }, { "epoch": 2.765215144075462, "grad_norm": 0.7923175096511841, "learning_rate": 7.518280817702616e-07, "loss": 0.683, "num_input_tokens_seen": 175308800, "step": 21400 }, { "epoch": 2.766507300684843, "grad_norm": 1.019843578338623, "learning_rate": 7.436165254099376e-07, "loss": 0.9568, "num_input_tokens_seen": 175390720, "step": 21410 }, { "epoch": 2.767799457294224, "grad_norm": 0.6530530452728271, "learning_rate": 7.354493823990006e-07, "loss": 0.8146, "num_input_tokens_seen": 175472640, "step": 21420 }, { "epoch": 2.7690916139036053, "grad_norm": 0.8259232044219971, "learning_rate": 7.273266676914498e-07, "loss": 0.8714, "num_input_tokens_seen": 175554560, "step": 21430 }, { "epoch": 2.770383770512986, "grad_norm": 0.8644856214523315, "learning_rate": 7.19248396159955e-07, "loss": 0.621, "num_input_tokens_seen": 175636480, "step": 21440 }, { "epoch": 2.7716759271223674, "grad_norm": 0.7736506462097168, "learning_rate": 7.112145825957927e-07, "loss": 0.8045, "num_input_tokens_seen": 175718400, "step": 21450 }, { "epoch": 2.7729680837317483, "grad_norm": 0.886471152305603, "learning_rate": 7.03225241708852e-07, "loss": 1.0378, "num_input_tokens_seen": 175800320, "step": 21460 }, { "epoch": 2.774260240341129, "grad_norm": 0.7383142113685608, "learning_rate": 6.952803881275894e-07, "loss": 0.5995, "num_input_tokens_seen": 175882240, "step": 21470 }, { "epoch": 2.7755523969505105, "grad_norm": 1.1481050252914429, "learning_rate": 6.873800363989935e-07, "loss": 0.9361, "num_input_tokens_seen": 175964160, "step": 21480 }, { "epoch": 2.7768445535598913, "grad_norm": 1.066712737083435, "learning_rate": 6.795242009885905e-07, "loss": 0.7132, "num_input_tokens_seen": 176046080, "step": 21490 }, { "epoch": 2.7781367101692727, "grad_norm": 0.7255296111106873, "learning_rate": 6.717128962803798e-07, "loss": 0.9569, "num_input_tokens_seen": 176128000, "step": 21500 }, { "epoch": 2.7794288667786535, "grad_norm": 0.3221868574619293, "learning_rate": 6.63946136576829e-07, "loss": 0.5613, "num_input_tokens_seen": 176209920, "step": 21510 }, { "epoch": 2.7807210233880344, "grad_norm": 0.8177877068519592, "learning_rate": 6.562239360988542e-07, "loss": 0.8761, "num_input_tokens_seen": 176291840, "step": 21520 }, { "epoch": 2.7820131799974157, "grad_norm": 0.910959780216217, "learning_rate": 6.485463089857674e-07, "loss": 0.5046, "num_input_tokens_seen": 176373760, "step": 21530 }, { "epoch": 2.783305336606797, "grad_norm": 1.2782275676727295, "learning_rate": 6.409132692952874e-07, "loss": 0.8931, "num_input_tokens_seen": 176455680, "step": 21540 }, { "epoch": 2.784597493216178, "grad_norm": 0.6053476333618164, "learning_rate": 6.333248310034706e-07, "loss": 0.9367, "num_input_tokens_seen": 176537600, "step": 21550 }, { "epoch": 2.7858896498255588, "grad_norm": 0.9305321574211121, "learning_rate": 6.257810080047249e-07, "loss": 1.0246, "num_input_tokens_seen": 176619520, "step": 21560 }, { "epoch": 2.78718180643494, "grad_norm": 1.467142105102539, "learning_rate": 6.182818141117625e-07, "loss": 0.7575, "num_input_tokens_seen": 176701440, "step": 21570 }, { "epoch": 2.788473963044321, "grad_norm": 0.3563095033168793, "learning_rate": 6.1082726305558e-07, "loss": 0.8363, "num_input_tokens_seen": 176783360, "step": 21580 }, { "epoch": 2.7897661196537022, "grad_norm": 0.7321064472198486, "learning_rate": 6.034173684854316e-07, "loss": 0.8788, "num_input_tokens_seen": 176865280, "step": 21590 }, { "epoch": 2.791058276263083, "grad_norm": 0.5982012152671814, "learning_rate": 5.960521439688088e-07, "loss": 0.7845, "num_input_tokens_seen": 176947200, "step": 21600 }, { "epoch": 2.792350432872464, "grad_norm": 0.965398371219635, "learning_rate": 5.88731602991413e-07, "loss": 0.8485, "num_input_tokens_seen": 177029120, "step": 21610 }, { "epoch": 2.7936425894818453, "grad_norm": 0.33368125557899475, "learning_rate": 5.814557589571223e-07, "loss": 0.9114, "num_input_tokens_seen": 177111040, "step": 21620 }, { "epoch": 2.794934746091226, "grad_norm": 1.154457688331604, "learning_rate": 5.742246251879829e-07, "loss": 0.6277, "num_input_tokens_seen": 177192960, "step": 21630 }, { "epoch": 2.7962269027006075, "grad_norm": 0.728986382484436, "learning_rate": 5.67038214924176e-07, "loss": 0.8553, "num_input_tokens_seen": 177274880, "step": 21640 }, { "epoch": 2.7975190593099883, "grad_norm": 0.7881826758384705, "learning_rate": 5.598965413239926e-07, "loss": 0.7787, "num_input_tokens_seen": 177356800, "step": 21650 }, { "epoch": 2.798811215919369, "grad_norm": 0.42167970538139343, "learning_rate": 5.527996174638061e-07, "loss": 0.5017, "num_input_tokens_seen": 177438720, "step": 21660 }, { "epoch": 2.8001033725287505, "grad_norm": 0.8099974393844604, "learning_rate": 5.457474563380638e-07, "loss": 1.0671, "num_input_tokens_seen": 177520640, "step": 21670 }, { "epoch": 2.8013955291381314, "grad_norm": 0.619404673576355, "learning_rate": 5.387400708592422e-07, "loss": 1.0223, "num_input_tokens_seen": 177602560, "step": 21680 }, { "epoch": 2.8026876857475127, "grad_norm": 1.066259741783142, "learning_rate": 5.317774738578446e-07, "loss": 0.7952, "num_input_tokens_seen": 177684480, "step": 21690 }, { "epoch": 2.8039798423568936, "grad_norm": 0.8455848097801208, "learning_rate": 5.248596780823567e-07, "loss": 0.5789, "num_input_tokens_seen": 177766400, "step": 21700 }, { "epoch": 2.8052719989662744, "grad_norm": 0.40524500608444214, "learning_rate": 5.179866961992353e-07, "loss": 0.9732, "num_input_tokens_seen": 177848320, "step": 21710 }, { "epoch": 2.8065641555756558, "grad_norm": 0.7112755179405212, "learning_rate": 5.111585407928887e-07, "loss": 0.6634, "num_input_tokens_seen": 177930240, "step": 21720 }, { "epoch": 2.807856312185037, "grad_norm": 0.7933624386787415, "learning_rate": 5.043752243656414e-07, "loss": 0.6454, "num_input_tokens_seen": 178012160, "step": 21730 }, { "epoch": 2.809148468794418, "grad_norm": 0.6080639362335205, "learning_rate": 4.976367593377218e-07, "loss": 0.8702, "num_input_tokens_seen": 178094080, "step": 21740 }, { "epoch": 2.810440625403799, "grad_norm": 0.7494640946388245, "learning_rate": 4.909431580472385e-07, "loss": 1.0812, "num_input_tokens_seen": 178176000, "step": 21750 }, { "epoch": 2.81173278201318, "grad_norm": 0.49116799235343933, "learning_rate": 4.842944327501458e-07, "loss": 0.9232, "num_input_tokens_seen": 178257920, "step": 21760 }, { "epoch": 2.813024938622561, "grad_norm": 0.8744192719459534, "learning_rate": 4.776905956202393e-07, "loss": 0.7583, "num_input_tokens_seen": 178339840, "step": 21770 }, { "epoch": 2.8143170952319423, "grad_norm": 0.6495558619499207, "learning_rate": 4.711316587491188e-07, "loss": 0.626, "num_input_tokens_seen": 178421760, "step": 21780 }, { "epoch": 2.815609251841323, "grad_norm": 0.5544187426567078, "learning_rate": 4.646176341461722e-07, "loss": 0.7818, "num_input_tokens_seen": 178503680, "step": 21790 }, { "epoch": 2.816901408450704, "grad_norm": 0.26983362436294556, "learning_rate": 4.581485337385588e-07, "loss": 0.7148, "num_input_tokens_seen": 178585600, "step": 21800 }, { "epoch": 2.8181935650600853, "grad_norm": 0.6384357213973999, "learning_rate": 4.5172436937117036e-07, "loss": 0.8498, "num_input_tokens_seen": 178667520, "step": 21810 }, { "epoch": 2.819485721669466, "grad_norm": 1.1162294149398804, "learning_rate": 4.4534515280663937e-07, "loss": 1.0099, "num_input_tokens_seen": 178749440, "step": 21820 }, { "epoch": 2.8207778782788475, "grad_norm": 1.0189695358276367, "learning_rate": 4.390108957252781e-07, "loss": 0.878, "num_input_tokens_seen": 178831360, "step": 21830 }, { "epoch": 2.8220700348882284, "grad_norm": 0.7732197046279907, "learning_rate": 4.3272160972509524e-07, "loss": 0.7988, "num_input_tokens_seen": 178913280, "step": 21840 }, { "epoch": 2.8233621914976093, "grad_norm": 0.9585608839988708, "learning_rate": 4.264773063217431e-07, "loss": 1.0454, "num_input_tokens_seen": 178995200, "step": 21850 }, { "epoch": 2.8246543481069906, "grad_norm": 0.694817841053009, "learning_rate": 4.20277996948526e-07, "loss": 0.9275, "num_input_tokens_seen": 179077120, "step": 21860 }, { "epoch": 2.825946504716372, "grad_norm": 0.8328717350959778, "learning_rate": 4.1412369295635023e-07, "loss": 0.95, "num_input_tokens_seen": 179159040, "step": 21870 }, { "epoch": 2.8272386613257527, "grad_norm": 0.9796121716499329, "learning_rate": 4.0801440561372694e-07, "loss": 0.8007, "num_input_tokens_seen": 179240960, "step": 21880 }, { "epoch": 2.8285308179351336, "grad_norm": 1.0329798460006714, "learning_rate": 4.0195014610674153e-07, "loss": 0.8786, "num_input_tokens_seen": 179322880, "step": 21890 }, { "epoch": 2.829822974544515, "grad_norm": 0.4001299738883972, "learning_rate": 3.9593092553902587e-07, "loss": 0.8457, "num_input_tokens_seen": 179404800, "step": 21900 }, { "epoch": 2.831115131153896, "grad_norm": 0.5981205701828003, "learning_rate": 3.899567549317529e-07, "loss": 0.7406, "num_input_tokens_seen": 179486720, "step": 21910 }, { "epoch": 2.832407287763277, "grad_norm": 0.5832979679107666, "learning_rate": 3.840276452236058e-07, "loss": 0.7223, "num_input_tokens_seen": 179568640, "step": 21920 }, { "epoch": 2.833699444372658, "grad_norm": 0.6982936263084412, "learning_rate": 3.7814360727076724e-07, "loss": 0.8734, "num_input_tokens_seen": 179650560, "step": 21930 }, { "epoch": 2.834991600982039, "grad_norm": 0.81357342004776, "learning_rate": 3.723046518468859e-07, "loss": 0.5382, "num_input_tokens_seen": 179732480, "step": 21940 }, { "epoch": 2.83628375759142, "grad_norm": 0.6205422282218933, "learning_rate": 3.6651078964306807e-07, "loss": 1.0019, "num_input_tokens_seen": 179814400, "step": 21950 }, { "epoch": 2.837575914200801, "grad_norm": 0.670599102973938, "learning_rate": 3.607620312678528e-07, "loss": 0.7448, "num_input_tokens_seen": 179896320, "step": 21960 }, { "epoch": 2.8388680708101823, "grad_norm": 1.0795679092407227, "learning_rate": 3.550583872471952e-07, "loss": 0.553, "num_input_tokens_seen": 179978240, "step": 21970 }, { "epoch": 2.840160227419563, "grad_norm": 0.7728472352027893, "learning_rate": 3.4939986802445256e-07, "loss": 0.9766, "num_input_tokens_seen": 180060160, "step": 21980 }, { "epoch": 2.841452384028944, "grad_norm": 0.24065038561820984, "learning_rate": 3.437864839603455e-07, "loss": 0.3731, "num_input_tokens_seen": 180142080, "step": 21990 }, { "epoch": 2.8427445406383254, "grad_norm": 1.1644612550735474, "learning_rate": 3.3821824533296633e-07, "loss": 0.5934, "num_input_tokens_seen": 180224000, "step": 22000 }, { "epoch": 2.8440366972477067, "grad_norm": 0.8106747269630432, "learning_rate": 3.3269516233773446e-07, "loss": 0.7189, "num_input_tokens_seen": 180305920, "step": 22010 }, { "epoch": 2.8453288538570876, "grad_norm": 0.24490030109882355, "learning_rate": 3.272172450873967e-07, "loss": 0.6045, "num_input_tokens_seen": 180387840, "step": 22020 }, { "epoch": 2.8466210104664684, "grad_norm": 1.151792287826538, "learning_rate": 3.217845036119993e-07, "loss": 0.4563, "num_input_tokens_seen": 180469760, "step": 22030 }, { "epoch": 2.8479131670758497, "grad_norm": 0.4980817139148712, "learning_rate": 3.163969478588713e-07, "loss": 0.7337, "num_input_tokens_seen": 180551680, "step": 22040 }, { "epoch": 2.8492053236852306, "grad_norm": 0.7074184417724609, "learning_rate": 3.11054587692608e-07, "loss": 0.8102, "num_input_tokens_seen": 180633600, "step": 22050 }, { "epoch": 2.850497480294612, "grad_norm": 0.7132773995399475, "learning_rate": 3.057574328950541e-07, "loss": 0.841, "num_input_tokens_seen": 180715520, "step": 22060 }, { "epoch": 2.851789636903993, "grad_norm": 1.5704045295715332, "learning_rate": 3.005054931652762e-07, "loss": 0.5297, "num_input_tokens_seen": 180797440, "step": 22070 }, { "epoch": 2.8530817935133737, "grad_norm": 1.3629149198532104, "learning_rate": 2.952987781195599e-07, "loss": 0.7995, "num_input_tokens_seen": 180879360, "step": 22080 }, { "epoch": 2.854373950122755, "grad_norm": 0.6526079773902893, "learning_rate": 2.901372972913791e-07, "loss": 0.8583, "num_input_tokens_seen": 180961280, "step": 22090 }, { "epoch": 2.855666106732136, "grad_norm": 0.8587889075279236, "learning_rate": 2.8502106013138516e-07, "loss": 0.8178, "num_input_tokens_seen": 181043200, "step": 22100 }, { "epoch": 2.856958263341517, "grad_norm": 0.6336101293563843, "learning_rate": 2.799500760073931e-07, "loss": 0.7179, "num_input_tokens_seen": 181125120, "step": 22110 }, { "epoch": 2.858250419950898, "grad_norm": 0.9315862059593201, "learning_rate": 2.749243542043561e-07, "loss": 0.4239, "num_input_tokens_seen": 181207040, "step": 22120 }, { "epoch": 2.859542576560279, "grad_norm": 0.7689865827560425, "learning_rate": 2.699439039243523e-07, "loss": 0.88, "num_input_tokens_seen": 181288960, "step": 22130 }, { "epoch": 2.86083473316966, "grad_norm": 0.37839198112487793, "learning_rate": 2.6500873428656483e-07, "loss": 0.4962, "num_input_tokens_seen": 181370880, "step": 22140 }, { "epoch": 2.862126889779041, "grad_norm": 0.6896770596504211, "learning_rate": 2.601188543272737e-07, "loss": 0.7783, "num_input_tokens_seen": 181452800, "step": 22150 }, { "epoch": 2.8634190463884224, "grad_norm": 0.639472484588623, "learning_rate": 2.552742729998309e-07, "loss": 1.1341, "num_input_tokens_seen": 181534720, "step": 22160 }, { "epoch": 2.8647112029978032, "grad_norm": 0.6958195567131042, "learning_rate": 2.5047499917464636e-07, "loss": 0.6448, "num_input_tokens_seen": 181616640, "step": 22170 }, { "epoch": 2.8660033596071846, "grad_norm": 0.9285911321640015, "learning_rate": 2.457210416391742e-07, "loss": 0.7003, "num_input_tokens_seen": 181698560, "step": 22180 }, { "epoch": 2.8672955162165654, "grad_norm": 1.0297704935073853, "learning_rate": 2.4101240909789325e-07, "loss": 0.9887, "num_input_tokens_seen": 181780480, "step": 22190 }, { "epoch": 2.8685876728259467, "grad_norm": 0.7845831513404846, "learning_rate": 2.3634911017229034e-07, "loss": 0.6259, "num_input_tokens_seen": 181862400, "step": 22200 }, { "epoch": 2.8698798294353276, "grad_norm": 0.7428514361381531, "learning_rate": 2.3173115340085204e-07, "loss": 0.9638, "num_input_tokens_seen": 181944320, "step": 22210 }, { "epoch": 2.8711719860447085, "grad_norm": 0.7519323825836182, "learning_rate": 2.2715854723903974e-07, "loss": 0.9015, "num_input_tokens_seen": 182026240, "step": 22220 }, { "epoch": 2.87246414265409, "grad_norm": 1.0879383087158203, "learning_rate": 2.2263130005927558e-07, "loss": 0.7179, "num_input_tokens_seen": 182108160, "step": 22230 }, { "epoch": 2.8737562992634706, "grad_norm": 0.7938571572303772, "learning_rate": 2.181494201509343e-07, "loss": 0.936, "num_input_tokens_seen": 182190080, "step": 22240 }, { "epoch": 2.875048455872852, "grad_norm": 0.4322587549686432, "learning_rate": 2.1371291572032382e-07, "loss": 0.5952, "num_input_tokens_seen": 182272000, "step": 22250 }, { "epoch": 2.876340612482233, "grad_norm": 0.47111037373542786, "learning_rate": 2.0932179489066006e-07, "loss": 0.7432, "num_input_tokens_seen": 182353920, "step": 22260 }, { "epoch": 2.8776327690916137, "grad_norm": 0.7983747720718384, "learning_rate": 2.0497606570207829e-07, "loss": 0.8684, "num_input_tokens_seen": 182435840, "step": 22270 }, { "epoch": 2.878924925700995, "grad_norm": 0.3726508915424347, "learning_rate": 2.0067573611158853e-07, "loss": 0.6558, "num_input_tokens_seen": 182517760, "step": 22280 }, { "epoch": 2.880217082310376, "grad_norm": 1.232807993888855, "learning_rate": 1.9642081399307844e-07, "loss": 0.562, "num_input_tokens_seen": 182599680, "step": 22290 }, { "epoch": 2.881509238919757, "grad_norm": 1.0021343231201172, "learning_rate": 1.9221130713729663e-07, "loss": 0.7792, "num_input_tokens_seen": 182681600, "step": 22300 }, { "epoch": 2.882801395529138, "grad_norm": 0.8979313969612122, "learning_rate": 1.8804722325183044e-07, "loss": 0.7825, "num_input_tokens_seen": 182763520, "step": 22310 }, { "epoch": 2.884093552138519, "grad_norm": 0.4306122660636902, "learning_rate": 1.8392856996110875e-07, "loss": 0.6558, "num_input_tokens_seen": 182845440, "step": 22320 }, { "epoch": 2.8853857087479002, "grad_norm": 0.7078767418861389, "learning_rate": 1.7985535480636584e-07, "loss": 0.6038, "num_input_tokens_seen": 182927360, "step": 22330 }, { "epoch": 2.8866778653572815, "grad_norm": 0.7182630300521851, "learning_rate": 1.7582758524564425e-07, "loss": 0.787, "num_input_tokens_seen": 183009280, "step": 22340 }, { "epoch": 2.8879700219666624, "grad_norm": 0.6843809485435486, "learning_rate": 1.7184526865377805e-07, "loss": 0.56, "num_input_tokens_seen": 183091200, "step": 22350 }, { "epoch": 2.8892621785760433, "grad_norm": 0.5071055293083191, "learning_rate": 1.6790841232237064e-07, "loss": 0.494, "num_input_tokens_seen": 183173120, "step": 22360 }, { "epoch": 2.8905543351854246, "grad_norm": 0.43069300055503845, "learning_rate": 1.6401702345979485e-07, "loss": 1.0883, "num_input_tokens_seen": 183255040, "step": 22370 }, { "epoch": 2.8918464917948055, "grad_norm": 0.9334409832954407, "learning_rate": 1.6017110919116786e-07, "loss": 0.7701, "num_input_tokens_seen": 183336960, "step": 22380 }, { "epoch": 2.8931386484041868, "grad_norm": 0.9663013815879822, "learning_rate": 1.5637067655834282e-07, "loss": 0.7901, "num_input_tokens_seen": 183418880, "step": 22390 }, { "epoch": 2.8944308050135676, "grad_norm": 0.45232853293418884, "learning_rate": 1.526157325199007e-07, "loss": 0.8333, "num_input_tokens_seen": 183500800, "step": 22400 }, { "epoch": 2.8957229616229485, "grad_norm": 1.2626268863677979, "learning_rate": 1.4890628395113072e-07, "loss": 0.3667, "num_input_tokens_seen": 183582720, "step": 22410 }, { "epoch": 2.89701511823233, "grad_norm": 1.018869400024414, "learning_rate": 1.452423376440193e-07, "loss": 1.0644, "num_input_tokens_seen": 183664640, "step": 22420 }, { "epoch": 2.8983072748417107, "grad_norm": 0.7966929078102112, "learning_rate": 1.4162390030723617e-07, "loss": 0.6229, "num_input_tokens_seen": 183746560, "step": 22430 }, { "epoch": 2.899599431451092, "grad_norm": 1.1223193407058716, "learning_rate": 1.380509785661288e-07, "loss": 0.8879, "num_input_tokens_seen": 183828480, "step": 22440 }, { "epoch": 2.900891588060473, "grad_norm": 0.751388430595398, "learning_rate": 1.3452357896270308e-07, "loss": 0.7747, "num_input_tokens_seen": 183910400, "step": 22450 }, { "epoch": 2.9021837446698537, "grad_norm": 0.7575998902320862, "learning_rate": 1.3104170795561477e-07, "loss": 0.7773, "num_input_tokens_seen": 183992320, "step": 22460 }, { "epoch": 2.903475901279235, "grad_norm": 0.6274828314781189, "learning_rate": 1.2760537192015866e-07, "loss": 0.7762, "num_input_tokens_seen": 184074240, "step": 22470 }, { "epoch": 2.9047680578886164, "grad_norm": 0.9277952909469604, "learning_rate": 1.242145771482489e-07, "loss": 0.7858, "num_input_tokens_seen": 184156160, "step": 22480 }, { "epoch": 2.9060602144979972, "grad_norm": 1.2103768587112427, "learning_rate": 1.2086932984842758e-07, "loss": 0.9757, "num_input_tokens_seen": 184238080, "step": 22490 }, { "epoch": 2.907352371107378, "grad_norm": 0.7629841566085815, "learning_rate": 1.1756963614582006e-07, "loss": 0.8254, "num_input_tokens_seen": 184320000, "step": 22500 }, { "epoch": 2.9086445277167594, "grad_norm": 0.32851213216781616, "learning_rate": 1.1431550208215736e-07, "loss": 0.2809, "num_input_tokens_seen": 184401920, "step": 22510 }, { "epoch": 2.9099366843261403, "grad_norm": 0.37616801261901855, "learning_rate": 1.1110693361574831e-07, "loss": 0.779, "num_input_tokens_seen": 184483840, "step": 22520 }, { "epoch": 2.9112288409355216, "grad_norm": 1.0593737363815308, "learning_rate": 1.0794393662147129e-07, "loss": 0.9263, "num_input_tokens_seen": 184565760, "step": 22530 }, { "epoch": 2.9125209975449025, "grad_norm": 0.390419602394104, "learning_rate": 1.0482651689075751e-07, "loss": 0.6628, "num_input_tokens_seen": 184647680, "step": 22540 }, { "epoch": 2.9138131541542833, "grad_norm": 0.5607829689979553, "learning_rate": 1.0175468013159384e-07, "loss": 0.7321, "num_input_tokens_seen": 184729600, "step": 22550 }, { "epoch": 2.9151053107636646, "grad_norm": 0.9104856252670288, "learning_rate": 9.872843196850057e-08, "loss": 0.8234, "num_input_tokens_seen": 184811520, "step": 22560 }, { "epoch": 2.9163974673730455, "grad_norm": 1.4063791036605835, "learning_rate": 9.574777794253143e-08, "loss": 0.5345, "num_input_tokens_seen": 184893440, "step": 22570 }, { "epoch": 2.917689623982427, "grad_norm": 2.190293073654175, "learning_rate": 9.281272351124859e-08, "loss": 0.9455, "num_input_tokens_seen": 184975360, "step": 22580 }, { "epoch": 2.9189817805918077, "grad_norm": 0.6696395874023438, "learning_rate": 8.992327404872825e-08, "loss": 0.3409, "num_input_tokens_seen": 185057280, "step": 22590 }, { "epoch": 2.9202739372011886, "grad_norm": 1.176025629043579, "learning_rate": 8.707943484553838e-08, "loss": 0.5035, "num_input_tokens_seen": 185139200, "step": 22600 }, { "epoch": 2.92156609381057, "grad_norm": 0.6432631015777588, "learning_rate": 8.428121110874154e-08, "loss": 0.7041, "num_input_tokens_seen": 185221120, "step": 22610 }, { "epoch": 2.9228582504199507, "grad_norm": 0.8045111298561096, "learning_rate": 8.152860796187545e-08, "loss": 0.8394, "num_input_tokens_seen": 185303040, "step": 22620 }, { "epoch": 2.924150407029332, "grad_norm": 0.2948504686355591, "learning_rate": 7.882163044494462e-08, "loss": 0.3898, "num_input_tokens_seen": 185384960, "step": 22630 }, { "epoch": 2.925442563638713, "grad_norm": 1.1298701763153076, "learning_rate": 7.616028351441484e-08, "loss": 0.7981, "num_input_tokens_seen": 185466880, "step": 22640 }, { "epoch": 2.9267347202480942, "grad_norm": 0.6935545802116394, "learning_rate": 7.354457204320486e-08, "loss": 0.8605, "num_input_tokens_seen": 185548800, "step": 22650 }, { "epoch": 2.928026876857475, "grad_norm": 0.624178409576416, "learning_rate": 7.097450082066969e-08, "loss": 0.687, "num_input_tokens_seen": 185630720, "step": 22660 }, { "epoch": 2.9293190334668564, "grad_norm": 0.7468099594116211, "learning_rate": 6.845007455260343e-08, "loss": 0.6511, "num_input_tokens_seen": 185712640, "step": 22670 }, { "epoch": 2.9306111900762373, "grad_norm": 0.20860843360424042, "learning_rate": 6.59712978612198e-08, "loss": 0.4383, "num_input_tokens_seen": 185794560, "step": 22680 }, { "epoch": 2.931903346685618, "grad_norm": 0.3760750889778137, "learning_rate": 6.353817528514938e-08, "loss": 0.4383, "num_input_tokens_seen": 185876480, "step": 22690 }, { "epoch": 2.9331955032949995, "grad_norm": 0.42980486154556274, "learning_rate": 6.11507112794285e-08, "loss": 0.6782, "num_input_tokens_seen": 185958400, "step": 22700 }, { "epoch": 2.9344876599043803, "grad_norm": 1.127774715423584, "learning_rate": 5.880891021549928e-08, "loss": 0.676, "num_input_tokens_seen": 186040320, "step": 22710 }, { "epoch": 2.9357798165137616, "grad_norm": 0.8025173544883728, "learning_rate": 5.6512776381192903e-08, "loss": 0.7285, "num_input_tokens_seen": 186122240, "step": 22720 }, { "epoch": 2.9370719731231425, "grad_norm": 0.6310427784919739, "learning_rate": 5.426231398071302e-08, "loss": 0.6244, "num_input_tokens_seen": 186204160, "step": 22730 }, { "epoch": 2.9383641297325234, "grad_norm": 0.8183490037918091, "learning_rate": 5.205752713465794e-08, "loss": 0.8171, "num_input_tokens_seen": 186286080, "step": 22740 }, { "epoch": 2.9396562863419047, "grad_norm": 1.129209041595459, "learning_rate": 4.989841987997901e-08, "loss": 0.9641, "num_input_tokens_seen": 186368000, "step": 22750 }, { "epoch": 2.9409484429512855, "grad_norm": 0.7792519927024841, "learning_rate": 4.778499616999166e-08, "loss": 0.7594, "num_input_tokens_seen": 186449920, "step": 22760 }, { "epoch": 2.942240599560667, "grad_norm": 0.7968661189079285, "learning_rate": 4.57172598743727e-08, "loss": 0.9113, "num_input_tokens_seen": 186531840, "step": 22770 }, { "epoch": 2.9435327561700477, "grad_norm": 0.6901931166648865, "learning_rate": 4.369521477913529e-08, "loss": 0.8022, "num_input_tokens_seen": 186613760, "step": 22780 }, { "epoch": 2.9448249127794286, "grad_norm": 0.7363210916519165, "learning_rate": 4.171886458664009e-08, "loss": 0.7282, "num_input_tokens_seen": 186695680, "step": 22790 }, { "epoch": 2.94611706938881, "grad_norm": 0.35474592447280884, "learning_rate": 3.9788212915573e-08, "loss": 0.6295, "num_input_tokens_seen": 186777600, "step": 22800 }, { "epoch": 2.947409225998191, "grad_norm": 0.9408590197563171, "learning_rate": 3.7903263300956285e-08, "loss": 0.6325, "num_input_tokens_seen": 186859520, "step": 22810 }, { "epoch": 2.948701382607572, "grad_norm": 0.9621270895004272, "learning_rate": 3.606401919411806e-08, "loss": 0.8424, "num_input_tokens_seen": 186941440, "step": 22820 }, { "epoch": 2.949993539216953, "grad_norm": 0.752377450466156, "learning_rate": 3.427048396271171e-08, "loss": 1.1133, "num_input_tokens_seen": 187023360, "step": 22830 }, { "epoch": 2.9512856958263343, "grad_norm": 0.8131755590438843, "learning_rate": 3.252266089069367e-08, "loss": 0.6982, "num_input_tokens_seen": 187105280, "step": 22840 }, { "epoch": 2.952577852435715, "grad_norm": 0.7799323797225952, "learning_rate": 3.0820553178320667e-08, "loss": 0.9222, "num_input_tokens_seen": 187187200, "step": 22850 }, { "epoch": 2.9538700090450964, "grad_norm": 0.4065955877304077, "learning_rate": 2.9164163942146937e-08, "loss": 0.5809, "num_input_tokens_seen": 187269120, "step": 22860 }, { "epoch": 2.9551621656544773, "grad_norm": 0.6465580463409424, "learning_rate": 2.7553496215015907e-08, "loss": 0.7016, "num_input_tokens_seen": 187351040, "step": 22870 }, { "epoch": 2.956454322263858, "grad_norm": 0.608161985874176, "learning_rate": 2.5988552946051848e-08, "loss": 0.5702, "num_input_tokens_seen": 187432960, "step": 22880 }, { "epoch": 2.9577464788732395, "grad_norm": 0.7066859602928162, "learning_rate": 2.44693370006599e-08, "loss": 0.6657, "num_input_tokens_seen": 187514880, "step": 22890 }, { "epoch": 2.9590386354826204, "grad_norm": 0.31225040555000305, "learning_rate": 2.2995851160520498e-08, "loss": 0.6522, "num_input_tokens_seen": 187596800, "step": 22900 }, { "epoch": 2.9603307920920017, "grad_norm": 0.7561403512954712, "learning_rate": 2.156809812358107e-08, "loss": 0.8856, "num_input_tokens_seen": 187678720, "step": 22910 }, { "epoch": 2.9616229487013825, "grad_norm": 1.5660463571548462, "learning_rate": 2.0186080504050466e-08, "loss": 0.718, "num_input_tokens_seen": 187760640, "step": 22920 }, { "epoch": 2.9629151053107634, "grad_norm": 0.6079100370407104, "learning_rate": 1.8849800832401733e-08, "loss": 0.7686, "num_input_tokens_seen": 187842560, "step": 22930 }, { "epoch": 2.9642072619201447, "grad_norm": 0.6655999422073364, "learning_rate": 1.75592615553527e-08, "loss": 0.8922, "num_input_tokens_seen": 187924480, "step": 22940 }, { "epoch": 2.965499418529526, "grad_norm": 0.7710443139076233, "learning_rate": 1.6314465035879855e-08, "loss": 0.5413, "num_input_tokens_seen": 188006400, "step": 22950 }, { "epoch": 2.966791575138907, "grad_norm": 1.0660786628723145, "learning_rate": 1.5115413553201674e-08, "loss": 0.4049, "num_input_tokens_seen": 188088320, "step": 22960 }, { "epoch": 2.9680837317482878, "grad_norm": 0.6799715161323547, "learning_rate": 1.3962109302773085e-08, "loss": 1.021, "num_input_tokens_seen": 188170240, "step": 22970 }, { "epoch": 2.969375888357669, "grad_norm": 1.0657424926757812, "learning_rate": 1.2854554396291018e-08, "loss": 0.4626, "num_input_tokens_seen": 188252160, "step": 22980 }, { "epoch": 2.97066804496705, "grad_norm": 1.7538788318634033, "learning_rate": 1.1792750861686074e-08, "loss": 0.9079, "num_input_tokens_seen": 188334080, "step": 22990 }, { "epoch": 2.9719602015764313, "grad_norm": 0.40748029947280884, "learning_rate": 1.0776700643116976e-08, "loss": 0.8018, "num_input_tokens_seen": 188416000, "step": 23000 }, { "epoch": 2.973252358185812, "grad_norm": 0.6334804892539978, "learning_rate": 9.806405600967794e-09, "loss": 0.6943, "num_input_tokens_seen": 188497920, "step": 23010 }, { "epoch": 2.974544514795193, "grad_norm": 0.8970416188240051, "learning_rate": 8.881867511845166e-09, "loss": 1.0121, "num_input_tokens_seen": 188579840, "step": 23020 }, { "epoch": 2.9758366714045743, "grad_norm": 0.32017982006073, "learning_rate": 8.00308806857275e-09, "loss": 1.0772, "num_input_tokens_seen": 188661760, "step": 23030 }, { "epoch": 2.977128828013955, "grad_norm": 0.7373923659324646, "learning_rate": 7.1700688801940034e-09, "loss": 0.7207, "num_input_tokens_seen": 188743680, "step": 23040 }, { "epoch": 2.9784209846233365, "grad_norm": 0.6554174423217773, "learning_rate": 6.382811471963846e-09, "loss": 0.589, "num_input_tokens_seen": 188825600, "step": 23050 }, { "epoch": 2.9797131412327174, "grad_norm": 0.23197783529758453, "learning_rate": 5.6413172853486685e-09, "loss": 0.5539, "num_input_tokens_seen": 188907520, "step": 23060 }, { "epoch": 2.9810052978420982, "grad_norm": 0.8123525977134705, "learning_rate": 4.94558767802078e-09, "loss": 1.0715, "num_input_tokens_seen": 188989440, "step": 23070 }, { "epoch": 2.9822974544514795, "grad_norm": 0.8512336015701294, "learning_rate": 4.295623923858405e-09, "loss": 0.9075, "num_input_tokens_seen": 189071360, "step": 23080 }, { "epoch": 2.983589611060861, "grad_norm": 0.5837238430976868, "learning_rate": 3.6914272129429106e-09, "loss": 0.9787, "num_input_tokens_seen": 189153280, "step": 23090 }, { "epoch": 2.9848817676702417, "grad_norm": 0.7648143172264099, "learning_rate": 3.1329986515560295e-09, "loss": 0.397, "num_input_tokens_seen": 189235200, "step": 23100 }, { "epoch": 2.9861739242796226, "grad_norm": 0.7528153657913208, "learning_rate": 2.6203392621798605e-09, "loss": 0.8219, "num_input_tokens_seen": 189317120, "step": 23110 }, { "epoch": 2.987466080889004, "grad_norm": 0.9112277626991272, "learning_rate": 2.153449983491318e-09, "loss": 0.93, "num_input_tokens_seen": 189399040, "step": 23120 }, { "epoch": 2.9887582374983848, "grad_norm": 0.6468605995178223, "learning_rate": 1.7323316703621305e-09, "loss": 1.1683, "num_input_tokens_seen": 189480960, "step": 23130 }, { "epoch": 2.990050394107766, "grad_norm": 0.8173196315765381, "learning_rate": 1.356985093856067e-09, "loss": 0.5163, "num_input_tokens_seen": 189562880, "step": 23140 }, { "epoch": 2.991342550717147, "grad_norm": 0.9355853796005249, "learning_rate": 1.0274109412372613e-09, "loss": 1.0414, "num_input_tokens_seen": 189644800, "step": 23150 }, { "epoch": 2.992634707326528, "grad_norm": 0.6136227250099182, "learning_rate": 7.436098159480099e-10, "loss": 0.743, "num_input_tokens_seen": 189726720, "step": 23160 }, { "epoch": 2.993926863935909, "grad_norm": 0.31119829416275024, "learning_rate": 5.055822376337505e-10, "loss": 0.941, "num_input_tokens_seen": 189808640, "step": 23170 }, { "epoch": 2.99521902054529, "grad_norm": 0.6250201463699341, "learning_rate": 3.1332864211808254e-10, "loss": 0.6753, "num_input_tokens_seen": 189890560, "step": 23180 }, { "epoch": 2.9965111771546713, "grad_norm": 0.5410284996032715, "learning_rate": 1.6684938141664498e-10, "loss": 0.8739, "num_input_tokens_seen": 189972480, "step": 23190 }, { "epoch": 2.997803333764052, "grad_norm": 0.7503390908241272, "learning_rate": 6.614472373434044e-11, "loss": 0.851, "num_input_tokens_seen": 190054400, "step": 23200 }, { "epoch": 2.999095490373433, "grad_norm": 0.6340755224227905, "learning_rate": 1.1214853459784457e-11, "loss": 0.7338, "num_input_tokens_seen": 190136320, "step": 23210 } ], "logging_steps": 10, "max_steps": 23217, "num_input_tokens_seen": 190193664, "num_train_epochs": 3, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 8.660062333952852e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }