{ "best_metric": null, "best_model_checkpoint": null, "epoch": 4.991735537190083, "eval_steps": 500, "global_step": 980, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.02542911633820725, "grad_norm": 0.7163803577423096, "learning_rate": 4.9996788644998285e-05, "loss": 1.6242, "step": 5 }, { "epoch": 0.0508582326764145, "grad_norm": 0.5585092306137085, "learning_rate": 4.99871554050172e-05, "loss": 1.4834, "step": 10 }, { "epoch": 0.07628734901462174, "grad_norm": 0.3108583688735962, "learning_rate": 4.997110275491702e-05, "loss": 1.3901, "step": 15 }, { "epoch": 0.101716465352829, "grad_norm": 0.20925107598304749, "learning_rate": 4.994863481875841e-05, "loss": 1.2635, "step": 20 }, { "epoch": 0.12714558169103624, "grad_norm": 0.2436874657869339, "learning_rate": 4.991975736874289e-05, "loss": 1.2457, "step": 25 }, { "epoch": 0.15257469802924348, "grad_norm": 0.19612200558185577, "learning_rate": 4.9884477823729956e-05, "loss": 1.2602, "step": 30 }, { "epoch": 0.17800381436745072, "grad_norm": 0.1591530740261078, "learning_rate": 4.984280524733107e-05, "loss": 1.2238, "step": 35 }, { "epoch": 0.203432930705658, "grad_norm": 0.15396293997764587, "learning_rate": 4.979475034558115e-05, "loss": 1.2456, "step": 40 }, { "epoch": 0.22886204704386523, "grad_norm": 0.17088189721107483, "learning_rate": 4.9740325464188154e-05, "loss": 1.2554, "step": 45 }, { "epoch": 0.25429116338207247, "grad_norm": 0.14276106655597687, "learning_rate": 4.967954458536126e-05, "loss": 1.248, "step": 50 }, { "epoch": 0.27972027972027974, "grad_norm": 0.13971547782421112, "learning_rate": 4.961242332421882e-05, "loss": 1.1816, "step": 55 }, { "epoch": 0.30514939605848695, "grad_norm": 0.14659512042999268, "learning_rate": 4.9538978924776634e-05, "loss": 1.1962, "step": 60 }, { "epoch": 0.3305785123966942, "grad_norm": 0.1615353226661682, "learning_rate": 4.945923025551789e-05, "loss": 1.1989, "step": 65 }, { "epoch": 0.35600762873490144, "grad_norm": 0.1626625955104828, "learning_rate": 4.937319780454559e-05, "loss": 1.2023, "step": 70 }, { "epoch": 0.3814367450731087, "grad_norm": 0.1588200479745865, "learning_rate": 4.92809036743191e-05, "loss": 1.2328, "step": 75 }, { "epoch": 0.406865861411316, "grad_norm": 0.1648840457201004, "learning_rate": 4.9182371575975736e-05, "loss": 1.1974, "step": 80 }, { "epoch": 0.4322949777495232, "grad_norm": 0.16755953431129456, "learning_rate": 4.907762682323927e-05, "loss": 1.1876, "step": 85 }, { "epoch": 0.45772409408773046, "grad_norm": 0.19065892696380615, "learning_rate": 4.8966696325916515e-05, "loss": 1.2396, "step": 90 }, { "epoch": 0.4831532104259377, "grad_norm": 0.18359169363975525, "learning_rate": 4.8849608582984096e-05, "loss": 1.1743, "step": 95 }, { "epoch": 0.5085823267641449, "grad_norm": 0.18455757200717926, "learning_rate": 4.8726393675266716e-05, "loss": 1.1566, "step": 100 }, { "epoch": 0.5340114431023522, "grad_norm": 0.18412873148918152, "learning_rate": 4.8597083257709194e-05, "loss": 1.1998, "step": 105 }, { "epoch": 0.5594405594405595, "grad_norm": 0.2001655399799347, "learning_rate": 4.846171055124401e-05, "loss": 1.172, "step": 110 }, { "epoch": 0.5848696757787667, "grad_norm": 0.1914481222629547, "learning_rate": 4.832031033425662e-05, "loss": 1.1514, "step": 115 }, { "epoch": 0.6102987921169739, "grad_norm": 0.19641566276550293, "learning_rate": 4.817291893365055e-05, "loss": 1.1887, "step": 120 }, { "epoch": 0.6357279084551812, "grad_norm": 0.20107164978981018, "learning_rate": 4.8019574215514705e-05, "loss": 1.1932, "step": 125 }, { "epoch": 0.6611570247933884, "grad_norm": 0.21638870239257812, "learning_rate": 4.7860315575395316e-05, "loss": 1.166, "step": 130 }, { "epoch": 0.6865861411315957, "grad_norm": 0.22959065437316895, "learning_rate": 4.7695183928174804e-05, "loss": 1.1645, "step": 135 }, { "epoch": 0.7120152574698029, "grad_norm": 0.2240159660577774, "learning_rate": 4.752422169756048e-05, "loss": 1.167, "step": 140 }, { "epoch": 0.7374443738080102, "grad_norm": 0.22989672422409058, "learning_rate": 4.734747280518549e-05, "loss": 1.187, "step": 145 }, { "epoch": 0.7628734901462174, "grad_norm": 0.22513297200202942, "learning_rate": 4.716498265932501e-05, "loss": 1.18, "step": 150 }, { "epoch": 0.7883026064844246, "grad_norm": 0.23522530496120453, "learning_rate": 4.6976798143230434e-05, "loss": 1.1532, "step": 155 }, { "epoch": 0.813731722822632, "grad_norm": 0.2377459853887558, "learning_rate": 4.678296760308474e-05, "loss": 1.1726, "step": 160 }, { "epoch": 0.8391608391608392, "grad_norm": 0.25356563925743103, "learning_rate": 4.658354083558188e-05, "loss": 1.2446, "step": 165 }, { "epoch": 0.8645899554990464, "grad_norm": 0.24188080430030823, "learning_rate": 4.637856907513366e-05, "loss": 1.1905, "step": 170 }, { "epoch": 0.8900190718372537, "grad_norm": 0.2587025761604309, "learning_rate": 4.6168104980707107e-05, "loss": 1.1594, "step": 175 }, { "epoch": 0.9154481881754609, "grad_norm": 0.25419944524765015, "learning_rate": 4.595220262229601e-05, "loss": 1.1709, "step": 180 }, { "epoch": 0.9408773045136681, "grad_norm": 0.2684890031814575, "learning_rate": 4.573091746702988e-05, "loss": 1.1936, "step": 185 }, { "epoch": 0.9663064208518753, "grad_norm": 0.2676820755004883, "learning_rate": 4.55043063649239e-05, "loss": 1.1692, "step": 190 }, { "epoch": 0.9917355371900827, "grad_norm": 0.27281874418258667, "learning_rate": 4.5272427534273774e-05, "loss": 1.1812, "step": 195 }, { "epoch": 1.0190718372536554, "grad_norm": 0.26799142360687256, "learning_rate": 4.503534054669892e-05, "loss": 1.3447, "step": 200 }, { "epoch": 1.0445009535918626, "grad_norm": 0.29342585802078247, "learning_rate": 4.4793106311838e-05, "loss": 1.1098, "step": 205 }, { "epoch": 1.06993006993007, "grad_norm": 0.29103586077690125, "learning_rate": 4.454578706170075e-05, "loss": 1.1913, "step": 210 }, { "epoch": 1.0953591862682772, "grad_norm": 0.28619349002838135, "learning_rate": 4.429344633468004e-05, "loss": 1.1936, "step": 215 }, { "epoch": 1.1207883026064844, "grad_norm": 0.3098839521408081, "learning_rate": 4.4036148959228365e-05, "loss": 1.1187, "step": 220 }, { "epoch": 1.1462174189446916, "grad_norm": 0.30148303508758545, "learning_rate": 4.377396103720278e-05, "loss": 1.157, "step": 225 }, { "epoch": 1.1716465352828989, "grad_norm": 0.31909579038619995, "learning_rate": 4.350694992688289e-05, "loss": 1.1618, "step": 230 }, { "epoch": 1.197075651621106, "grad_norm": 0.327538400888443, "learning_rate": 4.323518422566586e-05, "loss": 1.1245, "step": 235 }, { "epoch": 1.2225047679593135, "grad_norm": 0.334259033203125, "learning_rate": 4.2958733752443195e-05, "loss": 1.1108, "step": 240 }, { "epoch": 1.2479338842975207, "grad_norm": 0.3590494692325592, "learning_rate": 4.267766952966369e-05, "loss": 1.1006, "step": 245 }, { "epoch": 1.273363000635728, "grad_norm": 0.32646894454956055, "learning_rate": 4.239206376508717e-05, "loss": 1.1424, "step": 250 }, { "epoch": 1.2987921169739352, "grad_norm": 0.3532828688621521, "learning_rate": 4.210198983323366e-05, "loss": 1.141, "step": 255 }, { "epoch": 1.3242212333121424, "grad_norm": 0.33540236949920654, "learning_rate": 4.180752225653292e-05, "loss": 1.0976, "step": 260 }, { "epoch": 1.3496503496503496, "grad_norm": 0.3587055504322052, "learning_rate": 4.150873668617898e-05, "loss": 1.1019, "step": 265 }, { "epoch": 1.375079465988557, "grad_norm": 0.35621777176856995, "learning_rate": 4.120570988269472e-05, "loss": 1.1075, "step": 270 }, { "epoch": 1.400508582326764, "grad_norm": 0.3648779094219208, "learning_rate": 4.089851969621138e-05, "loss": 1.1422, "step": 275 }, { "epoch": 1.4259376986649714, "grad_norm": 0.3523823916912079, "learning_rate": 4.058724504646834e-05, "loss": 1.115, "step": 280 }, { "epoch": 1.4513668150031787, "grad_norm": 0.36515510082244873, "learning_rate": 4.027196590253786e-05, "loss": 1.1293, "step": 285 }, { "epoch": 1.4767959313413859, "grad_norm": 0.37401697039604187, "learning_rate": 3.9952763262280405e-05, "loss": 1.1186, "step": 290 }, { "epoch": 1.502225047679593, "grad_norm": 0.3709016442298889, "learning_rate": 3.9629719131535594e-05, "loss": 1.149, "step": 295 }, { "epoch": 1.5276541640178003, "grad_norm": 0.38189002871513367, "learning_rate": 3.9302916503054246e-05, "loss": 1.1267, "step": 300 }, { "epoch": 1.5530832803560077, "grad_norm": 0.36769306659698486, "learning_rate": 3.897243933517679e-05, "loss": 1.108, "step": 305 }, { "epoch": 1.5785123966942147, "grad_norm": 0.398843377828598, "learning_rate": 3.8638372530263715e-05, "loss": 1.1324, "step": 310 }, { "epoch": 1.6039415130324222, "grad_norm": 0.37085267901420593, "learning_rate": 3.830080191288342e-05, "loss": 1.1051, "step": 315 }, { "epoch": 1.6293706293706294, "grad_norm": 0.39182424545288086, "learning_rate": 3.7959814207763135e-05, "loss": 1.1622, "step": 320 }, { "epoch": 1.6547997457088366, "grad_norm": 0.3733777105808258, "learning_rate": 3.761549701750865e-05, "loss": 1.1146, "step": 325 }, { "epoch": 1.680228862047044, "grad_norm": 0.3874206840991974, "learning_rate": 3.726793880009845e-05, "loss": 1.0812, "step": 330 }, { "epoch": 1.705657978385251, "grad_norm": 0.4049334228038788, "learning_rate": 3.6917228846158134e-05, "loss": 1.1379, "step": 335 }, { "epoch": 1.7310870947234585, "grad_norm": 0.4189164340496063, "learning_rate": 3.656345725602089e-05, "loss": 1.1183, "step": 340 }, { "epoch": 1.7565162110616654, "grad_norm": 0.38738593459129333, "learning_rate": 3.620671491657993e-05, "loss": 1.1094, "step": 345 }, { "epoch": 1.7819453273998729, "grad_norm": 0.40121838450431824, "learning_rate": 3.5847093477938956e-05, "loss": 1.1048, "step": 350 }, { "epoch": 1.80737444373808, "grad_norm": 0.42995157837867737, "learning_rate": 3.5484685329866425e-05, "loss": 1.1209, "step": 355 }, { "epoch": 1.8328035600762873, "grad_norm": 0.4180549085140228, "learning_rate": 3.5119583578059846e-05, "loss": 1.1284, "step": 360 }, { "epoch": 1.8582326764144947, "grad_norm": 0.40642648935317993, "learning_rate": 3.475188202022617e-05, "loss": 1.1239, "step": 365 }, { "epoch": 1.8836617927527017, "grad_norm": 0.4026438593864441, "learning_rate": 3.438167512198436e-05, "loss": 1.1068, "step": 370 }, { "epoch": 1.9090909090909092, "grad_norm": 0.4229359030723572, "learning_rate": 3.400905799259634e-05, "loss": 1.1046, "step": 375 }, { "epoch": 1.9345200254291164, "grad_norm": 0.4220713973045349, "learning_rate": 3.363412636053269e-05, "loss": 1.1127, "step": 380 }, { "epoch": 1.9599491417673236, "grad_norm": 0.40782222151756287, "learning_rate": 3.3256976548879184e-05, "loss": 1.117, "step": 385 }, { "epoch": 1.9853782581055308, "grad_norm": 0.43011417984962463, "learning_rate": 3.2877705450590526e-05, "loss": 1.1105, "step": 390 }, { "epoch": 2.0127145581691037, "grad_norm": 0.4370057284832001, "learning_rate": 3.249641050359779e-05, "loss": 1.2868, "step": 395 }, { "epoch": 2.0381436745073107, "grad_norm": 0.42571040987968445, "learning_rate": 3.211318966577581e-05, "loss": 1.082, "step": 400 }, { "epoch": 2.063572790845518, "grad_norm": 0.45124417543411255, "learning_rate": 3.172814138977692e-05, "loss": 1.0788, "step": 405 }, { "epoch": 2.089001907183725, "grad_norm": 0.4748118817806244, "learning_rate": 3.1341364597737686e-05, "loss": 1.0658, "step": 410 }, { "epoch": 2.1144310235219326, "grad_norm": 0.44493603706359863, "learning_rate": 3.0952958655864955e-05, "loss": 1.0914, "step": 415 }, { "epoch": 2.13986013986014, "grad_norm": 0.4867158532142639, "learning_rate": 3.056302334890786e-05, "loss": 1.0719, "step": 420 }, { "epoch": 2.165289256198347, "grad_norm": 0.494890034198761, "learning_rate": 3.0171658854522273e-05, "loss": 1.0929, "step": 425 }, { "epoch": 2.1907183725365544, "grad_norm": 0.489145964384079, "learning_rate": 2.9778965717534313e-05, "loss": 1.0833, "step": 430 }, { "epoch": 2.2161474888747614, "grad_norm": 0.4823625087738037, "learning_rate": 2.9385044824109543e-05, "loss": 1.0836, "step": 435 }, { "epoch": 2.241576605212969, "grad_norm": 0.5059676170349121, "learning_rate": 2.8989997375834482e-05, "loss": 1.0492, "step": 440 }, { "epoch": 2.2670057215511763, "grad_norm": 0.5108452439308167, "learning_rate": 2.8593924863717046e-05, "loss": 1.0746, "step": 445 }, { "epoch": 2.2924348378893833, "grad_norm": 0.516611635684967, "learning_rate": 2.8196929042112652e-05, "loss": 1.0972, "step": 450 }, { "epoch": 2.3178639542275907, "grad_norm": 0.5169530510902405, "learning_rate": 2.7799111902582696e-05, "loss": 1.0524, "step": 455 }, { "epoch": 2.3432930705657977, "grad_norm": 0.5156260132789612, "learning_rate": 2.7400575647692046e-05, "loss": 1.0402, "step": 460 }, { "epoch": 2.368722186904005, "grad_norm": 0.5396901965141296, "learning_rate": 2.7001422664752333e-05, "loss": 1.0956, "step": 465 }, { "epoch": 2.394151303242212, "grad_norm": 0.5307148694992065, "learning_rate": 2.6601755499517826e-05, "loss": 1.0803, "step": 470 }, { "epoch": 2.4195804195804196, "grad_norm": 0.5289092063903809, "learning_rate": 2.620167682984052e-05, "loss": 1.0657, "step": 475 }, { "epoch": 2.445009535918627, "grad_norm": 0.5314033031463623, "learning_rate": 2.5801289439291388e-05, "loss": 1.07, "step": 480 }, { "epoch": 2.470438652256834, "grad_norm": 0.5466395616531372, "learning_rate": 2.540069619075435e-05, "loss": 1.0835, "step": 485 }, { "epoch": 2.4958677685950414, "grad_norm": 0.5496529340744019, "learning_rate": 2.5e-05, "loss": 1.0571, "step": 490 }, { "epoch": 2.5212968849332484, "grad_norm": 0.5266883373260498, "learning_rate": 2.459930380924566e-05, "loss": 1.0984, "step": 495 }, { "epoch": 2.546726001271456, "grad_norm": 0.5321890711784363, "learning_rate": 2.419871056070862e-05, "loss": 1.0453, "step": 500 }, { "epoch": 2.572155117609663, "grad_norm": 0.5523152947425842, "learning_rate": 2.3798323170159488e-05, "loss": 1.0697, "step": 505 }, { "epoch": 2.5975842339478703, "grad_norm": 0.5818379521369934, "learning_rate": 2.339824450048218e-05, "loss": 1.1192, "step": 510 }, { "epoch": 2.6230133502860777, "grad_norm": 0.5280163884162903, "learning_rate": 2.299857733524767e-05, "loss": 1.0263, "step": 515 }, { "epoch": 2.6484424666242847, "grad_norm": 0.5316183567047119, "learning_rate": 2.2599424352307957e-05, "loss": 1.0851, "step": 520 }, { "epoch": 2.673871582962492, "grad_norm": 0.5420896410942078, "learning_rate": 2.2200888097417307e-05, "loss": 1.0691, "step": 525 }, { "epoch": 2.699300699300699, "grad_norm": 0.5518523454666138, "learning_rate": 2.1803070957887347e-05, "loss": 1.0735, "step": 530 }, { "epoch": 2.7247298156389066, "grad_norm": 0.5464677810668945, "learning_rate": 2.140607513628296e-05, "loss": 1.0905, "step": 535 }, { "epoch": 2.750158931977114, "grad_norm": 0.5611386895179749, "learning_rate": 2.1010002624165527e-05, "loss": 1.0789, "step": 540 }, { "epoch": 2.775588048315321, "grad_norm": 0.576288104057312, "learning_rate": 2.0614955175890463e-05, "loss": 1.0551, "step": 545 }, { "epoch": 2.801017164653528, "grad_norm": 0.5907012820243835, "learning_rate": 2.02210342824657e-05, "loss": 1.0641, "step": 550 }, { "epoch": 2.8264462809917354, "grad_norm": 0.5511234998703003, "learning_rate": 1.982834114547773e-05, "loss": 1.0781, "step": 555 }, { "epoch": 2.851875397329943, "grad_norm": 0.5715182423591614, "learning_rate": 1.9436976651092144e-05, "loss": 1.0377, "step": 560 }, { "epoch": 2.87730451366815, "grad_norm": 0.5606087446212769, "learning_rate": 1.9047041344135044e-05, "loss": 1.037, "step": 565 }, { "epoch": 2.9027336300063573, "grad_norm": 0.5559452176094055, "learning_rate": 1.865863540226232e-05, "loss": 1.0786, "step": 570 }, { "epoch": 2.9281627463445643, "grad_norm": 0.5585038065910339, "learning_rate": 1.827185861022308e-05, "loss": 1.058, "step": 575 }, { "epoch": 2.9535918626827717, "grad_norm": 0.6056733727455139, "learning_rate": 1.7886810334224192e-05, "loss": 1.0654, "step": 580 }, { "epoch": 2.979020979020979, "grad_norm": 0.5935595035552979, "learning_rate": 1.7503589496402208e-05, "loss": 1.0603, "step": 585 }, { "epoch": 3.006357279084552, "grad_norm": 0.5643672943115234, "learning_rate": 1.7122294549409484e-05, "loss": 1.2515, "step": 590 }, { "epoch": 3.031786395422759, "grad_norm": 0.5927174687385559, "learning_rate": 1.6743023451120832e-05, "loss": 1.0486, "step": 595 }, { "epoch": 3.0572155117609663, "grad_norm": 0.6135289072990417, "learning_rate": 1.6365873639467315e-05, "loss": 1.0647, "step": 600 }, { "epoch": 3.0826446280991737, "grad_norm": 0.6193404197692871, "learning_rate": 1.599094200740367e-05, "loss": 1.0201, "step": 605 }, { "epoch": 3.1080737444373807, "grad_norm": 0.5914512872695923, "learning_rate": 1.561832487801565e-05, "loss": 1.0361, "step": 610 }, { "epoch": 3.133502860775588, "grad_norm": 0.6324409246444702, "learning_rate": 1.5248117979773829e-05, "loss": 1.0334, "step": 615 }, { "epoch": 3.158931977113795, "grad_norm": 0.6424704790115356, "learning_rate": 1.4880416421940155e-05, "loss": 0.9946, "step": 620 }, { "epoch": 3.1843610934520026, "grad_norm": 0.6515686511993408, "learning_rate": 1.4515314670133582e-05, "loss": 1.0242, "step": 625 }, { "epoch": 3.20979020979021, "grad_norm": 0.6387981176376343, "learning_rate": 1.4152906522061048e-05, "loss": 1.051, "step": 630 }, { "epoch": 3.235219326128417, "grad_norm": 0.6447208523750305, "learning_rate": 1.3793285083420077e-05, "loss": 1.0407, "step": 635 }, { "epoch": 3.2606484424666244, "grad_norm": 0.6469278335571289, "learning_rate": 1.3436542743979125e-05, "loss": 1.0262, "step": 640 }, { "epoch": 3.2860775588048314, "grad_norm": 0.6370428204536438, "learning_rate": 1.3082771153841871e-05, "loss": 1.0428, "step": 645 }, { "epoch": 3.311506675143039, "grad_norm": 0.6550537943840027, "learning_rate": 1.2732061199901562e-05, "loss": 1.0236, "step": 650 }, { "epoch": 3.336935791481246, "grad_norm": 0.6451073288917542, "learning_rate": 1.2384502982491358e-05, "loss": 1.0346, "step": 655 }, { "epoch": 3.3623649078194533, "grad_norm": 0.6514309048652649, "learning_rate": 1.2040185792236874e-05, "loss": 1.048, "step": 660 }, { "epoch": 3.3877940241576603, "grad_norm": 0.6752684116363525, "learning_rate": 1.1699198087116589e-05, "loss": 1.0338, "step": 665 }, { "epoch": 3.4132231404958677, "grad_norm": 0.6573219895362854, "learning_rate": 1.1361627469736285e-05, "loss": 1.0606, "step": 670 }, { "epoch": 3.438652256834075, "grad_norm": 0.6186379194259644, "learning_rate": 1.1027560664823209e-05, "loss": 1.0397, "step": 675 }, { "epoch": 3.464081373172282, "grad_norm": 0.6716841459274292, "learning_rate": 1.0697083496945765e-05, "loss": 1.0613, "step": 680 }, { "epoch": 3.4895104895104896, "grad_norm": 0.6756547093391418, "learning_rate": 1.0370280868464405e-05, "loss": 1.0409, "step": 685 }, { "epoch": 3.5149396058486966, "grad_norm": 0.6823243498802185, "learning_rate": 1.0047236737719601e-05, "loss": 1.0539, "step": 690 }, { "epoch": 3.540368722186904, "grad_norm": 0.6413143277168274, "learning_rate": 9.728034097462144e-06, "loss": 1.0292, "step": 695 }, { "epoch": 3.5657978385251115, "grad_norm": 0.6690115928649902, "learning_rate": 9.412754953531663e-06, "loss": 1.0183, "step": 700 }, { "epoch": 3.5912269548633184, "grad_norm": 0.684471070766449, "learning_rate": 9.101480303788623e-06, "loss": 1.022, "step": 705 }, { "epoch": 3.616656071201526, "grad_norm": 0.6697286367416382, "learning_rate": 8.794290117305296e-06, "loss": 1.0101, "step": 710 }, { "epoch": 3.642085187539733, "grad_norm": 0.6681500673294067, "learning_rate": 8.491263313821021e-06, "loss": 1.0113, "step": 715 }, { "epoch": 3.6675143038779403, "grad_norm": 0.6634730100631714, "learning_rate": 8.192477743467078e-06, "loss": 1.0016, "step": 720 }, { "epoch": 3.6929434202161477, "grad_norm": 0.645428478717804, "learning_rate": 7.898010166766347e-06, "loss": 1.0227, "step": 725 }, { "epoch": 3.7183725365543547, "grad_norm": 0.678112804889679, "learning_rate": 7.607936234912841e-06, "loss": 1.0339, "step": 730 }, { "epoch": 3.7438016528925617, "grad_norm": 0.691081166267395, "learning_rate": 7.3223304703363135e-06, "loss": 0.9891, "step": 735 }, { "epoch": 3.769230769230769, "grad_norm": 0.6777130961418152, "learning_rate": 7.041266247556813e-06, "loss": 1.0427, "step": 740 }, { "epoch": 3.7946598855689766, "grad_norm": 0.6708715558052063, "learning_rate": 6.764815774334149e-06, "loss": 1.0068, "step": 745 }, { "epoch": 3.8200890019071836, "grad_norm": 0.685946524143219, "learning_rate": 6.493050073117116e-06, "loss": 1.0673, "step": 750 }, { "epoch": 3.845518118245391, "grad_norm": 0.654752790927887, "learning_rate": 6.226038962797218e-06, "loss": 1.015, "step": 755 }, { "epoch": 3.870947234583598, "grad_norm": 0.679111897945404, "learning_rate": 5.9638510407716394e-06, "loss": 1.0068, "step": 760 }, { "epoch": 3.8963763509218055, "grad_norm": 0.6768970489501953, "learning_rate": 5.706553665319955e-06, "loss": 1.0424, "step": 765 }, { "epoch": 3.921805467260013, "grad_norm": 0.6532475352287292, "learning_rate": 5.454212938299255e-06, "loss": 1.0668, "step": 770 }, { "epoch": 3.94723458359822, "grad_norm": 0.646823525428772, "learning_rate": 5.2068936881620095e-06, "loss": 1.0185, "step": 775 }, { "epoch": 3.9726636999364273, "grad_norm": 0.6898627281188965, "learning_rate": 4.9646594533010875e-06, "loss": 1.036, "step": 780 }, { "epoch": 3.9980928162746343, "grad_norm": 1.4582291841506958, "learning_rate": 4.72757246572623e-06, "loss": 1.1749, "step": 785 }, { "epoch": 4.025429116338207, "grad_norm": 0.6636221408843994, "learning_rate": 4.495693635076101e-06, "loss": 1.0003, "step": 790 }, { "epoch": 4.050858232676415, "grad_norm": 0.679002046585083, "learning_rate": 4.2690825329701315e-06, "loss": 0.9937, "step": 795 }, { "epoch": 4.076287349014621, "grad_norm": 0.6657451391220093, "learning_rate": 4.047797377703985e-06, "loss": 1.0328, "step": 800 }, { "epoch": 4.101716465352829, "grad_norm": 0.6936158537864685, "learning_rate": 3.831895019292897e-06, "loss": 1.0453, "step": 805 }, { "epoch": 4.127145581691036, "grad_norm": 0.7010984420776367, "learning_rate": 3.621430924866348e-06, "loss": 0.9894, "step": 810 }, { "epoch": 4.152574698029244, "grad_norm": 0.7202966809272766, "learning_rate": 3.4164591644181233e-06, "loss": 1.0028, "step": 815 }, { "epoch": 4.17800381436745, "grad_norm": 0.7169246077537537, "learning_rate": 3.217032396915265e-06, "loss": 1.0268, "step": 820 }, { "epoch": 4.203432930705658, "grad_norm": 0.68609219789505, "learning_rate": 3.0232018567695697e-06, "loss": 1.008, "step": 825 }, { "epoch": 4.228862047043865, "grad_norm": 0.6944385170936584, "learning_rate": 2.8350173406749973e-06, "loss": 0.9933, "step": 830 }, { "epoch": 4.254291163382073, "grad_norm": 0.6846181750297546, "learning_rate": 2.652527194814511e-06, "loss": 1.0147, "step": 835 }, { "epoch": 4.27972027972028, "grad_norm": 0.6891025304794312, "learning_rate": 2.475778302439524e-06, "loss": 1.0619, "step": 840 }, { "epoch": 4.305149396058487, "grad_norm": 0.7004478573799133, "learning_rate": 2.3048160718251997e-06, "loss": 0.9766, "step": 845 }, { "epoch": 4.330578512396694, "grad_norm": 0.681630551815033, "learning_rate": 2.1396844246046903e-06, "loss": 1.0194, "step": 850 }, { "epoch": 4.356007628734901, "grad_norm": 0.7060221433639526, "learning_rate": 1.980425784485293e-06, "loss": 1.0079, "step": 855 }, { "epoch": 4.381436745073109, "grad_norm": 0.7196823954582214, "learning_rate": 1.827081066349459e-06, "loss": 1.0269, "step": 860 }, { "epoch": 4.406865861411316, "grad_norm": 0.7184461951255798, "learning_rate": 1.6796896657433808e-06, "loss": 0.9593, "step": 865 }, { "epoch": 4.432294977749523, "grad_norm": 0.7016611099243164, "learning_rate": 1.538289448755989e-06, "loss": 1.005, "step": 870 }, { "epoch": 4.45772409408773, "grad_norm": 0.7026543021202087, "learning_rate": 1.4029167422908107e-06, "loss": 1.0268, "step": 875 }, { "epoch": 4.483153210425938, "grad_norm": 0.722529411315918, "learning_rate": 1.273606324733284e-06, "loss": 1.0258, "step": 880 }, { "epoch": 4.508582326764145, "grad_norm": 0.6864616274833679, "learning_rate": 1.1503914170159057e-06, "loss": 1.0159, "step": 885 }, { "epoch": 4.534011443102353, "grad_norm": 0.6917942762374878, "learning_rate": 1.0333036740834856e-06, "loss": 1.0288, "step": 890 }, { "epoch": 4.559440559440559, "grad_norm": 0.7076994180679321, "learning_rate": 9.223731767607435e-07, "loss": 1.0102, "step": 895 }, { "epoch": 4.584869675778767, "grad_norm": 0.6951266527175903, "learning_rate": 8.176284240242638e-07, "loss": 1.0253, "step": 900 }, { "epoch": 4.610298792116974, "grad_norm": 0.6959212422370911, "learning_rate": 7.19096325680907e-07, "loss": 1.0014, "step": 905 }, { "epoch": 4.6357279084551815, "grad_norm": 0.7114453911781311, "learning_rate": 6.268021954544096e-07, "loss": 0.9987, "step": 910 }, { "epoch": 4.661157024793388, "grad_norm": 0.7204643487930298, "learning_rate": 5.407697444821169e-07, "loss": 0.9987, "step": 915 }, { "epoch": 4.686586141131595, "grad_norm": 0.7097218632698059, "learning_rate": 4.6102107522336403e-07, "loss": 1.0043, "step": 920 }, { "epoch": 4.712015257469803, "grad_norm": 0.7494398951530457, "learning_rate": 3.8757667578118994e-07, "loss": 1.0321, "step": 925 }, { "epoch": 4.73744437380801, "grad_norm": 0.6792721748352051, "learning_rate": 3.204554146387456e-07, "loss": 1.0747, "step": 930 }, { "epoch": 4.762873490146218, "grad_norm": 0.6858909130096436, "learning_rate": 2.5967453581185184e-07, "loss": 0.9982, "step": 935 }, { "epoch": 4.788302606484424, "grad_norm": 0.7001191973686218, "learning_rate": 2.052496544188487e-07, "loss": 1.0616, "step": 940 }, { "epoch": 4.813731722822632, "grad_norm": 0.7212966084480286, "learning_rate": 1.571947526689349e-07, "loss": 1.0264, "step": 945 }, { "epoch": 4.839160839160839, "grad_norm": 0.670038104057312, "learning_rate": 1.1552217627004425e-07, "loss": 1.0237, "step": 950 }, { "epoch": 4.864589955499047, "grad_norm": 0.7079971432685852, "learning_rate": 8.024263125710752e-08, "loss": 0.992, "step": 955 }, { "epoch": 4.890019071837254, "grad_norm": 0.7235581874847412, "learning_rate": 5.136518124159162e-08, "loss": 0.9947, "step": 960 }, { "epoch": 4.915448188175461, "grad_norm": 0.6986010074615479, "learning_rate": 2.8897245082978864e-08, "loss": 0.9907, "step": 965 }, { "epoch": 4.940877304513668, "grad_norm": 0.7248331904411316, "learning_rate": 1.284459498280266e-08, "loss": 0.9958, "step": 970 }, { "epoch": 4.9663064208518755, "grad_norm": 0.7134162783622742, "learning_rate": 3.2113550017198735e-09, "loss": 1.0067, "step": 975 }, { "epoch": 4.991735537190083, "grad_norm": 0.7377886176109314, "learning_rate": 0.0, "loss": 0.9834, "step": 980 }, { "epoch": 4.991735537190083, "step": 980, "total_flos": 1.2928446514593792e+18, "train_loss": 1.0959847966018987, "train_runtime": 6632.7746, "train_samples_per_second": 2.372, "train_steps_per_second": 0.148 } ], "logging_steps": 5, "max_steps": 980, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.2928446514593792e+18, "train_batch_size": 2, "trial_name": null, "trial_params": null }