{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9995213020584012, "eval_steps": 500, "global_step": 1044, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.004786979415988511, "grad_norm": 0.24613235890865326, "learning_rate": 0.0002, "loss": 1.2771, "step": 5 }, { "epoch": 0.009573958831977022, "grad_norm": 0.16961899399757385, "learning_rate": 0.0002, "loss": 1.0713, "step": 10 }, { "epoch": 0.014360938247965534, "grad_norm": 0.15322890877723694, "learning_rate": 0.0002, "loss": 0.9605, "step": 15 }, { "epoch": 0.019147917663954045, "grad_norm": 0.1613943725824356, "learning_rate": 0.0002, "loss": 0.7417, "step": 20 }, { "epoch": 0.023934897079942556, "grad_norm": 0.15506355464458466, "learning_rate": 0.0002, "loss": 0.6595, "step": 25 }, { "epoch": 0.028721876495931067, "grad_norm": 0.1182127594947815, "learning_rate": 0.0002, "loss": 0.6764, "step": 30 }, { "epoch": 0.03350885591191958, "grad_norm": 0.10777570307254791, "learning_rate": 0.0002, "loss": 0.6284, "step": 35 }, { "epoch": 0.03829583532790809, "grad_norm": 0.08354483544826508, "learning_rate": 0.0002, "loss": 0.6242, "step": 40 }, { "epoch": 0.0430828147438966, "grad_norm": 0.09534800797700882, "learning_rate": 0.0002, "loss": 0.6214, "step": 45 }, { "epoch": 0.04786979415988511, "grad_norm": 0.10349296033382416, "learning_rate": 0.0002, "loss": 0.6159, "step": 50 }, { "epoch": 0.052656773575873624, "grad_norm": 0.08007140457630157, "learning_rate": 0.0002, "loss": 0.5848, "step": 55 }, { "epoch": 0.057443752991862135, "grad_norm": 0.07333367317914963, "learning_rate": 0.0002, "loss": 0.6728, "step": 60 }, { "epoch": 0.062230732407850646, "grad_norm": 0.07483084499835968, "learning_rate": 0.0002, "loss": 0.5873, "step": 65 }, { "epoch": 0.06701771182383916, "grad_norm": 0.07768324017524719, "learning_rate": 0.0002, "loss": 0.6175, "step": 70 }, { "epoch": 0.07180469123982768, "grad_norm": 0.07919920980930328, "learning_rate": 0.0002, "loss": 0.5256, "step": 75 }, { "epoch": 0.07659167065581618, "grad_norm": 0.09047867357730865, "learning_rate": 0.0002, "loss": 0.5968, "step": 80 }, { "epoch": 0.0813786500718047, "grad_norm": 0.08032748848199844, "learning_rate": 0.0002, "loss": 0.6136, "step": 85 }, { "epoch": 0.0861656294877932, "grad_norm": 0.0846623107790947, "learning_rate": 0.0002, "loss": 0.5491, "step": 90 }, { "epoch": 0.09095260890378172, "grad_norm": 0.0998520776629448, "learning_rate": 0.0002, "loss": 0.5526, "step": 95 }, { "epoch": 0.09573958831977022, "grad_norm": 0.08374989777803421, "learning_rate": 0.0002, "loss": 0.5713, "step": 100 }, { "epoch": 0.10052656773575874, "grad_norm": 0.07846508920192719, "learning_rate": 0.0002, "loss": 0.5801, "step": 105 }, { "epoch": 0.10531354715174725, "grad_norm": 0.08031659573316574, "learning_rate": 0.0002, "loss": 0.6147, "step": 110 }, { "epoch": 0.11010052656773577, "grad_norm": 0.08738347887992859, "learning_rate": 0.0002, "loss": 0.5208, "step": 115 }, { "epoch": 0.11488750598372427, "grad_norm": 0.0916195958852768, "learning_rate": 0.0002, "loss": 0.5689, "step": 120 }, { "epoch": 0.11967448539971279, "grad_norm": 0.08580000698566437, "learning_rate": 0.0002, "loss": 0.6158, "step": 125 }, { "epoch": 0.12446146481570129, "grad_norm": 0.08231440931558609, "learning_rate": 0.0002, "loss": 0.5989, "step": 130 }, { "epoch": 0.1292484442316898, "grad_norm": 0.09185798466205597, "learning_rate": 0.0002, "loss": 0.5117, "step": 135 }, { "epoch": 0.13403542364767831, "grad_norm": 0.0896279439330101, "learning_rate": 0.0002, "loss": 0.4921, "step": 140 }, { "epoch": 0.13882240306366683, "grad_norm": 0.08800848573446274, "learning_rate": 0.0002, "loss": 0.5737, "step": 145 }, { "epoch": 0.14360938247965535, "grad_norm": 0.08985792100429535, "learning_rate": 0.0002, "loss": 0.5814, "step": 150 }, { "epoch": 0.14839636189564384, "grad_norm": 0.096456378698349, "learning_rate": 0.0002, "loss": 0.5483, "step": 155 }, { "epoch": 0.15318334131163236, "grad_norm": 0.08564233034849167, "learning_rate": 0.0002, "loss": 0.5258, "step": 160 }, { "epoch": 0.15797032072762088, "grad_norm": 0.08352309465408325, "learning_rate": 0.0002, "loss": 0.516, "step": 165 }, { "epoch": 0.1627573001436094, "grad_norm": 0.08917209506034851, "learning_rate": 0.0002, "loss": 0.5133, "step": 170 }, { "epoch": 0.1675442795595979, "grad_norm": 0.08113600313663483, "learning_rate": 0.0002, "loss": 0.556, "step": 175 }, { "epoch": 0.1723312589755864, "grad_norm": 0.09329506009817123, "learning_rate": 0.0002, "loss": 0.5211, "step": 180 }, { "epoch": 0.17711823839157492, "grad_norm": 0.08815263211727142, "learning_rate": 0.0002, "loss": 0.5338, "step": 185 }, { "epoch": 0.18190521780756344, "grad_norm": 0.08324268460273743, "learning_rate": 0.0002, "loss": 0.5066, "step": 190 }, { "epoch": 0.18669219722355193, "grad_norm": 0.0860678032040596, "learning_rate": 0.0002, "loss": 0.5745, "step": 195 }, { "epoch": 0.19147917663954045, "grad_norm": 0.07750646024942398, "learning_rate": 0.0002, "loss": 0.4632, "step": 200 }, { "epoch": 0.19626615605552897, "grad_norm": 0.09053143113851547, "learning_rate": 0.0002, "loss": 0.5797, "step": 205 }, { "epoch": 0.20105313547151749, "grad_norm": 0.07899998128414154, "learning_rate": 0.0002, "loss": 0.6043, "step": 210 }, { "epoch": 0.20584011488750598, "grad_norm": 0.09660762548446655, "learning_rate": 0.0002, "loss": 0.5559, "step": 215 }, { "epoch": 0.2106270943034945, "grad_norm": 0.0966796949505806, "learning_rate": 0.0002, "loss": 0.5965, "step": 220 }, { "epoch": 0.215414073719483, "grad_norm": 0.10608462989330292, "learning_rate": 0.0002, "loss": 0.4921, "step": 225 }, { "epoch": 0.22020105313547153, "grad_norm": 0.07869511842727661, "learning_rate": 0.0002, "loss": 0.5416, "step": 230 }, { "epoch": 0.22498803255146002, "grad_norm": 0.10257625579833984, "learning_rate": 0.0002, "loss": 0.5703, "step": 235 }, { "epoch": 0.22977501196744854, "grad_norm": 0.09301017224788666, "learning_rate": 0.0002, "loss": 0.5955, "step": 240 }, { "epoch": 0.23456199138343706, "grad_norm": 0.08770053088665009, "learning_rate": 0.0002, "loss": 0.4946, "step": 245 }, { "epoch": 0.23934897079942558, "grad_norm": 0.09587664902210236, "learning_rate": 0.0002, "loss": 0.5322, "step": 250 }, { "epoch": 0.24413595021541407, "grad_norm": 0.082343190908432, "learning_rate": 0.0002, "loss": 0.6184, "step": 255 }, { "epoch": 0.24892292963140258, "grad_norm": 0.09046710282564163, "learning_rate": 0.0002, "loss": 0.5508, "step": 260 }, { "epoch": 0.2537099090473911, "grad_norm": 0.09608398377895355, "learning_rate": 0.0002, "loss": 0.5634, "step": 265 }, { "epoch": 0.2584968884633796, "grad_norm": 0.08927994221448898, "learning_rate": 0.0002, "loss": 0.5631, "step": 270 }, { "epoch": 0.26328386787936814, "grad_norm": 0.115423783659935, "learning_rate": 0.0002, "loss": 0.5898, "step": 275 }, { "epoch": 0.26807084729535663, "grad_norm": 0.0849870815873146, "learning_rate": 0.0002, "loss": 0.5095, "step": 280 }, { "epoch": 0.2728578267113451, "grad_norm": 0.09704048186540604, "learning_rate": 0.0002, "loss": 0.553, "step": 285 }, { "epoch": 0.27764480612733367, "grad_norm": 0.0753026083111763, "learning_rate": 0.0002, "loss": 0.496, "step": 290 }, { "epoch": 0.28243178554332216, "grad_norm": 0.09067820757627487, "learning_rate": 0.0002, "loss": 0.5093, "step": 295 }, { "epoch": 0.2872187649593107, "grad_norm": 0.09334460645914078, "learning_rate": 0.0002, "loss": 0.5467, "step": 300 }, { "epoch": 0.2920057443752992, "grad_norm": 0.09724689275026321, "learning_rate": 0.0002, "loss": 0.5533, "step": 305 }, { "epoch": 0.2967927237912877, "grad_norm": 0.09164885431528091, "learning_rate": 0.0002, "loss": 0.5436, "step": 310 }, { "epoch": 0.30157970320727623, "grad_norm": 0.09583573043346405, "learning_rate": 0.0002, "loss": 0.5408, "step": 315 }, { "epoch": 0.3063666826232647, "grad_norm": 0.0860954225063324, "learning_rate": 0.0002, "loss": 0.4828, "step": 320 }, { "epoch": 0.3111536620392532, "grad_norm": 0.08259189128875732, "learning_rate": 0.0002, "loss": 0.582, "step": 325 }, { "epoch": 0.31594064145524176, "grad_norm": 0.10501275211572647, "learning_rate": 0.0002, "loss": 0.5176, "step": 330 }, { "epoch": 0.32072762087123025, "grad_norm": 0.09174875169992447, "learning_rate": 0.0002, "loss": 0.5393, "step": 335 }, { "epoch": 0.3255146002872188, "grad_norm": 0.09675736725330353, "learning_rate": 0.0002, "loss": 0.5495, "step": 340 }, { "epoch": 0.3303015797032073, "grad_norm": 0.08207903057336807, "learning_rate": 0.0002, "loss": 0.5252, "step": 345 }, { "epoch": 0.3350885591191958, "grad_norm": 0.08642390370368958, "learning_rate": 0.0002, "loss": 0.5688, "step": 350 }, { "epoch": 0.3398755385351843, "grad_norm": 0.0861140564084053, "learning_rate": 0.0002, "loss": 0.4866, "step": 355 }, { "epoch": 0.3446625179511728, "grad_norm": 0.08826491981744766, "learning_rate": 0.0002, "loss": 0.5392, "step": 360 }, { "epoch": 0.3494494973671613, "grad_norm": 0.09024737030267715, "learning_rate": 0.0002, "loss": 0.5554, "step": 365 }, { "epoch": 0.35423647678314985, "grad_norm": 0.09096304327249527, "learning_rate": 0.0002, "loss": 0.516, "step": 370 }, { "epoch": 0.35902345619913834, "grad_norm": 0.0845038965344429, "learning_rate": 0.0002, "loss": 0.5301, "step": 375 }, { "epoch": 0.3638104356151269, "grad_norm": 0.08174905180931091, "learning_rate": 0.0002, "loss": 0.5472, "step": 380 }, { "epoch": 0.36859741503111537, "grad_norm": 0.08673607558012009, "learning_rate": 0.0002, "loss": 0.5648, "step": 385 }, { "epoch": 0.37338439444710386, "grad_norm": 0.08147840946912766, "learning_rate": 0.0002, "loss": 0.5317, "step": 390 }, { "epoch": 0.3781713738630924, "grad_norm": 0.08197998255491257, "learning_rate": 0.0002, "loss": 0.5085, "step": 395 }, { "epoch": 0.3829583532790809, "grad_norm": 0.09027797728776932, "learning_rate": 0.0002, "loss": 0.5488, "step": 400 }, { "epoch": 0.3877453326950694, "grad_norm": 0.08635086566209793, "learning_rate": 0.0002, "loss": 0.5182, "step": 405 }, { "epoch": 0.39253231211105793, "grad_norm": 0.09970038384199142, "learning_rate": 0.0002, "loss": 0.5852, "step": 410 }, { "epoch": 0.3973192915270464, "grad_norm": 0.08561892062425613, "learning_rate": 0.0002, "loss": 0.5144, "step": 415 }, { "epoch": 0.40210627094303497, "grad_norm": 0.08953725546598434, "learning_rate": 0.0002, "loss": 0.5064, "step": 420 }, { "epoch": 0.40689325035902346, "grad_norm": 0.09641014784574509, "learning_rate": 0.0002, "loss": 0.4849, "step": 425 }, { "epoch": 0.41168022977501195, "grad_norm": 0.09051619470119476, "learning_rate": 0.0002, "loss": 0.5777, "step": 430 }, { "epoch": 0.4164672091910005, "grad_norm": 0.08543870598077774, "learning_rate": 0.0002, "loss": 0.5299, "step": 435 }, { "epoch": 0.421254188606989, "grad_norm": 0.08574735373258591, "learning_rate": 0.0002, "loss": 0.5573, "step": 440 }, { "epoch": 0.4260411680229775, "grad_norm": 0.09401609748601913, "learning_rate": 0.0002, "loss": 0.5643, "step": 445 }, { "epoch": 0.430828147438966, "grad_norm": 0.10760053247213364, "learning_rate": 0.0002, "loss": 0.516, "step": 450 }, { "epoch": 0.4356151268549545, "grad_norm": 0.09510120749473572, "learning_rate": 0.0002, "loss": 0.505, "step": 455 }, { "epoch": 0.44040210627094306, "grad_norm": 0.09105115383863449, "learning_rate": 0.0002, "loss": 0.5717, "step": 460 }, { "epoch": 0.44518908568693155, "grad_norm": 0.0891876295208931, "learning_rate": 0.0002, "loss": 0.5258, "step": 465 }, { "epoch": 0.44997606510292004, "grad_norm": 0.08933177590370178, "learning_rate": 0.0002, "loss": 0.4951, "step": 470 }, { "epoch": 0.4547630445189086, "grad_norm": 0.09821013361215591, "learning_rate": 0.0002, "loss": 0.5422, "step": 475 }, { "epoch": 0.4595500239348971, "grad_norm": 0.090922050178051, "learning_rate": 0.0002, "loss": 0.5286, "step": 480 }, { "epoch": 0.46433700335088557, "grad_norm": 0.09325899183750153, "learning_rate": 0.0002, "loss": 0.596, "step": 485 }, { "epoch": 0.4691239827668741, "grad_norm": 0.09565772861242294, "learning_rate": 0.0002, "loss": 0.4855, "step": 490 }, { "epoch": 0.4739109621828626, "grad_norm": 0.08238258212804794, "learning_rate": 0.0002, "loss": 0.534, "step": 495 }, { "epoch": 0.47869794159885115, "grad_norm": 0.10455012321472168, "learning_rate": 0.0002, "loss": 0.5615, "step": 500 }, { "epoch": 0.48348492101483964, "grad_norm": 0.07809582352638245, "learning_rate": 0.0002, "loss": 0.5319, "step": 505 }, { "epoch": 0.48827190043082813, "grad_norm": 0.09158290922641754, "learning_rate": 0.0002, "loss": 0.5149, "step": 510 }, { "epoch": 0.4930588798468167, "grad_norm": 0.09475893527269363, "learning_rate": 0.0002, "loss": 0.5548, "step": 515 }, { "epoch": 0.49784585926280517, "grad_norm": 0.08862445503473282, "learning_rate": 0.0002, "loss": 0.551, "step": 520 }, { "epoch": 0.5026328386787937, "grad_norm": 0.08608075976371765, "learning_rate": 0.0002, "loss": 0.5032, "step": 525 }, { "epoch": 0.5074198180947822, "grad_norm": 0.09171325713396072, "learning_rate": 0.0002, "loss": 0.4872, "step": 530 }, { "epoch": 0.5122067975107707, "grad_norm": 0.08891316503286362, "learning_rate": 0.0002, "loss": 0.5381, "step": 535 }, { "epoch": 0.5169937769267592, "grad_norm": 0.09202417731285095, "learning_rate": 0.0002, "loss": 0.5642, "step": 540 }, { "epoch": 0.5217807563427477, "grad_norm": 0.09024330973625183, "learning_rate": 0.0002, "loss": 0.4638, "step": 545 }, { "epoch": 0.5265677357587363, "grad_norm": 0.08484344184398651, "learning_rate": 0.0002, "loss": 0.5171, "step": 550 }, { "epoch": 0.5313547151747248, "grad_norm": 0.09126883000135422, "learning_rate": 0.0002, "loss": 0.4778, "step": 555 }, { "epoch": 0.5361416945907133, "grad_norm": 0.08565142005681992, "learning_rate": 0.0002, "loss": 0.5264, "step": 560 }, { "epoch": 0.5409286740067017, "grad_norm": 0.09363921731710434, "learning_rate": 0.0002, "loss": 0.5261, "step": 565 }, { "epoch": 0.5457156534226902, "grad_norm": 0.08321545273065567, "learning_rate": 0.0002, "loss": 0.4724, "step": 570 }, { "epoch": 0.5505026328386788, "grad_norm": 0.08636103570461273, "learning_rate": 0.0002, "loss": 0.5383, "step": 575 }, { "epoch": 0.5552896122546673, "grad_norm": 0.0867634192109108, "learning_rate": 0.0002, "loss": 0.571, "step": 580 }, { "epoch": 0.5600765916706558, "grad_norm": 0.09202156215906143, "learning_rate": 0.0002, "loss": 0.4925, "step": 585 }, { "epoch": 0.5648635710866443, "grad_norm": 0.08338255435228348, "learning_rate": 0.0002, "loss": 0.4724, "step": 590 }, { "epoch": 0.5696505505026328, "grad_norm": 0.09248416125774384, "learning_rate": 0.0002, "loss": 0.5339, "step": 595 }, { "epoch": 0.5744375299186214, "grad_norm": 0.08971364796161652, "learning_rate": 0.0002, "loss": 0.5467, "step": 600 }, { "epoch": 0.5792245093346099, "grad_norm": 0.10297700017690659, "learning_rate": 0.0002, "loss": 0.5269, "step": 605 }, { "epoch": 0.5840114887505984, "grad_norm": 0.09885570406913757, "learning_rate": 0.0002, "loss": 0.5741, "step": 610 }, { "epoch": 0.5887984681665869, "grad_norm": 0.0943949893116951, "learning_rate": 0.0002, "loss": 0.5107, "step": 615 }, { "epoch": 0.5935854475825754, "grad_norm": 0.09385235607624054, "learning_rate": 0.0002, "loss": 0.5522, "step": 620 }, { "epoch": 0.5983724269985639, "grad_norm": 0.0906907171010971, "learning_rate": 0.0002, "loss": 0.4684, "step": 625 }, { "epoch": 0.6031594064145525, "grad_norm": 0.08867505192756653, "learning_rate": 0.0002, "loss": 0.4637, "step": 630 }, { "epoch": 0.607946385830541, "grad_norm": 0.0929451733827591, "learning_rate": 0.0002, "loss": 0.5462, "step": 635 }, { "epoch": 0.6127333652465294, "grad_norm": 0.08720085769891739, "learning_rate": 0.0002, "loss": 0.497, "step": 640 }, { "epoch": 0.6175203446625179, "grad_norm": 0.10713039338588715, "learning_rate": 0.0002, "loss": 0.5448, "step": 645 }, { "epoch": 0.6223073240785064, "grad_norm": 0.08213481307029724, "learning_rate": 0.0002, "loss": 0.4816, "step": 650 }, { "epoch": 0.627094303494495, "grad_norm": 0.08939921110868454, "learning_rate": 0.0002, "loss": 0.4883, "step": 655 }, { "epoch": 0.6318812829104835, "grad_norm": 0.09071970731019974, "learning_rate": 0.0002, "loss": 0.5411, "step": 660 }, { "epoch": 0.636668262326472, "grad_norm": 0.09525053203105927, "learning_rate": 0.0002, "loss": 0.4966, "step": 665 }, { "epoch": 0.6414552417424605, "grad_norm": 0.08770790696144104, "learning_rate": 0.0002, "loss": 0.4786, "step": 670 }, { "epoch": 0.646242221158449, "grad_norm": 0.08054076880216599, "learning_rate": 0.0002, "loss": 0.5051, "step": 675 }, { "epoch": 0.6510292005744376, "grad_norm": 0.08313776552677155, "learning_rate": 0.0002, "loss": 0.5547, "step": 680 }, { "epoch": 0.6558161799904261, "grad_norm": 0.0805881917476654, "learning_rate": 0.0002, "loss": 0.495, "step": 685 }, { "epoch": 0.6606031594064146, "grad_norm": 0.10019008070230484, "learning_rate": 0.0002, "loss": 0.4683, "step": 690 }, { "epoch": 0.665390138822403, "grad_norm": 0.08097992837429047, "learning_rate": 0.0002, "loss": 0.5511, "step": 695 }, { "epoch": 0.6701771182383915, "grad_norm": 0.08138570934534073, "learning_rate": 0.0002, "loss": 0.5638, "step": 700 }, { "epoch": 0.67496409765438, "grad_norm": 0.09005066752433777, "learning_rate": 0.0002, "loss": 0.4591, "step": 705 }, { "epoch": 0.6797510770703686, "grad_norm": 0.09737958759069443, "learning_rate": 0.0002, "loss": 0.5003, "step": 710 }, { "epoch": 0.6845380564863571, "grad_norm": 0.0959305465221405, "learning_rate": 0.0002, "loss": 0.5645, "step": 715 }, { "epoch": 0.6893250359023456, "grad_norm": 0.0876409187912941, "learning_rate": 0.0002, "loss": 0.531, "step": 720 }, { "epoch": 0.6941120153183341, "grad_norm": 0.09579559415578842, "learning_rate": 0.0002, "loss": 0.5717, "step": 725 }, { "epoch": 0.6988989947343226, "grad_norm": 0.08657323569059372, "learning_rate": 0.0002, "loss": 0.4846, "step": 730 }, { "epoch": 0.7036859741503112, "grad_norm": 0.08424372225999832, "learning_rate": 0.0002, "loss": 0.5175, "step": 735 }, { "epoch": 0.7084729535662997, "grad_norm": 0.0895078107714653, "learning_rate": 0.0002, "loss": 0.518, "step": 740 }, { "epoch": 0.7132599329822882, "grad_norm": 0.08580939471721649, "learning_rate": 0.0002, "loss": 0.5601, "step": 745 }, { "epoch": 0.7180469123982767, "grad_norm": 0.0797315239906311, "learning_rate": 0.0002, "loss": 0.5354, "step": 750 }, { "epoch": 0.7228338918142652, "grad_norm": 0.08981385827064514, "learning_rate": 0.0002, "loss": 0.5638, "step": 755 }, { "epoch": 0.7276208712302538, "grad_norm": 0.09025374054908752, "learning_rate": 0.0002, "loss": 0.5028, "step": 760 }, { "epoch": 0.7324078506462423, "grad_norm": 0.09753820300102234, "learning_rate": 0.0002, "loss": 0.5237, "step": 765 }, { "epoch": 0.7371948300622307, "grad_norm": 0.08967633545398712, "learning_rate": 0.0002, "loss": 0.4636, "step": 770 }, { "epoch": 0.7419818094782192, "grad_norm": 0.10074934363365173, "learning_rate": 0.0002, "loss": 0.5276, "step": 775 }, { "epoch": 0.7467687888942077, "grad_norm": 0.0874541625380516, "learning_rate": 0.0002, "loss": 0.5085, "step": 780 }, { "epoch": 0.7515557683101962, "grad_norm": 0.084027960896492, "learning_rate": 0.0002, "loss": 0.5062, "step": 785 }, { "epoch": 0.7563427477261848, "grad_norm": 0.08965150266885757, "learning_rate": 0.0002, "loss": 0.5214, "step": 790 }, { "epoch": 0.7611297271421733, "grad_norm": 0.08234406262636185, "learning_rate": 0.0002, "loss": 0.4863, "step": 795 }, { "epoch": 0.7659167065581618, "grad_norm": 0.08266417682170868, "learning_rate": 0.0002, "loss": 0.5149, "step": 800 }, { "epoch": 0.7707036859741503, "grad_norm": 0.08559945225715637, "learning_rate": 0.0002, "loss": 0.528, "step": 805 }, { "epoch": 0.7754906653901388, "grad_norm": 0.08358705043792725, "learning_rate": 0.0002, "loss": 0.5283, "step": 810 }, { "epoch": 0.7802776448061274, "grad_norm": 0.08530480414628983, "learning_rate": 0.0002, "loss": 0.4868, "step": 815 }, { "epoch": 0.7850646242221159, "grad_norm": 0.08576823025941849, "learning_rate": 0.0002, "loss": 0.5277, "step": 820 }, { "epoch": 0.7898516036381044, "grad_norm": 0.10152282565832138, "learning_rate": 0.0002, "loss": 0.4652, "step": 825 }, { "epoch": 0.7946385830540929, "grad_norm": 0.08843079209327698, "learning_rate": 0.0002, "loss": 0.5194, "step": 830 }, { "epoch": 0.7994255624700813, "grad_norm": 0.08835287392139435, "learning_rate": 0.0002, "loss": 0.5352, "step": 835 }, { "epoch": 0.8042125418860699, "grad_norm": 0.08630600571632385, "learning_rate": 0.0002, "loss": 0.4869, "step": 840 }, { "epoch": 0.8089995213020584, "grad_norm": 0.08701962232589722, "learning_rate": 0.0002, "loss": 0.5048, "step": 845 }, { "epoch": 0.8137865007180469, "grad_norm": 0.09896954894065857, "learning_rate": 0.0002, "loss": 0.5398, "step": 850 }, { "epoch": 0.8185734801340354, "grad_norm": 0.0876292958855629, "learning_rate": 0.0002, "loss": 0.5801, "step": 855 }, { "epoch": 0.8233604595500239, "grad_norm": 0.08727893233299255, "learning_rate": 0.0002, "loss": 0.5378, "step": 860 }, { "epoch": 0.8281474389660124, "grad_norm": 0.08662202209234238, "learning_rate": 0.0002, "loss": 0.5804, "step": 865 }, { "epoch": 0.832934418382001, "grad_norm": 0.08253654092550278, "learning_rate": 0.0002, "loss": 0.4701, "step": 870 }, { "epoch": 0.8377213977979895, "grad_norm": 0.08907407522201538, "learning_rate": 0.0002, "loss": 0.4918, "step": 875 }, { "epoch": 0.842508377213978, "grad_norm": 0.09331085532903671, "learning_rate": 0.0002, "loss": 0.5438, "step": 880 }, { "epoch": 0.8472953566299665, "grad_norm": 0.09129630029201508, "learning_rate": 0.0002, "loss": 0.5327, "step": 885 }, { "epoch": 0.852082336045955, "grad_norm": 0.09735500812530518, "learning_rate": 0.0002, "loss": 0.5281, "step": 890 }, { "epoch": 0.8568693154619436, "grad_norm": 0.0904528871178627, "learning_rate": 0.0002, "loss": 0.4964, "step": 895 }, { "epoch": 0.861656294877932, "grad_norm": 0.08291352540254593, "learning_rate": 0.0002, "loss": 0.5192, "step": 900 }, { "epoch": 0.8664432742939205, "grad_norm": 0.09108038246631622, "learning_rate": 0.0002, "loss": 0.501, "step": 905 }, { "epoch": 0.871230253709909, "grad_norm": 0.09137269854545593, "learning_rate": 0.0002, "loss": 0.5012, "step": 910 }, { "epoch": 0.8760172331258975, "grad_norm": 0.08169892430305481, "learning_rate": 0.0002, "loss": 0.4994, "step": 915 }, { "epoch": 0.8808042125418861, "grad_norm": 0.08467283844947815, "learning_rate": 0.0002, "loss": 0.4502, "step": 920 }, { "epoch": 0.8855911919578746, "grad_norm": 0.08680226653814316, "learning_rate": 0.0002, "loss": 0.5508, "step": 925 }, { "epoch": 0.8903781713738631, "grad_norm": 0.08897334337234497, "learning_rate": 0.0002, "loss": 0.5281, "step": 930 }, { "epoch": 0.8951651507898516, "grad_norm": 0.09124335646629333, "learning_rate": 0.0002, "loss": 0.5173, "step": 935 }, { "epoch": 0.8999521302058401, "grad_norm": 0.08976174145936966, "learning_rate": 0.0002, "loss": 0.5519, "step": 940 }, { "epoch": 0.9047391096218286, "grad_norm": 0.07799748331308365, "learning_rate": 0.0002, "loss": 0.5495, "step": 945 }, { "epoch": 0.9095260890378172, "grad_norm": 0.08304045349359512, "learning_rate": 0.0002, "loss": 0.5552, "step": 950 }, { "epoch": 0.9143130684538057, "grad_norm": 0.08134391158819199, "learning_rate": 0.0002, "loss": 0.4953, "step": 955 }, { "epoch": 0.9191000478697942, "grad_norm": 0.102556973695755, "learning_rate": 0.0002, "loss": 0.5841, "step": 960 }, { "epoch": 0.9238870272857826, "grad_norm": 0.09310037642717361, "learning_rate": 0.0002, "loss": 0.4977, "step": 965 }, { "epoch": 0.9286740067017711, "grad_norm": 0.08947998285293579, "learning_rate": 0.0002, "loss": 0.5147, "step": 970 }, { "epoch": 0.9334609861177597, "grad_norm": 0.0801323875784874, "learning_rate": 0.0002, "loss": 0.5379, "step": 975 }, { "epoch": 0.9382479655337482, "grad_norm": 0.09458567947149277, "learning_rate": 0.0002, "loss": 0.463, "step": 980 }, { "epoch": 0.9430349449497367, "grad_norm": 0.08248139917850494, "learning_rate": 0.0002, "loss": 0.4899, "step": 985 }, { "epoch": 0.9478219243657252, "grad_norm": 0.08913381397724152, "learning_rate": 0.0002, "loss": 0.5455, "step": 990 }, { "epoch": 0.9526089037817137, "grad_norm": 0.09054595977067947, "learning_rate": 0.0002, "loss": 0.528, "step": 995 }, { "epoch": 0.9573958831977023, "grad_norm": 0.0929536446928978, "learning_rate": 0.0002, "loss": 0.5595, "step": 1000 }, { "epoch": 0.9621828626136908, "grad_norm": 0.09117671847343445, "learning_rate": 0.0002, "loss": 0.5212, "step": 1005 }, { "epoch": 0.9669698420296793, "grad_norm": 0.09163827449083328, "learning_rate": 0.0002, "loss": 0.4587, "step": 1010 }, { "epoch": 0.9717568214456678, "grad_norm": 0.09541551768779755, "learning_rate": 0.0002, "loss": 0.5352, "step": 1015 }, { "epoch": 0.9765438008616563, "grad_norm": 0.09220823645591736, "learning_rate": 0.0002, "loss": 0.5599, "step": 1020 }, { "epoch": 0.9813307802776448, "grad_norm": 0.09834371507167816, "learning_rate": 0.0002, "loss": 0.5605, "step": 1025 }, { "epoch": 0.9861177596936334, "grad_norm": 0.08727829903364182, "learning_rate": 0.0002, "loss": 0.5427, "step": 1030 }, { "epoch": 0.9909047391096218, "grad_norm": 0.09128595143556595, "learning_rate": 0.0002, "loss": 0.574, "step": 1035 }, { "epoch": 0.9956917185256103, "grad_norm": 0.09627512097358704, "learning_rate": 0.0002, "loss": 0.4791, "step": 1040 }, { "epoch": 0.9995213020584012, "step": 1044, "total_flos": 2.304533815525294e+17, "train_loss": 0.5449674188862359, "train_runtime": 2251.699, "train_samples_per_second": 7.421, "train_steps_per_second": 0.464 } ], "logging_steps": 5, "max_steps": 1044, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.304533815525294e+17, "train_batch_size": 8, "trial_name": null, "trial_params": null }