{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.999043062200957, "eval_steps": 500, "global_step": 174, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.011483253588516746, "grad_norm": 2.804490089416504, "learning_rate": 1.111111111111111e-06, "loss": 0.7736, "step": 1 }, { "epoch": 0.022966507177033493, "grad_norm": 2.74699068069458, "learning_rate": 2.222222222222222e-06, "loss": 0.8086, "step": 2 }, { "epoch": 0.03444976076555024, "grad_norm": 2.9466397762298584, "learning_rate": 3.3333333333333333e-06, "loss": 0.8392, "step": 3 }, { "epoch": 0.045933014354066985, "grad_norm": 2.555311679840088, "learning_rate": 4.444444444444444e-06, "loss": 0.8138, "step": 4 }, { "epoch": 0.05741626794258373, "grad_norm": 1.9726232290267944, "learning_rate": 5.555555555555557e-06, "loss": 0.7843, "step": 5 }, { "epoch": 0.06889952153110047, "grad_norm": 3.302605628967285, "learning_rate": 6.666666666666667e-06, "loss": 0.7329, "step": 6 }, { "epoch": 0.08038277511961722, "grad_norm": 1.8911763429641724, "learning_rate": 7.77777777777778e-06, "loss": 0.7359, "step": 7 }, { "epoch": 0.09186602870813397, "grad_norm": 1.9695310592651367, "learning_rate": 8.888888888888888e-06, "loss": 0.7293, "step": 8 }, { "epoch": 0.10334928229665072, "grad_norm": 1.4005483388900757, "learning_rate": 1e-05, "loss": 0.7275, "step": 9 }, { "epoch": 0.11483253588516747, "grad_norm": 1.3015406131744385, "learning_rate": 1.1111111111111113e-05, "loss": 0.6375, "step": 10 }, { "epoch": 0.12631578947368421, "grad_norm": 1.205918788909912, "learning_rate": 1.2222222222222224e-05, "loss": 0.6625, "step": 11 }, { "epoch": 0.13779904306220095, "grad_norm": 0.8213034868240356, "learning_rate": 1.3333333333333333e-05, "loss": 0.6525, "step": 12 }, { "epoch": 0.1492822966507177, "grad_norm": 1.1025549173355103, "learning_rate": 1.4444444444444446e-05, "loss": 0.6317, "step": 13 }, { "epoch": 0.16076555023923444, "grad_norm": 0.956466794013977, "learning_rate": 1.555555555555556e-05, "loss": 0.6169, "step": 14 }, { "epoch": 0.1722488038277512, "grad_norm": 1.0524718761444092, "learning_rate": 1.6666666666666667e-05, "loss": 0.634, "step": 15 }, { "epoch": 0.18373205741626794, "grad_norm": 0.7834458351135254, "learning_rate": 1.7777777777777777e-05, "loss": 0.6116, "step": 16 }, { "epoch": 0.19521531100478468, "grad_norm": 0.7513645887374878, "learning_rate": 1.888888888888889e-05, "loss": 0.5749, "step": 17 }, { "epoch": 0.20669856459330144, "grad_norm": 0.7897821664810181, "learning_rate": 2e-05, "loss": 0.6072, "step": 18 }, { "epoch": 0.21818181818181817, "grad_norm": 0.5807076096534729, "learning_rate": 1.9997972289848505e-05, "loss": 0.5816, "step": 19 }, { "epoch": 0.22966507177033493, "grad_norm": 0.663065493106842, "learning_rate": 1.9991889981715696e-05, "loss": 0.5943, "step": 20 }, { "epoch": 0.24114832535885167, "grad_norm": 0.5980852246284485, "learning_rate": 1.9981755542233175e-05, "loss": 0.575, "step": 21 }, { "epoch": 0.25263157894736843, "grad_norm": 0.6174298524856567, "learning_rate": 1.9967573081342103e-05, "loss": 0.5681, "step": 22 }, { "epoch": 0.2641148325358852, "grad_norm": 0.519263744354248, "learning_rate": 1.9949348350626456e-05, "loss": 0.5618, "step": 23 }, { "epoch": 0.2755980861244019, "grad_norm": 0.5192844867706299, "learning_rate": 1.992708874098054e-05, "loss": 0.5716, "step": 24 }, { "epoch": 0.28708133971291866, "grad_norm": 0.5204177498817444, "learning_rate": 1.9900803279611643e-05, "loss": 0.5297, "step": 25 }, { "epoch": 0.2985645933014354, "grad_norm": 0.43173807859420776, "learning_rate": 1.9870502626379127e-05, "loss": 0.5666, "step": 26 }, { "epoch": 0.31004784688995213, "grad_norm": 0.47530868649482727, "learning_rate": 1.983619906947144e-05, "loss": 0.566, "step": 27 }, { "epoch": 0.3215311004784689, "grad_norm": 0.43519994616508484, "learning_rate": 1.979790652042268e-05, "loss": 0.5577, "step": 28 }, { "epoch": 0.33301435406698565, "grad_norm": 0.43898147344589233, "learning_rate": 1.975564050847094e-05, "loss": 0.5458, "step": 29 }, { "epoch": 0.3444976076555024, "grad_norm": 0.38831689953804016, "learning_rate": 1.9709418174260523e-05, "loss": 0.5787, "step": 30 }, { "epoch": 0.3559808612440191, "grad_norm": 0.38646361231803894, "learning_rate": 1.9659258262890683e-05, "loss": 0.5403, "step": 31 }, { "epoch": 0.3674641148325359, "grad_norm": 0.3714185655117035, "learning_rate": 1.9605181116313725e-05, "loss": 0.5492, "step": 32 }, { "epoch": 0.37894736842105264, "grad_norm": 0.3823264539241791, "learning_rate": 1.954720866508546e-05, "loss": 0.5476, "step": 33 }, { "epoch": 0.39043062200956935, "grad_norm": 0.33007943630218506, "learning_rate": 1.9485364419471454e-05, "loss": 0.5408, "step": 34 }, { "epoch": 0.4019138755980861, "grad_norm": 0.39259204268455505, "learning_rate": 1.9419673459912652e-05, "loss": 0.5641, "step": 35 }, { "epoch": 0.4133971291866029, "grad_norm": 0.3294496536254883, "learning_rate": 1.9350162426854152e-05, "loss": 0.5344, "step": 36 }, { "epoch": 0.42488038277511964, "grad_norm": 0.3135583698749542, "learning_rate": 1.927685950994143e-05, "loss": 0.5271, "step": 37 }, { "epoch": 0.43636363636363634, "grad_norm": 0.33793437480926514, "learning_rate": 1.9199794436588244e-05, "loss": 0.5352, "step": 38 }, { "epoch": 0.4478468899521531, "grad_norm": 0.3562823534011841, "learning_rate": 1.91189984599209e-05, "loss": 0.5478, "step": 39 }, { "epoch": 0.45933014354066987, "grad_norm": 0.3568083643913269, "learning_rate": 1.9034504346103825e-05, "loss": 0.5072, "step": 40 }, { "epoch": 0.47081339712918663, "grad_norm": 0.34692370891571045, "learning_rate": 1.8946346361051474e-05, "loss": 0.504, "step": 41 }, { "epoch": 0.48229665071770333, "grad_norm": 0.35962024331092834, "learning_rate": 1.8854560256532098e-05, "loss": 0.5447, "step": 42 }, { "epoch": 0.4937799043062201, "grad_norm": 0.37867891788482666, "learning_rate": 1.875918325566888e-05, "loss": 0.5217, "step": 43 }, { "epoch": 0.5052631578947369, "grad_norm": 0.3821343779563904, "learning_rate": 1.866025403784439e-05, "loss": 0.5311, "step": 44 }, { "epoch": 0.5167464114832536, "grad_norm": 0.36335083842277527, "learning_rate": 1.8557812723014476e-05, "loss": 0.5305, "step": 45 }, { "epoch": 0.5282296650717704, "grad_norm": 0.33280235528945923, "learning_rate": 1.845190085543795e-05, "loss": 0.5425, "step": 46 }, { "epoch": 0.539712918660287, "grad_norm": 0.36008763313293457, "learning_rate": 1.8342561386828613e-05, "loss": 0.5122, "step": 47 }, { "epoch": 0.5511961722488038, "grad_norm": 0.36620059609413147, "learning_rate": 1.8229838658936566e-05, "loss": 0.4986, "step": 48 }, { "epoch": 0.5626794258373206, "grad_norm": 9.159019470214844, "learning_rate": 1.811377838556573e-05, "loss": 0.5448, "step": 49 }, { "epoch": 0.5741626794258373, "grad_norm": 0.47111213207244873, "learning_rate": 1.7994427634035016e-05, "loss": 0.522, "step": 50 }, { "epoch": 0.5856459330143541, "grad_norm": 0.34213587641716003, "learning_rate": 1.7871834806090502e-05, "loss": 0.5182, "step": 51 }, { "epoch": 0.5971291866028708, "grad_norm": 0.3735521137714386, "learning_rate": 1.7746049618276545e-05, "loss": 0.5173, "step": 52 }, { "epoch": 0.6086124401913876, "grad_norm": 0.3792261481285095, "learning_rate": 1.761712308177359e-05, "loss": 0.5267, "step": 53 }, { "epoch": 0.6200956937799043, "grad_norm": 0.31414318084716797, "learning_rate": 1.7485107481711014e-05, "loss": 0.5243, "step": 54 }, { "epoch": 0.631578947368421, "grad_norm": 0.3608068525791168, "learning_rate": 1.7350056355963287e-05, "loss": 0.5193, "step": 55 }, { "epoch": 0.6430622009569378, "grad_norm": 0.31694212555885315, "learning_rate": 1.7212024473438145e-05, "loss": 0.5029, "step": 56 }, { "epoch": 0.6545454545454545, "grad_norm": 0.2968662679195404, "learning_rate": 1.7071067811865477e-05, "loss": 0.5131, "step": 57 }, { "epoch": 0.6660287081339713, "grad_norm": 0.3195832073688507, "learning_rate": 1.6927243535095995e-05, "loss": 0.4912, "step": 58 }, { "epoch": 0.6775119617224881, "grad_norm": 0.34518277645111084, "learning_rate": 1.678060996991891e-05, "loss": 0.5124, "step": 59 }, { "epoch": 0.6889952153110048, "grad_norm": 0.285602867603302, "learning_rate": 1.6631226582407954e-05, "loss": 0.5132, "step": 60 }, { "epoch": 0.7004784688995215, "grad_norm": 0.3201611638069153, "learning_rate": 1.647915395380539e-05, "loss": 0.5119, "step": 61 }, { "epoch": 0.7119617224880382, "grad_norm": 0.3024705946445465, "learning_rate": 1.6324453755953772e-05, "loss": 0.5055, "step": 62 }, { "epoch": 0.723444976076555, "grad_norm": 0.32308679819107056, "learning_rate": 1.6167188726285433e-05, "loss": 0.5216, "step": 63 }, { "epoch": 0.7349282296650718, "grad_norm": 0.278341680765152, "learning_rate": 1.600742264237979e-05, "loss": 0.5087, "step": 64 }, { "epoch": 0.7464114832535885, "grad_norm": 0.43393227458000183, "learning_rate": 1.584522029609889e-05, "loss": 0.5093, "step": 65 }, { "epoch": 0.7578947368421053, "grad_norm": 0.2864857316017151, "learning_rate": 1.568064746731156e-05, "loss": 0.4954, "step": 66 }, { "epoch": 0.769377990430622, "grad_norm": 0.2618401348590851, "learning_rate": 1.551377089721692e-05, "loss": 0.4881, "step": 67 }, { "epoch": 0.7808612440191387, "grad_norm": 0.2827475666999817, "learning_rate": 1.5344658261278013e-05, "loss": 0.4741, "step": 68 }, { "epoch": 0.7923444976076555, "grad_norm": 0.28884321451187134, "learning_rate": 1.5173378141776569e-05, "loss": 0.5133, "step": 69 }, { "epoch": 0.8038277511961722, "grad_norm": 0.2944667935371399, "learning_rate": 1.5000000000000002e-05, "loss": 0.4843, "step": 70 }, { "epoch": 0.815311004784689, "grad_norm": 0.301776647567749, "learning_rate": 1.4824594148071936e-05, "loss": 0.4956, "step": 71 }, { "epoch": 0.8267942583732057, "grad_norm": 0.31137344241142273, "learning_rate": 1.4647231720437687e-05, "loss": 0.5161, "step": 72 }, { "epoch": 0.8382775119617225, "grad_norm": 0.28323987126350403, "learning_rate": 1.4467984645016259e-05, "loss": 0.4991, "step": 73 }, { "epoch": 0.8497607655502393, "grad_norm": 0.31255266070365906, "learning_rate": 1.4286925614030542e-05, "loss": 0.4991, "step": 74 }, { "epoch": 0.861244019138756, "grad_norm": 0.31073659658432007, "learning_rate": 1.410412805452757e-05, "loss": 0.4778, "step": 75 }, { "epoch": 0.8727272727272727, "grad_norm": 0.30535194277763367, "learning_rate": 1.3919666098600753e-05, "loss": 0.5201, "step": 76 }, { "epoch": 0.8842105263157894, "grad_norm": 0.3717443346977234, "learning_rate": 1.3733614553326211e-05, "loss": 0.4943, "step": 77 }, { "epoch": 0.8956937799043062, "grad_norm": 0.26282674074172974, "learning_rate": 1.3546048870425356e-05, "loss": 0.5022, "step": 78 }, { "epoch": 0.907177033492823, "grad_norm": 0.2955797612667084, "learning_rate": 1.335704511566605e-05, "loss": 0.482, "step": 79 }, { "epoch": 0.9186602870813397, "grad_norm": 0.30926328897476196, "learning_rate": 1.3166679938014728e-05, "loss": 0.4803, "step": 80 }, { "epoch": 0.9301435406698565, "grad_norm": 0.2495397925376892, "learning_rate": 1.297503053855203e-05, "loss": 0.4895, "step": 81 }, { "epoch": 0.9416267942583733, "grad_norm": 0.299214631319046, "learning_rate": 1.2782174639164528e-05, "loss": 0.5031, "step": 82 }, { "epoch": 0.9531100478468899, "grad_norm": 0.26720190048217773, "learning_rate": 1.2588190451025209e-05, "loss": 0.4931, "step": 83 }, { "epoch": 0.9645933014354067, "grad_norm": 0.28564536571502686, "learning_rate": 1.2393156642875579e-05, "loss": 0.4923, "step": 84 }, { "epoch": 0.9760765550239234, "grad_norm": 0.26890474557876587, "learning_rate": 1.2197152309122173e-05, "loss": 0.5252, "step": 85 }, { "epoch": 0.9875598086124402, "grad_norm": 0.28297609090805054, "learning_rate": 1.2000256937760446e-05, "loss": 0.4903, "step": 86 }, { "epoch": 0.999043062200957, "grad_norm": 0.26182886958122253, "learning_rate": 1.180255037813906e-05, "loss": 0.4879, "step": 87 }, { "epoch": 1.0105263157894737, "grad_norm": 0.7859248518943787, "learning_rate": 1.1604112808577603e-05, "loss": 0.7722, "step": 88 }, { "epoch": 1.0220095693779905, "grad_norm": 0.3331013023853302, "learning_rate": 1.1405024703850929e-05, "loss": 0.4322, "step": 89 }, { "epoch": 1.0334928229665072, "grad_norm": 0.7180531620979309, "learning_rate": 1.1205366802553231e-05, "loss": 0.4452, "step": 90 }, { "epoch": 1.044976076555024, "grad_norm": 0.3440372049808502, "learning_rate": 1.1005220074355172e-05, "loss": 0.4676, "step": 91 }, { "epoch": 1.0564593301435408, "grad_norm": 0.31776347756385803, "learning_rate": 1.0804665687167262e-05, "loss": 0.4573, "step": 92 }, { "epoch": 1.0679425837320573, "grad_norm": 0.39098435640335083, "learning_rate": 1.0603784974222862e-05, "loss": 0.4608, "step": 93 }, { "epoch": 1.079425837320574, "grad_norm": 0.29146501421928406, "learning_rate": 1.0402659401094154e-05, "loss": 0.4511, "step": 94 }, { "epoch": 1.0909090909090908, "grad_norm": 0.3079903721809387, "learning_rate": 1.0201370532654404e-05, "loss": 0.4757, "step": 95 }, { "epoch": 1.1023923444976076, "grad_norm": 0.32171630859375, "learning_rate": 1e-05, "loss": 0.4764, "step": 96 }, { "epoch": 1.1138755980861244, "grad_norm": 0.2739886939525604, "learning_rate": 9.7986294673456e-06, "loss": 0.4183, "step": 97 }, { "epoch": 1.1253588516746411, "grad_norm": 0.31871333718299866, "learning_rate": 9.597340598905851e-06, "loss": 0.4677, "step": 98 }, { "epoch": 1.1368421052631579, "grad_norm": 0.28314825892448425, "learning_rate": 9.39621502577714e-06, "loss": 0.4345, "step": 99 }, { "epoch": 1.1483253588516746, "grad_norm": 0.2956909239292145, "learning_rate": 9.195334312832742e-06, "loss": 0.456, "step": 100 }, { "epoch": 1.1607655502392344, "grad_norm": 0.3102877140045166, "learning_rate": 8.994779925644832e-06, "loss": 0.4653, "step": 101 }, { "epoch": 1.1722488038277512, "grad_norm": 0.2631514072418213, "learning_rate": 8.79463319744677e-06, "loss": 0.4663, "step": 102 }, { "epoch": 1.183732057416268, "grad_norm": 0.2780362665653229, "learning_rate": 8.594975296149076e-06, "loss": 0.4598, "step": 103 }, { "epoch": 1.1952153110047847, "grad_norm": 0.30441659688949585, "learning_rate": 8.395887191422397e-06, "loss": 0.4603, "step": 104 }, { "epoch": 1.2066985645933015, "grad_norm": 0.31006890535354614, "learning_rate": 8.197449621860944e-06, "loss": 0.4691, "step": 105 }, { "epoch": 1.2181818181818183, "grad_norm": 0.24229736626148224, "learning_rate": 7.999743062239557e-06, "loss": 0.4363, "step": 106 }, { "epoch": 1.229665071770335, "grad_norm": 0.28089576959609985, "learning_rate": 7.802847690877832e-06, "loss": 0.4507, "step": 107 }, { "epoch": 1.2411483253588518, "grad_norm": 0.2989681661128998, "learning_rate": 7.606843357124426e-06, "loss": 0.4537, "step": 108 }, { "epoch": 1.2526315789473683, "grad_norm": 0.25250881910324097, "learning_rate": 7.411809548974792e-06, "loss": 0.4358, "step": 109 }, { "epoch": 1.2641148325358853, "grad_norm": 0.2874893546104431, "learning_rate": 7.217825360835475e-06, "loss": 0.4394, "step": 110 }, { "epoch": 1.2755980861244018, "grad_norm": 0.25158512592315674, "learning_rate": 7.024969461447973e-06, "loss": 0.4375, "step": 111 }, { "epoch": 1.2870813397129186, "grad_norm": 0.27635353803634644, "learning_rate": 6.833320061985278e-06, "loss": 0.4508, "step": 112 }, { "epoch": 1.2985645933014354, "grad_norm": 0.2624880373477936, "learning_rate": 6.6429548843339554e-06, "loss": 0.4608, "step": 113 }, { "epoch": 1.3100478468899521, "grad_norm": 0.2889038026332855, "learning_rate": 6.453951129574644e-06, "loss": 0.4451, "step": 114 }, { "epoch": 1.321531100478469, "grad_norm": 0.24713511765003204, "learning_rate": 6.266385446673791e-06, "loss": 0.4378, "step": 115 }, { "epoch": 1.3330143540669857, "grad_norm": 0.26973429322242737, "learning_rate": 6.080333901399252e-06, "loss": 0.4443, "step": 116 }, { "epoch": 1.3444976076555024, "grad_norm": 0.2887195348739624, "learning_rate": 5.895871945472434e-06, "loss": 0.4565, "step": 117 }, { "epoch": 1.3559808612440192, "grad_norm": 0.2864110469818115, "learning_rate": 5.713074385969457e-06, "loss": 0.4305, "step": 118 }, { "epoch": 1.367464114832536, "grad_norm": 0.2510843873023987, "learning_rate": 5.532015354983742e-06, "loss": 0.4472, "step": 119 }, { "epoch": 1.3789473684210527, "grad_norm": 0.27841120958328247, "learning_rate": 5.352768279562315e-06, "loss": 0.4548, "step": 120 }, { "epoch": 1.3904306220095695, "grad_norm": 0.253780722618103, "learning_rate": 5.175405851928068e-06, "loss": 0.4401, "step": 121 }, { "epoch": 1.401913875598086, "grad_norm": 0.2693757116794586, "learning_rate": 5.000000000000003e-06, "loss": 0.4624, "step": 122 }, { "epoch": 1.413397129186603, "grad_norm": 0.2563832104206085, "learning_rate": 4.826621858223431e-06, "loss": 0.4388, "step": 123 }, { "epoch": 1.4248803827751195, "grad_norm": 0.305113285779953, "learning_rate": 4.655341738721989e-06, "loss": 0.4546, "step": 124 }, { "epoch": 1.4363636363636363, "grad_norm": 0.2533874213695526, "learning_rate": 4.486229102783084e-06, "loss": 0.4561, "step": 125 }, { "epoch": 1.447846889952153, "grad_norm": 0.2530078589916229, "learning_rate": 4.319352532688444e-06, "loss": 0.4445, "step": 126 }, { "epoch": 1.4593301435406698, "grad_norm": 0.28587111830711365, "learning_rate": 4.154779703901114e-06, "loss": 0.4821, "step": 127 }, { "epoch": 1.4708133971291866, "grad_norm": 0.26422637701034546, "learning_rate": 3.99257735762021e-06, "loss": 0.4571, "step": 128 }, { "epoch": 1.4822966507177033, "grad_norm": 0.279634028673172, "learning_rate": 3.832811273714569e-06, "loss": 0.4514, "step": 129 }, { "epoch": 1.49377990430622, "grad_norm": 0.24408729374408722, "learning_rate": 3.6755462440462288e-06, "loss": 0.4405, "step": 130 }, { "epoch": 1.5052631578947369, "grad_norm": 0.277403324842453, "learning_rate": 3.5208460461946136e-06, "loss": 0.4676, "step": 131 }, { "epoch": 1.5167464114832536, "grad_norm": 0.2653951048851013, "learning_rate": 3.3687734175920505e-06, "loss": 0.4422, "step": 132 }, { "epoch": 1.5282296650717704, "grad_norm": 0.2684209644794464, "learning_rate": 3.2193900300810908e-06, "loss": 0.4287, "step": 133 }, { "epoch": 1.5397129186602871, "grad_norm": 0.25018298625946045, "learning_rate": 3.0727564649040066e-06, "loss": 0.4107, "step": 134 }, { "epoch": 1.5511961722488037, "grad_norm": 0.25643426179885864, "learning_rate": 2.9289321881345257e-06, "loss": 0.4542, "step": 135 }, { "epoch": 1.5626794258373207, "grad_norm": 0.2723328173160553, "learning_rate": 2.7879755265618558e-06, "loss": 0.4471, "step": 136 }, { "epoch": 1.5741626794258372, "grad_norm": 0.25238141417503357, "learning_rate": 2.6499436440367165e-06, "loss": 0.4452, "step": 137 }, { "epoch": 1.5856459330143542, "grad_norm": 0.25168266892433167, "learning_rate": 2.514892518288988e-06, "loss": 0.4369, "step": 138 }, { "epoch": 1.5971291866028707, "grad_norm": 0.34418055415153503, "learning_rate": 2.382876918226409e-06, "loss": 0.4419, "step": 139 }, { "epoch": 1.6086124401913877, "grad_norm": 0.2500324547290802, "learning_rate": 2.2539503817234553e-06, "loss": 0.4455, "step": 140 }, { "epoch": 1.6200956937799043, "grad_norm": 0.2515903115272522, "learning_rate": 2.1281651939094996e-06, "loss": 0.4511, "step": 141 }, { "epoch": 1.631578947368421, "grad_norm": 0.24545879662036896, "learning_rate": 2.0055723659649907e-06, "loss": 0.4452, "step": 142 }, { "epoch": 1.6430622009569378, "grad_norm": 0.277642160654068, "learning_rate": 1.8862216144342692e-06, "loss": 0.4596, "step": 143 }, { "epoch": 1.6545454545454545, "grad_norm": 0.2690809965133667, "learning_rate": 1.7701613410634367e-06, "loss": 0.4721, "step": 144 }, { "epoch": 1.6660287081339713, "grad_norm": 0.2773820459842682, "learning_rate": 1.6574386131713872e-06, "loss": 0.4669, "step": 145 }, { "epoch": 1.677511961722488, "grad_norm": 0.2979687452316284, "learning_rate": 1.5480991445620541e-06, "loss": 0.484, "step": 146 }, { "epoch": 1.6889952153110048, "grad_norm": 0.2545396685600281, "learning_rate": 1.4421872769855262e-06, "loss": 0.4429, "step": 147 }, { "epoch": 1.7004784688995214, "grad_norm": 0.2929985523223877, "learning_rate": 1.339745962155613e-06, "loss": 0.4636, "step": 148 }, { "epoch": 1.7119617224880384, "grad_norm": 0.27831029891967773, "learning_rate": 1.2408167443311215e-06, "loss": 0.4015, "step": 149 }, { "epoch": 1.723444976076555, "grad_norm": 0.2582859694957733, "learning_rate": 1.1454397434679022e-06, "loss": 0.4508, "step": 150 }, { "epoch": 1.7349282296650719, "grad_norm": 0.27638334035873413, "learning_rate": 1.0536536389485275e-06, "loss": 0.4316, "step": 151 }, { "epoch": 1.7464114832535884, "grad_norm": 0.2702406942844391, "learning_rate": 9.65495653896179e-07, "loss": 0.4347, "step": 152 }, { "epoch": 1.7578947368421054, "grad_norm": 0.286505788564682, "learning_rate": 8.810015400790994e-07, "loss": 0.4579, "step": 153 }, { "epoch": 1.769377990430622, "grad_norm": 0.23764821887016296, "learning_rate": 8.002055634117578e-07, "loss": 0.4148, "step": 154 }, { "epoch": 1.7808612440191387, "grad_norm": 0.2822108268737793, "learning_rate": 7.231404900585714e-07, "loss": 0.4443, "step": 155 }, { "epoch": 1.7923444976076555, "grad_norm": 0.24537336826324463, "learning_rate": 6.498375731458529e-07, "loss": 0.4583, "step": 156 }, { "epoch": 1.8038277511961722, "grad_norm": 0.2529117465019226, "learning_rate": 5.803265400873514e-07, "loss": 0.4448, "step": 157 }, { "epoch": 1.815311004784689, "grad_norm": 0.2595837712287903, "learning_rate": 5.146355805285452e-07, "loss": 0.4488, "step": 158 }, { "epoch": 1.8267942583732057, "grad_norm": 0.27293673157691956, "learning_rate": 4.5279133491454406e-07, "loss": 0.4448, "step": 159 }, { "epoch": 1.8382775119617225, "grad_norm": 0.27972596883773804, "learning_rate": 3.9481888368627764e-07, "loss": 0.4608, "step": 160 }, { "epoch": 1.8497607655502393, "grad_norm": 0.384954571723938, "learning_rate": 3.4074173710931804e-07, "loss": 0.4344, "step": 161 }, { "epoch": 1.861244019138756, "grad_norm": 0.2894757390022278, "learning_rate": 2.905818257394799e-07, "loss": 0.4483, "step": 162 }, { "epoch": 1.8727272727272726, "grad_norm": 0.31389251351356506, "learning_rate": 2.4435949152906144e-07, "loss": 0.4689, "step": 163 }, { "epoch": 1.8842105263157896, "grad_norm": 0.2728366255760193, "learning_rate": 2.0209347957732328e-07, "loss": 0.4479, "step": 164 }, { "epoch": 1.895693779904306, "grad_norm": 0.28522682189941406, "learning_rate": 1.6380093052856482e-07, "loss": 0.4329, "step": 165 }, { "epoch": 1.907177033492823, "grad_norm": 0.31500834226608276, "learning_rate": 1.2949737362087156e-07, "loss": 0.4493, "step": 166 }, { "epoch": 1.9186602870813396, "grad_norm": 0.28287577629089355, "learning_rate": 9.919672038835926e-08, "loss": 0.445, "step": 167 }, { "epoch": 1.9301435406698566, "grad_norm": 0.2714127004146576, "learning_rate": 7.291125901946027e-08, "loss": 0.4704, "step": 168 }, { "epoch": 1.9416267942583731, "grad_norm": 0.2918432950973511, "learning_rate": 5.0651649373544276e-08, "loss": 0.4601, "step": 169 }, { "epoch": 1.95311004784689, "grad_norm": 0.2897633910179138, "learning_rate": 3.242691865790071e-08, "loss": 0.4416, "step": 170 }, { "epoch": 1.9645933014354067, "grad_norm": 0.24492616951465607, "learning_rate": 1.824445776682504e-08, "loss": 0.4389, "step": 171 }, { "epoch": 1.9760765550239234, "grad_norm": 0.3015381693840027, "learning_rate": 8.110018284304132e-09, "loss": 0.4549, "step": 172 }, { "epoch": 1.9875598086124402, "grad_norm": 0.26895269751548767, "learning_rate": 2.0277101514987184e-09, "loss": 0.4451, "step": 173 }, { "epoch": 1.999043062200957, "grad_norm": 0.25817951560020447, "learning_rate": 0.0, "loss": 0.4523, "step": 174 }, { "epoch": 1.999043062200957, "step": 174, "total_flos": 665025606320128.0, "train_loss": 0.19090694274710512, "train_runtime": 9940.2778, "train_samples_per_second": 1.681, "train_steps_per_second": 0.018 } ], "logging_steps": 1, "max_steps": 174, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 665025606320128.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }