diff --git "a/1x_A100_40GB/nohup.out" "b/1x_A100_40GB/nohup.out" --- "a/1x_A100_40GB/nohup.out" +++ "b/1x_A100_40GB/nohup.out" @@ -11654,4 +11654,8446 @@ step 11196/19560 | loss 3.359104 (-1.20z)| norm 0.2613 (-0.91z)| lr 2.47e-04 | 2 step 11197/19560 | loss 3.400577 (-0.31z)| norm 0.2807 (+0.05z)| lr 2.47e-04 | 2532.96 ms | 53.3% bf16 MFU | 206940 tok/s step 11198/19560 | loss 3.369091 (-0.99z)| norm 0.2669 (-0.63z)| lr 2.47e-04 | 2532.65 ms | 53.3% bf16 MFU | 206943 tok/s step 11199/19560 | loss 3.347360 (-1.44z)| norm 0.2744 (-0.26z)| lr 2.47e-04 | 2533.21 ms | 53.3% bf16 MFU | 206945 tok/s -step 11200/195 \ No newline at end of file +step 11200/19560 | loss 3.392591 (-0.46z)| norm 0.3024 (+1.13z)| lr 2.47e-04 | 2532.13 ms | 53.3% bf16 MFU | 206950 tok/s +step 11201/19560 | loss 3.343293 (-1.50z)| norm 0.2738 (-0.28z)| lr 2.47e-04 | 2532.25 ms | 53.3% bf16 MFU | 206955 tok/s +step 11202/19560 | loss 3.471011 (+1.22z)| norm 0.2801 (+0.04z)| lr 2.47e-04 | 2534.74 ms | 53.3% bf16 MFU | 206949 tok/s +step 11203/19560 | loss 3.386312 (-0.58z)| norm 0.2622 (-0.84z)| lr 2.47e-04 | 2534.21 ms | 53.3% bf16 MFU | 206946 tok/s +step 11204/19560 | loss 3.420524 (+0.14z)| norm 0.2831 (+0.20z)| lr 2.47e-04 | 2533.53 ms | 53.3% bf16 MFU | 206946 tok/s +step 11205/19560 | loss 3.402011 (-0.27z)| norm 0.2705 (-0.42z)| lr 2.47e-04 | 2534.79 ms | 53.3% bf16 MFU | 206940 tok/s +step 11206/19560 | loss 3.380336 (-0.72z)| norm 0.2507 (-1.39z)| lr 2.47e-04 | 2534.56 ms | 53.3% bf16 MFU | 206936 tok/s +step 11207/19560 | loss 3.485972 (+1.51z)| norm 0.2689 (-0.49z)| lr 2.47e-04 | 2535.28 ms | 53.3% bf16 MFU | 206929 tok/s +step 11208/19560 | loss 3.337027 (-1.62z)| norm 0.2774 (-0.06z)| lr 2.46e-04 | 2533.27 ms | 53.3% bf16 MFU | 206930 tok/s +step 11209/19560 | loss 3.373604 (-0.84z)| norm 0.2652 (-0.67z)| lr 2.46e-04 | 2534.50 ms | 53.3% bf16 MFU | 206927 tok/s +step 11210/19560 | loss 3.462321 (+1.02z)| norm 0.2978 (+0.93z)| lr 2.46e-04 | 2533.49 ms | 53.3% bf16 MFU | 206928 tok/s +step 11211/19560 | loss 3.462509 (+1.02z)| norm 0.2628 (-0.79z)| lr 2.46e-04 | 2533.34 ms | 53.3% bf16 MFU | 206929 tok/s +step 11212/19560 | loss 3.408409 (-0.11z)| norm 0.2968 (+0.88z)| lr 2.46e-04 | 2533.94 ms | 53.3% bf16 MFU | 206928 tok/s +step 11213/19560 | loss 3.373142 (-0.86z)| norm 0.2745 (-0.22z)| lr 2.46e-04 | 2535.55 ms | 53.2% bf16 MFU | 206920 tok/s +step 11214/19560 | loss 3.372562 (-0.86z)| norm 0.2751 (-0.18z)| lr 2.46e-04 | 2531.69 ms | 53.3% bf16 MFU | 206929 tok/s +step 11215/19560 | loss 3.416002 (+0.05z)| norm 0.2694 (-0.45z)| lr 2.46e-04 | 2534.32 ms | 53.3% bf16 MFU | 206926 tok/s +step 11216/19560 | loss 3.487318 (+1.53z)| norm 0.2706 (-0.38z)| lr 2.46e-04 | 2532.85 ms | 53.3% bf16 MFU | 206930 tok/s +step 11217/19560 | loss 3.412849 (-0.03z)| norm 0.2637 (-0.71z)| lr 2.46e-04 | 2533.84 ms | 53.3% bf16 MFU | 206929 tok/s +step 11218/19560 | loss 3.428332 (+0.30z)| norm 0.2927 (+0.73z)| lr 2.46e-04 | 2534.81 ms | 53.3% bf16 MFU | 206924 tok/s +step 11219/19560 | loss 3.382736 (-0.65z)| norm 0.2552 (-1.12z)| lr 2.46e-04 | 2534.18 ms | 53.3% bf16 MFU | 206922 tok/s +step 11220/19560 | loss 3.423297 (+0.20z)| norm 0.2622 (-0.76z)| lr 2.46e-04 | 2533.80 ms | 53.3% bf16 MFU | 206922 tok/s +step 11221/19560 | loss 3.421196 (+0.15z)| norm 0.2593 (-0.89z)| lr 2.46e-04 | 2533.99 ms | 53.3% bf16 MFU | 206921 tok/s +step 11222/19560 | loss 3.412929 (-0.03z)| norm 0.2672 (-0.49z)| lr 2.46e-04 | 2533.48 ms | 53.3% bf16 MFU | 206922 tok/s +step 11223/19560 | loss 3.363047 (-1.09z)| norm 0.2820 (+0.24z)| lr 2.46e-04 | 2534.66 ms | 53.3% bf16 MFU | 206918 tok/s +step 11224/19560 | loss 3.394569 (-0.41z)| norm 0.2583 (-0.93z)| lr 2.46e-04 | 2535.09 ms | 53.3% bf16 MFU | 206913 tok/s +step 11225/19560 | loss 3.358034 (-1.18z)| norm 0.2736 (-0.18z)| lr 2.46e-04 | 2533.51 ms | 53.3% bf16 MFU | 206915 tok/s +step 11226/19560 | loss 3.400948 (-0.27z)| norm 0.2738 (-0.17z)| lr 2.46e-04 | 2532.93 ms | 53.3% bf16 MFU | 206918 tok/s +step 11227/19560 | loss 3.406020 (-0.16z)| norm 0.2562 (-1.12z)| lr 2.46e-04 | 2533.25 ms | 53.3% bf16 MFU | 206920 tok/s +step 11228/19560 | loss 3.345462 (-1.42z)| norm 0.2873 (+0.63z)| lr 2.45e-04 | 2532.72 ms | 53.3% bf16 MFU | 206925 tok/s +step 11229/19560 | loss 3.351834 (-1.27z)| norm 0.2597 (-0.92z)| lr 2.45e-04 | 2532.12 ms | 53.3% bf16 MFU | 206931 tok/s +step 11230/19560 | loss 3.519990 (+2.18z)| norm 0.2962 (+1.13z)| lr 2.45e-04 | 2532.81 ms | 53.3% bf16 MFU | 206935 tok/s +step 11231/19560 | loss 3.374278 (-0.79z)| norm 0.2467 (-1.62z)| lr 2.45e-04 | 2536.07 ms | 53.2% bf16 MFU | 206924 tok/s +step 11232/19560 | loss 3.390808 (-0.45z)| norm 0.2701 (-0.32z)| lr 2.45e-04 | 2535.02 ms | 53.3% bf16 MFU | 206919 tok/s +step 11233/19560 | loss 3.418746 (+0.13z)| norm 0.2624 (-0.73z)| lr 2.45e-04 | 2534.23 ms | 53.3% bf16 MFU | 206917 tok/s +step 11234/19560 | loss 3.439556 (+0.55z)| norm 0.2638 (-0.64z)| lr 2.45e-04 | 2533.84 ms | 53.3% bf16 MFU | 206917 tok/s +step 11235/19560 | loss 3.370116 (-0.86z)| norm 0.2585 (-0.93z)| lr 2.45e-04 | 2535.14 ms | 53.3% bf16 MFU | 206912 tok/s +step 11236/19560 | loss 3.375669 (-0.76z)| norm 0.2667 (-0.47z)| lr 2.45e-04 | 2533.23 ms | 53.3% bf16 MFU | 206914 tok/s +step 11237/19560 | loss 3.435187 (+0.47z)| norm 0.2604 (-0.81z)| lr 2.45e-04 | 2532.11 ms | 53.3% bf16 MFU | 206921 tok/s +step 11238/19560 | loss 3.391211 (-0.43z)| norm 0.2649 (-0.55z)| lr 2.45e-04 | 2530.95 ms | 53.3% bf16 MFU | 206933 tok/s +step 11239/19560 | loss 3.445644 (+0.71z)| norm 0.3196 (+2.43z)| lr 2.45e-04 | 2533.49 ms | 53.3% bf16 MFU | 206933 tok/s +step 11240/19560 | loss 3.375662 (-0.75z)| norm 0.2681 (-0.39z)| lr 2.45e-04 | 2533.41 ms | 53.3% bf16 MFU | 206934 tok/s +step 11241/19560 | loss 3.392775 (-0.39z)| norm 0.2631 (-0.66z)| lr 2.45e-04 | 2531.13 ms | 53.3% bf16 MFU | 206944 tok/s +step 11242/19560 | loss 3.328936 (-1.68z)| norm 0.2660 (-0.50z)| lr 2.45e-04 | 2532.22 ms | 53.3% bf16 MFU | 206949 tok/s +step 11243/19560 | loss 3.460366 (+1.00z)| norm 0.2768 (+0.08z)| lr 2.45e-04 | 2533.76 ms | 53.3% bf16 MFU | 206948 tok/s +step 11244/19560 | loss 3.380841 (-0.62z)| norm 0.2787 (+0.19z)| lr 2.45e-04 | 2532.71 ms | 53.3% bf16 MFU | 206951 tok/s +step 11245/19560 | loss 3.444508 (+0.74z)| norm 0.2718 (-0.19z)| lr 2.45e-04 | 2532.64 ms | 53.3% bf16 MFU | 206954 tok/s +step 11246/19560 | loss 3.403067 (-0.15z)| norm 0.2748 (-0.03z)| lr 2.45e-04 | 2532.03 ms | 53.3% bf16 MFU | 206959 tok/s +step 11247/19560 | loss 3.448509 (+0.81z)| norm 0.2962 (+1.13z)| lr 2.45e-04 | 2531.51 ms | 53.3% bf16 MFU | 206967 tok/s +step 11248/19560 | loss 3.404295 (-0.13z)| norm 0.2776 (+0.11z)| lr 2.45e-04 | 2532.50 ms | 53.3% bf16 MFU | 206970 tok/s +step 11249/19560 | loss 3.382132 (-0.61z)| norm 0.2692 (-0.35z)| lr 2.44e-04 | 2534.87 ms | 53.3% bf16 MFU | 206963 tok/s +step 11250/19560 | loss 3.408407 (-0.04z)| norm 0.3243 (+2.58z)| lr 2.44e-04 | 2532.40 ms | 53.3% bf16 MFU | 206966 tok/s +val loss 3.381608 +evaluating HellaSwag: 0/628 evaluating HellaSwag: 10/628 evaluating HellaSwag: 20/628 evaluating HellaSwag: 30/628 evaluating HellaSwag: 40/628 evaluating HellaSwag: 50/628 evaluating HellaSwag: 60/628 evaluating HellaSwag: 70/628 evaluating HellaSwag: 80/628 evaluating HellaSwag: 90/628 evaluating HellaSwag: 100/628 evaluating HellaSwag: 110/628 evaluating HellaSwag: 120/628 evaluating HellaSwag: 130/628 evaluating HellaSwag: 140/628 evaluating HellaSwag: 150/628 evaluating HellaSwag: 160/628 evaluating HellaSwag: 170/628 evaluating HellaSwag: 180/628 evaluating HellaSwag: 190/628 evaluating HellaSwag: 200/628 evaluating HellaSwag: 210/628 evaluating HellaSwag: 220/628 evaluating HellaSwag: 230/628 evaluating HellaSwag: 240/628 evaluating HellaSwag: 250/628 evaluating HellaSwag: 260/628 evaluating HellaSwag: 270/628 evaluating HellaSwag: 280/628 evaluating HellaSwag: 290/628 evaluating HellaSwag: 300/628 evaluating HellaSwag: 310/628 evaluating HellaSwag: 320/628 evaluating HellaSwag: 330/628 evaluating HellaSwag: 340/628 evaluating HellaSwag: 350/628 evaluating HellaSwag: 360/628 evaluating HellaSwag: 370/628 evaluating HellaSwag: 380/628 evaluating HellaSwag: 390/628 evaluating HellaSwag: 400/628 evaluating HellaSwag: 410/628 evaluating HellaSwag: 420/628 evaluating HellaSwag: 430/628 evaluating HellaSwag: 440/628 evaluating HellaSwag: 450/628 evaluating HellaSwag: 460/628 evaluating HellaSwag: 470/628 evaluating HellaSwag: 480/628 evaluating HellaSwag: 490/628 evaluating HellaSwag: 500/628 evaluating HellaSwag: 510/628 evaluating HellaSwag: 520/628 evaluating HellaSwag: 530/628 evaluating HellaSwag: 540/628 evaluating HellaSwag: 550/628 evaluating HellaSwag: 560/628 evaluating HellaSwag: 570/628 evaluating HellaSwag: 580/628 evaluating HellaSwag: 590/628 evaluating HellaSwag: 600/628 evaluating HellaSwag: 610/628 evaluating HellaSwag: 620/628 HellaSwag: 2934/10042 = 0.292173 +step 11251/19560 | loss 3.414132 (+0.08z)| norm 0.3161 (+2.09z)| lr 2.44e-04 | 2533.10 ms | 53.3% bf16 MFU | 206966 tok/s +step 11252/19560 | loss 3.372873 (-0.79z)| norm 0.2770 (+0.03z)| lr 2.44e-04 | 2532.36 ms | 53.3% bf16 MFU | 206970 tok/s +step 11253/19560 | loss 3.507972 (+2.08z)| norm 0.3044 (+1.45z)| lr 2.44e-04 | 2533.30 ms | 53.3% bf16 MFU | 206969 tok/s +step 11254/19560 | loss 3.332907 (-1.64z)| norm 0.2996 (+1.19z)| lr 2.44e-04 | 2533.01 ms | 53.3% bf16 MFU | 206970 tok/s +step 11255/19560 | loss 3.417545 (+0.16z)| norm 0.2781 (+0.07z)| lr 2.44e-04 | 2531.78 ms | 53.3% bf16 MFU | 206976 tok/s +step 11256/19560 | loss 3.364316 (-0.96z)| norm 0.2873 (+0.55z)| lr 2.44e-04 | 2532.85 ms | 53.3% bf16 MFU | 206977 tok/s +step 11257/19560 | loss 3.363840 (-0.96z)| norm 0.2798 (+0.16z)| lr 2.44e-04 | 2532.94 ms | 53.3% bf16 MFU | 206977 tok/s +step 11258/19560 | loss 3.373484 (-0.74z)| norm 0.2531 (-1.21z)| lr 2.44e-04 | 2534.04 ms | 53.3% bf16 MFU | 206973 tok/s +step 11259/19560 | loss 3.373091 (-0.75z)| norm 0.2615 (-0.78z)| lr 2.44e-04 | 2534.98 ms | 53.3% bf16 MFU | 206966 tok/s +step 11260/19560 | loss 3.327714 (-1.67z)| norm 0.2732 (-0.17z)| lr 2.44e-04 | 2534.00 ms | 53.3% bf16 MFU | 206962 tok/s +step 11261/19560 | loss 3.377576 (-0.62z)| norm 0.2402 (-1.86z)| lr 2.44e-04 | 2534.88 ms | 53.3% bf16 MFU | 206956 tok/s +step 11262/19560 | loss 3.412048 (+0.11z)| norm 0.2608 (-0.80z)| lr 2.44e-04 | 2533.70 ms | 53.3% bf16 MFU | 206954 tok/s +step 11263/19560 | loss 3.366421 (-0.83z)| norm 0.2708 (-0.28z)| lr 2.44e-04 | 2533.23 ms | 53.3% bf16 MFU | 206955 tok/s +step 11264/19560 | loss 3.365143 (-0.86z)| norm 0.2617 (-0.75z)| lr 2.44e-04 | 2534.06 ms | 53.3% bf16 MFU | 206952 tok/s +step 11265/19560 | loss 3.373435 (-0.67z)| norm 0.2745 (-0.09z)| lr 2.44e-04 | 2534.54 ms | 53.3% bf16 MFU | 206947 tok/s +step 11266/19560 | loss 3.393282 (-0.25z)| norm 0.2705 (-0.29z)| lr 2.44e-04 | 2535.29 ms | 53.3% bf16 MFU | 206939 tok/s +step 11267/19560 | loss 3.427930 (+0.47z)| norm 0.2765 (+0.01z)| lr 2.44e-04 | 2533.57 ms | 53.3% bf16 MFU | 206939 tok/s +step 11268/19560 | loss 3.378576 (-0.56z)| norm 0.2801 (+0.19z)| lr 2.44e-04 | 2532.41 ms | 53.3% bf16 MFU | 206944 tok/s +step 11269/19560 | loss 3.470917 (+1.37z)| norm 0.2674 (-0.46z)| lr 2.43e-04 | 2535.73 ms | 53.2% bf16 MFU | 206935 tok/s +step 11270/19560 | loss 3.372039 (-0.69z)| norm 0.2482 (-1.44z)| lr 2.43e-04 | 2534.01 ms | 53.3% bf16 MFU | 206933 tok/s +step 11271/19560 | loss 3.354616 (-1.04z)| norm 0.2818 (+0.28z)| lr 2.43e-04 | 2531.87 ms | 53.3% bf16 MFU | 206940 tok/s +step 11272/19560 | loss 3.362202 (-0.87z)| norm 0.2732 (-0.17z)| lr 2.43e-04 | 2531.40 ms | 53.3% bf16 MFU | 206949 tok/s +step 11273/19560 | loss 3.330621 (-1.50z)| norm 0.2658 (-0.55z)| lr 2.43e-04 | 2534.54 ms | 53.3% bf16 MFU | 206944 tok/s +step 11274/19560 | loss 3.444519 (+0.85z)| norm 0.2658 (-0.56z)| lr 2.43e-04 | 2533.25 ms | 53.3% bf16 MFU | 206945 tok/s +step 11275/19560 | loss 3.467237 (+1.30z)| norm 0.2685 (-0.42z)| lr 2.43e-04 | 2532.17 ms | 53.3% bf16 MFU | 206950 tok/s +step 11276/19560 | loss 3.352424 (-1.04z)| norm 0.2479 (-1.48z)| lr 2.43e-04 | 2532.91 ms | 53.3% bf16 MFU | 206952 tok/s +step 11277/19560 | loss 3.358039 (-0.92z)| norm 0.2610 (-0.79z)| lr 2.43e-04 | 2531.73 ms | 53.3% bf16 MFU | 206959 tok/s +step 11278/19560 | loss 3.372966 (-0.60z)| norm 0.2452 (-1.60z)| lr 2.43e-04 | 2533.70 ms | 53.3% bf16 MFU | 206958 tok/s +step 11279/19560 | loss 3.388563 (-0.27z)| norm 0.2631 (-0.67z)| lr 2.43e-04 | 2532.63 ms | 53.3% bf16 MFU | 206960 tok/s +step 11280/19560 | loss 3.404757 (+0.06z)| norm 0.2523 (-1.22z)| lr 2.43e-04 | 2534.06 ms | 53.3% bf16 MFU | 206957 tok/s +step 11281/19560 | loss 3.357634 (-0.89z)| norm 0.2803 (+0.21z)| lr 2.43e-04 | 2532.90 ms | 53.3% bf16 MFU | 206959 tok/s +step 11282/19560 | loss 3.330331 (-1.44z)| norm 0.2656 (-0.54z)| lr 2.43e-04 | 2534.35 ms | 53.3% bf16 MFU | 206954 tok/s +step 11283/19560 | loss 3.378894 (-0.45z)| norm 0.2673 (-0.46z)| lr 2.43e-04 | 2534.63 ms | 53.3% bf16 MFU | 206949 tok/s +step 11284/19560 | loss 3.351783 (-0.98z)| norm 0.2518 (-1.25z)| lr 2.43e-04 | 2533.35 ms | 53.3% bf16 MFU | 206950 tok/s +step 11285/19560 | loss 3.344620 (-1.11z)| norm 0.2932 (+0.86z)| lr 2.43e-04 | 2535.00 ms | 53.3% bf16 MFU | 206943 tok/s +step 11286/19560 | loss 3.397442 (-0.03z)| norm 0.2662 (-0.51z)| lr 2.43e-04 | 2532.71 ms | 53.3% bf16 MFU | 206946 tok/s +step 11287/19560 | loss 3.377085 (-0.44z)| norm 0.2628 (-0.69z)| lr 2.43e-04 | 2535.28 ms | 53.3% bf16 MFU | 206939 tok/s +step 11288/19560 | loss 3.382761 (-0.32z)| norm 0.2630 (-0.69z)| lr 2.43e-04 | 2534.22 ms | 53.3% bf16 MFU | 206936 tok/s +step 11289/19560 | loss 3.424796 (+0.53z)| norm 0.2650 (-0.58z)| lr 2.42e-04 | 2532.26 ms | 53.3% bf16 MFU | 206941 tok/s +step 11290/19560 | loss 3.418512 (+0.40z)| norm 0.2944 (+0.92z)| lr 2.42e-04 | 2534.69 ms | 53.3% bf16 MFU | 206936 tok/s +step 11291/19560 | loss 3.366748 (-0.65z)| norm 0.2853 (+0.45z)| lr 2.42e-04 | 2533.73 ms | 53.3% bf16 MFU | 206936 tok/s +step 11292/19560 | loss 3.393960 (-0.10z)| norm 0.2942 (+0.90z)| lr 2.42e-04 | 2533.58 ms | 53.3% bf16 MFU | 206936 tok/s +step 11293/19560 | loss 3.341628 (-1.16z)| norm 0.2891 (+0.63z)| lr 2.42e-04 | 2533.51 ms | 53.3% bf16 MFU | 206936 tok/s +step 11294/19560 | loss 3.406426 (+0.15z)| norm 0.2681 (-0.44z)| lr 2.42e-04 | 2532.88 ms | 53.3% bf16 MFU | 206939 tok/s +step 11295/19560 | loss 3.399129 (-0.00z)| norm 0.3198 (+2.16z)| lr 2.42e-04 | 2533.13 ms | 53.3% bf16 MFU | 206941 tok/s +step 11296/19560 | loss 3.381479 (-0.36z)| norm 0.2896 (+0.64z)| lr 2.42e-04 | 2532.60 ms | 53.3% bf16 MFU | 206944 tok/s +step 11297/19560 | loss 3.443995 (+0.91z)| norm 0.2597 (-0.86z)| lr 2.42e-04 | 2533.84 ms | 53.3% bf16 MFU | 206943 tok/s +step 11298/19560 | loss 3.369270 (-0.63z)| norm 0.2815 (+0.25z)| lr 2.42e-04 | 2533.43 ms | 53.3% bf16 MFU | 206943 tok/s +step 11299/19560 | loss 3.390873 (-0.17z)| norm 0.2571 (-0.98z)| lr 2.42e-04 | 2533.81 ms | 53.3% bf16 MFU | 206942 tok/s +step 11300/19560 | loss 3.389719 (-0.20z)| norm 0.2625 (-0.69z)| lr 2.42e-04 | 2533.72 ms | 53.3% bf16 MFU | 206941 tok/s +step 11301/19560 | loss 3.417522 (+0.37z)| norm 0.2520 (-1.21z)| lr 2.42e-04 | 2532.66 ms | 53.3% bf16 MFU | 206944 tok/s +step 11302/19560 | loss 3.441434 (+0.86z)| norm 0.2775 (+0.09z)| lr 2.42e-04 | 2533.44 ms | 53.3% bf16 MFU | 206945 tok/s +step 11303/19560 | loss 3.371253 (-0.58z)| norm 0.2876 (+0.60z)| lr 2.42e-04 | 2533.83 ms | 53.3% bf16 MFU | 206943 tok/s +step 11304/19560 | loss 3.379236 (-0.41z)| norm 0.2806 (+0.25z)| lr 2.42e-04 | 2533.60 ms | 53.3% bf16 MFU | 206943 tok/s +step 11305/19560 | loss 3.396494 (-0.05z)| norm 0.2817 (+0.30z)| lr 2.42e-04 | 2533.51 ms | 53.3% bf16 MFU | 206943 tok/s +step 11306/19560 | loss 3.376992 (-0.45z)| norm 0.2588 (-0.87z)| lr 2.42e-04 | 2535.28 ms | 53.3% bf16 MFU | 206935 tok/s +step 11307/19560 | loss 3.439437 (+0.83z)| norm 0.2770 (+0.08z)| lr 2.42e-04 | 2533.42 ms | 53.3% bf16 MFU | 206936 tok/s +step 11308/19560 | loss 3.378910 (-0.43z)| norm 0.2580 (-0.89z)| lr 2.42e-04 | 2533.25 ms | 53.3% bf16 MFU | 206937 tok/s +step 11309/19560 | loss 3.436229 (+0.75z)| norm 0.2774 (+0.11z)| lr 2.42e-04 | 2532.06 ms | 53.3% bf16 MFU | 206943 tok/s +step 11310/19560 | loss 3.459084 (+1.35z)| norm 0.2950 (+1.17z)| lr 2.41e-04 | 2532.68 ms | 53.3% bf16 MFU | 206947 tok/s +step 11311/19560 | loss 3.386151 (-0.29z)| norm 0.2967 (+1.28z)| lr 2.41e-04 | 2534.41 ms | 53.3% bf16 MFU | 206943 tok/s +step 11312/19560 | loss 3.384915 (-0.30z)| norm 0.2559 (-1.14z)| lr 2.41e-04 | 2534.21 ms | 53.3% bf16 MFU | 206940 tok/s +step 11313/19560 | loss 3.374281 (-0.57z)| norm 0.3024 (+1.80z)| lr 2.41e-04 | 2533.37 ms | 53.3% bf16 MFU | 206940 tok/s +step 11314/19560 | loss 3.467201 (+1.82z)| norm 0.2630 (-0.68z)| lr 2.41e-04 | 2533.92 ms | 53.3% bf16 MFU | 206939 tok/s +step 11315/19560 | loss 3.348826 (-1.21z)| norm 0.2842 (+0.67z)| lr 2.41e-04 | 2532.51 ms | 53.3% bf16 MFU | 206943 tok/s +step 11316/19560 | loss 3.377113 (-0.49z)| norm 0.2612 (-0.78z)| lr 2.41e-04 | 2533.71 ms | 53.3% bf16 MFU | 206942 tok/s +step 11317/19560 | loss 3.299222 (-2.42z)| norm 0.2769 (+0.23z)| lr 2.41e-04 | 2535.59 ms | 53.2% bf16 MFU | 206934 tok/s +step 11318/19560 | loss 3.416932 (+0.56z)| norm 0.2844 (+0.69z)| lr 2.41e-04 | 2535.13 ms | 53.3% bf16 MFU | 206927 tok/s +step 11319/19560 | loss 3.376146 (-0.48z)| norm 0.2708 (-0.16z)| lr 2.41e-04 | 2533.04 ms | 53.3% bf16 MFU | 206930 tok/s +step 11320/19560 | loss 3.382205 (-0.32z)| norm 0.2768 (+0.22z)| lr 2.41e-04 | 2531.81 ms | 53.3% bf16 MFU | 206938 tok/s +step 11321/19560 | loss 3.384293 (-0.25z)| norm 0.2868 (+0.85z)| lr 2.41e-04 | 2533.24 ms | 53.3% bf16 MFU | 206939 tok/s +step 11322/19560 | loss 3.394138 (-0.01z)| norm 0.2907 (+1.08z)| lr 2.41e-04 | 2533.50 ms | 53.3% bf16 MFU | 206939 tok/s +step 11323/19560 | loss 3.440481 (+1.15z)| norm 0.2945 (+1.31z)| lr 2.41e-04 | 2535.09 ms | 53.3% bf16 MFU | 206933 tok/s +step 11324/19560 | loss 3.389000 (-0.15z)| norm 0.2511 (-1.42z)| lr 2.41e-04 | 2531.62 ms | 53.3% bf16 MFU | 206941 tok/s +step 11325/19560 | loss 3.391426 (-0.09z)| norm 0.2917 (+1.12z)| lr 2.41e-04 | 2530.84 ms | 53.3% bf16 MFU | 206952 tok/s +step 11326/19560 | loss 3.392052 (-0.08z)| norm 0.2882 (+0.89z)| lr 2.41e-04 | 2533.27 ms | 53.3% bf16 MFU | 206952 tok/s +step 11327/19560 | loss 3.395509 (-0.00z)| norm 0.2657 (-0.51z)| lr 2.41e-04 | 2533.06 ms | 53.3% bf16 MFU | 206954 tok/s +step 11328/19560 | loss 3.396256 (+0.02z)| norm 0.2798 (+0.38z)| lr 2.41e-04 | 2531.65 ms | 53.3% bf16 MFU | 206961 tok/s +step 11329/19560 | loss 3.343205 (-1.34z)| norm 0.2756 (+0.12z)| lr 2.41e-04 | 2533.26 ms | 53.3% bf16 MFU | 206961 tok/s +step 11330/19560 | loss 3.396343 (+0.03z)| norm 0.2664 (-0.46z)| lr 2.40e-04 | 2532.07 ms | 53.3% bf16 MFU | 206966 tok/s +step 11331/19560 | loss 3.386765 (-0.21z)| norm 0.2696 (-0.26z)| lr 2.40e-04 | 2533.83 ms | 53.3% bf16 MFU | 206963 tok/s +step 11332/19560 | loss 3.352375 (-1.09z)| norm 0.2787 (+0.32z)| lr 2.40e-04 | 2532.51 ms | 53.3% bf16 MFU | 206966 tok/s +step 11333/19560 | loss 3.346804 (-1.22z)| norm 0.2632 (-0.65z)| lr 2.40e-04 | 2532.15 ms | 53.3% bf16 MFU | 206970 tok/s +step 11334/19560 | loss 3.423986 (+0.76z)| norm 0.2911 (+1.09z)| lr 2.40e-04 | 2533.55 ms | 53.3% bf16 MFU | 206969 tok/s +step 11335/19560 | loss 3.423721 (+0.78z)| norm 0.2763 (+0.15z)| lr 2.40e-04 | 2531.78 ms | 53.3% bf16 MFU | 206974 tok/s +step 11336/19560 | loss 3.389790 (-0.12z)| norm 0.2520 (-1.36z)| lr 2.40e-04 | 2531.90 ms | 53.3% bf16 MFU | 206979 tok/s +step 11337/19560 | loss 3.467578 (+1.89z)| norm 0.2829 (+0.57z)| lr 2.40e-04 | 2531.61 ms | 53.3% bf16 MFU | 206985 tok/s +step 11338/19560 | loss 3.488882 (+2.42z)| norm 0.2712 (-0.16z)| lr 2.40e-04 | 2532.15 ms | 53.3% bf16 MFU | 206989 tok/s +step 11339/19560 | loss 3.379259 (-0.40z)| norm 0.2615 (-0.77z)| lr 2.40e-04 | 2531.99 ms | 53.3% bf16 MFU | 206992 tok/s +step 11340/19560 | loss 3.403743 (+0.24z)| norm 0.2664 (-0.45z)| lr 2.40e-04 | 2533.12 ms | 53.3% bf16 MFU | 206991 tok/s +step 11341/19560 | loss 3.458200 (+1.63z)| norm 0.2483 (-1.58z)| lr 2.40e-04 | 2532.94 ms | 53.3% bf16 MFU | 206991 tok/s +step 11342/19560 | loss 3.483160 (+2.21z)| norm 0.2628 (-0.65z)| lr 2.40e-04 | 2534.16 ms | 53.3% bf16 MFU | 206986 tok/s +step 11343/19560 | loss 3.359751 (-0.91z)| norm 0.2701 (-0.19z)| lr 2.40e-04 | 2532.79 ms | 53.3% bf16 MFU | 206987 tok/s +step 11344/19560 | loss 3.346419 (-1.24z)| norm 0.2606 (-0.78z)| lr 2.40e-04 | 2534.27 ms | 53.3% bf16 MFU | 206981 tok/s +step 11345/19560 | loss 3.388796 (-0.14z)| norm 0.2654 (-0.48z)| lr 2.40e-04 | 2532.24 ms | 53.3% bf16 MFU | 206985 tok/s +step 11346/19560 | loss 3.400825 (+0.17z)| norm 0.2856 (+0.79z)| lr 2.40e-04 | 2533.96 ms | 53.3% bf16 MFU | 206981 tok/s +step 11347/19560 | loss 3.383753 (-0.27z)| norm 0.2689 (-0.27z)| lr 2.40e-04 | 2532.60 ms | 53.3% bf16 MFU | 206982 tok/s +step 11348/19560 | loss 3.363807 (-0.77z)| norm 0.2829 (+0.61z)| lr 2.40e-04 | 2534.97 ms | 53.3% bf16 MFU | 206974 tok/s +step 11349/19560 | loss 3.382616 (-0.28z)| norm 0.2794 (+0.38z)| lr 2.40e-04 | 2533.45 ms | 53.3% bf16 MFU | 206973 tok/s +step 11350/19560 | loss 3.351404 (-1.07z)| norm 0.2812 (+0.48z)| lr 2.40e-04 | 2533.04 ms | 53.3% bf16 MFU | 206973 tok/s +step 11351/19560 | loss 3.348155 (-1.15z)| norm 0.2775 (+0.26z)| lr 2.39e-04 | 2532.30 ms | 53.3% bf16 MFU | 206977 tok/s +step 11352/19560 | loss 3.339954 (-1.34z)| norm 0.2555 (-1.14z)| lr 2.39e-04 | 2533.31 ms | 53.3% bf16 MFU | 206976 tok/s +step 11353/19560 | loss 3.380716 (-0.30z)| norm 0.2615 (-0.75z)| lr 2.39e-04 | 2533.82 ms | 53.3% bf16 MFU | 206973 tok/s +step 11354/19560 | loss 3.352552 (-1.01z)| norm 0.2568 (-1.04z)| lr 2.39e-04 | 2533.34 ms | 53.3% bf16 MFU | 206972 tok/s +step 11355/19560 | loss 3.341669 (-1.27z)| norm 0.2504 (-1.44z)| lr 2.39e-04 | 2533.04 ms | 53.3% bf16 MFU | 206972 tok/s +step 11356/19560 | loss 3.376441 (-0.39z)| norm 0.2583 (-0.92z)| lr 2.39e-04 | 2532.34 ms | 53.3% bf16 MFU | 206975 tok/s +step 11357/19560 | loss 3.366643 (-0.65z)| norm 0.2659 (-0.45z)| lr 2.39e-04 | 2532.82 ms | 53.3% bf16 MFU | 206977 tok/s +step 11358/19560 | loss 3.426647 (+0.94z)| norm 0.2589 (-0.88z)| lr 2.39e-04 | 2532.17 ms | 53.3% bf16 MFU | 206980 tok/s +step 11359/19560 | loss 3.414688 (+0.61z)| norm 0.2900 (+1.08z)| lr 2.39e-04 | 2534.56 ms | 53.3% bf16 MFU | 206974 tok/s +step 11360/19560 | loss 3.390583 (-0.03z)| norm 0.2650 (-0.51z)| lr 2.39e-04 | 2530.64 ms | 53.4% bf16 MFU | 206984 tok/s +step 11361/19560 | loss 3.427973 (+0.96z)| norm 0.2738 (+0.04z)| lr 2.39e-04 | 2533.94 ms | 53.3% bf16 MFU | 206980 tok/s +step 11362/19560 | loss 3.324684 (-1.74z)| norm 0.2576 (-0.99z)| lr 2.39e-04 | 2532.07 ms | 53.3% bf16 MFU | 206984 tok/s +step 11363/19560 | loss 3.380323 (-0.28z)| norm 0.2590 (-0.90z)| lr 2.39e-04 | 2532.50 ms | 53.3% bf16 MFU | 206986 tok/s +step 11364/19560 | loss 3.365701 (-0.66z)| norm 0.2868 (+0.86z)| lr 2.39e-04 | 2532.54 ms | 53.3% bf16 MFU | 206988 tok/s +step 11365/19560 | loss 3.397345 (+0.18z)| norm 0.2783 (+0.31z)| lr 2.39e-04 | 2532.95 ms | 53.3% bf16 MFU | 206988 tok/s +step 11366/19560 | loss 3.388738 (-0.05z)| norm 0.2791 (+0.36z)| lr 2.39e-04 | 2531.45 ms | 53.3% bf16 MFU | 206994 tok/s +step 11367/19560 | loss 3.488970 (+2.56z)| norm 0.2751 (+0.13z)| lr 2.39e-04 | 2534.23 ms | 53.3% bf16 MFU | 206988 tok/s +step 11368/19560 | loss 3.369657 (-0.55z)| norm 0.2714 (-0.12z)| lr 2.39e-04 | 2531.26 ms | 53.3% bf16 MFU | 206995 tok/s +step 11369/19560 | loss 3.399313 (+0.22z)| norm 0.2926 (+1.26z)| lr 2.39e-04 | 2533.28 ms | 53.3% bf16 MFU | 206993 tok/s +step 11370/19560 | loss 3.370096 (-0.55z)| norm 0.2537 (-1.29z)| lr 2.39e-04 | 2531.45 ms | 53.3% bf16 MFU | 206999 tok/s +step 11371/19560 | loss 3.353102 (-0.99z)| norm 0.2704 (-0.19z)| lr 2.38e-04 | 2532.92 ms | 53.3% bf16 MFU | 206999 tok/s +step 11372/19560 | loss 3.373233 (-0.45z)| norm 0.2866 (+0.86z)| lr 2.38e-04 | 2531.61 ms | 53.3% bf16 MFU | 207004 tok/s +step 11373/19560 | loss 3.367797 (-0.58z)| norm 0.2614 (-0.77z)| lr 2.38e-04 | 2534.11 ms | 53.3% bf16 MFU | 206998 tok/s +step 11374/19560 | loss 3.408432 (+0.50z)| norm 0.2814 (+0.53z)| lr 2.38e-04 | 2532.67 ms | 53.3% bf16 MFU | 206999 tok/s +step 11375/19560 | loss 3.378166 (-0.29z)| norm 0.2986 (+1.64z)| lr 2.38e-04 | 2532.83 ms | 53.3% bf16 MFU | 206999 tok/s +step 11376/19560 | loss 3.362460 (-0.71z)| norm 0.2630 (-0.66z)| lr 2.38e-04 | 2531.97 ms | 53.3% bf16 MFU | 207002 tok/s +step 11377/19560 | loss 3.396008 (+0.19z)| norm 0.2971 (+1.52z)| lr 2.38e-04 | 2532.52 ms | 53.3% bf16 MFU | 207003 tok/s +step 11378/19560 | loss 3.334002 (-1.45z)| norm 0.2604 (-0.84z)| lr 2.38e-04 | 2533.25 ms | 53.3% bf16 MFU | 207001 tok/s +step 11379/19560 | loss 3.391405 (+0.09z)| norm 0.3133 (+2.72z)| lr 2.38e-04 | 2533.34 ms | 53.3% bf16 MFU | 206999 tok/s +step 11380/19560 | loss 3.352154 (-0.95z)| norm 0.2703 (-0.17z)| lr 2.38e-04 | 2532.29 ms | 53.3% bf16 MFU | 207001 tok/s +step 11381/19560 | loss 3.447652 (+1.65z)| norm 0.2806 (+0.54z)| lr 2.38e-04 | 2532.40 ms | 53.3% bf16 MFU | 207002 tok/s +step 11382/19560 | loss 3.397604 (+0.27z)| norm 0.2750 (+0.18z)| lr 2.38e-04 | 2532.46 ms | 53.3% bf16 MFU | 207004 tok/s +step 11383/19560 | loss 3.409363 (+0.59z)| norm 0.2765 (+0.28z)| lr 2.38e-04 | 2533.57 ms | 53.3% bf16 MFU | 207000 tok/s +step 11384/19560 | loss 3.412285 (+0.66z)| norm 0.2915 (+1.31z)| lr 2.38e-04 | 2532.73 ms | 53.3% bf16 MFU | 207001 tok/s +step 11385/19560 | loss 3.381608 (-0.19z)| norm 0.2848 (+0.85z)| lr 2.38e-04 | 2532.41 ms | 53.3% bf16 MFU | 207002 tok/s +step 11386/19560 | loss 3.357012 (-0.87z)| norm 0.2690 (-0.26z)| lr 2.38e-04 | 2533.77 ms | 53.3% bf16 MFU | 206998 tok/s +step 11387/19560 | loss 3.381073 (-0.20z)| norm 0.2741 (+0.09z)| lr 2.38e-04 | 2533.69 ms | 53.3% bf16 MFU | 206994 tok/s +step 11388/19560 | loss 3.527512 (+3.65z)| norm 0.2813 (+0.59z)| lr 2.38e-04 | 2532.18 ms | 53.3% bf16 MFU | 206997 tok/s +step 11389/19560 | loss 3.389163 (-0.02z)| norm 0.2737 (+0.05z)| lr 2.38e-04 | 2531.16 ms | 53.3% bf16 MFU | 207004 tok/s +step 11390/19560 | loss 3.382590 (-0.19z)| norm 0.2775 (+0.30z)| lr 2.38e-04 | 2532.39 ms | 53.3% bf16 MFU | 207005 tok/s +step 11391/19560 | loss 3.385154 (-0.13z)| norm 0.2852 (+0.84z)| lr 2.37e-04 | 2532.87 ms | 53.3% bf16 MFU | 207005 tok/s +step 11392/19560 | loss 3.381814 (-0.22z)| norm 0.2599 (-0.95z)| lr 2.37e-04 | 2533.40 ms | 53.3% bf16 MFU | 207002 tok/s +step 11393/19560 | loss 3.415859 (+0.68z)| norm 0.2710 (-0.16z)| lr 2.37e-04 | 2534.19 ms | 53.3% bf16 MFU | 206996 tok/s +step 11394/19560 | loss 3.438692 (+1.27z)| norm 0.2725 (-0.05z)| lr 2.37e-04 | 2530.94 ms | 53.3% bf16 MFU | 207004 tok/s +step 11395/19560 | loss 3.374087 (-0.43z)| norm 0.2618 (-0.81z)| lr 2.37e-04 | 2534.28 ms | 53.3% bf16 MFU | 206998 tok/s +step 11396/19560 | loss 3.389449 (-0.03z)| norm 0.2747 (+0.11z)| lr 2.37e-04 | 2533.44 ms | 53.3% bf16 MFU | 206995 tok/s +step 11397/19560 | loss 3.425820 (+0.96z)| norm 0.2835 (+0.72z)| lr 2.37e-04 | 2531.68 ms | 53.3% bf16 MFU | 207000 tok/s +step 11398/19560 | loss 3.423759 (+0.89z)| norm 0.2725 (-0.07z)| lr 2.37e-04 | 2531.94 ms | 53.3% bf16 MFU | 207004 tok/s +step 11399/19560 | loss 3.426974 (+0.97z)| norm 0.2736 (+0.02z)| lr 2.37e-04 | 2531.65 ms | 53.3% bf16 MFU | 207008 tok/s +step 11400/19560 | loss 3.494527 (+2.68z)| norm 0.2967 (+1.64z)| lr 2.37e-04 | 2533.75 ms | 53.3% bf16 MFU | 207004 tok/s +step 11401/19560 | loss 3.419399 (+0.70z)| norm 0.3010 (+1.90z)| lr 2.37e-04 | 2531.48 ms | 53.3% bf16 MFU | 207009 tok/s +step 11402/19560 | loss 3.395017 (+0.07z)| norm 0.2696 (-0.30z)| lr 2.37e-04 | 2533.95 ms | 53.3% bf16 MFU | 207004 tok/s +step 11403/19560 | loss 3.344100 (-1.27z)| norm 0.2803 (+0.44z)| lr 2.37e-04 | 2531.61 ms | 53.3% bf16 MFU | 207008 tok/s +step 11404/19560 | loss 3.391195 (-0.02z)| norm 0.2881 (+0.98z)| lr 2.37e-04 | 2533.74 ms | 53.3% bf16 MFU | 207004 tok/s +step 11405/19560 | loss 3.570507 (+4.41z)| norm 0.2742 (-0.01z)| lr 2.37e-04 | 2532.98 ms | 53.3% bf16 MFU | 207003 tok/s +step 11406/19560 | loss 3.432044 (+0.95z)| norm 0.2718 (-0.21z)| lr 2.37e-04 | 2531.58 ms | 53.3% bf16 MFU | 207008 tok/s +step 11407/19560 | loss 3.366267 (-0.68z)| norm 0.2820 (+0.52z)| lr 2.37e-04 | 2533.15 ms | 53.3% bf16 MFU | 207006 tok/s +step 11408/19560 | loss 3.380273 (-0.33z)| norm 0.2762 (+0.09z)| lr 2.37e-04 | 2535.64 ms | 53.2% bf16 MFU | 206994 tok/s +step 11409/19560 | loss 3.347519 (-1.14z)| norm 0.2775 (+0.19z)| lr 2.37e-04 | 2535.51 ms | 53.3% bf16 MFU | 206983 tok/s +step 11410/19560 | loss 3.375278 (-0.46z)| norm 0.2607 (-1.04z)| lr 2.37e-04 | 2533.49 ms | 53.3% bf16 MFU | 206981 tok/s +step 11411/19560 | loss 3.347174 (-1.15z)| norm 0.2859 (+0.79z)| lr 2.37e-04 | 2534.28 ms | 53.3% bf16 MFU | 206976 tok/s +step 11412/19560 | loss 3.332992 (-1.49z)| norm 0.2726 (-0.19z)| lr 2.36e-04 | 2534.32 ms | 53.3% bf16 MFU | 206971 tok/s +step 11413/19560 | loss 3.390362 (-0.08z)| norm 0.2912 (+1.19z)| lr 2.36e-04 | 2532.96 ms | 53.3% bf16 MFU | 206972 tok/s +step 11414/19560 | loss 3.385484 (-0.20z)| norm 0.2565 (-1.36z)| lr 2.36e-04 | 2531.81 ms | 53.3% bf16 MFU | 206977 tok/s +step 11415/19560 | loss 3.413928 (+0.50z)| norm 0.3027 (+1.98z)| lr 2.36e-04 | 2533.07 ms | 53.3% bf16 MFU | 206977 tok/s +step 11416/19560 | loss 3.392963 (-0.03z)| norm 0.2742 (-0.09z)| lr 2.36e-04 | 2536.07 ms | 53.2% bf16 MFU | 206965 tok/s +step 11417/19560 | loss 3.393129 (-0.02z)| norm 0.2863 (+0.78z)| lr 2.36e-04 | 2533.41 ms | 53.3% bf16 MFU | 206964 tok/s +step 11418/19560 | loss 3.359346 (-0.85z)| norm 0.2545 (-1.51z)| lr 2.36e-04 | 2533.53 ms | 53.3% bf16 MFU | 206963 tok/s +step 11419/19560 | loss 3.387956 (-0.14z)| norm 0.2674 (-0.57z)| lr 2.36e-04 | 2533.74 ms | 53.3% bf16 MFU | 206961 tok/s +step 11420/19560 | loss 3.351536 (-1.03z)| norm 0.3002 (+1.80z)| lr 2.36e-04 | 2534.87 ms | 53.3% bf16 MFU | 206954 tok/s +step 11421/19560 | loss 3.344632 (-1.21z)| norm 0.2682 (-0.50z)| lr 2.36e-04 | 2532.99 ms | 53.3% bf16 MFU | 206956 tok/s +step 11422/19560 | loss 3.366028 (-0.67z)| norm 0.2676 (-0.54z)| lr 2.36e-04 | 2533.07 ms | 53.3% bf16 MFU | 206957 tok/s +step 11423/19560 | loss 3.436880 (+1.08z)| norm 0.2848 (+0.75z)| lr 2.36e-04 | 2532.02 ms | 53.3% bf16 MFU | 206962 tok/s +step 11424/19560 | loss 3.396219 (+0.07z)| norm 0.2786 (+0.29z)| lr 2.36e-04 | 2532.05 ms | 53.3% bf16 MFU | 206967 tok/s +step 11425/19560 | loss 3.338355 (-1.34z)| norm 0.2624 (-0.94z)| lr 2.36e-04 | 2532.93 ms | 53.3% bf16 MFU | 206968 tok/s +step 11426/19560 | loss 3.408969 (+0.40z)| norm 0.2701 (-0.35z)| lr 2.36e-04 | 2532.54 ms | 53.3% bf16 MFU | 206971 tok/s +step 11427/19560 | loss 3.421350 (+0.70z)| norm 0.2750 (+0.02z)| lr 2.36e-04 | 2532.29 ms | 53.3% bf16 MFU | 206974 tok/s +step 11428/19560 | loss 3.376762 (-0.40z)| norm 0.2616 (-1.01z)| lr 2.36e-04 | 2533.30 ms | 53.3% bf16 MFU | 206974 tok/s +step 11429/19560 | loss 3.352388 (-0.98z)| norm 0.2904 (+1.18z)| lr 2.36e-04 | 2533.19 ms | 53.3% bf16 MFU | 206973 tok/s +step 11430/19560 | loss 3.328369 (-1.55z)| norm 0.2693 (-0.44z)| lr 2.36e-04 | 2531.91 ms | 53.3% bf16 MFU | 206978 tok/s +step 11431/19560 | loss 3.411018 (+0.47z)| norm 0.2691 (-0.45z)| lr 2.36e-04 | 2531.01 ms | 53.3% bf16 MFU | 206987 tok/s +step 11432/19560 | loss 3.401535 (+0.23z)| norm 0.2805 (+0.44z)| lr 2.35e-04 | 2532.50 ms | 53.3% bf16 MFU | 206989 tok/s +step 11433/19560 | loss 3.404633 (+0.31z)| norm 0.2824 (+0.58z)| lr 2.35e-04 | 2533.52 ms | 53.3% bf16 MFU | 206986 tok/s +step 11434/19560 | loss 3.405223 (+0.32z)| norm 0.2705 (-0.34z)| lr 2.35e-04 | 2532.72 ms | 53.3% bf16 MFU | 206987 tok/s +step 11435/19560 | loss 3.460340 (+1.66z)| norm 0.2780 (+0.23z)| lr 2.35e-04 | 2532.48 ms | 53.3% bf16 MFU | 206989 tok/s +step 11436/19560 | loss 3.363894 (-0.69z)| norm 0.2722 (-0.23z)| lr 2.35e-04 | 2535.05 ms | 53.3% bf16 MFU | 206980 tok/s +step 11437/19560 | loss 3.423194 (+0.76z)| norm 0.2632 (-0.91z)| lr 2.35e-04 | 2534.21 ms | 53.3% bf16 MFU | 206976 tok/s +step 11438/19560 | loss 3.427428 (+0.87z)| norm 0.2679 (-0.54z)| lr 2.35e-04 | 2535.34 ms | 53.3% bf16 MFU | 206966 tok/s +step 11439/19560 | loss 3.411293 (+0.47z)| norm 0.2642 (-0.82z)| lr 2.35e-04 | 2533.46 ms | 53.3% bf16 MFU | 206965 tok/s +step 11440/19560 | loss 3.375673 (-0.40z)| norm 0.2872 (+0.99z)| lr 2.35e-04 | 2533.25 ms | 53.3% bf16 MFU | 206965 tok/s +step 11441/19560 | loss 3.529554 (+3.21z)| norm 0.3002 (+2.03z)| lr 2.35e-04 | 2533.69 ms | 53.3% bf16 MFU | 206963 tok/s +step 11442/19560 | loss 3.403292 (+0.25z)| norm 0.3079 (+2.56z)| lr 2.35e-04 | 2535.13 ms | 53.3% bf16 MFU | 206956 tok/s +step 11443/19560 | loss 3.362561 (-0.72z)| norm 0.3194 (+3.30z)| lr 2.35e-04 | 2532.50 ms | 53.3% bf16 MFU | 206959 tok/s +step 11444/19560 | loss 3.385549 (-0.18z)| norm 0.2734 (-0.15z)| lr 2.35e-04 | 2532.38 ms | 53.3% bf16 MFU | 206963 tok/s +step 11445/19560 | loss 3.437231 (+1.05z)| norm 0.3099 (+2.50z)| lr 2.35e-04 | 2533.40 ms | 53.3% bf16 MFU | 206962 tok/s +step 11446/19560 | loss 3.387725 (-0.15z)| norm 0.2767 (+0.08z)| lr 2.35e-04 | 2531.93 ms | 53.3% bf16 MFU | 206967 tok/s +step 11447/19560 | loss 3.442636 (+1.17z)| norm 0.3162 (+2.86z)| lr 2.35e-04 | 2533.65 ms | 53.3% bf16 MFU | 206966 tok/s +step 11448/19560 | loss 3.382627 (-0.28z)| norm 0.2717 (-0.30z)| lr 2.35e-04 | 2534.16 ms | 53.3% bf16 MFU | 206962 tok/s +step 11449/19560 | loss 3.400523 (+0.15z)| norm 0.3010 (+1.75z)| lr 2.35e-04 | 2533.07 ms | 53.3% bf16 MFU | 206962 tok/s +step 11450/19560 | loss 3.390293 (-0.10z)| norm 0.2890 (+0.91z)| lr 2.35e-04 | 2534.22 ms | 53.3% bf16 MFU | 206958 tok/s +step 11451/19560 | loss 3.378338 (-0.38z)| norm 0.2706 (-0.37z)| lr 2.35e-04 | 2532.31 ms | 53.3% bf16 MFU | 206963 tok/s +step 11452/19560 | loss 3.478418 (+2.00z)| norm 0.2799 (+0.27z)| lr 2.35e-04 | 2532.77 ms | 53.3% bf16 MFU | 206965 tok/s +step 11453/19560 | loss 3.301855 (-2.16z)| norm 0.2903 (+1.02z)| lr 2.34e-04 | 2532.27 ms | 53.3% bf16 MFU | 206968 tok/s +step 11454/19560 | loss 3.415740 (+0.51z)| norm 0.2888 (+0.91z)| lr 2.34e-04 | 2534.46 ms | 53.3% bf16 MFU | 206963 tok/s +step 11455/19560 | loss 3.407779 (+0.32z)| norm 0.2737 (-0.18z)| lr 2.34e-04 | 2533.53 ms | 53.3% bf16 MFU | 206962 tok/s +step 11456/19560 | loss 3.418742 (+0.57z)| norm 0.2690 (-0.51z)| lr 2.34e-04 | 2532.47 ms | 53.3% bf16 MFU | 206965 tok/s +step 11457/19560 | loss 3.373052 (-0.51z)| norm 0.2951 (+1.35z)| lr 2.34e-04 | 2532.80 ms | 53.3% bf16 MFU | 206967 tok/s +step 11458/19560 | loss 3.360199 (-0.80z)| norm 0.3104 (+2.36z)| lr 2.34e-04 | 2532.97 ms | 53.3% bf16 MFU | 206968 tok/s +step 11459/19560 | loss 3.363012 (-0.73z)| norm 0.2837 (+0.49z)| lr 2.34e-04 | 2534.54 ms | 53.3% bf16 MFU | 206962 tok/s +step 11460/19560 | loss 3.370368 (-0.56z)| norm 0.2849 (+0.57z)| lr 2.34e-04 | 2533.71 ms | 53.3% bf16 MFU | 206960 tok/s +step 11461/19560 | loss 3.404178 (+0.22z)| norm 0.2834 (+0.45z)| lr 2.34e-04 | 2534.47 ms | 53.3% bf16 MFU | 206956 tok/s +step 11462/19560 | loss 3.373991 (-0.48z)| norm 0.2901 (+0.93z)| lr 2.34e-04 | 2534.13 ms | 53.3% bf16 MFU | 206952 tok/s +step 11463/19560 | loss 3.413116 (+0.44z)| norm 0.2827 (+0.40z)| lr 2.34e-04 | 2532.54 ms | 53.3% bf16 MFU | 206956 tok/s +step 11464/19560 | loss 3.408652 (+0.34z)| norm 0.2702 (-0.49z)| lr 2.34e-04 | 2532.25 ms | 53.3% bf16 MFU | 206960 tok/s +step 11465/19560 | loss 3.332689 (-1.44z)| norm 0.2557 (-1.49z)| lr 2.34e-04 | 2532.38 ms | 53.3% bf16 MFU | 206964 tok/s +step 11466/19560 | loss 3.425543 (+0.79z)| norm 0.2638 (-0.91z)| lr 2.34e-04 | 2533.28 ms | 53.3% bf16 MFU | 206964 tok/s +step 11467/19560 | loss 3.359966 (-0.79z)| norm 0.2599 (-1.18z)| lr 2.34e-04 | 2533.86 ms | 53.3% bf16 MFU | 206961 tok/s +step 11468/19560 | loss 3.356131 (-0.87z)| norm 0.2428 (-2.32z)| lr 2.34e-04 | 2534.40 ms | 53.3% bf16 MFU | 206956 tok/s +step 11469/19560 | loss 3.393833 (+0.05z)| norm 0.2626 (-0.98z)| lr 2.34e-04 | 2532.40 ms | 53.3% bf16 MFU | 206960 tok/s +step 11470/19560 | loss 3.388954 (-0.05z)| norm 0.2627 (-0.97z)| lr 2.34e-04 | 2533.61 ms | 53.3% bf16 MFU | 206959 tok/s +step 11471/19560 | loss 3.433723 (+1.04z)| norm 0.2521 (-1.68z)| lr 2.34e-04 | 2534.03 ms | 53.3% bf16 MFU | 206956 tok/s +step 11472/19560 | loss 3.400990 (+0.22z)| norm 0.2470 (-2.00z)| lr 2.34e-04 | 2533.67 ms | 53.3% bf16 MFU | 206954 tok/s +step 11473/19560 | loss 3.392410 (+0.01z)| norm 0.2645 (-0.81z)| lr 2.33e-04 | 2534.20 ms | 53.3% bf16 MFU | 206951 tok/s +step 11474/19560 | loss 3.341107 (-1.25z)| norm 0.2554 (-1.40z)| lr 2.33e-04 | 2533.70 ms | 53.3% bf16 MFU | 206950 tok/s +step 11475/19560 | loss 3.361766 (-0.73z)| norm 0.2769 (+0.04z)| lr 2.33e-04 | 2534.73 ms | 53.3% bf16 MFU | 206944 tok/s +step 11476/19560 | loss 3.361538 (-0.74z)| norm 0.2567 (-1.30z)| lr 2.33e-04 | 2534.89 ms | 53.3% bf16 MFU | 206939 tok/s +step 11477/19560 | loss 3.392736 (+0.03z)| norm 0.2705 (-0.37z)| lr 2.33e-04 | 2534.29 ms | 53.3% bf16 MFU | 206936 tok/s +step 11478/19560 | loss 3.382926 (-0.22z)| norm 0.2597 (-1.08z)| lr 2.33e-04 | 2536.17 ms | 53.2% bf16 MFU | 206925 tok/s +step 11479/19560 | loss 3.350920 (-1.01z)| norm 0.2683 (-0.50z)| lr 2.33e-04 | 2533.76 ms | 53.3% bf16 MFU | 206925 tok/s +step 11480/19560 | loss 3.392325 (+0.00z)| norm 0.2640 (-0.80z)| lr 2.33e-04 | 2534.23 ms | 53.3% bf16 MFU | 206923 tok/s +step 11481/19560 | loss 3.350502 (-1.03z)| norm 0.2588 (-1.14z)| lr 2.33e-04 | 2534.44 ms | 53.3% bf16 MFU | 206920 tok/s +step 11482/19560 | loss 3.386999 (-0.13z)| norm 0.2615 (-0.96z)| lr 2.33e-04 | 2535.75 ms | 53.2% bf16 MFU | 206912 tok/s +step 11483/19560 | loss 3.376647 (-0.40z)| norm 0.2736 (-0.17z)| lr 2.33e-04 | 2533.97 ms | 53.3% bf16 MFU | 206911 tok/s +step 11484/19560 | loss 3.409803 (+0.42z)| norm 0.2530 (-1.55z)| lr 2.33e-04 | 2536.05 ms | 53.2% bf16 MFU | 206902 tok/s +step 11485/19560 | loss 3.411926 (+0.47z)| norm 0.2874 (+0.76z)| lr 2.33e-04 | 2534.65 ms | 53.3% bf16 MFU | 206900 tok/s +step 11486/19560 | loss 3.352823 (-0.99z)| norm 0.2770 (+0.05z)| lr 2.33e-04 | 2532.35 ms | 53.3% bf16 MFU | 206907 tok/s +step 11487/19560 | loss 3.399569 (+0.18z)| norm 0.2804 (+0.28z)| lr 2.33e-04 | 2533.03 ms | 53.3% bf16 MFU | 206910 tok/s +step 11488/19560 | loss 3.446751 (+1.34z)| norm 0.2926 (+1.10z)| lr 2.33e-04 | 2532.96 ms | 53.3% bf16 MFU | 206914 tok/s +step 11489/19560 | loss 3.340694 (-1.28z)| norm 0.2947 (+1.22z)| lr 2.33e-04 | 2534.99 ms | 53.3% bf16 MFU | 206909 tok/s +step 11490/19560 | loss 3.365017 (-0.69z)| norm 0.2810 (+0.28z)| lr 2.33e-04 | 2533.81 ms | 53.3% bf16 MFU | 206910 tok/s +step 11491/19560 | loss 3.393160 (+0.01z)| norm 0.2985 (+1.45z)| lr 2.33e-04 | 2534.02 ms | 53.3% bf16 MFU | 206909 tok/s +step 11492/19560 | loss 3.351695 (-1.02z)| norm 0.2684 (-0.58z)| lr 2.33e-04 | 2534.17 ms | 53.3% bf16 MFU | 206908 tok/s +step 11493/19560 | loss 3.367106 (-0.63z)| norm 0.3155 (+2.53z)| lr 2.33e-04 | 2533.61 ms | 53.3% bf16 MFU | 206909 tok/s +step 11494/19560 | loss 3.306153 (-2.09z)| norm 0.2648 (-0.82z)| lr 2.32e-04 | 2535.43 ms | 53.3% bf16 MFU | 206903 tok/s +step 11495/19560 | loss 3.369533 (-0.53z)| norm 0.3247 (+3.00z)| lr 2.32e-04 | 2535.02 ms | 53.3% bf16 MFU | 206899 tok/s +step 11496/19560 | loss 3.398767 (+0.20z)| norm 0.3303 (+3.19z)| lr 2.32e-04 | 2534.72 ms | 53.3% bf16 MFU | 206896 tok/s +step 11497/19560 | loss 3.361458 (-0.73z)| norm 0.3002 (+1.35z)| lr 2.32e-04 | 2534.41 ms | 53.3% bf16 MFU | 206895 tok/s +step 11498/19560 | loss 3.364834 (-0.64z)| norm 0.2940 (+0.95z)| lr 2.32e-04 | 2535.18 ms | 53.3% bf16 MFU | 206890 tok/s +step 11499/19560 | loss 3.406504 (+0.39z)| norm 0.2885 (+0.61z)| lr 2.32e-04 | 2531.33 ms | 53.3% bf16 MFU | 206902 tok/s +step 11500/19560 | loss 3.380309 (-0.27z)| norm 0.2730 (-0.33z)| lr 2.32e-04 | 2533.16 ms | 53.3% bf16 MFU | 206905 tok/s +val loss 3.376118 +evaluating HellaSwag: 0/628 evaluating HellaSwag: 10/628 evaluating HellaSwag: 20/628 evaluating HellaSwag: 30/628 evaluating HellaSwag: 40/628 evaluating HellaSwag: 50/628 evaluating HellaSwag: 60/628 evaluating HellaSwag: 70/628 evaluating HellaSwag: 80/628 evaluating HellaSwag: 90/628 evaluating HellaSwag: 100/628 evaluating HellaSwag: 110/628 evaluating HellaSwag: 120/628 evaluating HellaSwag: 130/628 evaluating HellaSwag: 140/628 evaluating HellaSwag: 150/628 evaluating HellaSwag: 160/628 evaluating HellaSwag: 170/628 evaluating HellaSwag: 180/628 evaluating HellaSwag: 190/628 evaluating HellaSwag: 200/628 evaluating HellaSwag: 210/628 evaluating HellaSwag: 220/628 evaluating HellaSwag: 230/628 evaluating HellaSwag: 240/628 evaluating HellaSwag: 250/628 evaluating HellaSwag: 260/628 evaluating HellaSwag: 270/628 evaluating HellaSwag: 280/628 evaluating HellaSwag: 290/628 evaluating HellaSwag: 300/628 evaluating HellaSwag: 310/628 evaluating HellaSwag: 320/628 evaluating HellaSwag: 330/628 evaluating HellaSwag: 340/628 evaluating HellaSwag: 350/628 evaluating HellaSwag: 360/628 evaluating HellaSwag: 370/628 evaluating HellaSwag: 380/628 evaluating HellaSwag: 390/628 evaluating HellaSwag: 400/628 evaluating HellaSwag: 410/628 evaluating HellaSwag: 420/628 evaluating HellaSwag: 430/628 evaluating HellaSwag: 440/628 evaluating HellaSwag: 450/628 evaluating HellaSwag: 460/628 evaluating HellaSwag: 470/628 evaluating HellaSwag: 480/628 evaluating HellaSwag: 490/628 evaluating HellaSwag: 500/628 evaluating HellaSwag: 510/628 evaluating HellaSwag: 520/628 evaluating HellaSwag: 530/628 evaluating HellaSwag: 540/628 evaluating HellaSwag: 550/628 evaluating HellaSwag: 560/628 evaluating HellaSwag: 570/628 evaluating HellaSwag: 580/628 evaluating HellaSwag: 590/628 evaluating HellaSwag: 600/628 evaluating HellaSwag: 610/628 evaluating HellaSwag: 620/628 HellaSwag: 2922/10042 = 0.290978 +step 11501/19560 | loss 3.358019 (-0.83z)| norm 0.2849 (+0.39z)| lr 2.32e-04 | 2532.87 ms | 53.3% bf16 MFU | 206910 tok/s +step 11502/19560 | loss 3.391854 (+0.02z)| norm 0.2781 (-0.03z)| lr 2.32e-04 | 2533.43 ms | 53.3% bf16 MFU | 206911 tok/s +step 11503/19560 | loss 3.343143 (-1.18z)| norm 0.2956 (+1.05z)| lr 2.32e-04 | 2533.60 ms | 53.3% bf16 MFU | 206913 tok/s +step 11504/19560 | loss 3.358291 (-0.80z)| norm 0.2911 (+0.76z)| lr 2.32e-04 | 2533.55 ms | 53.3% bf16 MFU | 206914 tok/s +step 11505/19560 | loss 3.442406 (+1.27z)| norm 0.2814 (+0.17z)| lr 2.32e-04 | 2533.32 ms | 53.3% bf16 MFU | 206916 tok/s +step 11506/19560 | loss 3.369523 (-0.54z)| norm 0.2615 (-1.06z)| lr 2.32e-04 | 2533.25 ms | 53.3% bf16 MFU | 206918 tok/s +step 11507/19560 | loss 3.387493 (-0.09z)| norm 0.2509 (-1.70z)| lr 2.32e-04 | 2532.81 ms | 53.3% bf16 MFU | 206922 tok/s +step 11508/19560 | loss 3.366665 (-0.62z)| norm 0.2708 (-0.46z)| lr 2.32e-04 | 2533.79 ms | 53.3% bf16 MFU | 206922 tok/s +step 11509/19560 | loss 3.391501 (+0.02z)| norm 0.2871 (+0.55z)| lr 2.32e-04 | 2534.51 ms | 53.3% bf16 MFU | 206919 tok/s +step 11510/19560 | loss 3.348486 (-1.05z)| norm 0.2978 (+1.20z)| lr 2.32e-04 | 2533.96 ms | 53.3% bf16 MFU | 206918 tok/s +step 11511/19560 | loss 3.391969 (+0.04z)| norm 0.2663 (-0.74z)| lr 2.32e-04 | 2533.28 ms | 53.3% bf16 MFU | 206920 tok/s +step 11512/19560 | loss 3.424292 (+0.85z)| norm 0.2927 (+0.89z)| lr 2.32e-04 | 2534.58 ms | 53.3% bf16 MFU | 206917 tok/s +step 11513/19560 | loss 3.408453 (+0.45z)| norm 0.2767 (-0.10z)| lr 2.32e-04 | 2531.74 ms | 53.3% bf16 MFU | 206926 tok/s +step 11514/19560 | loss 3.402251 (+0.28z)| norm 0.2890 (+0.65z)| lr 2.31e-04 | 2533.10 ms | 53.3% bf16 MFU | 206928 tok/s +step 11515/19560 | loss 3.424142 (+0.82z)| norm 0.2834 (+0.31z)| lr 2.31e-04 | 2533.79 ms | 53.3% bf16 MFU | 206927 tok/s +step 11516/19560 | loss 3.461332 (+1.83z)| norm 0.2958 (+1.06z)| lr 2.31e-04 | 2533.58 ms | 53.3% bf16 MFU | 206928 tok/s +step 11517/19560 | loss 3.409973 (+0.49z)| norm 0.2785 (-0.01z)| lr 2.31e-04 | 2532.94 ms | 53.3% bf16 MFU | 206931 tok/s +step 11518/19560 | loss 3.434642 (+1.11z)| norm 0.2495 (-1.75z)| lr 2.31e-04 | 2533.77 ms | 53.3% bf16 MFU | 206930 tok/s +step 11519/19560 | loss 3.404768 (+0.34z)| norm 0.2925 (+0.85z)| lr 2.31e-04 | 2532.59 ms | 53.3% bf16 MFU | 206935 tok/s +step 11520/19560 | loss 3.402037 (+0.27z)| norm 0.2467 (-1.90z)| lr 2.31e-04 | 2534.92 ms | 53.3% bf16 MFU | 206929 tok/s +step 11521/19560 | loss 3.322993 (-1.73z)| norm 0.2805 (+0.12z)| lr 2.31e-04 | 2534.59 ms | 53.3% bf16 MFU | 206925 tok/s +step 11522/19560 | loss 3.373068 (-0.45z)| norm 0.2620 (-0.98z)| lr 2.31e-04 | 2536.00 ms | 53.2% bf16 MFU | 206916 tok/s +step 11523/19560 | loss 3.395002 (+0.11z)| norm 0.2815 (+0.18z)| lr 2.31e-04 | 2534.93 ms | 53.3% bf16 MFU | 206912 tok/s +step 11524/19560 | loss 3.482524 (+2.29z)| norm 0.2677 (-0.64z)| lr 2.31e-04 | 2535.29 ms | 53.3% bf16 MFU | 206906 tok/s +step 11525/19560 | loss 3.345207 (-1.14z)| norm 0.2633 (-0.90z)| lr 2.31e-04 | 2533.88 ms | 53.3% bf16 MFU | 206906 tok/s +step 11526/19560 | loss 3.387913 (-0.06z)| norm 0.2854 (+0.42z)| lr 2.31e-04 | 2534.14 ms | 53.3% bf16 MFU | 206905 tok/s +step 11527/19560 | loss 3.392236 (+0.05z)| norm 0.2657 (-0.75z)| lr 2.31e-04 | 2535.66 ms | 53.2% bf16 MFU | 206898 tok/s +step 11528/19560 | loss 3.404642 (+0.39z)| norm 0.2672 (-0.66z)| lr 2.31e-04 | 2535.01 ms | 53.3% bf16 MFU | 206894 tok/s +step 11529/19560 | loss 3.359430 (-0.77z)| norm 0.2591 (-1.12z)| lr 2.31e-04 | 2533.47 ms | 53.3% bf16 MFU | 206897 tok/s +step 11530/19560 | loss 3.351368 (-0.96z)| norm 0.2586 (-1.14z)| lr 2.31e-04 | 2535.79 ms | 53.2% bf16 MFU | 206890 tok/s +step 11531/19560 | loss 3.352776 (-0.93z)| norm 0.2758 (-0.11z)| lr 2.31e-04 | 2532.50 ms | 53.3% bf16 MFU | 206896 tok/s +step 11532/19560 | loss 3.415255 (+0.68z)| norm 0.2513 (-1.55z)| lr 2.31e-04 | 2532.70 ms | 53.3% bf16 MFU | 206902 tok/s +step 11533/19560 | loss 3.304209 (-2.30z)| norm 0.2509 (-1.55z)| lr 2.31e-04 | 2532.66 ms | 53.3% bf16 MFU | 206907 tok/s +step 11534/19560 | loss 3.400940 (+0.40z)| norm 0.2582 (-1.11z)| lr 2.31e-04 | 2533.44 ms | 53.3% bf16 MFU | 206909 tok/s +step 11535/19560 | loss 3.341455 (-1.25z)| norm 0.2653 (-0.68z)| lr 2.30e-04 | 2532.78 ms | 53.3% bf16 MFU | 206914 tok/s +step 11536/19560 | loss 3.346420 (-1.10z)| norm 0.2544 (-1.30z)| lr 2.30e-04 | 2533.21 ms | 53.3% bf16 MFU | 206917 tok/s +step 11537/19560 | loss 3.382727 (-0.10z)| norm 0.2843 (+0.43z)| lr 2.30e-04 | 2533.66 ms | 53.3% bf16 MFU | 206917 tok/s +step 11538/19560 | loss 3.374149 (-0.34z)| norm 0.2625 (-0.83z)| lr 2.30e-04 | 2532.08 ms | 53.3% bf16 MFU | 206924 tok/s +step 11539/19560 | loss 3.395720 (+0.25z)| norm 0.2808 (+0.23z)| lr 2.30e-04 | 2532.53 ms | 53.3% bf16 MFU | 206929 tok/s +step 11540/19560 | loss 3.341424 (-1.27z)| norm 0.2682 (-0.50z)| lr 2.30e-04 | 2533.14 ms | 53.3% bf16 MFU | 206931 tok/s +step 11541/19560 | loss 3.407769 (+0.58z)| norm 0.3111 (+1.96z)| lr 2.30e-04 | 2534.78 ms | 53.3% bf16 MFU | 206926 tok/s +step 11542/19560 | loss 3.399901 (+0.36z)| norm 0.3140 (+2.07z)| lr 2.30e-04 | 2534.92 ms | 53.3% bf16 MFU | 206921 tok/s +step 11543/19560 | loss 3.414078 (+0.75z)| norm 0.2827 (+0.31z)| lr 2.30e-04 | 2533.55 ms | 53.3% bf16 MFU | 206922 tok/s +step 11544/19560 | loss 3.409199 (+0.61z)| norm 0.3053 (+1.57z)| lr 2.30e-04 | 2533.58 ms | 53.3% bf16 MFU | 206923 tok/s +step 11545/19560 | loss 3.342975 (-1.22z)| norm 0.2803 (+0.16z)| lr 2.30e-04 | 2532.35 ms | 53.3% bf16 MFU | 206929 tok/s +step 11546/19560 | loss 3.316006 (-1.93z)| norm 0.2942 (+0.94z)| lr 2.30e-04 | 2532.15 ms | 53.3% bf16 MFU | 206935 tok/s +step 11547/19560 | loss 3.344732 (-1.13z)| norm 0.2913 (+0.76z)| lr 2.30e-04 | 2533.67 ms | 53.3% bf16 MFU | 206934 tok/s +step 11548/19560 | loss 3.379683 (-0.18z)| norm 0.2899 (+0.68z)| lr 2.30e-04 | 2533.46 ms | 53.3% bf16 MFU | 206935 tok/s +step 11549/19560 | loss 3.398509 (+0.32z)| norm 0.2867 (+0.49z)| lr 2.30e-04 | 2533.25 ms | 53.3% bf16 MFU | 206936 tok/s +step 11550/19560 | loss 3.410087 (+0.63z)| norm 0.3362 (+3.16z)| lr 2.30e-04 | 2533.70 ms | 53.3% bf16 MFU | 206936 tok/s +step 11551/19560 | loss 3.442681 (+1.52z)| norm 0.3060 (+1.48z)| lr 2.30e-04 | 2534.04 ms | 53.3% bf16 MFU | 206934 tok/s +step 11552/19560 | loss 3.386064 (-0.03z)| norm 0.3015 (+1.22z)| lr 2.30e-04 | 2533.44 ms | 53.3% bf16 MFU | 206935 tok/s +step 11553/19560 | loss 3.340053 (-1.29z)| norm 0.2859 (+0.37z)| lr 2.30e-04 | 2532.13 ms | 53.3% bf16 MFU | 206941 tok/s +step 11554/19560 | loss 3.349931 (-1.01z)| norm 0.2975 (+0.99z)| lr 2.30e-04 | 2533.26 ms | 53.3% bf16 MFU | 206942 tok/s +step 11555/19560 | loss 3.440024 (+1.45z)| norm 0.2869 (+0.41z)| lr 2.30e-04 | 2532.49 ms | 53.3% bf16 MFU | 206946 tok/s +step 11556/19560 | loss 3.380754 (-0.17z)| norm 0.2858 (+0.34z)| lr 2.29e-04 | 2533.89 ms | 53.3% bf16 MFU | 206944 tok/s +step 11557/19560 | loss 3.384581 (-0.07z)| norm 0.2803 (+0.04z)| lr 2.29e-04 | 2530.76 ms | 53.4% bf16 MFU | 206955 tok/s +step 11558/19560 | loss 3.465551 (+2.10z)| norm 0.3510 (+3.64z)| lr 2.29e-04 | 2534.81 ms | 53.3% bf16 MFU | 206949 tok/s +step 11559/19560 | loss 3.360048 (-0.75z)| norm 0.2869 (+0.34z)| lr 2.29e-04 | 2532.92 ms | 53.3% bf16 MFU | 206951 tok/s +step 11560/19560 | loss 3.397539 (+0.27z)| norm 0.2791 (-0.06z)| lr 2.29e-04 | 2533.49 ms | 53.3% bf16 MFU | 206951 tok/s +step 11561/19560 | loss 3.450467 (+1.67z)| norm 0.3839 (+4.79z)| lr 2.29e-04 | 2530.74 ms | 53.4% bf16 MFU | 206962 tok/s +step 11562/19560 | loss 3.335219 (-1.40z)| norm 0.3000 (+0.87z)| lr 2.29e-04 | 2532.18 ms | 53.3% bf16 MFU | 206966 tok/s +step 11563/19560 | loss 3.353449 (-0.90z)| norm 0.2915 (+0.47z)| lr 2.29e-04 | 2534.34 ms | 53.3% bf16 MFU | 206961 tok/s +step 11564/19560 | loss 3.343428 (-1.16z)| norm 0.3416 (+2.69z)| lr 2.29e-04 | 2534.51 ms | 53.3% bf16 MFU | 206956 tok/s +step 11565/19560 | loss 3.360963 (-0.68z)| norm 0.2999 (+0.80z)| lr 2.29e-04 | 2535.51 ms | 53.3% bf16 MFU | 206947 tok/s +step 11566/19560 | loss 3.387991 (+0.06z)| norm 0.2954 (+0.59z)| lr 2.29e-04 | 2533.35 ms | 53.3% bf16 MFU | 206948 tok/s +step 11567/19560 | loss 3.362369 (-0.62z)| norm 0.2878 (+0.24z)| lr 2.29e-04 | 2533.30 ms | 53.3% bf16 MFU | 206948 tok/s +step 11568/19560 | loss 3.355372 (-0.81z)| norm 0.2819 (-0.03z)| lr 2.29e-04 | 2532.60 ms | 53.3% bf16 MFU | 206952 tok/s +step 11569/19560 | loss 3.341340 (-1.21z)| norm 0.2646 (-0.80z)| lr 2.29e-04 | 2534.17 ms | 53.3% bf16 MFU | 206948 tok/s +step 11570/19560 | loss 3.416639 (+0.94z)| norm 0.2858 (+0.17z)| lr 2.29e-04 | 2532.23 ms | 53.3% bf16 MFU | 206953 tok/s +step 11571/19560 | loss 3.342841 (-1.16z)| norm 0.2971 (+0.70z)| lr 2.29e-04 | 2532.91 ms | 53.3% bf16 MFU | 206955 tok/s +step 11572/19560 | loss 3.400503 (+0.47z)| norm 0.2770 (-0.23z)| lr 2.29e-04 | 2533.14 ms | 53.3% bf16 MFU | 206956 tok/s +step 11573/19560 | loss 3.400082 (+0.47z)| norm 0.2744 (-0.34z)| lr 2.29e-04 | 2531.27 ms | 53.3% bf16 MFU | 206964 tok/s +step 11574/19560 | loss 3.385148 (+0.05z)| norm 0.2827 (+0.04z)| lr 2.29e-04 | 2533.12 ms | 53.3% bf16 MFU | 206965 tok/s +step 11575/19560 | loss 3.379774 (-0.09z)| norm 0.2777 (-0.17z)| lr 2.29e-04 | 2532.75 ms | 53.3% bf16 MFU | 206967 tok/s +step 11576/19560 | loss 3.486155 (+2.87z)| norm 0.3248 (+1.98z)| lr 2.28e-04 | 2532.65 ms | 53.3% bf16 MFU | 206969 tok/s +step 11577/19560 | loss 3.320528 (-1.74z)| norm 0.2734 (-0.38z)| lr 2.28e-04 | 2531.78 ms | 53.3% bf16 MFU | 206975 tok/s +step 11578/19560 | loss 3.356929 (-0.72z)| norm 0.2903 (+0.40z)| lr 2.28e-04 | 2533.23 ms | 53.3% bf16 MFU | 206974 tok/s +step 11579/19560 | loss 3.379482 (-0.10z)| norm 0.2602 (-0.98z)| lr 2.28e-04 | 2533.79 ms | 53.3% bf16 MFU | 206971 tok/s +step 11580/19560 | loss 3.347217 (-0.98z)| norm 0.3226 (+1.85z)| lr 2.28e-04 | 2532.42 ms | 53.3% bf16 MFU | 206974 tok/s +step 11581/19560 | loss 3.381961 (-0.02z)| norm 0.2959 (+0.63z)| lr 2.28e-04 | 2533.59 ms | 53.3% bf16 MFU | 206972 tok/s +step 11582/19560 | loss 3.330075 (-1.49z)| norm 0.2566 (-1.13z)| lr 2.28e-04 | 2533.66 ms | 53.3% bf16 MFU | 206970 tok/s +step 11583/19560 | loss 3.371392 (-0.29z)| norm 0.2940 (+0.55z)| lr 2.28e-04 | 2533.15 ms | 53.3% bf16 MFU | 206970 tok/s +step 11584/19560 | loss 3.413234 (+0.92z)| norm 0.2641 (-0.80z)| lr 2.28e-04 | 2532.66 ms | 53.3% bf16 MFU | 206972 tok/s +step 11585/19560 | loss 3.338422 (-1.23z)| norm 0.2866 (+0.22z)| lr 2.28e-04 | 2531.78 ms | 53.3% bf16 MFU | 206978 tok/s +step 11586/19560 | loss 3.342601 (-1.10z)| norm 0.3007 (+0.86z)| lr 2.28e-04 | 2533.96 ms | 53.3% bf16 MFU | 206974 tok/s +step 11587/19560 | loss 3.402715 (+0.61z)| norm 0.2759 (-0.26z)| lr 2.28e-04 | 2532.65 ms | 53.3% bf16 MFU | 206976 tok/s +step 11588/19560 | loss 3.370675 (-0.31z)| norm 0.3133 (+1.41z)| lr 2.28e-04 | 2531.69 ms | 53.3% bf16 MFU | 206982 tok/s +step 11589/19560 | loss 3.353214 (-0.80z)| norm 0.2407 (-1.81z)| lr 2.28e-04 | 2534.75 ms | 53.3% bf16 MFU | 206975 tok/s +step 11590/19560 | loss 3.432166 (+1.44z)| norm 0.3140 (+1.42z)| lr 2.28e-04 | 2533.57 ms | 53.3% bf16 MFU | 206973 tok/s +step 11591/19560 | loss 3.362084 (-0.54z)| norm 0.2664 (-0.67z)| lr 2.28e-04 | 2532.09 ms | 53.3% bf16 MFU | 206977 tok/s +step 11592/19560 | loss 3.361252 (-0.55z)| norm 0.2918 (+0.44z)| lr 2.28e-04 | 2530.93 ms | 53.3% bf16 MFU | 206986 tok/s +step 11593/19560 | loss 3.374926 (-0.18z)| norm 0.2629 (-0.83z)| lr 2.28e-04 | 2532.22 ms | 53.3% bf16 MFU | 206989 tok/s +step 11594/19560 | loss 3.402230 (+0.61z)| norm 0.3269 (+1.94z)| lr 2.28e-04 | 2533.49 ms | 53.3% bf16 MFU | 206986 tok/s +step 11595/19560 | loss 3.402083 (+0.60z)| norm 0.3077 (+1.09z)| lr 2.28e-04 | 2533.74 ms | 53.3% bf16 MFU | 206983 tok/s +step 11596/19560 | loss 3.317213 (-1.81z)| norm 0.3318 (+2.09z)| lr 2.28e-04 | 2532.92 ms | 53.3% bf16 MFU | 206984 tok/s +step 11597/19560 | loss 3.359409 (-0.60z)| norm 0.3042 (+0.89z)| lr 2.27e-04 | 2532.97 ms | 53.3% bf16 MFU | 206984 tok/s +step 11598/19560 | loss 3.450697 (+1.95z)| norm 0.3047 (+0.89z)| lr 2.27e-04 | 2532.31 ms | 53.3% bf16 MFU | 206986 tok/s +step 11599/19560 | loss 3.360959 (-0.55z)| norm 0.3087 (+1.05z)| lr 2.27e-04 | 2535.12 ms | 53.3% bf16 MFU | 206978 tok/s +step 11600/19560 | loss 3.347568 (-0.92z)| norm 0.2953 (+0.46z)| lr 2.27e-04 | 2532.80 ms | 53.3% bf16 MFU | 206979 tok/s +step 11601/19560 | loss 3.437971 (+1.60z)| norm 0.3121 (+1.17z)| lr 2.27e-04 | 2532.55 ms | 53.3% bf16 MFU | 206981 tok/s +step 11602/19560 | loss 3.381681 (+0.02z)| norm 0.2741 (-0.49z)| lr 2.27e-04 | 2533.95 ms | 53.3% bf16 MFU | 206977 tok/s +step 11603/19560 | loss 3.385858 (+0.14z)| norm 0.2830 (-0.11z)| lr 2.27e-04 | 2533.49 ms | 53.3% bf16 MFU | 206975 tok/s +step 11604/19560 | loss 3.341031 (-1.11z)| norm 0.3145 (+1.26z)| lr 2.27e-04 | 2533.60 ms | 53.3% bf16 MFU | 206973 tok/s +step 11605/19560 | loss 3.391962 (+0.31z)| norm 0.2857 (-0.01z)| lr 2.27e-04 | 2532.98 ms | 53.3% bf16 MFU | 206974 tok/s +step 11606/19560 | loss 3.512706 (+3.48z)| norm 0.2792 (-0.31z)| lr 2.27e-04 | 2532.91 ms | 53.3% bf16 MFU | 206975 tok/s +step 11607/19560 | loss 3.390363 (+0.22z)| norm 0.2995 (+0.58z)| lr 2.27e-04 | 2533.59 ms | 53.3% bf16 MFU | 206973 tok/s +step 11608/19560 | loss 3.321671 (-1.58z)| norm 0.3413 (+2.36z)| lr 2.27e-04 | 2531.94 ms | 53.3% bf16 MFU | 206977 tok/s +step 11609/19560 | loss 3.378001 (-0.10z)| norm 0.3076 (+0.88z)| lr 2.27e-04 | 2532.76 ms | 53.3% bf16 MFU | 206979 tok/s +step 11610/19560 | loss 3.390863 (+0.24z)| norm 0.2819 (-0.25z)| lr 2.27e-04 | 2534.08 ms | 53.3% bf16 MFU | 206974 tok/s +step 11611/19560 | loss 3.314347 (-1.75z)| norm 0.2841 (-0.15z)| lr 2.27e-04 | 2532.41 ms | 53.3% bf16 MFU | 206977 tok/s +step 11612/19560 | loss 3.364663 (-0.43z)| norm 0.2773 (-0.46z)| lr 2.27e-04 | 2533.98 ms | 53.3% bf16 MFU | 206974 tok/s +step 11613/19560 | loss 3.348651 (-0.83z)| norm 0.2676 (-0.88z)| lr 2.27e-04 | 2533.93 ms | 53.3% bf16 MFU | 206970 tok/s +step 11614/19560 | loss 3.345548 (-0.91z)| norm 0.2844 (-0.14z)| lr 2.27e-04 | 2532.94 ms | 53.3% bf16 MFU | 206971 tok/s +step 11615/19560 | loss 3.378169 (-0.06z)| norm 0.2833 (-0.19z)| lr 2.27e-04 | 2531.67 ms | 53.3% bf16 MFU | 206977 tok/s +step 11616/19560 | loss 3.407252 (+0.72z)| norm 0.2581 (-1.28z)| lr 2.27e-04 | 2534.71 ms | 53.3% bf16 MFU | 206970 tok/s +step 11617/19560 | loss 3.371896 (-0.22z)| norm 0.2806 (-0.29z)| lr 2.26e-04 | 2536.44 ms | 53.2% bf16 MFU | 206957 tok/s +step 11618/19560 | loss 3.420682 (+1.06z)| norm 0.3105 (+1.00z)| lr 2.26e-04 | 2532.08 ms | 53.3% bf16 MFU | 206962 tok/s +step 11619/19560 | loss 3.514815 (+3.37z)| norm 0.2729 (-0.63z)| lr 2.26e-04 | 2532.47 ms | 53.3% bf16 MFU | 206965 tok/s +step 11620/19560 | loss 3.345699 (-0.91z)| norm 0.2829 (-0.20z)| lr 2.26e-04 | 2533.05 ms | 53.3% bf16 MFU | 206966 tok/s +step 11621/19560 | loss 3.379377 (-0.06z)| norm 0.2670 (-0.88z)| lr 2.26e-04 | 2534.21 ms | 53.3% bf16 MFU | 206962 tok/s +step 11622/19560 | loss 3.478139 (+2.38z)| norm 0.2730 (-0.62z)| lr 2.26e-04 | 2532.03 ms | 53.3% bf16 MFU | 206967 tok/s +step 11623/19560 | loss 3.295773 (-2.13z)| norm 0.3055 (+0.82z)| lr 2.26e-04 | 2532.46 ms | 53.3% bf16 MFU | 206970 tok/s +step 11624/19560 | loss 3.326775 (-1.35z)| norm 0.2591 (-1.22z)| lr 2.26e-04 | 2534.85 ms | 53.3% bf16 MFU | 206963 tok/s +step 11625/19560 | loss 3.371460 (-0.26z)| norm 0.2896 (+0.14z)| lr 2.26e-04 | 2534.27 ms | 53.3% bf16 MFU | 206959 tok/s +step 11626/19560 | loss 3.378748 (-0.08z)| norm 0.2708 (-0.69z)| lr 2.26e-04 | 2532.12 ms | 53.3% bf16 MFU | 206964 tok/s +step 11627/19560 | loss 3.367801 (-0.34z)| norm 0.2656 (-0.91z)| lr 2.26e-04 | 2534.18 ms | 53.3% bf16 MFU | 206960 tok/s +step 11628/19560 | loss 3.373086 (-0.21z)| norm 0.2710 (-0.67z)| lr 2.26e-04 | 2533.97 ms | 53.3% bf16 MFU | 206957 tok/s +step 11629/19560 | loss 3.422070 (+0.97z)| norm 0.2717 (-0.63z)| lr 2.26e-04 | 2532.95 ms | 53.3% bf16 MFU | 206959 tok/s +step 11630/19560 | loss 3.380154 (-0.05z)| norm 0.2997 (+0.61z)| lr 2.26e-04 | 2535.26 ms | 53.3% bf16 MFU | 206950 tok/s +step 11631/19560 | loss 3.359694 (-0.55z)| norm 0.2559 (-1.32z)| lr 2.26e-04 | 2535.25 ms | 53.3% bf16 MFU | 206943 tok/s +step 11632/19560 | loss 3.397181 (+0.36z)| norm 0.2718 (-0.61z)| lr 2.26e-04 | 2534.77 ms | 53.3% bf16 MFU | 206938 tok/s +step 11633/19560 | loss 3.362661 (-0.48z)| norm 0.2768 (-0.39z)| lr 2.26e-04 | 2533.39 ms | 53.3% bf16 MFU | 206938 tok/s +step 11634/19560 | loss 3.368939 (-0.32z)| norm 0.2561 (-1.29z)| lr 2.26e-04 | 2534.95 ms | 53.3% bf16 MFU | 206933 tok/s +step 11635/19560 | loss 3.311836 (-1.70z)| norm 0.2843 (-0.06z)| lr 2.26e-04 | 2532.42 ms | 53.3% bf16 MFU | 206938 tok/s +step 11636/19560 | loss 3.364285 (-0.42z)| norm 0.2704 (-0.68z)| lr 2.26e-04 | 2532.74 ms | 53.3% bf16 MFU | 206941 tok/s +step 11637/19560 | loss 3.369505 (-0.28z)| norm 0.3213 (+1.55z)| lr 2.26e-04 | 2534.42 ms | 53.3% bf16 MFU | 206937 tok/s +step 11638/19560 | loss 3.376332 (-0.12z)| norm 0.2741 (-0.52z)| lr 2.25e-04 | 2533.88 ms | 53.3% bf16 MFU | 206936 tok/s +step 11639/19560 | loss 3.326322 (-1.33z)| norm 0.2938 (+0.34z)| lr 2.25e-04 | 2532.87 ms | 53.3% bf16 MFU | 206939 tok/s +step 11640/19560 | loss 3.367393 (-0.32z)| norm 0.2696 (-0.72z)| lr 2.25e-04 | 2534.59 ms | 53.3% bf16 MFU | 206934 tok/s +step 11641/19560 | loss 3.360625 (-0.47z)| norm 0.2964 (+0.46z)| lr 2.25e-04 | 2534.94 ms | 53.3% bf16 MFU | 206929 tok/s +step 11642/19560 | loss 3.340821 (-0.94z)| norm 0.2605 (-1.11z)| lr 2.25e-04 | 2534.65 ms | 53.3% bf16 MFU | 206925 tok/s +step 11643/19560 | loss 3.336861 (-1.02z)| norm 0.3039 (+0.79z)| lr 2.25e-04 | 2535.49 ms | 53.3% bf16 MFU | 206918 tok/s +step 11644/19560 | loss 3.354063 (-0.59z)| norm 0.2773 (-0.37z)| lr 2.25e-04 | 2531.79 ms | 53.3% bf16 MFU | 206926 tok/s +step 11645/19560 | loss 3.345143 (-0.80z)| norm 0.2858 (-0.00z)| lr 2.25e-04 | 2532.47 ms | 53.3% bf16 MFU | 206931 tok/s +step 11646/19560 | loss 3.353909 (-0.57z)| norm 0.2813 (-0.21z)| lr 2.25e-04 | 2534.30 ms | 53.3% bf16 MFU | 206928 tok/s +step 11647/19560 | loss 3.381050 (+0.11z)| norm 0.2935 (+0.33z)| lr 2.25e-04 | 2533.03 ms | 53.3% bf16 MFU | 206931 tok/s +step 11648/19560 | loss 3.373307 (-0.08z)| norm 0.2516 (-1.53z)| lr 2.25e-04 | 2531.42 ms | 53.3% bf16 MFU | 206940 tok/s +step 11649/19560 | loss 3.350077 (-0.67z)| norm 0.2976 (+0.50z)| lr 2.25e-04 | 2532.13 ms | 53.3% bf16 MFU | 206946 tok/s +step 11650/19560 | loss 3.343174 (-0.83z)| norm 0.2604 (-1.14z)| lr 2.25e-04 | 2533.59 ms | 53.3% bf16 MFU | 206945 tok/s +step 11651/19560 | loss 3.450096 (+1.81z)| norm 0.2815 (-0.21z)| lr 2.25e-04 | 2532.95 ms | 53.3% bf16 MFU | 206947 tok/s +step 11652/19560 | loss 3.302947 (-1.82z)| norm 0.2713 (-0.66z)| lr 2.25e-04 | 2533.95 ms | 53.3% bf16 MFU | 206945 tok/s +step 11653/19560 | loss 3.263732 (-2.72z)| norm 0.2991 (+0.56z)| lr 2.25e-04 | 2532.99 ms | 53.3% bf16 MFU | 206947 tok/s +step 11654/19560 | loss 3.300469 (-1.78z)| norm 0.2601 (-1.16z)| lr 2.25e-04 | 2531.34 ms | 53.3% bf16 MFU | 206956 tok/s +step 11655/19560 | loss 3.312607 (-1.46z)| norm 0.2862 (-0.02z)| lr 2.25e-04 | 2532.84 ms | 53.3% bf16 MFU | 206958 tok/s +step 11656/19560 | loss 3.350842 (-0.53z)| norm 0.2925 (+0.26z)| lr 2.25e-04 | 2531.67 ms | 53.3% bf16 MFU | 206964 tok/s +step 11657/19560 | loss 3.362905 (-0.25z)| norm 0.2741 (-0.57z)| lr 2.25e-04 | 2533.61 ms | 53.3% bf16 MFU | 206963 tok/s +step 11658/19560 | loss 3.376211 (+0.07z)| norm 0.2865 (-0.03z)| lr 2.25e-04 | 2532.48 ms | 53.3% bf16 MFU | 206966 tok/s +step 11659/19560 | loss 3.431957 (+1.38z)| norm 0.2619 (-1.12z)| lr 2.24e-04 | 2534.25 ms | 53.3% bf16 MFU | 206962 tok/s +step 11660/19560 | loss 3.344454 (-0.69z)| norm 0.2998 (+0.56z)| lr 2.24e-04 | 2533.13 ms | 53.3% bf16 MFU | 206962 tok/s +step 11661/19560 | loss 3.323078 (-1.21z)| norm 0.2765 (-0.50z)| lr 2.24e-04 | 2533.55 ms | 53.3% bf16 MFU | 206961 tok/s +step 11662/19560 | loss 3.328475 (-1.07z)| norm 0.2866 (-0.05z)| lr 2.24e-04 | 2533.70 ms | 53.3% bf16 MFU | 206959 tok/s +step 11663/19560 | loss 3.335550 (-0.89z)| norm 0.2646 (-1.06z)| lr 2.24e-04 | 2533.17 ms | 53.3% bf16 MFU | 206960 tok/s +step 11664/19560 | loss 3.321239 (-1.23z)| norm 0.3396 (+2.32z)| lr 2.24e-04 | 2532.47 ms | 53.3% bf16 MFU | 206963 tok/s +step 11665/19560 | loss 3.361745 (-0.26z)| norm 0.3006 (+0.55z)| lr 2.24e-04 | 2532.88 ms | 53.3% bf16 MFU | 206965 tok/s +step 11666/19560 | loss 3.382262 (+0.23z)| norm 0.2942 (+0.25z)| lr 2.24e-04 | 2534.38 ms | 53.3% bf16 MFU | 206960 tok/s +step 11667/19560 | loss 3.344519 (-0.66z)| norm 0.2757 (-0.60z)| lr 2.24e-04 | 2536.78 ms | 53.2% bf16 MFU | 206946 tok/s +step 11668/19560 | loss 3.363125 (-0.22z)| norm 0.3282 (+1.75z)| lr 2.24e-04 | 2534.22 ms | 53.3% bf16 MFU | 206942 tok/s +step 11669/19560 | loss 3.391392 (+0.46z)| norm 0.2682 (-0.93z)| lr 2.24e-04 | 2533.88 ms | 53.3% bf16 MFU | 206941 tok/s +step 11670/19560 | loss 3.372306 (+0.01z)| norm 0.3053 (+0.75z)| lr 2.24e-04 | 2533.11 ms | 53.3% bf16 MFU | 206943 tok/s +step 11671/19560 | loss 3.347635 (-0.57z)| norm 0.2620 (-1.20z)| lr 2.24e-04 | 2534.44 ms | 53.3% bf16 MFU | 206939 tok/s +step 11672/19560 | loss 3.399832 (+0.68z)| norm 0.2895 (+0.04z)| lr 2.24e-04 | 2534.33 ms | 53.3% bf16 MFU | 206935 tok/s +step 11673/19560 | loss 3.349108 (-0.54z)| norm 0.2675 (-0.94z)| lr 2.24e-04 | 2533.69 ms | 53.3% bf16 MFU | 206935 tok/s +step 11674/19560 | loss 3.399192 (+0.65z)| norm 0.2674 (-0.93z)| lr 2.24e-04 | 2535.25 ms | 53.3% bf16 MFU | 206928 tok/s +step 11675/19560 | loss 3.394735 (+0.54z)| norm 0.2598 (-1.25z)| lr 2.24e-04 | 2534.93 ms | 53.3% bf16 MFU | 206923 tok/s +step 11676/19560 | loss 3.384825 (+0.30z)| norm 0.2666 (-0.94z)| lr 2.24e-04 | 2534.87 ms | 53.3% bf16 MFU | 206918 tok/s +step 11677/19560 | loss 3.349535 (-0.55z)| norm 0.2704 (-0.77z)| lr 2.24e-04 | 2533.70 ms | 53.3% bf16 MFU | 206919 tok/s +step 11678/19560 | loss 3.411358 (+0.95z)| norm 0.2612 (-1.16z)| lr 2.24e-04 | 2532.82 ms | 53.3% bf16 MFU | 206923 tok/s +step 11679/19560 | loss 3.357736 (-0.34z)| norm 0.2887 (+0.08z)| lr 2.23e-04 | 2533.28 ms | 53.3% bf16 MFU | 206925 tok/s +step 11680/19560 | loss 3.376702 (+0.13z)| norm 0.2656 (-0.94z)| lr 2.23e-04 | 2536.06 ms | 53.2% bf16 MFU | 206915 tok/s +step 11681/19560 | loss 3.434408 (+1.51z)| norm 0.2765 (-0.45z)| lr 2.23e-04 | 2533.79 ms | 53.3% bf16 MFU | 206915 tok/s +step 11682/19560 | loss 3.319044 (-1.28z)| norm 0.3192 (+1.44z)| lr 2.23e-04 | 2533.01 ms | 53.3% bf16 MFU | 206919 tok/s +step 11683/19560 | loss 3.340306 (-0.75z)| norm 0.2551 (-1.39z)| lr 2.23e-04 | 2534.31 ms | 53.3% bf16 MFU | 206916 tok/s +step 11684/19560 | loss 3.408374 (+0.90z)| norm 0.2866 (+0.00z)| lr 2.23e-04 | 2532.83 ms | 53.3% bf16 MFU | 206920 tok/s +step 11685/19560 | loss 3.431600 (+1.44z)| norm 0.2659 (-0.90z)| lr 2.23e-04 | 2532.28 ms | 53.3% bf16 MFU | 206927 tok/s +step 11686/19560 | loss 3.375078 (+0.10z)| norm 0.2740 (-0.54z)| lr 2.23e-04 | 2533.53 ms | 53.3% bf16 MFU | 206927 tok/s +step 11687/19560 | loss 3.312581 (-1.42z)| norm 0.2769 (-0.40z)| lr 2.23e-04 | 2533.58 ms | 53.3% bf16 MFU | 206928 tok/s +step 11688/19560 | loss 3.368476 (-0.05z)| norm 0.2755 (-0.46z)| lr 2.23e-04 | 2533.84 ms | 53.3% bf16 MFU | 206927 tok/s +step 11689/19560 | loss 3.365264 (-0.11z)| norm 0.2767 (-0.40z)| lr 2.23e-04 | 2534.64 ms | 53.3% bf16 MFU | 206923 tok/s +step 11690/19560 | loss 3.341817 (-0.70z)| norm 0.2658 (-0.93z)| lr 2.23e-04 | 2533.08 ms | 53.3% bf16 MFU | 206926 tok/s +step 11691/19560 | loss 3.352494 (-0.43z)| norm 0.2725 (-0.59z)| lr 2.23e-04 | 2532.54 ms | 53.3% bf16 MFU | 206930 tok/s +step 11692/19560 | loss 3.361691 (-0.21z)| norm 0.2760 (-0.41z)| lr 2.23e-04 | 2531.68 ms | 53.3% bf16 MFU | 206938 tok/s +step 11693/19560 | loss 3.351407 (-0.46z)| norm 0.2469 (-1.84z)| lr 2.23e-04 | 2532.03 ms | 53.3% bf16 MFU | 206945 tok/s +step 11694/19560 | loss 3.372359 (+0.06z)| norm 0.3062 (+1.13z)| lr 2.23e-04 | 2533.96 ms | 53.3% bf16 MFU | 206943 tok/s +step 11695/19560 | loss 3.399682 (+0.74z)| norm 0.2836 (+0.00z)| lr 2.23e-04 | 2533.71 ms | 53.3% bf16 MFU | 206942 tok/s +step 11696/19560 | loss 3.393127 (+0.57z)| norm 0.2840 (+0.02z)| lr 2.23e-04 | 2533.80 ms | 53.3% bf16 MFU | 206941 tok/s +step 11697/19560 | loss 3.413391 (+1.05z)| norm 0.2856 (+0.09z)| lr 2.23e-04 | 2533.52 ms | 53.3% bf16 MFU | 206941 tok/s +step 11698/19560 | loss 3.318055 (-1.29z)| norm 0.2705 (-0.66z)| lr 2.23e-04 | 2531.96 ms | 53.3% bf16 MFU | 206947 tok/s +step 11699/19560 | loss 3.354978 (-0.38z)| norm 0.2804 (-0.16z)| lr 2.23e-04 | 2534.54 ms | 53.3% bf16 MFU | 206942 tok/s +step 11700/19560 | loss 3.331553 (-0.95z)| norm 0.2605 (-1.15z)| lr 2.22e-04 | 2533.05 ms | 53.3% bf16 MFU | 206944 tok/s +step 11701/19560 | loss 3.355890 (-0.34z)| norm 0.2790 (-0.22z)| lr 2.22e-04 | 2531.57 ms | 53.3% bf16 MFU | 206952 tok/s +step 11702/19560 | loss 3.377323 (+0.20z)| norm 0.2564 (-1.34z)| lr 2.22e-04 | 2535.30 ms | 53.3% bf16 MFU | 206944 tok/s +step 11703/19560 | loss 3.367174 (-0.05z)| norm 0.2985 (+0.75z)| lr 2.22e-04 | 2533.09 ms | 53.3% bf16 MFU | 206946 tok/s +step 11704/19560 | loss 3.374495 (+0.16z)| norm 0.2628 (-1.01z)| lr 2.22e-04 | 2532.38 ms | 53.3% bf16 MFU | 206950 tok/s +step 11705/19560 | loss 3.341167 (-0.71z)| norm 0.2936 (+0.53z)| lr 2.22e-04 | 2532.17 ms | 53.3% bf16 MFU | 206955 tok/s +step 11706/19560 | loss 3.377506 (+0.23z)| norm 0.2743 (-0.43z)| lr 2.22e-04 | 2533.35 ms | 53.3% bf16 MFU | 206955 tok/s +step 11707/19560 | loss 3.330486 (-0.97z)| norm 0.2792 (-0.20z)| lr 2.22e-04 | 2533.75 ms | 53.3% bf16 MFU | 206953 tok/s +step 11708/19560 | loss 3.363606 (-0.12z)| norm 0.2698 (-0.66z)| lr 2.22e-04 | 2532.87 ms | 53.3% bf16 MFU | 206955 tok/s +step 11709/19560 | loss 3.369373 (+0.03z)| norm 0.2532 (-1.48z)| lr 2.22e-04 | 2532.13 ms | 53.3% bf16 MFU | 206960 tok/s +step 11710/19560 | loss 3.361081 (-0.19z)| norm 0.2750 (-0.39z)| lr 2.22e-04 | 2534.28 ms | 53.3% bf16 MFU | 206956 tok/s +step 11711/19560 | loss 3.364646 (-0.10z)| norm 0.2667 (-0.80z)| lr 2.22e-04 | 2533.99 ms | 53.3% bf16 MFU | 206954 tok/s +step 11712/19560 | loss 3.348833 (-0.50z)| norm 0.2583 (-1.22z)| lr 2.22e-04 | 2533.60 ms | 53.3% bf16 MFU | 206953 tok/s +step 11713/19560 | loss 3.325849 (-1.09z)| norm 0.2583 (-1.20z)| lr 2.22e-04 | 2533.57 ms | 53.3% bf16 MFU | 206952 tok/s +step 11714/19560 | loss 3.381420 (+0.34z)| norm 0.2599 (-1.11z)| lr 2.22e-04 | 2533.44 ms | 53.3% bf16 MFU | 206952 tok/s +step 11715/19560 | loss 3.363780 (-0.11z)| norm 0.2824 (+0.03z)| lr 2.22e-04 | 2535.16 ms | 53.3% bf16 MFU | 206944 tok/s +step 11716/19560 | loss 3.332679 (-0.91z)| norm 0.2669 (-0.74z)| lr 2.22e-04 | 2533.87 ms | 53.3% bf16 MFU | 206943 tok/s +step 11717/19560 | loss 3.373954 (+0.16z)| norm 0.2649 (-0.86z)| lr 2.22e-04 | 2533.70 ms | 53.3% bf16 MFU | 206942 tok/s +step 11718/19560 | loss 3.351047 (-0.42z)| norm 0.2721 (-0.48z)| lr 2.22e-04 | 2533.67 ms | 53.3% bf16 MFU | 206941 tok/s +step 11719/19560 | loss 3.354168 (-0.34z)| norm 0.2659 (-0.80z)| lr 2.22e-04 | 2532.96 ms | 53.3% bf16 MFU | 206943 tok/s +step 11720/19560 | loss 3.350778 (-0.42z)| norm 0.2662 (-0.77z)| lr 2.22e-04 | 2531.96 ms | 53.3% bf16 MFU | 206950 tok/s +step 11721/19560 | loss 3.370540 (+0.09z)| norm 0.2706 (-0.55z)| lr 2.21e-04 | 2532.15 ms | 53.3% bf16 MFU | 206955 tok/s +step 11722/19560 | loss 3.387010 (+0.53z)| norm 0.2546 (-1.38z)| lr 2.21e-04 | 2532.97 ms | 53.3% bf16 MFU | 206956 tok/s +step 11723/19560 | loss 3.345526 (-0.55z)| norm 0.2933 (+0.69z)| lr 2.21e-04 | 2531.98 ms | 53.3% bf16 MFU | 206962 tok/s +step 11724/19560 | loss 3.368663 (+0.05z)| norm 0.2732 (-0.37z)| lr 2.21e-04 | 2532.68 ms | 53.3% bf16 MFU | 206964 tok/s +step 11725/19560 | loss 3.377892 (+0.29z)| norm 0.2877 (+0.44z)| lr 2.21e-04 | 2532.64 ms | 53.3% bf16 MFU | 206967 tok/s +step 11726/19560 | loss 3.424237 (+1.54z)| norm 0.2812 (+0.09z)| lr 2.21e-04 | 2531.88 ms | 53.3% bf16 MFU | 206972 tok/s +step 11727/19560 | loss 3.359292 (-0.20z)| norm 0.2554 (-1.34z)| lr 2.21e-04 | 2533.87 ms | 53.3% bf16 MFU | 206969 tok/s +step 11728/19560 | loss 3.339025 (-0.74z)| norm 0.3053 (+1.46z)| lr 2.21e-04 | 2531.79 ms | 53.3% bf16 MFU | 206975 tok/s +step 11729/19560 | loss 3.394423 (+0.76z)| norm 0.2615 (-0.99z)| lr 2.21e-04 | 2531.97 ms | 53.3% bf16 MFU | 206979 tok/s +step 11730/19560 | loss 3.426600 (+1.61z)| norm 0.2793 (+0.02z)| lr 2.21e-04 | 2532.85 ms | 53.3% bf16 MFU | 206980 tok/s +step 11731/19560 | loss 3.368478 (+0.05z)| norm 0.2594 (-1.09z)| lr 2.21e-04 | 2533.43 ms | 53.3% bf16 MFU | 206978 tok/s +step 11732/19560 | loss 3.357205 (-0.25z)| norm 0.2674 (-0.63z)| lr 2.21e-04 | 2531.64 ms | 53.3% bf16 MFU | 206984 tok/s +step 11733/19560 | loss 3.416493 (+1.33z)| norm 0.2761 (-0.13z)| lr 2.21e-04 | 2533.81 ms | 53.3% bf16 MFU | 206981 tok/s +step 11734/19560 | loss 3.393413 (+0.78z)| norm 0.2602 (-1.02z)| lr 2.21e-04 | 2532.92 ms | 53.3% bf16 MFU | 206981 tok/s +step 11735/19560 | loss 3.395735 (+0.84z)| norm 0.2697 (-0.47z)| lr 2.21e-04 | 2532.92 ms | 53.3% bf16 MFU | 206982 tok/s +step 11736/19560 | loss 3.356977 (-0.26z)| norm 0.2674 (-0.60z)| lr 2.21e-04 | 2533.96 ms | 53.3% bf16 MFU | 206978 tok/s +step 11737/19560 | loss 3.448512 (+2.29z)| norm 0.2854 (+0.50z)| lr 2.21e-04 | 2533.96 ms | 53.3% bf16 MFU | 206974 tok/s +step 11738/19560 | loss 3.405820 (+1.09z)| norm 0.2693 (-0.47z)| lr 2.21e-04 | 2532.35 ms | 53.3% bf16 MFU | 206977 tok/s +step 11739/19560 | loss 3.363820 (-0.10z)| norm 0.2665 (-0.64z)| lr 2.21e-04 | 2533.59 ms | 53.3% bf16 MFU | 206975 tok/s +step 11740/19560 | loss 3.400251 (+0.92z)| norm 0.2554 (-1.30z)| lr 2.21e-04 | 2533.94 ms | 53.3% bf16 MFU | 206972 tok/s +step 11741/19560 | loss 3.396852 (+0.81z)| norm 0.2689 (-0.48z)| lr 2.21e-04 | 2533.58 ms | 53.3% bf16 MFU | 206970 tok/s +step 11742/19560 | loss 3.370822 (+0.07z)| norm 0.2535 (-1.39z)| lr 2.20e-04 | 2532.87 ms | 53.3% bf16 MFU | 206971 tok/s +step 11743/19560 | loss 3.418251 (+1.39z)| norm 0.2734 (-0.18z)| lr 2.20e-04 | 2535.67 ms | 53.2% bf16 MFU | 206961 tok/s +step 11744/19560 | loss 3.349693 (-0.51z)| norm 0.2607 (-0.95z)| lr 2.20e-04 | 2535.71 ms | 53.2% bf16 MFU | 206951 tok/s +step 11745/19560 | loss 3.368176 (+0.01z)| norm 0.2644 (-0.72z)| lr 2.20e-04 | 2533.84 ms | 53.3% bf16 MFU | 206949 tok/s +step 11746/19560 | loss 3.368519 (+0.03z)| norm 0.2575 (-1.12z)| lr 2.20e-04 | 2532.55 ms | 53.3% bf16 MFU | 206952 tok/s +step 11747/19560 | loss 3.382861 (+0.49z)| norm 0.2695 (-0.39z)| lr 2.20e-04 | 2531.60 ms | 53.3% bf16 MFU | 206960 tok/s +step 11748/19560 | loss 3.320622 (-1.38z)| norm 0.2558 (-1.20z)| lr 2.20e-04 | 2533.26 ms | 53.3% bf16 MFU | 206960 tok/s +step 11749/19560 | loss 3.345131 (-0.63z)| norm 0.2482 (-1.64z)| lr 2.20e-04 | 2531.37 ms | 53.3% bf16 MFU | 206968 tok/s +step 11750/19560 | loss 3.375570 (+0.32z)| norm 0.2598 (-0.94z)| lr 2.20e-04 | 2531.51 ms | 53.3% bf16 MFU | 206975 tok/s +val loss 3.368641 +evaluating HellaSwag: 0/628 evaluating HellaSwag: 10/628 evaluating HellaSwag: 20/628 evaluating HellaSwag: 30/628 evaluating HellaSwag: 40/628 evaluating HellaSwag: 50/628 evaluating HellaSwag: 60/628 evaluating HellaSwag: 70/628 evaluating HellaSwag: 80/628 evaluating HellaSwag: 90/628 evaluating HellaSwag: 100/628 evaluating HellaSwag: 110/628 evaluating HellaSwag: 120/628 evaluating HellaSwag: 130/628 evaluating HellaSwag: 140/628 evaluating HellaSwag: 150/628 evaluating HellaSwag: 160/628 evaluating HellaSwag: 170/628 evaluating HellaSwag: 180/628 evaluating HellaSwag: 190/628 evaluating HellaSwag: 200/628 evaluating HellaSwag: 210/628 evaluating HellaSwag: 220/628 evaluating HellaSwag: 230/628 evaluating HellaSwag: 240/628 evaluating HellaSwag: 250/628 evaluating HellaSwag: 260/628 evaluating HellaSwag: 270/628 evaluating HellaSwag: 280/628 evaluating HellaSwag: 290/628 evaluating HellaSwag: 300/628 evaluating HellaSwag: 310/628 evaluating HellaSwag: 320/628 evaluating HellaSwag: 330/628 evaluating HellaSwag: 340/628 evaluating HellaSwag: 350/628 evaluating HellaSwag: 360/628 evaluating HellaSwag: 370/628 evaluating HellaSwag: 380/628 evaluating HellaSwag: 390/628 evaluating HellaSwag: 400/628 evaluating HellaSwag: 410/628 evaluating HellaSwag: 420/628 evaluating HellaSwag: 430/628 evaluating HellaSwag: 440/628 evaluating HellaSwag: 450/628 evaluating HellaSwag: 460/628 evaluating HellaSwag: 470/628 evaluating HellaSwag: 480/628 evaluating HellaSwag: 490/628 evaluating HellaSwag: 500/628 evaluating HellaSwag: 510/628 evaluating HellaSwag: 520/628 evaluating HellaSwag: 530/628 evaluating HellaSwag: 540/628 evaluating HellaSwag: 550/628 evaluating HellaSwag: 560/628 evaluating HellaSwag: 570/628 evaluating HellaSwag: 580/628 evaluating HellaSwag: 590/628 evaluating HellaSwag: 600/628 evaluating HellaSwag: 610/628 evaluating HellaSwag: 620/628 HellaSwag: 2954/10042 = 0.294165 +step 11751/19560 | loss 3.364923 (-0.03z)| norm 0.2796 (+0.26z)| lr 2.20e-04 | 2533.38 ms | 53.3% bf16 MFU | 206973 tok/s +step 11752/19560 | loss 3.369507 (+0.11z)| norm 0.2617 (-0.82z)| lr 2.20e-04 | 2532.58 ms | 53.3% bf16 MFU | 206976 tok/s +step 11753/19560 | loss 3.398432 (+1.03z)| norm 0.2719 (-0.20z)| lr 2.20e-04 | 2533.74 ms | 53.3% bf16 MFU | 206973 tok/s +step 11754/19560 | loss 3.301590 (-2.03z)| norm 0.2881 (+0.78z)| lr 2.20e-04 | 2534.29 ms | 53.3% bf16 MFU | 206968 tok/s +step 11755/19560 | loss 3.365095 (-0.02z)| norm 0.2648 (-0.63z)| lr 2.20e-04 | 2533.23 ms | 53.3% bf16 MFU | 206968 tok/s +step 11756/19560 | loss 3.379481 (+0.43z)| norm 0.2970 (+1.30z)| lr 2.20e-04 | 2532.86 ms | 53.3% bf16 MFU | 206969 tok/s +step 11757/19560 | loss 3.365835 (+0.02z)| norm 0.2854 (+0.59z)| lr 2.20e-04 | 2533.84 ms | 53.3% bf16 MFU | 206967 tok/s +step 11758/19560 | loss 3.336526 (-0.91z)| norm 0.2988 (+1.40z)| lr 2.20e-04 | 2532.37 ms | 53.3% bf16 MFU | 206970 tok/s +step 11759/19560 | loss 3.381067 (+0.51z)| norm 0.2809 (+0.31z)| lr 2.20e-04 | 2533.39 ms | 53.3% bf16 MFU | 206969 tok/s +step 11760/19560 | loss 3.306200 (-1.84z)| norm 0.3123 (+2.16z)| lr 2.20e-04 | 2534.25 ms | 53.3% bf16 MFU | 206965 tok/s +step 11761/19560 | loss 3.381111 (+0.52z)| norm 0.2962 (+1.18z)| lr 2.20e-04 | 2535.45 ms | 53.3% bf16 MFU | 206956 tok/s +step 11762/19560 | loss 3.444723 (+2.45z)| norm 0.2931 (+0.98z)| lr 2.19e-04 | 2534.68 ms | 53.3% bf16 MFU | 206950 tok/s +step 11763/19560 | loss 3.364672 (-0.03z)| norm 0.2961 (+1.15z)| lr 2.19e-04 | 2532.67 ms | 53.3% bf16 MFU | 206953 tok/s +step 11764/19560 | loss 3.380144 (+0.45z)| norm 0.2853 (+0.51z)| lr 2.19e-04 | 2534.08 ms | 53.3% bf16 MFU | 206950 tok/s +step 11765/19560 | loss 3.360854 (-0.15z)| norm 0.2880 (+0.70z)| lr 2.19e-04 | 2533.75 ms | 53.3% bf16 MFU | 206949 tok/s +step 11766/19560 | loss 3.390450 (+0.77z)| norm 0.3027 (+1.56z)| lr 2.19e-04 | 2532.72 ms | 53.3% bf16 MFU | 206952 tok/s +step 11767/19560 | loss 3.396660 (+0.95z)| norm 0.2830 (+0.38z)| lr 2.19e-04 | 2533.16 ms | 53.3% bf16 MFU | 206952 tok/s +step 11768/19560 | loss 3.377066 (+0.33z)| norm 0.2976 (+1.24z)| lr 2.19e-04 | 2531.69 ms | 53.3% bf16 MFU | 206959 tok/s +step 11769/19560 | loss 3.390831 (+0.75z)| norm 0.2804 (+0.22z)| lr 2.19e-04 | 2532.84 ms | 53.3% bf16 MFU | 206961 tok/s +step 11770/19560 | loss 3.458041 (+2.74z)| norm 0.2813 (+0.27z)| lr 2.19e-04 | 2531.82 ms | 53.3% bf16 MFU | 206967 tok/s +step 11771/19560 | loss 3.344593 (-0.70z)| norm 0.2984 (+1.31z)| lr 2.19e-04 | 2533.74 ms | 53.3% bf16 MFU | 206965 tok/s +step 11772/19560 | loss 3.372538 (+0.14z)| norm 0.2966 (+1.18z)| lr 2.19e-04 | 2532.22 ms | 53.3% bf16 MFU | 206969 tok/s +step 11773/19560 | loss 3.314506 (-1.59z)| norm 0.2903 (+0.80z)| lr 2.19e-04 | 2533.79 ms | 53.3% bf16 MFU | 206966 tok/s +step 11774/19560 | loss 3.305138 (-1.84z)| norm 0.2809 (+0.24z)| lr 2.19e-04 | 2533.26 ms | 53.3% bf16 MFU | 206966 tok/s +step 11775/19560 | loss 3.380720 (+0.40z)| norm 0.3096 (+1.93z)| lr 2.19e-04 | 2531.46 ms | 53.3% bf16 MFU | 206973 tok/s +step 11776/19560 | loss 3.341390 (-0.76z)| norm 0.2729 (-0.26z)| lr 2.19e-04 | 2533.04 ms | 53.3% bf16 MFU | 206974 tok/s +step 11777/19560 | loss 3.406688 (+1.16z)| norm 0.2822 (+0.30z)| lr 2.19e-04 | 2532.15 ms | 53.3% bf16 MFU | 206978 tok/s +step 11778/19560 | loss 3.360478 (-0.21z)| norm 0.2729 (-0.27z)| lr 2.19e-04 | 2534.18 ms | 53.3% bf16 MFU | 206973 tok/s +step 11779/19560 | loss 3.378667 (+0.36z)| norm 0.3060 (+1.71z)| lr 2.19e-04 | 2532.84 ms | 53.3% bf16 MFU | 206974 tok/s +step 11780/19560 | loss 3.305821 (-1.85z)| norm 0.2787 (+0.07z)| lr 2.19e-04 | 2535.37 ms | 53.3% bf16 MFU | 206965 tok/s +step 11781/19560 | loss 3.482034 (+3.41z)| norm 0.2865 (+0.54z)| lr 2.19e-04 | 2532.96 ms | 53.3% bf16 MFU | 206966 tok/s +step 11782/19560 | loss 3.312336 (-1.70z)| norm 0.3016 (+1.43z)| lr 2.19e-04 | 2533.44 ms | 53.3% bf16 MFU | 206965 tok/s +step 11783/19560 | loss 3.395060 (+0.78z)| norm 0.2693 (-0.50z)| lr 2.18e-04 | 2531.39 ms | 53.3% bf16 MFU | 206973 tok/s +step 11784/19560 | loss 3.312831 (-1.70z)| norm 0.2798 (+0.14z)| lr 2.18e-04 | 2532.91 ms | 53.3% bf16 MFU | 206973 tok/s +step 11785/19560 | loss 3.355612 (-0.40z)| norm 0.3090 (+1.85z)| lr 2.18e-04 | 2530.46 ms | 53.4% bf16 MFU | 206984 tok/s +step 11786/19560 | loss 3.327205 (-1.24z)| norm 0.2638 (-0.82z)| lr 2.18e-04 | 2532.53 ms | 53.3% bf16 MFU | 206986 tok/s +step 11787/19560 | loss 3.424552 (+1.68z)| norm 0.2790 (+0.07z)| lr 2.18e-04 | 2532.49 ms | 53.3% bf16 MFU | 206988 tok/s +step 11788/19560 | loss 3.383569 (+0.44z)| norm 0.2769 (-0.04z)| lr 2.18e-04 | 2532.17 ms | 53.3% bf16 MFU | 206991 tok/s +step 11789/19560 | loss 3.374691 (+0.16z)| norm 0.2885 (+0.65z)| lr 2.18e-04 | 2531.13 ms | 53.3% bf16 MFU | 206998 tok/s +step 11790/19560 | loss 3.447537 (+2.31z)| norm 0.2842 (+0.39z)| lr 2.18e-04 | 2531.88 ms | 53.3% bf16 MFU | 207002 tok/s +step 11791/19560 | loss 3.379452 (+0.27z)| norm 0.2780 (+0.01z)| lr 2.18e-04 | 2533.71 ms | 53.3% bf16 MFU | 206998 tok/s +step 11792/19560 | loss 3.364687 (-0.19z)| norm 0.2729 (-0.27z)| lr 2.18e-04 | 2534.36 ms | 53.3% bf16 MFU | 206992 tok/s +step 11793/19560 | loss 3.355511 (-0.46z)| norm 0.3058 (+1.80z)| lr 2.18e-04 | 2532.45 ms | 53.3% bf16 MFU | 206994 tok/s +step 11794/19560 | loss 3.408572 (+1.13z)| norm 0.3173 (+2.47z)| lr 2.18e-04 | 2532.36 ms | 53.3% bf16 MFU | 206996 tok/s +step 11795/19560 | loss 3.381926 (+0.32z)| norm 0.2905 (+0.80z)| lr 2.18e-04 | 2530.51 ms | 53.4% bf16 MFU | 207005 tok/s +step 11796/19560 | loss 3.359879 (-0.35z)| norm 0.2755 (-0.10z)| lr 2.18e-04 | 2530.68 ms | 53.4% bf16 MFU | 207014 tok/s +step 11797/19560 | loss 3.382468 (+0.34z)| norm 0.2637 (-0.86z)| lr 2.18e-04 | 2530.51 ms | 53.4% bf16 MFU | 207022 tok/s +step 11798/19560 | loss 3.308139 (-1.86z)| norm 0.3948 (+6.31z)| lr 2.18e-04 | 2531.22 ms | 53.3% bf16 MFU | 207028 tok/s +step 11799/19560 | loss 3.380896 (+0.29z)| norm 0.2907 (+0.68z)| lr 2.18e-04 | 2532.57 ms | 53.3% bf16 MFU | 207027 tok/s +step 11800/19560 | loss 3.467409 (+2.78z)| norm 0.3147 (+1.94z)| lr 2.18e-04 | 2532.29 ms | 53.3% bf16 MFU | 207028 tok/s +step 11801/19560 | loss 3.341518 (-0.87z)| norm 0.3008 (+1.18z)| lr 2.18e-04 | 2532.78 ms | 53.3% bf16 MFU | 207027 tok/s +step 11802/19560 | loss 3.318770 (-1.50z)| norm 0.3067 (+1.47z)| lr 2.18e-04 | 2532.34 ms | 53.3% bf16 MFU | 207027 tok/s +step 11803/19560 | loss 3.346562 (-0.69z)| norm 0.2838 (+0.26z)| lr 2.18e-04 | 2532.77 ms | 53.3% bf16 MFU | 207026 tok/s +step 11804/19560 | loss 3.418850 (+1.37z)| norm 0.2917 (+0.66z)| lr 2.17e-04 | 2533.97 ms | 53.3% bf16 MFU | 207020 tok/s +step 11805/19560 | loss 3.355337 (-0.44z)| norm 0.2839 (+0.24z)| lr 2.17e-04 | 2533.95 ms | 53.3% bf16 MFU | 207014 tok/s +step 11806/19560 | loss 3.390857 (+0.58z)| norm 0.2894 (+0.52z)| lr 2.17e-04 | 2533.07 ms | 53.3% bf16 MFU | 207012 tok/s +step 11807/19560 | loss 3.476185 (+2.90z)| norm 0.2772 (-0.12z)| lr 2.17e-04 | 2532.87 ms | 53.3% bf16 MFU | 207011 tok/s +step 11808/19560 | loss 3.328817 (-1.17z)| norm 0.2882 (+0.46z)| lr 2.17e-04 | 2532.77 ms | 53.3% bf16 MFU | 207011 tok/s +step 11809/19560 | loss 3.354491 (-0.45z)| norm 0.2770 (-0.14z)| lr 2.17e-04 | 2530.59 ms | 53.4% bf16 MFU | 207019 tok/s +step 11810/19560 | loss 3.402515 (+0.88z)| norm 0.2767 (-0.14z)| lr 2.17e-04 | 2531.91 ms | 53.3% bf16 MFU | 207022 tok/s +step 11811/19560 | loss 3.341783 (-0.83z)| norm 0.2654 (-0.76z)| lr 2.17e-04 | 2532.26 ms | 53.3% bf16 MFU | 207023 tok/s +step 11812/19560 | loss 3.413891 (+1.19z)| norm 0.2621 (-0.93z)| lr 2.17e-04 | 2532.15 ms | 53.3% bf16 MFU | 207024 tok/s +step 11813/19560 | loss 3.402481 (+0.89z)| norm 0.2648 (-0.78z)| lr 2.17e-04 | 2533.87 ms | 53.3% bf16 MFU | 207019 tok/s +step 11814/19560 | loss 3.349063 (-0.61z)| norm 0.2607 (-0.99z)| lr 2.17e-04 | 2533.60 ms | 53.3% bf16 MFU | 207015 tok/s +step 11815/19560 | loss 3.366685 (-0.13z)| norm 0.2690 (-0.54z)| lr 2.17e-04 | 2536.42 ms | 53.2% bf16 MFU | 206999 tok/s +step 11816/19560 | loss 3.369994 (-0.04z)| norm 0.2517 (-1.45z)| lr 2.17e-04 | 2534.74 ms | 53.3% bf16 MFU | 206991 tok/s +step 11817/19560 | loss 3.314952 (-1.58z)| norm 0.2732 (-0.30z)| lr 2.17e-04 | 2533.83 ms | 53.3% bf16 MFU | 206987 tok/s +step 11818/19560 | loss 3.542897 (+4.44z)| norm 0.2970 (+0.96z)| lr 2.17e-04 | 2534.21 ms | 53.3% bf16 MFU | 206982 tok/s +step 11819/19560 | loss 3.464154 (+2.32z)| norm 0.2884 (+0.49z)| lr 2.17e-04 | 2534.79 ms | 53.3% bf16 MFU | 206975 tok/s +step 11820/19560 | loss 3.319036 (-1.37z)| norm 0.2567 (-1.18z)| lr 2.17e-04 | 2534.30 ms | 53.3% bf16 MFU | 206970 tok/s +step 11821/19560 | loss 3.417282 (+1.10z)| norm 0.2659 (-0.71z)| lr 2.17e-04 | 2533.88 ms | 53.3% bf16 MFU | 206967 tok/s +step 11822/19560 | loss 3.365918 (-0.19z)| norm 0.2609 (-0.97z)| lr 2.17e-04 | 2533.23 ms | 53.3% bf16 MFU | 206967 tok/s +step 11823/19560 | loss 3.321204 (-1.30z)| norm 0.2571 (-1.15z)| lr 2.17e-04 | 2534.57 ms | 53.3% bf16 MFU | 206961 tok/s +step 11824/19560 | loss 3.361557 (-0.28z)| norm 0.2715 (-0.37z)| lr 2.17e-04 | 2534.36 ms | 53.3% bf16 MFU | 206957 tok/s +step 11825/19560 | loss 3.397451 (+0.63z)| norm 0.2583 (-1.06z)| lr 2.16e-04 | 2533.43 ms | 53.3% bf16 MFU | 206956 tok/s +step 11826/19560 | loss 3.337160 (-0.90z)| norm 0.2552 (-1.22z)| lr 2.16e-04 | 2531.64 ms | 53.3% bf16 MFU | 206963 tok/s +step 11827/19560 | loss 3.354225 (-0.47z)| norm 0.2563 (-1.14z)| lr 2.16e-04 | 2531.99 ms | 53.3% bf16 MFU | 206968 tok/s +step 11828/19560 | loss 3.345599 (-0.69z)| norm 0.3982 (+5.51z)| lr 2.16e-04 | 2533.35 ms | 53.3% bf16 MFU | 206968 tok/s +step 11829/19560 | loss 3.348301 (-0.62z)| norm 0.2628 (-0.74z)| lr 2.16e-04 | 2531.54 ms | 53.3% bf16 MFU | 206974 tok/s +step 11830/19560 | loss 3.313495 (-1.47z)| norm 0.2766 (-0.12z)| lr 2.16e-04 | 2534.09 ms | 53.3% bf16 MFU | 206970 tok/s +step 11831/19560 | loss 3.309531 (-1.55z)| norm 0.2676 (-0.52z)| lr 2.16e-04 | 2532.67 ms | 53.3% bf16 MFU | 206972 tok/s +step 11832/19560 | loss 3.412267 (+1.00z)| norm 0.2623 (-0.77z)| lr 2.16e-04 | 2531.60 ms | 53.3% bf16 MFU | 206979 tok/s +step 11833/19560 | loss 3.398155 (+0.64z)| norm 0.2822 (+0.16z)| lr 2.16e-04 | 2533.10 ms | 53.3% bf16 MFU | 206978 tok/s +step 11834/19560 | loss 3.329833 (-1.04z)| norm 0.2617 (-0.78z)| lr 2.16e-04 | 2531.25 ms | 53.3% bf16 MFU | 206986 tok/s +step 11835/19560 | loss 3.345277 (-0.67z)| norm 0.2879 (+0.43z)| lr 2.16e-04 | 2534.68 ms | 53.3% bf16 MFU | 206979 tok/s +step 11836/19560 | loss 3.405900 (+0.82z)| norm 0.2545 (-1.11z)| lr 2.16e-04 | 2532.22 ms | 53.3% bf16 MFU | 206982 tok/s +step 11837/19560 | loss 3.437297 (+1.57z)| norm 0.3078 (+1.33z)| lr 2.16e-04 | 2532.66 ms | 53.3% bf16 MFU | 206984 tok/s +step 11838/19560 | loss 3.386707 (+0.33z)| norm 0.2636 (-0.71z)| lr 2.16e-04 | 2533.62 ms | 53.3% bf16 MFU | 206981 tok/s +step 11839/19560 | loss 3.343692 (-0.72z)| norm 0.2727 (-0.29z)| lr 2.16e-04 | 2534.64 ms | 53.3% bf16 MFU | 206974 tok/s +step 11840/19560 | loss 3.366023 (-0.18z)| norm 0.2803 (+0.05z)| lr 2.16e-04 | 2534.04 ms | 53.3% bf16 MFU | 206971 tok/s +step 11841/19560 | loss 3.356234 (-0.42z)| norm 0.2752 (-0.19z)| lr 2.16e-04 | 2533.19 ms | 53.3% bf16 MFU | 206970 tok/s +step 11842/19560 | loss 3.320597 (-1.28z)| norm 0.2882 (+0.40z)| lr 2.16e-04 | 2533.28 ms | 53.3% bf16 MFU | 206970 tok/s +step 11843/19560 | loss 3.465874 (+2.20z)| norm 0.2899 (+0.48z)| lr 2.16e-04 | 2532.55 ms | 53.3% bf16 MFU | 206972 tok/s +step 11844/19560 | loss 3.371202 (-0.07z)| norm 0.3230 (+1.97z)| lr 2.16e-04 | 2533.49 ms | 53.3% bf16 MFU | 206971 tok/s +step 11845/19560 | loss 3.397128 (+0.55z)| norm 0.2816 (+0.07z)| lr 2.16e-04 | 2532.82 ms | 53.3% bf16 MFU | 206972 tok/s +step 11846/19560 | loss 3.320620 (-1.27z)| norm 0.2942 (+0.64z)| lr 2.15e-04 | 2534.31 ms | 53.3% bf16 MFU | 206967 tok/s +step 11847/19560 | loss 3.390446 (+0.38z)| norm 0.2859 (+0.25z)| lr 2.15e-04 | 2534.51 ms | 53.3% bf16 MFU | 206962 tok/s +step 11848/19560 | loss 3.353839 (-0.49z)| norm 0.2900 (+0.43z)| lr 2.15e-04 | 2534.36 ms | 53.3% bf16 MFU | 206958 tok/s +step 11849/19560 | loss 3.346721 (-0.65z)| norm 0.2978 (+0.78z)| lr 2.15e-04 | 2534.48 ms | 53.3% bf16 MFU | 206953 tok/s +step 11850/19560 | loss 3.335780 (-0.90z)| norm 0.2897 (+0.39z)| lr 2.15e-04 | 2532.24 ms | 53.3% bf16 MFU | 206957 tok/s +step 11851/19560 | loss 3.363506 (-0.25z)| norm 0.2873 (+0.29z)| lr 2.15e-04 | 2532.27 ms | 53.3% bf16 MFU | 206962 tok/s +step 11852/19560 | loss 3.429184 (+1.30z)| norm 0.2812 (+0.00z)| lr 2.15e-04 | 2532.99 ms | 53.3% bf16 MFU | 206963 tok/s +step 11853/19560 | loss 3.529988 (+3.47z)| norm 0.3070 (+1.18z)| lr 2.15e-04 | 2532.23 ms | 53.3% bf16 MFU | 206967 tok/s +step 11854/19560 | loss 3.406970 (+0.71z)| norm 0.2598 (-0.98z)| lr 2.15e-04 | 2532.28 ms | 53.3% bf16 MFU | 206971 tok/s +step 11855/19560 | loss 3.379596 (+0.09z)| norm 0.2822 (+0.04z)| lr 2.15e-04 | 2530.73 ms | 53.4% bf16 MFU | 206981 tok/s +step 11856/19560 | loss 3.296200 (-1.77z)| norm 0.2609 (-0.92z)| lr 2.15e-04 | 2534.43 ms | 53.3% bf16 MFU | 206975 tok/s +step 11857/19560 | loss 3.341952 (-0.73z)| norm 0.2838 (+0.12z)| lr 2.15e-04 | 2533.23 ms | 53.3% bf16 MFU | 206974 tok/s +step 11858/19560 | loss 3.391520 (+0.38z)| norm 0.2817 (+0.03z)| lr 2.15e-04 | 2533.02 ms | 53.3% bf16 MFU | 206975 tok/s +step 11859/19560 | loss 3.423426 (+1.08z)| norm 0.2689 (-0.57z)| lr 2.15e-04 | 2533.05 ms | 53.3% bf16 MFU | 206975 tok/s +step 11860/19560 | loss 3.375794 (+0.01z)| norm 0.2883 (+0.32z)| lr 2.15e-04 | 2534.85 ms | 53.3% bf16 MFU | 206968 tok/s +step 11861/19560 | loss 3.339808 (-0.78z)| norm 0.2715 (-0.46z)| lr 2.15e-04 | 2532.60 ms | 53.3% bf16 MFU | 206970 tok/s +step 11862/19560 | loss 3.487052 (+2.44z)| norm 0.2953 (+0.63z)| lr 2.15e-04 | 2532.19 ms | 53.3% bf16 MFU | 206974 tok/s +step 11863/19560 | loss 3.404212 (+0.63z)| norm 0.2812 (-0.02z)| lr 2.15e-04 | 2531.85 ms | 53.3% bf16 MFU | 206979 tok/s +step 11864/19560 | loss 3.342118 (-0.72z)| norm 0.2960 (+0.65z)| lr 2.15e-04 | 2534.00 ms | 53.3% bf16 MFU | 206975 tok/s +step 11865/19560 | loss 3.368865 (-0.13z)| norm 0.3057 (+1.09z)| lr 2.15e-04 | 2532.46 ms | 53.3% bf16 MFU | 206978 tok/s +step 11866/19560 | loss 3.300494 (-1.60z)| norm 0.2790 (-0.15z)| lr 2.14e-04 | 2534.75 ms | 53.3% bf16 MFU | 206971 tok/s +step 11867/19560 | loss 3.376348 (+0.05z)| norm 0.2799 (-0.11z)| lr 2.14e-04 | 2533.89 ms | 53.3% bf16 MFU | 206968 tok/s +step 11868/19560 | loss 3.440519 (+1.44z)| norm 0.2833 (+0.04z)| lr 2.14e-04 | 2534.12 ms | 53.3% bf16 MFU | 206964 tok/s +step 11869/19560 | loss 3.380083 (+0.13z)| norm 0.2943 (+0.54z)| lr 2.14e-04 | 2533.41 ms | 53.3% bf16 MFU | 206963 tok/s +step 11870/19560 | loss 3.370541 (-0.08z)| norm 0.2673 (-0.73z)| lr 2.14e-04 | 2535.33 ms | 53.3% bf16 MFU | 206955 tok/s +step 11871/19560 | loss 3.378947 (+0.11z)| norm 0.2705 (-0.58z)| lr 2.14e-04 | 2533.35 ms | 53.3% bf16 MFU | 206955 tok/s +step 11872/19560 | loss 3.364495 (-0.20z)| norm 0.2700 (-0.61z)| lr 2.14e-04 | 2533.63 ms | 53.3% bf16 MFU | 206954 tok/s +step 11873/19560 | loss 3.341223 (-0.71z)| norm 0.2574 (-1.20z)| lr 2.14e-04 | 2534.66 ms | 53.3% bf16 MFU | 206948 tok/s +step 11874/19560 | loss 3.302433 (-1.52z)| norm 0.2634 (-0.92z)| lr 2.14e-04 | 2534.11 ms | 53.3% bf16 MFU | 206946 tok/s +step 11875/19560 | loss 3.392376 (+0.41z)| norm 0.2701 (-0.60z)| lr 2.14e-04 | 2533.42 ms | 53.3% bf16 MFU | 206946 tok/s +step 11876/19560 | loss 3.307354 (-1.41z)| norm 0.2553 (-1.30z)| lr 2.14e-04 | 2534.03 ms | 53.3% bf16 MFU | 206943 tok/s +step 11877/19560 | loss 3.351754 (-0.46z)| norm 0.2623 (-0.98z)| lr 2.14e-04 | 2534.16 ms | 53.3% bf16 MFU | 206941 tok/s +step 11878/19560 | loss 3.448454 (+1.59z)| norm 0.2601 (-1.09z)| lr 2.14e-04 | 2532.93 ms | 53.3% bf16 MFU | 206943 tok/s +step 11879/19560 | loss 3.393546 (+0.42z)| norm 0.2492 (-1.58z)| lr 2.14e-04 | 2531.25 ms | 53.3% bf16 MFU | 206952 tok/s +step 11880/19560 | loss 3.425933 (+1.09z)| norm 0.2590 (-1.12z)| lr 2.14e-04 | 2533.66 ms | 53.3% bf16 MFU | 206951 tok/s +step 11881/19560 | loss 3.366640 (-0.16z)| norm 0.2516 (-1.44z)| lr 2.14e-04 | 2532.87 ms | 53.3% bf16 MFU | 206953 tok/s +step 11882/19560 | loss 3.387850 (+0.28z)| norm 0.2854 (+0.13z)| lr 2.14e-04 | 2532.26 ms | 53.3% bf16 MFU | 206958 tok/s +step 11883/19560 | loss 3.312865 (-1.31z)| norm 0.2460 (-1.68z)| lr 2.14e-04 | 2533.95 ms | 53.3% bf16 MFU | 206955 tok/s +step 11884/19560 | loss 3.453753 (+1.66z)| norm 0.2814 (-0.04z)| lr 2.14e-04 | 2533.40 ms | 53.3% bf16 MFU | 206955 tok/s +step 11885/19560 | loss 3.385394 (+0.22z)| norm 0.2736 (-0.40z)| lr 2.14e-04 | 2532.85 ms | 53.3% bf16 MFU | 206957 tok/s +step 11886/19560 | loss 3.351585 (-0.50z)| norm 0.2772 (-0.22z)| lr 2.14e-04 | 2533.67 ms | 53.3% bf16 MFU | 206955 tok/s +step 11887/19560 | loss 3.383842 (+0.18z)| norm 0.2613 (-0.95z)| lr 2.13e-04 | 2533.32 ms | 53.3% bf16 MFU | 206956 tok/s +step 11888/19560 | loss 3.394479 (+0.39z)| norm 0.2715 (-0.47z)| lr 2.13e-04 | 2535.50 ms | 53.3% bf16 MFU | 206947 tok/s +step 11889/19560 | loss 3.384966 (+0.19z)| norm 0.2609 (-0.95z)| lr 2.13e-04 | 2534.24 ms | 53.3% bf16 MFU | 206943 tok/s +step 11890/19560 | loss 3.331530 (-0.93z)| norm 0.2584 (-1.05z)| lr 2.13e-04 | 2533.55 ms | 53.3% bf16 MFU | 206943 tok/s +step 11891/19560 | loss 3.482287 (+2.22z)| norm 0.2718 (-0.42z)| lr 2.13e-04 | 2534.21 ms | 53.3% bf16 MFU | 206940 tok/s +step 11892/19560 | loss 3.345477 (-0.63z)| norm 0.2702 (-0.49z)| lr 2.13e-04 | 2533.79 ms | 53.3% bf16 MFU | 206939 tok/s +step 11893/19560 | loss 3.393322 (+0.36z)| norm 0.2532 (-1.25z)| lr 2.13e-04 | 2535.21 ms | 53.3% bf16 MFU | 206932 tok/s +step 11894/19560 | loss 3.361483 (-0.30z)| norm 0.2733 (-0.32z)| lr 2.13e-04 | 2535.04 ms | 53.3% bf16 MFU | 206926 tok/s +step 11895/19560 | loss 3.305094 (-1.45z)| norm 0.2624 (-0.81z)| lr 2.13e-04 | 2534.47 ms | 53.3% bf16 MFU | 206923 tok/s +step 11896/19560 | loss 3.368281 (-0.14z)| norm 0.2776 (-0.11z)| lr 2.13e-04 | 2534.74 ms | 53.3% bf16 MFU | 206919 tok/s +step 11897/19560 | loss 3.319236 (-1.14z)| norm 0.2586 (-0.97z)| lr 2.13e-04 | 2535.08 ms | 53.3% bf16 MFU | 206914 tok/s +step 11898/19560 | loss 3.399261 (+0.53z)| norm 0.2769 (-0.13z)| lr 2.13e-04 | 2532.46 ms | 53.3% bf16 MFU | 206920 tok/s +step 11899/19560 | loss 3.318997 (-1.14z)| norm 0.2667 (-0.59z)| lr 2.13e-04 | 2533.43 ms | 53.3% bf16 MFU | 206921 tok/s +step 11900/19560 | loss 3.409470 (+0.73z)| norm 0.2677 (-0.53z)| lr 2.13e-04 | 2535.23 ms | 53.3% bf16 MFU | 206915 tok/s +step 11901/19560 | loss 3.295173 (-1.62z)| norm 0.2727 (-0.30z)| lr 2.13e-04 | 2534.36 ms | 53.3% bf16 MFU | 206913 tok/s +step 11902/19560 | loss 3.376930 (+0.05z)| norm 0.2862 (+0.33z)| lr 2.13e-04 | 2533.23 ms | 53.3% bf16 MFU | 206915 tok/s +step 11903/19560 | loss 3.330577 (-0.90z)| norm 0.2597 (-0.88z)| lr 2.13e-04 | 2533.39 ms | 53.3% bf16 MFU | 206917 tok/s +step 11904/19560 | loss 3.413620 (+0.81z)| norm 0.2799 (+0.05z)| lr 2.13e-04 | 2534.44 ms | 53.3% bf16 MFU | 206915 tok/s +step 11905/19560 | loss 3.414180 (+0.82z)| norm 0.2653 (-0.62z)| lr 2.13e-04 | 2533.21 ms | 53.3% bf16 MFU | 206917 tok/s +step 11906/19560 | loss 3.389408 (+0.30z)| norm 0.2783 (-0.02z)| lr 2.13e-04 | 2532.76 ms | 53.3% bf16 MFU | 206921 tok/s +step 11907/19560 | loss 3.356611 (-0.38z)| norm 0.2731 (-0.25z)| lr 2.13e-04 | 2533.46 ms | 53.3% bf16 MFU | 206923 tok/s +step 11908/19560 | loss 3.360456 (-0.31z)| norm 0.2682 (-0.47z)| lr 2.12e-04 | 2534.24 ms | 53.3% bf16 MFU | 206921 tok/s +step 11909/19560 | loss 3.406899 (+0.69z)| norm 0.2538 (-1.13z)| lr 2.12e-04 | 2533.71 ms | 53.3% bf16 MFU | 206921 tok/s +step 11910/19560 | loss 3.499805 (+2.57z)| norm 0.2613 (-0.77z)| lr 2.12e-04 | 2531.14 ms | 53.3% bf16 MFU | 206932 tok/s +step 11911/19560 | loss 3.352125 (-0.49z)| norm 0.2735 (-0.20z)| lr 2.12e-04 | 2534.44 ms | 53.3% bf16 MFU | 206928 tok/s +step 11912/19560 | loss 3.321842 (-1.12z)| norm 0.2694 (-0.38z)| lr 2.12e-04 | 2534.29 ms | 53.3% bf16 MFU | 206926 tok/s +step 11913/19560 | loss 3.343578 (-0.67z)| norm 0.2702 (-0.34z)| lr 2.12e-04 | 2535.96 ms | 53.2% bf16 MFU | 206916 tok/s +step 11914/19560 | loss 3.337346 (-0.80z)| norm 0.3665 (+3.89z)| lr 2.12e-04 | 2534.68 ms | 53.3% bf16 MFU | 206913 tok/s +step 11915/19560 | loss 3.337582 (-0.78z)| norm 0.2844 (+0.27z)| lr 2.12e-04 | 2534.03 ms | 53.3% bf16 MFU | 206912 tok/s +step 11916/19560 | loss 3.416234 (+0.85z)| norm 0.2892 (+0.48z)| lr 2.12e-04 | 2534.38 ms | 53.3% bf16 MFU | 206910 tok/s +step 11917/19560 | loss 3.421954 (+0.96z)| norm 0.2900 (+0.51z)| lr 2.12e-04 | 2535.44 ms | 53.3% bf16 MFU | 206904 tok/s +step 11918/19560 | loss 3.319717 (-1.15z)| norm 0.2857 (+0.32z)| lr 2.12e-04 | 2536.24 ms | 53.2% bf16 MFU | 206895 tok/s +step 11919/19560 | loss 3.415224 (+0.83z)| norm 0.3000 (+0.94z)| lr 2.12e-04 | 2534.93 ms | 53.3% bf16 MFU | 206891 tok/s +step 11920/19560 | loss 3.410761 (+0.73z)| norm 0.2962 (+0.76z)| lr 2.12e-04 | 2536.19 ms | 53.2% bf16 MFU | 206883 tok/s +step 11921/19560 | loss 3.324790 (-1.04z)| norm 0.2842 (+0.25z)| lr 2.12e-04 | 2536.19 ms | 53.2% bf16 MFU | 206875 tok/s +step 11922/19560 | loss 3.390559 (+0.32z)| norm 0.2800 (+0.08z)| lr 2.12e-04 | 2535.49 ms | 53.3% bf16 MFU | 206870 tok/s +step 11923/19560 | loss 3.316583 (-1.19z)| norm 0.2743 (-0.17z)| lr 2.12e-04 | 2536.05 ms | 53.2% bf16 MFU | 206863 tok/s +step 11924/19560 | loss 3.367594 (-0.14z)| norm 0.2931 (+0.66z)| lr 2.12e-04 | 2535.77 ms | 53.2% bf16 MFU | 206858 tok/s +step 11925/19560 | loss 3.350244 (-0.49z)| norm 0.2695 (-0.39z)| lr 2.12e-04 | 2535.34 ms | 53.3% bf16 MFU | 206855 tok/s +step 11926/19560 | loss 3.314945 (-1.22z)| norm 0.2812 (+0.19z)| lr 2.12e-04 | 2535.17 ms | 53.3% bf16 MFU | 206852 tok/s +step 11927/19560 | loss 3.359100 (-0.31z)| norm 0.2803 (+0.15z)| lr 2.12e-04 | 2532.94 ms | 53.3% bf16 MFU | 206859 tok/s +step 11928/19560 | loss 3.392795 (+0.40z)| norm 0.2652 (-0.60z)| lr 2.12e-04 | 2535.00 ms | 53.3% bf16 MFU | 206857 tok/s +step 11929/19560 | loss 3.426355 (+1.08z)| norm 0.2828 (+0.30z)| lr 2.11e-04 | 2533.65 ms | 53.3% bf16 MFU | 206861 tok/s +step 11930/19560 | loss 3.392682 (+0.37z)| norm 0.2761 (-0.03z)| lr 2.11e-04 | 2534.37 ms | 53.3% bf16 MFU | 206861 tok/s +step 11931/19560 | loss 3.422446 (+0.98z)| norm 0.2645 (-0.62z)| lr 2.11e-04 | 2533.89 ms | 53.3% bf16 MFU | 206864 tok/s +step 11932/19560 | loss 3.337389 (-0.78z)| norm 0.2765 (+0.01z)| lr 2.11e-04 | 2534.88 ms | 53.3% bf16 MFU | 206862 tok/s +step 11933/19560 | loss 3.435167 (+1.24z)| norm 0.2668 (-0.49z)| lr 2.11e-04 | 2534.83 ms | 53.3% bf16 MFU | 206860 tok/s +step 11934/19560 | loss 3.456390 (+1.65z)| norm 0.2753 (-0.04z)| lr 2.11e-04 | 2535.01 ms | 53.3% bf16 MFU | 206858 tok/s +step 11935/19560 | loss 3.350747 (-0.51z)| norm 0.2700 (-0.31z)| lr 2.11e-04 | 2534.48 ms | 53.3% bf16 MFU | 206859 tok/s +step 11936/19560 | loss 3.359218 (-0.33z)| norm 0.2757 (-0.01z)| lr 2.11e-04 | 2534.06 ms | 53.3% bf16 MFU | 206860 tok/s +step 11937/19560 | loss 3.442465 (+1.39z)| norm 0.2736 (-0.12z)| lr 2.11e-04 | 2532.75 ms | 53.3% bf16 MFU | 206868 tok/s +step 11938/19560 | loss 3.417393 (+0.86z)| norm 0.2485 (-1.40z)| lr 2.11e-04 | 2534.51 ms | 53.3% bf16 MFU | 206867 tok/s +step 11939/19560 | loss 3.394670 (+0.38z)| norm 0.2768 (+0.05z)| lr 2.11e-04 | 2533.84 ms | 53.3% bf16 MFU | 206870 tok/s +step 11940/19560 | loss 3.357994 (-0.37z)| norm 0.2560 (-1.01z)| lr 2.11e-04 | 2534.86 ms | 53.3% bf16 MFU | 206868 tok/s +step 11941/19560 | loss 3.393567 (+0.37z)| norm 0.2738 (-0.10z)| lr 2.11e-04 | 2534.13 ms | 53.3% bf16 MFU | 206869 tok/s +step 11942/19560 | loss 3.332560 (-0.90z)| norm 0.2634 (-0.64z)| lr 2.11e-04 | 2532.03 ms | 53.3% bf16 MFU | 206878 tok/s +step 11943/19560 | loss 3.377875 (+0.04z)| norm 0.2706 (-0.27z)| lr 2.11e-04 | 2533.01 ms | 53.3% bf16 MFU | 206884 tok/s +step 11944/19560 | loss 3.348651 (-0.56z)| norm 0.2754 (-0.03z)| lr 2.11e-04 | 2532.77 ms | 53.3% bf16 MFU | 206890 tok/s +step 11945/19560 | loss 3.355883 (-0.42z)| norm 0.2727 (-0.17z)| lr 2.11e-04 | 2534.51 ms | 53.3% bf16 MFU | 206888 tok/s +step 11946/19560 | loss 3.373924 (-0.02z)| norm 0.2684 (-0.38z)| lr 2.11e-04 | 2534.10 ms | 53.3% bf16 MFU | 206888 tok/s +step 11947/19560 | loss 3.344126 (-0.66z)| norm 0.2944 (+0.96z)| lr 2.11e-04 | 2533.81 ms | 53.3% bf16 MFU | 206890 tok/s +step 11948/19560 | loss 3.424208 (+1.11z)| norm 0.2558 (-1.04z)| lr 2.11e-04 | 2534.47 ms | 53.3% bf16 MFU | 206888 tok/s +step 11949/19560 | loss 3.411432 (+0.82z)| norm 0.2678 (-0.42z)| lr 2.11e-04 | 2535.91 ms | 53.2% bf16 MFU | 206881 tok/s +step 11950/19560 | loss 3.275980 (-2.15z)| norm 0.2797 (+0.19z)| lr 2.10e-04 | 2533.86 ms | 53.3% bf16 MFU | 206883 tok/s +step 11951/19560 | loss 3.381788 (+0.16z)| norm 0.2908 (+0.76z)| lr 2.10e-04 | 2531.69 ms | 53.3% bf16 MFU | 206893 tok/s +step 11952/19560 | loss 3.359195 (-0.33z)| norm 0.2818 (+0.28z)| lr 2.10e-04 | 2534.02 ms | 53.3% bf16 MFU | 206894 tok/s +step 11953/19560 | loss 3.378173 (+0.09z)| norm 0.2736 (-0.15z)| lr 2.10e-04 | 2534.13 ms | 53.3% bf16 MFU | 206893 tok/s +step 11954/19560 | loss 3.372397 (-0.04z)| norm 0.3160 (+2.02z)| lr 2.10e-04 | 2534.04 ms | 53.3% bf16 MFU | 206894 tok/s +step 11955/19560 | loss 3.342713 (-0.70z)| norm 0.2720 (-0.26z)| lr 2.10e-04 | 2534.63 ms | 53.3% bf16 MFU | 206891 tok/s +step 11956/19560 | loss 3.385206 (+0.23z)| norm 0.2583 (-1.10z)| lr 2.10e-04 | 2533.92 ms | 53.3% bf16 MFU | 206892 tok/s +step 11957/19560 | loss 3.383335 (+0.19z)| norm 0.3305 (+3.23z)| lr 2.10e-04 | 2535.17 ms | 53.3% bf16 MFU | 206888 tok/s +step 11958/19560 | loss 3.380327 (+0.11z)| norm 0.2614 (-0.89z)| lr 2.10e-04 | 2535.06 ms | 53.3% bf16 MFU | 206884 tok/s +step 11959/19560 | loss 3.365952 (-0.22z)| norm 0.2641 (-0.73z)| lr 2.10e-04 | 2532.26 ms | 53.3% bf16 MFU | 206892 tok/s +step 11960/19560 | loss 3.409320 (+0.75z)| norm 0.2723 (-0.24z)| lr 2.10e-04 | 2533.66 ms | 53.3% bf16 MFU | 206894 tok/s +step 11961/19560 | loss 3.382449 (+0.15z)| norm 0.2884 (+0.71z)| lr 2.10e-04 | 2532.88 ms | 53.3% bf16 MFU | 206899 tok/s +step 11962/19560 | loss 3.398601 (+0.50z)| norm 0.2587 (-1.06z)| lr 2.10e-04 | 2534.00 ms | 53.3% bf16 MFU | 206899 tok/s +step 11963/19560 | loss 3.434670 (+1.30z)| norm 0.2821 (+0.34z)| lr 2.10e-04 | 2532.07 ms | 53.3% bf16 MFU | 206907 tok/s +step 11964/19560 | loss 3.321001 (-1.24z)| norm 0.3304 (+3.09z)| lr 2.10e-04 | 2534.79 ms | 53.3% bf16 MFU | 206904 tok/s +step 11965/19560 | loss 3.377223 (+0.03z)| norm 0.2839 (+0.42z)| lr 2.10e-04 | 2533.63 ms | 53.3% bf16 MFU | 206905 tok/s +step 11966/19560 | loss 3.455504 (+1.76z)| norm 0.2889 (+0.70z)| lr 2.10e-04 | 2532.28 ms | 53.3% bf16 MFU | 206912 tok/s +step 11967/19560 | loss 3.386775 (+0.22z)| norm 0.3005 (+1.35z)| lr 2.10e-04 | 2531.31 ms | 53.3% bf16 MFU | 206922 tok/s +step 11968/19560 | loss 3.400588 (+0.53z)| norm 0.2701 (-0.41z)| lr 2.10e-04 | 2533.20 ms | 53.3% bf16 MFU | 206925 tok/s +step 11969/19560 | loss 3.390460 (+0.30z)| norm 0.2767 (-0.03z)| lr 2.10e-04 | 2533.93 ms | 53.3% bf16 MFU | 206924 tok/s +step 11970/19560 | loss 3.345913 (-0.71z)| norm 0.2607 (-0.94z)| lr 2.10e-04 | 2532.69 ms | 53.3% bf16 MFU | 206928 tok/s +step 11971/19560 | loss 3.332347 (-1.00z)| norm 0.2714 (-0.31z)| lr 2.09e-04 | 2531.87 ms | 53.3% bf16 MFU | 206935 tok/s +step 11972/19560 | loss 3.366363 (-0.23z)| norm 0.2942 (+1.05z)| lr 2.09e-04 | 2533.19 ms | 53.3% bf16 MFU | 206937 tok/s +step 11973/19560 | loss 3.342410 (-0.76z)| norm 0.2529 (-1.39z)| lr 2.09e-04 | 2532.79 ms | 53.3% bf16 MFU | 206940 tok/s +step 11974/19560 | loss 3.292128 (-1.88z)| norm 0.2705 (-0.33z)| lr 2.09e-04 | 2535.00 ms | 53.3% bf16 MFU | 206934 tok/s +step 11975/19560 | loss 3.271648 (-2.27z)| norm 0.2648 (-0.66z)| lr 2.09e-04 | 2533.32 ms | 53.3% bf16 MFU | 206935 tok/s +step 11976/19560 | loss 3.388051 (+0.29z)| norm 0.2780 (+0.12z)| lr 2.09e-04 | 2535.54 ms | 53.2% bf16 MFU | 206927 tok/s +step 11977/19560 | loss 3.350788 (-0.54z)| norm 0.2647 (-0.65z)| lr 2.09e-04 | 2533.73 ms | 53.3% bf16 MFU | 206927 tok/s +step 11978/19560 | loss 3.383454 (+0.18z)| norm 0.2586 (-1.00z)| lr 2.09e-04 | 2533.77 ms | 53.3% bf16 MFU | 206927 tok/s +step 11979/19560 | loss 3.344151 (-0.69z)| norm 0.2876 (+0.73z)| lr 2.09e-04 | 2532.30 ms | 53.3% bf16 MFU | 206932 tok/s +step 11980/19560 | loss 3.397832 (+0.51z)| norm 0.2463 (-1.70z)| lr 2.09e-04 | 2534.08 ms | 53.3% bf16 MFU | 206930 tok/s +step 11981/19560 | loss 3.519871 (+3.23z)| norm 0.3027 (+1.64z)| lr 2.09e-04 | 2533.17 ms | 53.3% bf16 MFU | 206932 tok/s +step 11982/19560 | loss 3.376375 (+0.04z)| norm 0.2595 (-0.92z)| lr 2.09e-04 | 2532.12 ms | 53.3% bf16 MFU | 206938 tok/s +step 11983/19560 | loss 3.424925 (+1.11z)| norm 0.2623 (-0.75z)| lr 2.09e-04 | 2534.25 ms | 53.3% bf16 MFU | 206936 tok/s +step 11984/19560 | loss 3.395374 (+0.44z)| norm 0.2681 (-0.41z)| lr 2.09e-04 | 2533.04 ms | 53.3% bf16 MFU | 206938 tok/s +step 11985/19560 | loss 3.362276 (-0.31z)| norm 0.2343 (-2.34z)| lr 2.09e-04 | 2534.46 ms | 53.3% bf16 MFU | 206934 tok/s +step 11986/19560 | loss 3.436676 (+1.35z)| norm 0.2614 (-0.75z)| lr 2.09e-04 | 2533.15 ms | 53.3% bf16 MFU | 206936 tok/s +step 11987/19560 | loss 3.295364 (-1.78z)| norm 0.2400 (-1.95z)| lr 2.09e-04 | 2533.20 ms | 53.3% bf16 MFU | 206938 tok/s +step 11988/19560 | loss 3.416625 (+0.91z)| norm 0.2621 (-0.68z)| lr 2.09e-04 | 2531.52 ms | 53.3% bf16 MFU | 206946 tok/s +step 11989/19560 | loss 3.540600 (+3.45z)| norm 0.2837 (+0.54z)| lr 2.09e-04 | 2532.22 ms | 53.3% bf16 MFU | 206951 tok/s +step 11990/19560 | loss 3.478011 (+2.14z)| norm 0.2761 (+0.12z)| lr 2.09e-04 | 2530.56 ms | 53.4% bf16 MFU | 206962 tok/s +step 11991/19560 | loss 3.358316 (-0.39z)| norm 0.2806 (+0.38z)| lr 2.09e-04 | 2533.62 ms | 53.3% bf16 MFU | 206961 tok/s +step 11992/19560 | loss 3.372628 (-0.09z)| norm 0.2816 (+0.45z)| lr 2.08e-04 | 2532.20 ms | 53.3% bf16 MFU | 206965 tok/s +step 11993/19560 | loss 3.429612 (+1.10z)| norm 0.2769 (+0.19z)| lr 2.08e-04 | 2532.22 ms | 53.3% bf16 MFU | 206969 tok/s +step 11994/19560 | loss 3.419463 (+0.87z)| norm 0.2930 (+1.12z)| lr 2.08e-04 | 2531.37 ms | 53.3% bf16 MFU | 206977 tok/s +step 11995/19560 | loss 3.327668 (-1.07z)| norm 0.2951 (+1.23z)| lr 2.08e-04 | 2532.62 ms | 53.3% bf16 MFU | 206979 tok/s +step 11996/19560 | loss 3.456806 (+1.66z)| norm 0.2839 (+0.58z)| lr 2.08e-04 | 2533.26 ms | 53.3% bf16 MFU | 206978 tok/s +step 11997/19560 | loss 3.352507 (-0.54z)| norm 0.2984 (+1.41z)| lr 2.08e-04 | 2531.71 ms | 53.3% bf16 MFU | 206983 tok/s +step 11998/19560 | loss 3.395722 (+0.37z)| norm 0.2934 (+1.11z)| lr 2.08e-04 | 2534.83 ms | 53.3% bf16 MFU | 206976 tok/s +step 11999/19560 | loss 3.345514 (-0.68z)| norm 0.2780 (+0.22z)| lr 2.08e-04 | 2531.78 ms | 53.3% bf16 MFU | 206981 tok/s +step 12000/19560 | loss 3.414000 (+0.75z)| norm 0.2820 (+0.45z)| lr 2.08e-04 | 2531.77 ms | 53.3% bf16 MFU | 206986 tok/s +val loss 3.365694 +evaluating HellaSwag: 0/628 evaluating HellaSwag: 10/628 evaluating HellaSwag: 20/628 evaluating HellaSwag: 30/628 evaluating HellaSwag: 40/628 evaluating HellaSwag: 50/628 evaluating HellaSwag: 60/628 evaluating HellaSwag: 70/628 evaluating HellaSwag: 80/628 evaluating HellaSwag: 90/628 evaluating HellaSwag: 100/628 evaluating HellaSwag: 110/628 evaluating HellaSwag: 120/628 evaluating HellaSwag: 130/628 evaluating HellaSwag: 140/628 evaluating HellaSwag: 150/628 evaluating HellaSwag: 160/628 evaluating HellaSwag: 170/628 evaluating HellaSwag: 180/628 evaluating HellaSwag: 190/628 evaluating HellaSwag: 200/628 evaluating HellaSwag: 210/628 evaluating HellaSwag: 220/628 evaluating HellaSwag: 230/628 evaluating HellaSwag: 240/628 evaluating HellaSwag: 250/628 evaluating HellaSwag: 260/628 evaluating HellaSwag: 270/628 evaluating HellaSwag: 280/628 evaluating HellaSwag: 290/628 evaluating HellaSwag: 300/628 evaluating HellaSwag: 310/628 evaluating HellaSwag: 320/628 evaluating HellaSwag: 330/628 evaluating HellaSwag: 340/628 evaluating HellaSwag: 350/628 evaluating HellaSwag: 360/628 evaluating HellaSwag: 370/628 evaluating HellaSwag: 380/628 evaluating HellaSwag: 390/628 evaluating HellaSwag: 400/628 evaluating HellaSwag: 410/628 evaluating HellaSwag: 420/628 evaluating HellaSwag: 430/628 evaluating HellaSwag: 440/628 evaluating HellaSwag: 450/628 evaluating HellaSwag: 460/628 evaluating HellaSwag: 470/628 evaluating HellaSwag: 480/628 evaluating HellaSwag: 490/628 evaluating HellaSwag: 500/628 evaluating HellaSwag: 510/628 evaluating HellaSwag: 520/628 evaluating HellaSwag: 530/628 evaluating HellaSwag: 540/628 evaluating HellaSwag: 550/628 evaluating HellaSwag: 560/628 evaluating HellaSwag: 570/628 evaluating HellaSwag: 580/628 evaluating HellaSwag: 590/628 evaluating HellaSwag: 600/628 evaluating HellaSwag: 610/628 evaluating HellaSwag: 620/628 HellaSwag: 2954/10042 = 0.294165 +step 12001/19560 | loss 3.400091 (+0.45z)| norm 0.2647 (-0.55z)| lr 2.08e-04 | 2534.18 ms | 53.3% bf16 MFU | 206981 tok/s +step 12002/19560 | loss 3.388103 (+0.18z)| norm 0.2966 (+1.26z)| lr 2.08e-04 | 2532.39 ms | 53.3% bf16 MFU | 206984 tok/s +step 12003/19560 | loss 3.373657 (-0.12z)| norm 0.2722 (-0.13z)| lr 2.08e-04 | 2533.31 ms | 53.3% bf16 MFU | 206983 tok/s +step 12004/19560 | loss 3.386541 (+0.14z)| norm 0.3024 (+1.57z)| lr 2.08e-04 | 2533.44 ms | 53.3% bf16 MFU | 206981 tok/s +step 12005/19560 | loss 3.329012 (-1.09z)| norm 0.2981 (+1.30z)| lr 2.08e-04 | 2531.04 ms | 53.3% bf16 MFU | 206989 tok/s +step 12006/19560 | loss 3.389625 (+0.22z)| norm 0.2878 (+0.71z)| lr 2.08e-04 | 2532.35 ms | 53.3% bf16 MFU | 206991 tok/s +step 12007/19560 | loss 3.359181 (-0.43z)| norm 0.2650 (-0.61z)| lr 2.08e-04 | 2532.31 ms | 53.3% bf16 MFU | 206994 tok/s +step 12008/19560 | loss 3.332548 (-0.99z)| norm 0.2738 (-0.11z)| lr 2.08e-04 | 2533.46 ms | 53.3% bf16 MFU | 206991 tok/s +step 12009/19560 | loss 3.352103 (-0.56z)| norm 0.2657 (-0.58z)| lr 2.08e-04 | 2533.45 ms | 53.3% bf16 MFU | 206989 tok/s +step 12010/19560 | loss 3.364752 (-0.29z)| norm 0.2652 (-0.60z)| lr 2.08e-04 | 2533.01 ms | 53.3% bf16 MFU | 206989 tok/s +step 12011/19560 | loss 3.276669 (-2.15z)| norm 0.3415 (+3.61z)| lr 2.08e-04 | 2534.52 ms | 53.3% bf16 MFU | 206982 tok/s +step 12012/19560 | loss 3.406958 (+0.64z)| norm 0.2702 (-0.33z)| lr 2.08e-04 | 2532.69 ms | 53.3% bf16 MFU | 206983 tok/s +step 12013/19560 | loss 3.380195 (+0.06z)| norm 0.2436 (-1.77z)| lr 2.07e-04 | 2533.14 ms | 53.3% bf16 MFU | 206983 tok/s +step 12014/19560 | loss 3.347359 (-0.64z)| norm 0.2727 (-0.18z)| lr 2.07e-04 | 2532.03 ms | 53.3% bf16 MFU | 206987 tok/s +step 12015/19560 | loss 3.336299 (-0.87z)| norm 0.2535 (-1.22z)| lr 2.07e-04 | 2532.20 ms | 53.3% bf16 MFU | 206990 tok/s +step 12016/19560 | loss 3.351502 (-0.54z)| norm 0.2534 (-1.21z)| lr 2.07e-04 | 2531.99 ms | 53.3% bf16 MFU | 206994 tok/s +step 12017/19560 | loss 3.423814 (+1.00z)| norm 0.2621 (-0.74z)| lr 2.07e-04 | 2531.10 ms | 53.3% bf16 MFU | 207001 tok/s +step 12018/19560 | loss 3.356861 (-0.43z)| norm 0.2766 (+0.04z)| lr 2.07e-04 | 2531.38 ms | 53.3% bf16 MFU | 207007 tok/s +step 12019/19560 | loss 3.351299 (-0.54z)| norm 0.2757 (-0.02z)| lr 2.07e-04 | 2531.99 ms | 53.3% bf16 MFU | 207010 tok/s +step 12020/19560 | loss 3.343050 (-0.72z)| norm 0.2759 (-0.01z)| lr 2.07e-04 | 2531.69 ms | 53.3% bf16 MFU | 207014 tok/s +step 12021/19560 | loss 3.349552 (-0.57z)| norm 0.2709 (-0.29z)| lr 2.07e-04 | 2532.70 ms | 53.3% bf16 MFU | 207013 tok/s +step 12022/19560 | loss 3.359358 (-0.36z)| norm 0.2634 (-0.69z)| lr 2.07e-04 | 2532.63 ms | 53.3% bf16 MFU | 207013 tok/s +step 12023/19560 | loss 3.387907 (+0.25z)| norm 0.2879 (+0.63z)| lr 2.07e-04 | 2531.74 ms | 53.3% bf16 MFU | 207017 tok/s +step 12024/19560 | loss 3.382480 (+0.13z)| norm 0.2697 (-0.36z)| lr 2.07e-04 | 2531.82 ms | 53.3% bf16 MFU | 207020 tok/s +step 12025/19560 | loss 3.378196 (+0.03z)| norm 0.2636 (-0.70z)| lr 2.07e-04 | 2533.23 ms | 53.3% bf16 MFU | 207017 tok/s +step 12026/19560 | loss 3.342598 (-0.75z)| norm 0.2807 (+0.24z)| lr 2.07e-04 | 2534.56 ms | 53.3% bf16 MFU | 207009 tok/s +step 12027/19560 | loss 3.332203 (-0.98z)| norm 0.2883 (+0.65z)| lr 2.07e-04 | 2533.16 ms | 53.3% bf16 MFU | 207007 tok/s +step 12028/19560 | loss 3.390105 (+0.30z)| norm 0.2608 (-0.85z)| lr 2.07e-04 | 2533.03 ms | 53.3% bf16 MFU | 207006 tok/s +step 12029/19560 | loss 3.350511 (-0.59z)| norm 0.2790 (+0.14z)| lr 2.07e-04 | 2535.25 ms | 53.3% bf16 MFU | 206996 tok/s +step 12030/19560 | loss 3.357744 (-0.43z)| norm 0.2752 (-0.06z)| lr 2.07e-04 | 2532.25 ms | 53.3% bf16 MFU | 206998 tok/s +step 12031/19560 | loss 3.479911 (+2.25z)| norm 0.2867 (+0.55z)| lr 2.07e-04 | 2532.40 ms | 53.3% bf16 MFU | 207000 tok/s +step 12032/19560 | loss 3.347070 (-0.67z)| norm 0.2751 (-0.08z)| lr 2.07e-04 | 2533.60 ms | 53.3% bf16 MFU | 206996 tok/s +step 12033/19560 | loss 3.420987 (+0.96z)| norm 0.2688 (-0.43z)| lr 2.07e-04 | 2532.71 ms | 53.3% bf16 MFU | 206997 tok/s +step 12034/19560 | loss 3.403445 (+0.57z)| norm 0.2728 (-0.20z)| lr 2.06e-04 | 2533.93 ms | 53.3% bf16 MFU | 206992 tok/s +step 12035/19560 | loss 3.398774 (+0.46z)| norm 0.2762 (-0.02z)| lr 2.06e-04 | 2532.08 ms | 53.3% bf16 MFU | 206996 tok/s +step 12036/19560 | loss 3.359868 (-0.40z)| norm 0.3017 (+1.36z)| lr 2.06e-04 | 2532.98 ms | 53.3% bf16 MFU | 206995 tok/s +step 12037/19560 | loss 3.306262 (-1.55z)| norm 0.2929 (+0.86z)| lr 2.06e-04 | 2532.87 ms | 53.3% bf16 MFU | 206995 tok/s +step 12038/19560 | loss 3.313525 (-1.39z)| norm 0.2697 (-0.41z)| lr 2.06e-04 | 2533.62 ms | 53.3% bf16 MFU | 206992 tok/s +step 12039/19560 | loss 3.421144 (+1.00z)| norm 0.2821 (+0.27z)| lr 2.06e-04 | 2533.85 ms | 53.3% bf16 MFU | 206988 tok/s +step 12040/19560 | loss 3.379060 (+0.06z)| norm 0.2583 (-1.03z)| lr 2.06e-04 | 2531.74 ms | 53.3% bf16 MFU | 206993 tok/s +step 12041/19560 | loss 3.347164 (-0.66z)| norm 0.2903 (+0.71z)| lr 2.06e-04 | 2533.15 ms | 53.3% bf16 MFU | 206992 tok/s +step 12042/19560 | loss 3.374844 (-0.05z)| norm 0.2586 (-1.08z)| lr 2.06e-04 | 2532.04 ms | 53.3% bf16 MFU | 206995 tok/s +step 12043/19560 | loss 3.295585 (-1.80z)| norm 0.2893 (+0.77z)| lr 2.06e-04 | 2533.11 ms | 53.3% bf16 MFU | 206994 tok/s +step 12044/19560 | loss 3.382153 (+0.13z)| norm 0.2739 (-0.15z)| lr 2.06e-04 | 2532.14 ms | 53.3% bf16 MFU | 206997 tok/s +step 12045/19560 | loss 3.377160 (+0.03z)| norm 0.2730 (-0.20z)| lr 2.06e-04 | 2532.02 ms | 53.3% bf16 MFU | 207001 tok/s +step 12046/19560 | loss 3.316541 (-1.33z)| norm 0.2569 (-1.15z)| lr 2.06e-04 | 2532.15 ms | 53.3% bf16 MFU | 207003 tok/s +step 12047/19560 | loss 3.361423 (-0.32z)| norm 0.2749 (-0.06z)| lr 2.06e-04 | 2533.07 ms | 53.3% bf16 MFU | 207002 tok/s +step 12048/19560 | loss 3.341690 (-0.75z)| norm 0.3039 (+1.69z)| lr 2.06e-04 | 2532.34 ms | 53.3% bf16 MFU | 207004 tok/s +step 12049/19560 | loss 3.337399 (-0.85z)| norm 0.2589 (-1.01z)| lr 2.06e-04 | 2533.48 ms | 53.3% bf16 MFU | 207001 tok/s +step 12050/19560 | loss 3.324509 (-1.12z)| norm 0.2699 (-0.34z)| lr 2.06e-04 | 2532.23 ms | 53.3% bf16 MFU | 207003 tok/s +step 12051/19560 | loss 3.340644 (-0.77z)| norm 0.2666 (-0.54z)| lr 2.06e-04 | 2531.85 ms | 53.3% bf16 MFU | 207007 tok/s +step 12052/19560 | loss 3.375906 (+0.03z)| norm 0.2689 (-0.39z)| lr 2.06e-04 | 2531.63 ms | 53.3% bf16 MFU | 207011 tok/s +step 12053/19560 | loss 3.466156 (+2.01z)| norm 0.2712 (-0.26z)| lr 2.06e-04 | 2532.77 ms | 53.3% bf16 MFU | 207011 tok/s +step 12054/19560 | loss 3.402790 (+0.59z)| norm 0.2550 (-1.21z)| lr 2.06e-04 | 2531.97 ms | 53.3% bf16 MFU | 207013 tok/s +step 12055/19560 | loss 3.378669 (+0.05z)| norm 0.2691 (-0.36z)| lr 2.05e-04 | 2532.34 ms | 53.3% bf16 MFU | 207015 tok/s +step 12056/19560 | loss 3.346772 (-0.66z)| norm 0.2706 (-0.27z)| lr 2.05e-04 | 2533.63 ms | 53.3% bf16 MFU | 207010 tok/s +step 12057/19560 | loss 3.391456 (+0.35z)| norm 0.2627 (-0.74z)| lr 2.05e-04 | 2533.36 ms | 53.3% bf16 MFU | 207008 tok/s +step 12058/19560 | loss 3.404782 (+0.64z)| norm 0.2714 (-0.21z)| lr 2.05e-04 | 2532.54 ms | 53.3% bf16 MFU | 207008 tok/s +step 12059/19560 | loss 3.339515 (-0.80z)| norm 0.2687 (-0.38z)| lr 2.05e-04 | 2532.33 ms | 53.3% bf16 MFU | 207010 tok/s +step 12060/19560 | loss 3.380915 (+0.12z)| norm 0.2703 (-0.28z)| lr 2.05e-04 | 2534.71 ms | 53.3% bf16 MFU | 207001 tok/s +step 12061/19560 | loss 3.344408 (-0.69z)| norm 0.2513 (-1.40z)| lr 2.05e-04 | 2533.43 ms | 53.3% bf16 MFU | 206999 tok/s +step 12062/19560 | loss 3.329436 (-1.02z)| norm 0.2722 (-0.15z)| lr 2.05e-04 | 2533.37 ms | 53.3% bf16 MFU | 206996 tok/s +step 12063/19560 | loss 3.396650 (+0.51z)| norm 0.2663 (-0.51z)| lr 2.05e-04 | 2533.29 ms | 53.3% bf16 MFU | 206995 tok/s +step 12064/19560 | loss 3.348029 (-0.60z)| norm 0.2984 (+1.38z)| lr 2.05e-04 | 2533.99 ms | 53.3% bf16 MFU | 206990 tok/s +step 12065/19560 | loss 3.364160 (-0.22z)| norm 0.2838 (+0.52z)| lr 2.05e-04 | 2533.67 ms | 53.3% bf16 MFU | 206987 tok/s +step 12066/19560 | loss 3.369658 (-0.08z)| norm 0.2903 (+0.89z)| lr 2.05e-04 | 2533.05 ms | 53.3% bf16 MFU | 206986 tok/s +step 12067/19560 | loss 3.372014 (-0.02z)| norm 0.2917 (+0.96z)| lr 2.05e-04 | 2533.99 ms | 53.3% bf16 MFU | 206982 tok/s +step 12068/19560 | loss 3.326265 (-1.07z)| norm 0.3162 (+2.34z)| lr 2.05e-04 | 2533.09 ms | 53.3% bf16 MFU | 206982 tok/s +step 12069/19560 | loss 3.352315 (-0.47z)| norm 0.2775 (+0.09z)| lr 2.05e-04 | 2533.93 ms | 53.3% bf16 MFU | 206978 tok/s +step 12070/19560 | loss 3.471128 (+2.21z)| norm 0.3219 (+2.58z)| lr 2.05e-04 | 2532.80 ms | 53.3% bf16 MFU | 206979 tok/s +step 12071/19560 | loss 3.330412 (-0.97z)| norm 0.2815 (+0.28z)| lr 2.05e-04 | 2531.14 ms | 53.3% bf16 MFU | 206987 tok/s +step 12072/19560 | loss 3.389923 (+0.37z)| norm 0.2916 (+0.85z)| lr 2.05e-04 | 2531.31 ms | 53.3% bf16 MFU | 206994 tok/s +step 12073/19560 | loss 3.401143 (+0.62z)| norm 0.3168 (+2.21z)| lr 2.05e-04 | 2532.71 ms | 53.3% bf16 MFU | 206994 tok/s +step 12074/19560 | loss 3.305797 (-1.51z)| norm 0.2883 (+0.62z)| lr 2.05e-04 | 2531.68 ms | 53.3% bf16 MFU | 206999 tok/s +step 12075/19560 | loss 3.339394 (-0.76z)| norm 0.3159 (+2.11z)| lr 2.05e-04 | 2532.65 ms | 53.3% bf16 MFU | 207000 tok/s +step 12076/19560 | loss 3.371568 (-0.03z)| norm 0.2927 (+0.83z)| lr 2.04e-04 | 2532.53 ms | 53.3% bf16 MFU | 207001 tok/s +step 12077/19560 | loss 3.334671 (-0.85z)| norm 0.3078 (+1.63z)| lr 2.04e-04 | 2531.60 ms | 53.3% bf16 MFU | 207006 tok/s +step 12078/19560 | loss 3.371678 (-0.03z)| norm 0.2913 (+0.72z)| lr 2.04e-04 | 2533.14 ms | 53.3% bf16 MFU | 207004 tok/s +step 12079/19560 | loss 3.341213 (-0.72z)| norm 0.2880 (+0.54z)| lr 2.04e-04 | 2531.63 ms | 53.3% bf16 MFU | 207009 tok/s +step 12080/19560 | loss 3.409508 (+0.83z)| norm 0.2749 (-0.16z)| lr 2.04e-04 | 2532.57 ms | 53.3% bf16 MFU | 207009 tok/s +step 12081/19560 | loss 3.310281 (-1.41z)| norm 0.2674 (-0.57z)| lr 2.04e-04 | 2531.86 ms | 53.3% bf16 MFU | 207012 tok/s +step 12082/19560 | loss 3.328057 (-1.00z)| norm 0.2599 (-0.96z)| lr 2.04e-04 | 2533.46 ms | 53.3% bf16 MFU | 207009 tok/s +step 12083/19560 | loss 3.374370 (+0.04z)| norm 0.2916 (+0.77z)| lr 2.04e-04 | 2532.89 ms | 53.3% bf16 MFU | 207008 tok/s +step 12084/19560 | loss 3.369199 (-0.07z)| norm 0.2745 (-0.18z)| lr 2.04e-04 | 2532.30 ms | 53.3% bf16 MFU | 207010 tok/s +step 12085/19560 | loss 3.334237 (-0.85z)| norm 0.2899 (+0.71z)| lr 2.04e-04 | 2533.48 ms | 53.3% bf16 MFU | 207006 tok/s +step 12086/19560 | loss 3.408392 (+0.81z)| norm 0.2678 (-0.55z)| lr 2.04e-04 | 2533.67 ms | 53.3% bf16 MFU | 207003 tok/s +step 12087/19560 | loss 3.328348 (-0.98z)| norm 0.2957 (+1.03z)| lr 2.04e-04 | 2532.59 ms | 53.3% bf16 MFU | 207003 tok/s +step 12088/19560 | loss 3.340317 (-0.70z)| norm 0.2669 (-0.61z)| lr 2.04e-04 | 2532.10 ms | 53.3% bf16 MFU | 207006 tok/s +step 12089/19560 | loss 3.383255 (+0.27z)| norm 0.2831 (+0.31z)| lr 2.04e-04 | 2533.29 ms | 53.3% bf16 MFU | 207004 tok/s +step 12090/19560 | loss 3.356845 (-0.32z)| norm 0.2675 (-0.58z)| lr 2.04e-04 | 2532.66 ms | 53.3% bf16 MFU | 207004 tok/s +step 12091/19560 | loss 3.388482 (+0.40z)| norm 0.2669 (-0.61z)| lr 2.04e-04 | 2532.49 ms | 53.3% bf16 MFU | 207005 tok/s +step 12092/19560 | loss 3.354732 (-0.37z)| norm 0.2883 (+0.66z)| lr 2.04e-04 | 2533.21 ms | 53.3% bf16 MFU | 207003 tok/s +step 12093/19560 | loss 3.400252 (+0.66z)| norm 0.2630 (-0.83z)| lr 2.04e-04 | 2534.88 ms | 53.3% bf16 MFU | 206994 tok/s +step 12094/19560 | loss 3.343781 (-0.61z)| norm 0.2722 (-0.28z)| lr 2.04e-04 | 2532.40 ms | 53.3% bf16 MFU | 206996 tok/s +step 12095/19560 | loss 3.378221 (+0.18z)| norm 0.2861 (+0.55z)| lr 2.04e-04 | 2530.96 ms | 53.3% bf16 MFU | 207004 tok/s +step 12096/19560 | loss 3.306534 (-1.44z)| norm 0.2686 (-0.49z)| lr 2.04e-04 | 2530.21 ms | 53.4% bf16 MFU | 207014 tok/s +step 12097/19560 | loss 3.311671 (-1.30z)| norm 0.2940 (+1.01z)| lr 2.04e-04 | 2531.69 ms | 53.3% bf16 MFU | 207018 tok/s +step 12098/19560 | loss 3.346711 (-0.50z)| norm 0.2994 (+1.30z)| lr 2.03e-04 | 2533.29 ms | 53.3% bf16 MFU | 207015 tok/s +step 12099/19560 | loss 3.364604 (-0.10z)| norm 0.2780 (+0.04z)| lr 2.03e-04 | 2532.13 ms | 53.3% bf16 MFU | 207017 tok/s +step 12100/19560 | loss 3.356432 (-0.29z)| norm 0.2814 (+0.25z)| lr 2.03e-04 | 2532.36 ms | 53.3% bf16 MFU | 207018 tok/s +step 12101/19560 | loss 3.329450 (-0.90z)| norm 0.2833 (+0.35z)| lr 2.03e-04 | 2533.25 ms | 53.3% bf16 MFU | 207015 tok/s +step 12102/19560 | loss 3.331319 (-0.87z)| norm 0.2657 (-0.70z)| lr 2.03e-04 | 2531.90 ms | 53.3% bf16 MFU | 207018 tok/s +step 12103/19560 | loss 3.311454 (-1.35z)| norm 0.3029 (+1.50z)| lr 2.03e-04 | 2531.76 ms | 53.3% bf16 MFU | 207021 tok/s +step 12104/19560 | loss 3.341007 (-0.65z)| norm 0.2585 (-1.13z)| lr 2.03e-04 | 2532.14 ms | 53.3% bf16 MFU | 207023 tok/s +step 12105/19560 | loss 3.395503 (+0.60z)| norm 0.2927 (+0.88z)| lr 2.03e-04 | 2532.66 ms | 53.3% bf16 MFU | 207022 tok/s +step 12106/19560 | loss 3.381609 (+0.28z)| norm 0.2923 (+0.84z)| lr 2.03e-04 | 2531.77 ms | 53.3% bf16 MFU | 207025 tok/s +step 12107/19560 | loss 3.314963 (-1.25z)| norm 0.2590 (-1.11z)| lr 2.03e-04 | 2534.61 ms | 53.3% bf16 MFU | 207017 tok/s +step 12108/19560 | loss 3.342297 (-0.61z)| norm 0.2941 (+0.94z)| lr 2.03e-04 | 2532.51 ms | 53.3% bf16 MFU | 207017 tok/s +step 12109/19560 | loss 3.411202 (+1.04z)| norm 0.2799 (+0.12z)| lr 2.03e-04 | 2532.84 ms | 53.3% bf16 MFU | 207016 tok/s +step 12110/19560 | loss 3.389254 (+0.51z)| norm 0.2889 (+0.64z)| lr 2.03e-04 | 2532.65 ms | 53.3% bf16 MFU | 207016 tok/s +step 12111/19560 | loss 3.327442 (-0.97z)| norm 0.2761 (-0.14z)| lr 2.03e-04 | 2532.80 ms | 53.3% bf16 MFU | 207015 tok/s +step 12112/19560 | loss 3.329693 (-0.90z)| norm 0.2930 (+0.87z)| lr 2.03e-04 | 2531.15 ms | 53.3% bf16 MFU | 207021 tok/s +step 12113/19560 | loss 3.339209 (-0.66z)| norm 0.2746 (-0.26z)| lr 2.03e-04 | 2532.63 ms | 53.3% bf16 MFU | 207020 tok/s +step 12114/19560 | loss 3.353394 (-0.31z)| norm 0.2743 (-0.29z)| lr 2.03e-04 | 2532.37 ms | 53.3% bf16 MFU | 207021 tok/s +step 12115/19560 | loss 3.326625 (-0.98z)| norm 0.2690 (-0.65z)| lr 2.03e-04 | 2532.66 ms | 53.3% bf16 MFU | 207021 tok/s +step 12116/19560 | loss 3.385128 (+0.47z)| norm 0.2840 (+0.30z)| lr 2.03e-04 | 2532.58 ms | 53.3% bf16 MFU | 207020 tok/s +step 12117/19560 | loss 3.356961 (-0.20z)| norm 0.2692 (-0.64z)| lr 2.03e-04 | 2532.32 ms | 53.3% bf16 MFU | 207021 tok/s +step 12118/19560 | loss 3.340066 (-0.65z)| norm 0.2841 (+0.30z)| lr 2.03e-04 | 2534.26 ms | 53.3% bf16 MFU | 207014 tok/s +step 12119/19560 | loss 3.335312 (-0.78z)| norm 0.2937 (+0.91z)| lr 2.02e-04 | 2535.73 ms | 53.2% bf16 MFU | 207002 tok/s +step 12120/19560 | loss 3.320479 (-1.17z)| norm 0.2667 (-0.80z)| lr 2.02e-04 | 2534.61 ms | 53.3% bf16 MFU | 206994 tok/s +step 12121/19560 | loss 3.366935 (+0.13z)| norm 0.2935 (+0.89z)| lr 2.02e-04 | 2533.04 ms | 53.3% bf16 MFU | 206993 tok/s +step 12122/19560 | loss 3.351376 (-0.30z)| norm 0.2627 (-1.04z)| lr 2.02e-04 | 2533.15 ms | 53.3% bf16 MFU | 206992 tok/s +step 12123/19560 | loss 3.268414 (-2.57z)| norm 0.2809 (+0.12z)| lr 2.02e-04 | 2534.09 ms | 53.3% bf16 MFU | 206987 tok/s +step 12124/19560 | loss 3.361169 (+0.01z)| norm 0.2906 (+0.73z)| lr 2.02e-04 | 2534.34 ms | 53.3% bf16 MFU | 206982 tok/s +step 12125/19560 | loss 3.345324 (-0.43z)| norm 0.2676 (-0.72z)| lr 2.02e-04 | 2533.08 ms | 53.3% bf16 MFU | 206981 tok/s +step 12126/19560 | loss 3.406085 (+1.28z)| norm 0.2846 (+0.37z)| lr 2.02e-04 | 2532.76 ms | 53.3% bf16 MFU | 206982 tok/s +step 12127/19560 | loss 3.330453 (-0.85z)| norm 0.2646 (-0.89z)| lr 2.02e-04 | 2531.59 ms | 53.3% bf16 MFU | 206988 tok/s +step 12128/19560 | loss 3.305107 (-1.54z)| norm 0.3056 (+1.68z)| lr 2.02e-04 | 2530.98 ms | 53.3% bf16 MFU | 206996 tok/s +step 12129/19560 | loss 3.336709 (-0.64z)| norm 0.2555 (-1.46z)| lr 2.02e-04 | 2533.27 ms | 53.3% bf16 MFU | 206994 tok/s +step 12130/19560 | loss 3.399246 (+1.13z)| norm 0.2962 (+1.09z)| lr 2.02e-04 | 2531.40 ms | 53.3% bf16 MFU | 207000 tok/s +step 12131/19560 | loss 3.375222 (+0.45z)| norm 0.2584 (-1.26z)| lr 2.02e-04 | 2531.98 ms | 53.3% bf16 MFU | 207004 tok/s +step 12132/19560 | loss 3.335782 (-0.66z)| norm 0.2571 (-1.33z)| lr 2.02e-04 | 2531.35 ms | 53.3% bf16 MFU | 207009 tok/s +step 12133/19560 | loss 3.379079 (+0.56z)| norm 0.2530 (-1.55z)| lr 2.02e-04 | 2532.15 ms | 53.3% bf16 MFU | 207012 tok/s +step 12134/19560 | loss 3.365173 (+0.17z)| norm 0.2564 (-1.32z)| lr 2.02e-04 | 2533.42 ms | 53.3% bf16 MFU | 207008 tok/s +step 12135/19560 | loss 3.388890 (+0.84z)| norm 0.2738 (-0.25z)| lr 2.02e-04 | 2533.72 ms | 53.3% bf16 MFU | 207004 tok/s +step 12136/19560 | loss 3.483195 (+3.33z)| norm 0.2693 (-0.53z)| lr 2.02e-04 | 2531.81 ms | 53.3% bf16 MFU | 207008 tok/s +step 12137/19560 | loss 3.274246 (-2.28z)| norm 0.2799 (+0.12z)| lr 2.02e-04 | 2533.59 ms | 53.3% bf16 MFU | 207004 tok/s +step 12138/19560 | loss 3.316490 (-1.14z)| norm 0.2651 (-0.79z)| lr 2.02e-04 | 2534.75 ms | 53.3% bf16 MFU | 206996 tok/s +step 12139/19560 | loss 3.302088 (-1.54z)| norm 0.2717 (-0.37z)| lr 2.02e-04 | 2533.83 ms | 53.3% bf16 MFU | 206992 tok/s +step 12140/19560 | loss 3.369952 (+0.28z)| norm 0.2745 (-0.19z)| lr 2.01e-04 | 2533.37 ms | 53.3% bf16 MFU | 206990 tok/s +step 12141/19560 | loss 3.301739 (-1.52z)| norm 0.2718 (-0.39z)| lr 2.01e-04 | 2533.72 ms | 53.3% bf16 MFU | 206987 tok/s +step 12142/19560 | loss 3.363752 (+0.13z)| norm 0.2805 (+0.19z)| lr 2.01e-04 | 2534.13 ms | 53.3% bf16 MFU | 206982 tok/s +step 12143/19560 | loss 3.333076 (-0.69z)| norm 0.2699 (-0.54z)| lr 2.01e-04 | 2532.63 ms | 53.3% bf16 MFU | 206984 tok/s +step 12144/19560 | loss 3.362489 (+0.09z)| norm 0.2811 (+0.22z)| lr 2.01e-04 | 2532.35 ms | 53.3% bf16 MFU | 206986 tok/s +step 12145/19560 | loss 3.344031 (-0.39z)| norm 0.2557 (-1.52z)| lr 2.01e-04 | 2534.05 ms | 53.3% bf16 MFU | 206982 tok/s +step 12146/19560 | loss 3.333769 (-0.66z)| norm 0.2867 (+0.59z)| lr 2.01e-04 | 2535.94 ms | 53.2% bf16 MFU | 206970 tok/s +step 12147/19560 | loss 3.340108 (-0.49z)| norm 0.2602 (-1.21z)| lr 2.01e-04 | 2531.66 ms | 53.3% bf16 MFU | 206976 tok/s +step 12148/19560 | loss 3.401746 (+1.15z)| norm 0.2908 (+0.87z)| lr 2.01e-04 | 2532.66 ms | 53.3% bf16 MFU | 206978 tok/s +step 12149/19560 | loss 3.346423 (-0.33z)| norm 0.2763 (-0.12z)| lr 2.01e-04 | 2531.76 ms | 53.3% bf16 MFU | 206983 tok/s +step 12150/19560 | loss 3.322905 (-0.94z)| norm 0.2754 (-0.19z)| lr 2.01e-04 | 2533.31 ms | 53.3% bf16 MFU | 206982 tok/s +step 12151/19560 | loss 3.326924 (-0.82z)| norm 0.2836 (+0.38z)| lr 2.01e-04 | 2532.67 ms | 53.3% bf16 MFU | 206983 tok/s +step 12152/19560 | loss 3.327914 (-0.79z)| norm 0.2777 (-0.03z)| lr 2.01e-04 | 2532.35 ms | 53.3% bf16 MFU | 206986 tok/s +step 12153/19560 | loss 3.366963 (+0.26z)| norm 0.2685 (-0.67z)| lr 2.01e-04 | 2532.23 ms | 53.3% bf16 MFU | 206989 tok/s +step 12154/19560 | loss 3.344212 (-0.35z)| norm 0.3329 (+3.53z)| lr 2.01e-04 | 2531.78 ms | 53.3% bf16 MFU | 206994 tok/s +step 12155/19560 | loss 3.321789 (-0.94z)| norm 0.2875 (+0.58z)| lr 2.01e-04 | 2532.74 ms | 53.3% bf16 MFU | 206994 tok/s +step 12156/19560 | loss 3.297144 (-1.57z)| norm 0.2690 (-0.64z)| lr 2.01e-04 | 2531.49 ms | 53.3% bf16 MFU | 207000 tok/s +step 12157/19560 | loss 3.388748 (+0.84z)| norm 0.2830 (+0.28z)| lr 2.01e-04 | 2532.26 ms | 53.3% bf16 MFU | 207002 tok/s +step 12158/19560 | loss 3.330932 (-0.68z)| norm 0.2587 (-1.29z)| lr 2.01e-04 | 2531.84 ms | 53.3% bf16 MFU | 207006 tok/s +step 12159/19560 | loss 3.324686 (-0.84z)| norm 0.2670 (-0.74z)| lr 2.01e-04 | 2532.56 ms | 53.3% bf16 MFU | 207006 tok/s +step 12160/19560 | loss 3.367823 (+0.34z)| norm 0.2570 (-1.37z)| lr 2.01e-04 | 2532.92 ms | 53.3% bf16 MFU | 207006 tok/s +step 12161/19560 | loss 3.359125 (+0.11z)| norm 0.2702 (-0.52z)| lr 2.00e-04 | 2533.89 ms | 53.3% bf16 MFU | 207001 tok/s +step 12162/19560 | loss 3.369428 (+0.41z)| norm 0.2637 (-0.94z)| lr 2.00e-04 | 2533.70 ms | 53.3% bf16 MFU | 206997 tok/s +step 12163/19560 | loss 3.346552 (-0.22z)| norm 0.2747 (-0.23z)| lr 2.00e-04 | 2532.54 ms | 53.3% bf16 MFU | 206998 tok/s +step 12164/19560 | loss 3.379533 (+0.70z)| norm 0.2725 (-0.36z)| lr 2.00e-04 | 2533.27 ms | 53.3% bf16 MFU | 206996 tok/s +step 12165/19560 | loss 3.389930 (+0.98z)| norm 0.2572 (-1.33z)| lr 2.00e-04 | 2531.78 ms | 53.3% bf16 MFU | 207001 tok/s +step 12166/19560 | loss 3.375724 (+0.57z)| norm 0.3591 (+4.73z)| lr 2.00e-04 | 2533.81 ms | 53.3% bf16 MFU | 206996 tok/s +step 12167/19560 | loss 3.344907 (-0.29z)| norm 0.2749 (-0.20z)| lr 2.00e-04 | 2531.83 ms | 53.3% bf16 MFU | 207001 tok/s +step 12168/19560 | loss 3.349599 (-0.15z)| norm 0.3026 (+1.40z)| lr 2.00e-04 | 2533.99 ms | 53.3% bf16 MFU | 206996 tok/s +step 12169/19560 | loss 3.363324 (+0.24z)| norm 0.2940 (+0.89z)| lr 2.00e-04 | 2532.67 ms | 53.3% bf16 MFU | 206996 tok/s +step 12170/19560 | loss 3.366289 (+0.33z)| norm 0.2782 (-0.04z)| lr 2.00e-04 | 2533.25 ms | 53.3% bf16 MFU | 206995 tok/s +step 12171/19560 | loss 3.334700 (-0.60z)| norm 0.2969 (+1.05z)| lr 2.00e-04 | 2532.52 ms | 53.3% bf16 MFU | 206996 tok/s +step 12172/19560 | loss 3.314755 (-1.16z)| norm 0.2609 (-1.05z)| lr 2.00e-04 | 2533.52 ms | 53.3% bf16 MFU | 206993 tok/s +step 12173/19560 | loss 3.335510 (-0.55z)| norm 0.3024 (+1.35z)| lr 2.00e-04 | 2533.26 ms | 53.3% bf16 MFU | 206992 tok/s +step 12174/19560 | loss 3.306606 (-1.38z)| norm 0.2913 (+0.69z)| lr 2.00e-04 | 2532.55 ms | 53.3% bf16 MFU | 206993 tok/s +step 12175/19560 | loss 3.337376 (-0.48z)| norm 0.2991 (+1.13z)| lr 2.00e-04 | 2533.10 ms | 53.3% bf16 MFU | 206992 tok/s +step 12176/19560 | loss 3.293070 (-1.73z)| norm 0.2657 (-0.79z)| lr 2.00e-04 | 2533.52 ms | 53.3% bf16 MFU | 206990 tok/s +step 12177/19560 | loss 3.329870 (-0.68z)| norm 0.2631 (-0.94z)| lr 2.00e-04 | 2533.14 ms | 53.3% bf16 MFU | 206989 tok/s +step 12178/19560 | loss 3.380669 (+0.76z)| norm 0.2917 (+0.71z)| lr 2.00e-04 | 2530.88 ms | 53.3% bf16 MFU | 206997 tok/s +step 12179/19560 | loss 3.414400 (+1.68z)| norm 0.2678 (-0.68z)| lr 2.00e-04 | 2532.28 ms | 53.3% bf16 MFU | 206999 tok/s +step 12180/19560 | loss 3.431291 (+2.11z)| norm 0.2639 (-0.90z)| lr 2.00e-04 | 2534.15 ms | 53.3% bf16 MFU | 206994 tok/s +step 12181/19560 | loss 3.541018 (+4.83z)| norm 0.2799 (+0.02z)| lr 2.00e-04 | 2532.26 ms | 53.3% bf16 MFU | 206996 tok/s +step 12182/19560 | loss 3.323868 (-0.82z)| norm 0.2817 (+0.12z)| lr 1.99e-04 | 2533.26 ms | 53.3% bf16 MFU | 206995 tok/s +step 12183/19560 | loss 3.402962 (+1.24z)| norm 0.2692 (-0.62z)| lr 1.99e-04 | 2532.59 ms | 53.3% bf16 MFU | 206996 tok/s +step 12184/19560 | loss 3.387637 (+0.83z)| norm 0.2827 (+0.17z)| lr 1.99e-04 | 2532.46 ms | 53.3% bf16 MFU | 206997 tok/s +step 12185/19560 | loss 3.339787 (-0.40z)| norm 0.2784 (-0.09z)| lr 1.99e-04 | 2531.84 ms | 53.3% bf16 MFU | 207001 tok/s +step 12186/19560 | loss 3.315943 (-1.01z)| norm 0.2575 (-1.31z)| lr 1.99e-04 | 2532.06 ms | 53.3% bf16 MFU | 207004 tok/s +step 12187/19560 | loss 3.331430 (-0.60z)| norm 0.2866 (+0.39z)| lr 1.99e-04 | 2532.54 ms | 53.3% bf16 MFU | 207005 tok/s +step 12188/19560 | loss 3.327891 (-0.68z)| norm 0.2572 (-1.32z)| lr 1.99e-04 | 2531.84 ms | 53.3% bf16 MFU | 207009 tok/s +step 12189/19560 | loss 3.335713 (-0.48z)| norm 0.2752 (-0.28z)| lr 1.99e-04 | 2533.79 ms | 53.3% bf16 MFU | 207004 tok/s +step 12190/19560 | loss 3.378273 (+0.62z)| norm 0.2614 (-1.09z)| lr 1.99e-04 | 2531.75 ms | 53.3% bf16 MFU | 207008 tok/s +step 12191/19560 | loss 3.362089 (+0.21z)| norm 0.2691 (-0.64z)| lr 1.99e-04 | 2530.89 ms | 53.3% bf16 MFU | 207016 tok/s +step 12192/19560 | loss 3.329861 (-0.63z)| norm 0.2593 (-1.20z)| lr 1.99e-04 | 2531.37 ms | 53.3% bf16 MFU | 207021 tok/s +step 12193/19560 | loss 3.365282 (+0.30z)| norm 0.2507 (-1.67z)| lr 1.99e-04 | 2530.58 ms | 53.4% bf16 MFU | 207029 tok/s +step 12194/19560 | loss 3.415246 (+1.58z)| norm 0.2631 (-0.93z)| lr 1.99e-04 | 2530.88 ms | 53.3% bf16 MFU | 207035 tok/s +step 12195/19560 | loss 3.430641 (+1.94z)| norm 0.2470 (-1.83z)| lr 1.99e-04 | 2532.51 ms | 53.3% bf16 MFU | 207034 tok/s +step 12196/19560 | loss 3.357971 (+0.08z)| norm 0.2664 (-0.70z)| lr 1.99e-04 | 2531.65 ms | 53.3% bf16 MFU | 207037 tok/s +step 12197/19560 | loss 3.341714 (-0.34z)| norm 0.2625 (-0.92z)| lr 1.99e-04 | 2532.59 ms | 53.3% bf16 MFU | 207036 tok/s +step 12198/19560 | loss 3.347034 (-0.18z)| norm 0.2521 (-1.52z)| lr 1.99e-04 | 2533.55 ms | 53.3% bf16 MFU | 207031 tok/s +step 12199/19560 | loss 3.333876 (-0.53z)| norm 0.2724 (-0.31z)| lr 1.99e-04 | 2529.71 ms | 53.4% bf16 MFU | 207042 tok/s +step 12200/19560 | loss 3.341357 (-0.33z)| norm 0.2525 (-1.47z)| lr 1.99e-04 | 2532.51 ms | 53.3% bf16 MFU | 207041 tok/s +step 12201/19560 | loss 3.326679 (-0.70z)| norm 0.2713 (-0.35z)| lr 1.99e-04 | 2532.30 ms | 53.3% bf16 MFU | 207041 tok/s +step 12202/19560 | loss 3.389717 (+0.97z)| norm 0.2541 (-1.36z)| lr 1.99e-04 | 2531.33 ms | 53.3% bf16 MFU | 207045 tok/s +step 12203/19560 | loss 3.317119 (-0.97z)| norm 0.2614 (-0.91z)| lr 1.99e-04 | 2532.49 ms | 53.3% bf16 MFU | 207044 tok/s +step 12204/19560 | loss 3.353000 (-0.01z)| norm 0.2675 (-0.53z)| lr 1.98e-04 | 2533.49 ms | 53.3% bf16 MFU | 207039 tok/s +step 12205/19560 | loss 3.337976 (-0.41z)| norm 0.2660 (-0.61z)| lr 1.98e-04 | 2533.22 ms | 53.3% bf16 MFU | 207036 tok/s +step 12206/19560 | loss 3.354126 (+0.02z)| norm 0.2622 (-0.83z)| lr 1.98e-04 | 2533.30 ms | 53.3% bf16 MFU | 207032 tok/s +step 12207/19560 | loss 3.320279 (-0.88z)| norm 0.2668 (-0.53z)| lr 1.98e-04 | 2533.17 ms | 53.3% bf16 MFU | 207029 tok/s +step 12208/19560 | loss 3.325050 (-0.74z)| norm 0.2651 (-0.63z)| lr 1.98e-04 | 2532.22 ms | 53.3% bf16 MFU | 207029 tok/s +step 12209/19560 | loss 3.387516 (+0.93z)| norm 0.2693 (-0.38z)| lr 1.98e-04 | 2533.47 ms | 53.3% bf16 MFU | 207025 tok/s +step 12210/19560 | loss 3.354573 (+0.04z)| norm 0.2722 (-0.21z)| lr 1.98e-04 | 2532.20 ms | 53.3% bf16 MFU | 207026 tok/s +step 12211/19560 | loss 3.388716 (+0.95z)| norm 0.2677 (-0.47z)| lr 1.98e-04 | 2533.47 ms | 53.3% bf16 MFU | 207022 tok/s +step 12212/19560 | loss 3.304210 (-1.30z)| norm 0.2761 (+0.05z)| lr 1.98e-04 | 2533.97 ms | 53.3% bf16 MFU | 207016 tok/s +step 12213/19560 | loss 3.357920 (+0.13z)| norm 0.2577 (-1.08z)| lr 1.98e-04 | 2533.90 ms | 53.3% bf16 MFU | 207011 tok/s +step 12214/19560 | loss 3.360803 (+0.22z)| norm 0.2475 (-1.69z)| lr 1.98e-04 | 2533.85 ms | 53.3% bf16 MFU | 207006 tok/s +step 12215/19560 | loss 3.321684 (-0.83z)| norm 0.2790 (+0.26z)| lr 1.98e-04 | 2532.53 ms | 53.3% bf16 MFU | 207007 tok/s +step 12216/19560 | loss 3.370098 (+0.47z)| norm 0.2610 (-0.85z)| lr 1.98e-04 | 2534.87 ms | 53.3% bf16 MFU | 206998 tok/s +step 12217/19560 | loss 3.424008 (+1.89z)| norm 0.2988 (+1.47z)| lr 1.98e-04 | 2534.80 ms | 53.3% bf16 MFU | 206990 tok/s +step 12218/19560 | loss 3.363237 (+0.27z)| norm 0.2639 (-0.67z)| lr 1.98e-04 | 2534.30 ms | 53.3% bf16 MFU | 206984 tok/s +step 12219/19560 | loss 3.354965 (+0.05z)| norm 0.2961 (+1.28z)| lr 1.98e-04 | 2532.90 ms | 53.3% bf16 MFU | 206985 tok/s +step 12220/19560 | loss 3.305372 (-1.25z)| norm 0.2676 (-0.44z)| lr 1.98e-04 | 2532.77 ms | 53.3% bf16 MFU | 206986 tok/s +step 12221/19560 | loss 3.340259 (-0.32z)| norm 0.2427 (-1.93z)| lr 1.98e-04 | 2535.82 ms | 53.2% bf16 MFU | 206974 tok/s +step 12222/19560 | loss 3.323492 (-0.76z)| norm 0.2818 (+0.42z)| lr 1.98e-04 | 2531.83 ms | 53.3% bf16 MFU | 206979 tok/s +step 12223/19560 | loss 3.344155 (-0.20z)| norm 0.2620 (-0.76z)| lr 1.98e-04 | 2532.92 ms | 53.3% bf16 MFU | 206980 tok/s +step 12224/19560 | loss 3.283282 (-1.81z)| norm 0.2643 (-0.62z)| lr 1.98e-04 | 2532.45 ms | 53.3% bf16 MFU | 206982 tok/s +step 12225/19560 | loss 3.464591 (+2.88z)| norm 0.3740 (+5.28z)| lr 1.97e-04 | 2532.78 ms | 53.3% bf16 MFU | 206983 tok/s +step 12226/19560 | loss 3.318972 (-0.86z)| norm 0.2971 (+1.17z)| lr 1.97e-04 | 2531.40 ms | 53.3% bf16 MFU | 206990 tok/s +step 12227/19560 | loss 3.298298 (-1.37z)| norm 0.3140 (+2.03z)| lr 1.97e-04 | 2534.76 ms | 53.3% bf16 MFU | 206982 tok/s +step 12228/19560 | loss 3.325295 (-0.67z)| norm 0.2713 (-0.21z)| lr 1.97e-04 | 2532.73 ms | 53.3% bf16 MFU | 206983 tok/s +step 12229/19560 | loss 3.335622 (-0.41z)| norm 0.2863 (+0.57z)| lr 1.97e-04 | 2533.57 ms | 53.3% bf16 MFU | 206981 tok/s +step 12230/19560 | loss 3.378700 (+0.68z)| norm 0.2804 (+0.26z)| lr 1.97e-04 | 2532.33 ms | 53.3% bf16 MFU | 206984 tok/s +step 12231/19560 | loss 3.347521 (-0.12z)| norm 0.2827 (+0.39z)| lr 1.97e-04 | 2532.56 ms | 53.3% bf16 MFU | 206986 tok/s +step 12232/19560 | loss 3.394334 (+1.06z)| norm 0.2683 (-0.38z)| lr 1.97e-04 | 2532.26 ms | 53.3% bf16 MFU | 206988 tok/s +step 12233/19560 | loss 3.324302 (-0.71z)| norm 0.2778 (+0.13z)| lr 1.97e-04 | 2533.53 ms | 53.3% bf16 MFU | 206986 tok/s +step 12234/19560 | loss 3.293513 (-1.47z)| norm 0.2677 (-0.40z)| lr 1.97e-04 | 2532.24 ms | 53.3% bf16 MFU | 206989 tok/s +step 12235/19560 | loss 3.329999 (-0.55z)| norm 0.2518 (-1.24z)| lr 1.97e-04 | 2532.38 ms | 53.3% bf16 MFU | 206991 tok/s +step 12236/19560 | loss 3.336476 (-0.39z)| norm 0.2690 (-0.32z)| lr 1.97e-04 | 2532.83 ms | 53.3% bf16 MFU | 206991 tok/s +step 12237/19560 | loss 3.295119 (-1.42z)| norm 0.2606 (-0.76z)| lr 1.97e-04 | 2532.06 ms | 53.3% bf16 MFU | 206995 tok/s +step 12238/19560 | loss 3.403705 (+1.34z)| norm 0.2651 (-0.51z)| lr 1.97e-04 | 2533.74 ms | 53.3% bf16 MFU | 206991 tok/s +step 12239/19560 | loss 3.300218 (-1.27z)| norm 0.2814 (+0.37z)| lr 1.97e-04 | 2531.01 ms | 53.3% bf16 MFU | 206999 tok/s +step 12240/19560 | loss 3.370936 (+0.50z)| norm 0.2617 (-0.68z)| lr 1.97e-04 | 2532.16 ms | 53.3% bf16 MFU | 207002 tok/s +step 12241/19560 | loss 3.391033 (+1.00z)| norm 0.2903 (+0.85z)| lr 1.97e-04 | 2534.43 ms | 53.3% bf16 MFU | 206995 tok/s +step 12242/19560 | loss 3.360737 (+0.23z)| norm 0.2579 (-0.88z)| lr 1.97e-04 | 2532.94 ms | 53.3% bf16 MFU | 206995 tok/s +step 12243/19560 | loss 3.280214 (-1.76z)| norm 0.2510 (-1.23z)| lr 1.97e-04 | 2533.91 ms | 53.3% bf16 MFU | 206990 tok/s +step 12244/19560 | loss 3.337811 (-0.32z)| norm 0.2813 (+0.38z)| lr 1.97e-04 | 2532.50 ms | 53.3% bf16 MFU | 206992 tok/s +step 12245/19560 | loss 3.307205 (-1.07z)| norm 0.2833 (+0.48z)| lr 1.97e-04 | 2533.69 ms | 53.3% bf16 MFU | 206989 tok/s +step 12246/19560 | loss 3.391713 (+1.01z)| norm 0.2958 (+1.13z)| lr 1.96e-04 | 2532.20 ms | 53.3% bf16 MFU | 206992 tok/s +step 12247/19560 | loss 3.385839 (+0.86z)| norm 0.2892 (+0.79z)| lr 1.96e-04 | 2533.99 ms | 53.3% bf16 MFU | 206987 tok/s +step 12248/19560 | loss 3.321764 (-0.72z)| norm 0.2675 (-0.36z)| lr 1.96e-04 | 2532.42 ms | 53.3% bf16 MFU | 206989 tok/s +step 12249/19560 | loss 3.364720 (+0.34z)| norm 0.2808 (+0.35z)| lr 1.96e-04 | 2534.15 ms | 53.3% bf16 MFU | 206984 tok/s +step 12250/19560 | loss 3.330678 (-0.50z)| norm 0.2792 (+0.25z)| lr 1.96e-04 | 2534.97 ms | 53.3% bf16 MFU | 206976 tok/s +val loss 3.360524 +evaluating HellaSwag: 0/628 evaluating HellaSwag: 10/628 evaluating HellaSwag: 20/628 evaluating HellaSwag: 30/628 evaluating HellaSwag: 40/628 evaluating HellaSwag: 50/628 evaluating HellaSwag: 60/628 evaluating HellaSwag: 70/628 evaluating HellaSwag: 80/628 evaluating HellaSwag: 90/628 evaluating HellaSwag: 100/628 evaluating HellaSwag: 110/628 evaluating HellaSwag: 120/628 evaluating HellaSwag: 130/628 evaluating HellaSwag: 140/628 evaluating HellaSwag: 150/628 evaluating HellaSwag: 160/628 evaluating HellaSwag: 170/628 evaluating HellaSwag: 180/628 evaluating HellaSwag: 190/628 evaluating HellaSwag: 200/628 evaluating HellaSwag: 210/628 evaluating HellaSwag: 220/628 evaluating HellaSwag: 230/628 evaluating HellaSwag: 240/628 evaluating HellaSwag: 250/628 evaluating HellaSwag: 260/628 evaluating HellaSwag: 270/628 evaluating HellaSwag: 280/628 evaluating HellaSwag: 290/628 evaluating HellaSwag: 300/628 evaluating HellaSwag: 310/628 evaluating HellaSwag: 320/628 evaluating HellaSwag: 330/628 evaluating HellaSwag: 340/628 evaluating HellaSwag: 350/628 evaluating HellaSwag: 360/628 evaluating HellaSwag: 370/628 evaluating HellaSwag: 380/628 evaluating HellaSwag: 390/628 evaluating HellaSwag: 400/628 evaluating HellaSwag: 410/628 evaluating HellaSwag: 420/628 evaluating HellaSwag: 430/628 evaluating HellaSwag: 440/628 evaluating HellaSwag: 450/628 evaluating HellaSwag: 460/628 evaluating HellaSwag: 470/628 evaluating HellaSwag: 480/628 evaluating HellaSwag: 490/628 evaluating HellaSwag: 500/628 evaluating HellaSwag: 510/628 evaluating HellaSwag: 520/628 evaluating HellaSwag: 530/628 evaluating HellaSwag: 540/628 evaluating HellaSwag: 550/628 evaluating HellaSwag: 560/628 evaluating HellaSwag: 570/628 evaluating HellaSwag: 580/628 evaluating HellaSwag: 590/628 evaluating HellaSwag: 600/628 evaluating HellaSwag: 610/628 evaluating HellaSwag: 620/628 HellaSwag: 2966/10042 = 0.295359 +step 12251/19560 | loss 3.322360 (-0.73z)| norm 0.2729 (-0.08z)| lr 1.96e-04 | 2533.75 ms | 53.3% bf16 MFU | 206973 tok/s +step 12252/19560 | loss 3.360099 (+0.22z)| norm 0.3016 (+1.44z)| lr 1.96e-04 | 2531.75 ms | 53.3% bf16 MFU | 206979 tok/s +step 12253/19560 | loss 3.300444 (-1.26z)| norm 0.2940 (+1.02z)| lr 1.96e-04 | 2534.11 ms | 53.3% bf16 MFU | 206975 tok/s +step 12254/19560 | loss 3.377243 (+0.66z)| norm 0.2756 (+0.06z)| lr 1.96e-04 | 2533.14 ms | 53.3% bf16 MFU | 206975 tok/s +step 12255/19560 | loss 3.333510 (-0.43z)| norm 0.2768 (+0.11z)| lr 1.96e-04 | 2532.02 ms | 53.3% bf16 MFU | 206979 tok/s +step 12256/19560 | loss 3.296038 (-1.36z)| norm 0.2861 (+0.62z)| lr 1.96e-04 | 2532.14 ms | 53.3% bf16 MFU | 206983 tok/s +step 12257/19560 | loss 3.356579 (+0.14z)| norm 0.2965 (+1.15z)| lr 1.96e-04 | 2532.66 ms | 53.3% bf16 MFU | 206984 tok/s +step 12258/19560 | loss 3.340012 (-0.26z)| norm 0.2813 (+0.35z)| lr 1.96e-04 | 2533.22 ms | 53.3% bf16 MFU | 206983 tok/s +step 12259/19560 | loss 3.368036 (+0.44z)| norm 0.2764 (+0.08z)| lr 1.96e-04 | 2535.02 ms | 53.3% bf16 MFU | 206975 tok/s +step 12260/19560 | loss 3.315943 (-0.86z)| norm 0.2836 (+0.46z)| lr 1.96e-04 | 2535.52 ms | 53.3% bf16 MFU | 206965 tok/s +step 12261/19560 | loss 3.340050 (-0.25z)| norm 0.2633 (-0.64z)| lr 1.96e-04 | 2534.13 ms | 53.3% bf16 MFU | 206961 tok/s +step 12262/19560 | loss 3.337080 (-0.32z)| norm 0.2925 (+0.92z)| lr 1.96e-04 | 2534.24 ms | 53.3% bf16 MFU | 206957 tok/s +step 12263/19560 | loss 3.346266 (-0.08z)| norm 0.2641 (-0.61z)| lr 1.96e-04 | 2536.05 ms | 53.2% bf16 MFU | 206946 tok/s +step 12264/19560 | loss 3.358699 (+0.27z)| norm 0.2800 (+0.25z)| lr 1.96e-04 | 2536.49 ms | 53.2% bf16 MFU | 206934 tok/s +step 12265/19560 | loss 3.377887 (+0.77z)| norm 0.2778 (+0.13z)| lr 1.96e-04 | 2534.56 ms | 53.3% bf16 MFU | 206930 tok/s +step 12266/19560 | loss 3.338329 (-0.30z)| norm 0.2653 (-0.55z)| lr 1.96e-04 | 2536.72 ms | 53.2% bf16 MFU | 206917 tok/s +step 12267/19560 | loss 3.364551 (+0.40z)| norm 0.2695 (-0.32z)| lr 1.95e-04 | 2536.03 ms | 53.2% bf16 MFU | 206908 tok/s +step 12268/19560 | loss 3.362288 (+0.34z)| norm 0.2518 (-1.25z)| lr 1.95e-04 | 2536.38 ms | 53.2% bf16 MFU | 206898 tok/s +step 12269/19560 | loss 3.373590 (+0.63z)| norm 0.2900 (+0.78z)| lr 1.95e-04 | 2537.65 ms | 53.2% bf16 MFU | 206883 tok/s +step 12270/19560 | loss 3.344837 (-0.15z)| norm 0.2967 (+1.13z)| lr 1.95e-04 | 2537.79 ms | 53.2% bf16 MFU | 206869 tok/s +step 12271/19560 | loss 3.366579 (+0.44z)| norm 0.2817 (+0.33z)| lr 1.95e-04 | 2536.79 ms | 53.2% bf16 MFU | 206859 tok/s +step 12272/19560 | loss 3.380252 (+0.80z)| norm 0.2889 (+0.70z)| lr 1.95e-04 | 2538.17 ms | 53.2% bf16 MFU | 206844 tok/s +step 12273/19560 | loss 3.341820 (-0.24z)| norm 0.2710 (-0.25z)| lr 1.95e-04 | 2537.65 ms | 53.2% bf16 MFU | 206832 tok/s +step 12274/19560 | loss 3.396430 (+1.22z)| norm 0.2895 (+0.73z)| lr 1.95e-04 | 2538.77 ms | 53.2% bf16 MFU | 206816 tok/s +step 12275/19560 | loss 3.355690 (+0.12z)| norm 0.2913 (+0.82z)| lr 1.95e-04 | 2537.73 ms | 53.2% bf16 MFU | 206805 tok/s +step 12276/19560 | loss 3.383601 (+0.88z)| norm 0.2809 (+0.27z)| lr 1.95e-04 | 2537.05 ms | 53.2% bf16 MFU | 206798 tok/s +step 12277/19560 | loss 3.313843 (-1.00z)| norm 0.2612 (-0.78z)| lr 1.95e-04 | 2539.25 ms | 53.2% bf16 MFU | 206781 tok/s +step 12278/19560 | loss 3.315327 (-0.95z)| norm 0.2905 (+0.78z)| lr 1.95e-04 | 2537.41 ms | 53.2% bf16 MFU | 206774 tok/s +step 12279/19560 | loss 3.317850 (-0.88z)| norm 0.2639 (-0.63z)| lr 1.95e-04 | 2537.32 ms | 53.2% bf16 MFU | 206766 tok/s +step 12280/19560 | loss 3.384880 (+0.91z)| norm 0.2628 (-0.68z)| lr 1.95e-04 | 2537.42 ms | 53.2% bf16 MFU | 206759 tok/s +step 12281/19560 | loss 3.335176 (-0.42z)| norm 0.2723 (-0.18z)| lr 1.95e-04 | 2538.43 ms | 53.2% bf16 MFU | 206748 tok/s +step 12282/19560 | loss 3.340426 (-0.28z)| norm 0.2816 (+0.35z)| lr 1.95e-04 | 2538.75 ms | 53.2% bf16 MFU | 206737 tok/s +step 12283/19560 | loss 3.356426 (+0.14z)| norm 0.2706 (-0.25z)| lr 1.95e-04 | 2536.47 ms | 53.2% bf16 MFU | 206735 tok/s +step 12284/19560 | loss 3.421318 (+1.85z)| norm 0.3139 (+2.08z)| lr 1.95e-04 | 2538.30 ms | 53.2% bf16 MFU | 206726 tok/s +step 12285/19560 | loss 3.315625 (-0.96z)| norm 0.2874 (+0.64z)| lr 1.95e-04 | 2538.13 ms | 53.2% bf16 MFU | 206717 tok/s +step 12286/19560 | loss 3.394198 (+1.13z)| norm 0.2841 (+0.46z)| lr 1.95e-04 | 2537.61 ms | 53.2% bf16 MFU | 206712 tok/s +step 12287/19560 | loss 3.345624 (-0.17z)| norm 0.2839 (+0.44z)| lr 1.95e-04 | 2537.87 ms | 53.2% bf16 MFU | 206706 tok/s +step 12288/19560 | loss 3.319200 (-0.87z)| norm 0.2752 (-0.04z)| lr 1.95e-04 | 2536.55 ms | 53.2% bf16 MFU | 206705 tok/s +step 12289/19560 | loss 3.354999 (+0.09z)| norm 0.2758 (-0.01z)| lr 1.94e-04 | 2536.79 ms | 53.2% bf16 MFU | 206703 tok/s +step 12290/19560 | loss 3.361697 (+0.27z)| norm 0.2741 (-0.11z)| lr 1.94e-04 | 2536.20 ms | 53.2% bf16 MFU | 206704 tok/s +step 12291/19560 | loss 3.381890 (+0.80z)| norm 0.3034 (+1.46z)| lr 1.94e-04 | 2537.14 ms | 53.2% bf16 MFU | 206701 tok/s +step 12292/19560 | loss 3.398077 (+1.22z)| norm 0.2867 (+0.55z)| lr 1.94e-04 | 2537.82 ms | 53.2% bf16 MFU | 206696 tok/s +step 12293/19560 | loss 3.277876 (-1.92z)| norm 0.3008 (+1.29z)| lr 1.94e-04 | 2537.58 ms | 53.2% bf16 MFU | 206691 tok/s +step 12294/19560 | loss 3.348896 (-0.06z)| norm 0.2737 (-0.14z)| lr 1.94e-04 | 2538.49 ms | 53.2% bf16 MFU | 206684 tok/s +step 12295/19560 | loss 3.367453 (+0.43z)| norm 0.2763 (+0.01z)| lr 1.94e-04 | 2536.77 ms | 53.2% bf16 MFU | 206683 tok/s +step 12296/19560 | loss 3.447351 (+2.44z)| norm 0.2665 (-0.56z)| lr 1.94e-04 | 2537.49 ms | 53.2% bf16 MFU | 206680 tok/s +step 12297/19560 | loss 3.342285 (-0.24z)| norm 0.2807 (+0.29z)| lr 1.94e-04 | 2538.14 ms | 53.2% bf16 MFU | 206674 tok/s +step 12298/19560 | loss 3.352555 (+0.02z)| norm 0.2737 (-0.12z)| lr 1.94e-04 | 2537.08 ms | 53.2% bf16 MFU | 206673 tok/s +step 12299/19560 | loss 3.332074 (-0.50z)| norm 0.2760 (+0.02z)| lr 1.94e-04 | 2536.51 ms | 53.2% bf16 MFU | 206674 tok/s +step 12300/19560 | loss 3.378424 (+0.67z)| norm 0.3313 (+3.18z)| lr 1.94e-04 | 2538.39 ms | 53.2% bf16 MFU | 206668 tok/s +step 12301/19560 | loss 3.310827 (-1.05z)| norm 0.3087 (+1.87z)| lr 1.94e-04 | 2538.42 ms | 53.2% bf16 MFU | 206661 tok/s +step 12302/19560 | loss 3.350858 (-0.04z)| norm 0.2680 (-0.46z)| lr 1.94e-04 | 2539.12 ms | 53.2% bf16 MFU | 206652 tok/s +step 12303/19560 | loss 3.337014 (-0.39z)| norm 0.2976 (+1.24z)| lr 1.94e-04 | 2536.80 ms | 53.2% bf16 MFU | 206653 tok/s +step 12304/19560 | loss 3.346432 (-0.16z)| norm 0.2804 (+0.25z)| lr 1.94e-04 | 2536.26 ms | 53.2% bf16 MFU | 206657 tok/s +step 12305/19560 | loss 3.302432 (-1.29z)| norm 0.2690 (-0.41z)| lr 1.94e-04 | 2536.38 ms | 53.2% bf16 MFU | 206659 tok/s +step 12306/19560 | loss 3.349292 (-0.08z)| norm 0.2767 (+0.04z)| lr 1.94e-04 | 2537.02 ms | 53.2% bf16 MFU | 206659 tok/s +step 12307/19560 | loss 3.407806 (+1.44z)| norm 0.2910 (+0.86z)| lr 1.94e-04 | 2536.61 ms | 53.2% bf16 MFU | 206660 tok/s +step 12308/19560 | loss 3.475231 (+3.11z)| norm 0.2988 (+1.28z)| lr 1.94e-04 | 2537.17 ms | 53.2% bf16 MFU | 206660 tok/s +step 12309/19560 | loss 3.419708 (+1.88z)| norm 0.2973 (+1.18z)| lr 1.94e-04 | 2535.70 ms | 53.2% bf16 MFU | 206665 tok/s +step 12310/19560 | loss 3.368173 (+0.45z)| norm 0.2751 (-0.09z)| lr 1.93e-04 | 2536.32 ms | 53.2% bf16 MFU | 206667 tok/s +step 12311/19560 | loss 3.385905 (+0.95z)| norm 0.2641 (-0.71z)| lr 1.93e-04 | 2538.08 ms | 53.2% bf16 MFU | 206662 tok/s +step 12312/19560 | loss 3.389791 (+1.05z)| norm 0.2767 (+0.01z)| lr 1.93e-04 | 2537.19 ms | 53.2% bf16 MFU | 206661 tok/s +step 12313/19560 | loss 3.400847 (+1.34z)| norm 0.2650 (-0.65z)| lr 1.93e-04 | 2538.34 ms | 53.2% bf16 MFU | 206655 tok/s +step 12314/19560 | loss 3.357658 (+0.14z)| norm 0.2705 (-0.34z)| lr 1.93e-04 | 2535.61 ms | 53.2% bf16 MFU | 206661 tok/s +step 12315/19560 | loss 3.361710 (+0.24z)| norm 0.2564 (-1.13z)| lr 1.93e-04 | 2535.61 ms | 53.2% bf16 MFU | 206667 tok/s +step 12316/19560 | loss 3.441351 (+2.38z)| norm 0.2592 (-0.98z)| lr 1.93e-04 | 2535.68 ms | 53.2% bf16 MFU | 206671 tok/s +step 12317/19560 | loss 3.333871 (-0.54z)| norm 0.2475 (-1.61z)| lr 1.93e-04 | 2536.08 ms | 53.2% bf16 MFU | 206674 tok/s +step 12318/19560 | loss 3.360922 (+0.20z)| norm 0.2822 (+0.34z)| lr 1.93e-04 | 2534.82 ms | 53.3% bf16 MFU | 206682 tok/s +step 12319/19560 | loss 3.395970 (+1.14z)| norm 0.2679 (-0.47z)| lr 1.93e-04 | 2534.27 ms | 53.3% bf16 MFU | 206692 tok/s +step 12320/19560 | loss 3.405198 (+1.36z)| norm 0.3461 (+3.72z)| lr 1.93e-04 | 2533.64 ms | 53.3% bf16 MFU | 206704 tok/s +step 12321/19560 | loss 3.374706 (+0.54z)| norm 0.2790 (+0.11z)| lr 1.93e-04 | 2534.68 ms | 53.3% bf16 MFU | 206711 tok/s +step 12322/19560 | loss 3.313261 (-1.09z)| norm 0.3032 (+1.39z)| lr 1.93e-04 | 2534.83 ms | 53.3% bf16 MFU | 206717 tok/s +step 12323/19560 | loss 3.358354 (+0.14z)| norm 0.2891 (+0.62z)| lr 1.93e-04 | 2535.07 ms | 53.3% bf16 MFU | 206722 tok/s +step 12324/19560 | loss 3.362581 (+0.26z)| norm 0.2878 (+0.54z)| lr 1.93e-04 | 2534.85 ms | 53.3% bf16 MFU | 206728 tok/s +step 12325/19560 | loss 3.304364 (-1.33z)| norm 0.3074 (+1.57z)| lr 1.93e-04 | 2534.23 ms | 53.3% bf16 MFU | 206735 tok/s +step 12326/19560 | loss 3.427015 (+1.97z)| norm 0.2690 (-0.51z)| lr 1.93e-04 | 2533.24 ms | 53.3% bf16 MFU | 206747 tok/s +step 12327/19560 | loss 3.374374 (+0.55z)| norm 0.3090 (+1.63z)| lr 1.93e-04 | 2534.91 ms | 53.3% bf16 MFU | 206751 tok/s +step 12328/19560 | loss 3.392231 (+1.02z)| norm 0.3080 (+1.55z)| lr 1.93e-04 | 2534.41 ms | 53.3% bf16 MFU | 206757 tok/s +step 12329/19560 | loss 3.388130 (+0.89z)| norm 0.2760 (-0.17z)| lr 1.93e-04 | 2533.34 ms | 53.3% bf16 MFU | 206767 tok/s +step 12330/19560 | loss 3.396961 (+1.13z)| norm 0.3065 (+1.45z)| lr 1.93e-04 | 2534.83 ms | 53.3% bf16 MFU | 206770 tok/s +step 12331/19560 | loss 3.327533 (-0.73z)| norm 0.2658 (-0.74z)| lr 1.93e-04 | 2534.88 ms | 53.3% bf16 MFU | 206773 tok/s +step 12332/19560 | loss 3.317202 (-1.00z)| norm 0.3022 (+1.19z)| lr 1.92e-04 | 2533.96 ms | 53.3% bf16 MFU | 206780 tok/s +step 12333/19560 | loss 3.352602 (-0.06z)| norm 0.2713 (-0.46z)| lr 1.92e-04 | 2533.66 ms | 53.3% bf16 MFU | 206787 tok/s +step 12334/19560 | loss 3.360443 (+0.15z)| norm 0.2714 (-0.46z)| lr 1.92e-04 | 2533.22 ms | 53.3% bf16 MFU | 206796 tok/s +step 12335/19560 | loss 3.405445 (+1.33z)| norm 0.2694 (-0.57z)| lr 1.92e-04 | 2531.70 ms | 53.3% bf16 MFU | 206811 tok/s +step 12336/19560 | loss 3.349004 (-0.18z)| norm 0.2790 (-0.06z)| lr 1.92e-04 | 2533.15 ms | 53.3% bf16 MFU | 206819 tok/s +step 12337/19560 | loss 3.370720 (+0.41z)| norm 0.2718 (-0.45z)| lr 1.92e-04 | 2532.06 ms | 53.3% bf16 MFU | 206831 tok/s +step 12338/19560 | loss 3.367779 (+0.33z)| norm 0.2676 (-0.67z)| lr 1.92e-04 | 2533.32 ms | 53.3% bf16 MFU | 206837 tok/s +step 12339/19560 | loss 3.353623 (-0.05z)| norm 0.2697 (-0.56z)| lr 1.92e-04 | 2534.55 ms | 53.3% bf16 MFU | 206838 tok/s +step 12340/19560 | loss 3.376999 (+0.57z)| norm 0.2767 (-0.18z)| lr 1.92e-04 | 2532.88 ms | 53.3% bf16 MFU | 206846 tok/s +step 12341/19560 | loss 3.404132 (+1.28z)| norm 0.2741 (-0.33z)| lr 1.92e-04 | 2533.30 ms | 53.3% bf16 MFU | 206851 tok/s +step 12342/19560 | loss 3.393657 (+0.99z)| norm 0.2861 (+0.31z)| lr 1.92e-04 | 2533.49 ms | 53.3% bf16 MFU | 206856 tok/s +step 12343/19560 | loss 3.374231 (+0.46z)| norm 0.2659 (-0.80z)| lr 1.92e-04 | 2533.98 ms | 53.3% bf16 MFU | 206858 tok/s +step 12344/19560 | loss 3.334392 (-0.59z)| norm 0.2601 (-1.11z)| lr 1.92e-04 | 2530.68 ms | 53.4% bf16 MFU | 206874 tok/s +step 12345/19560 | loss 3.397099 (+1.09z)| norm 0.2703 (-0.54z)| lr 1.92e-04 | 2532.05 ms | 53.3% bf16 MFU | 206883 tok/s +step 12346/19560 | loss 3.417106 (+1.61z)| norm 0.2606 (-1.07z)| lr 1.92e-04 | 2532.80 ms | 53.3% bf16 MFU | 206889 tok/s +step 12347/19560 | loss 3.380287 (+0.62z)| norm 0.3194 (+2.11z)| lr 1.92e-04 | 2533.51 ms | 53.3% bf16 MFU | 206892 tok/s +step 12348/19560 | loss 3.398629 (+1.09z)| norm 0.2604 (-1.07z)| lr 1.92e-04 | 2531.50 ms | 53.3% bf16 MFU | 206902 tok/s +step 12349/19560 | loss 3.354065 (-0.10z)| norm 0.3028 (+1.20z)| lr 1.92e-04 | 2533.63 ms | 53.3% bf16 MFU | 206904 tok/s +step 12350/19560 | loss 3.425790 (+1.78z)| norm 0.2860 (+0.29z)| lr 1.92e-04 | 2532.52 ms | 53.3% bf16 MFU | 206910 tok/s +step 12351/19560 | loss 3.371502 (+0.33z)| norm 0.2873 (+0.34z)| lr 1.92e-04 | 2535.30 ms | 53.3% bf16 MFU | 206904 tok/s +step 12352/19560 | loss 3.348616 (-0.29z)| norm 0.2701 (-0.60z)| lr 1.92e-04 | 2532.10 ms | 53.3% bf16 MFU | 206912 tok/s +step 12353/19560 | loss 3.424711 (+1.80z)| norm 0.2847 (+0.27z)| lr 1.91e-04 | 2532.40 ms | 53.3% bf16 MFU | 206918 tok/s +step 12354/19560 | loss 3.340239 (-0.52z)| norm 0.2858 (+0.34z)| lr 1.91e-04 | 2533.04 ms | 53.3% bf16 MFU | 206921 tok/s +step 12355/19560 | loss 3.353063 (-0.18z)| norm 0.2616 (-1.13z)| lr 1.91e-04 | 2533.76 ms | 53.3% bf16 MFU | 206921 tok/s +step 12356/19560 | loss 3.343000 (-0.47z)| norm 0.2699 (-0.62z)| lr 1.91e-04 | 2533.67 ms | 53.3% bf16 MFU | 206921 tok/s +step 12357/19560 | loss 3.366348 (+0.18z)| norm 0.2548 (-1.53z)| lr 1.91e-04 | 2533.57 ms | 53.3% bf16 MFU | 206922 tok/s +step 12358/19560 | loss 3.332868 (-0.75z)| norm 0.2761 (-0.21z)| lr 1.91e-04 | 2531.59 ms | 53.3% bf16 MFU | 206931 tok/s +step 12359/19560 | loss 3.370510 (+0.30z)| norm 0.2798 (+0.02z)| lr 1.91e-04 | 2530.51 ms | 53.4% bf16 MFU | 206944 tok/s +step 12360/19560 | loss 3.395082 (+0.98z)| norm 0.3002 (+1.25z)| lr 1.91e-04 | 2533.40 ms | 53.3% bf16 MFU | 206944 tok/s +step 12361/19560 | loss 3.346081 (-0.39z)| norm 0.2570 (-1.38z)| lr 1.91e-04 | 2533.06 ms | 53.3% bf16 MFU | 206946 tok/s +step 12362/19560 | loss 3.342580 (-0.51z)| norm 0.2892 (+0.58z)| lr 1.91e-04 | 2531.76 ms | 53.3% bf16 MFU | 206952 tok/s +step 12363/19560 | loss 3.331639 (-0.82z)| norm 0.2851 (+0.31z)| lr 1.91e-04 | 2533.19 ms | 53.3% bf16 MFU | 206953 tok/s +step 12364/19560 | loss 3.387838 (+0.76z)| norm 0.3033 (+1.41z)| lr 1.91e-04 | 2531.28 ms | 53.3% bf16 MFU | 206962 tok/s +step 12365/19560 | loss 3.377266 (+0.45z)| norm 0.2980 (+1.07z)| lr 1.91e-04 | 2532.21 ms | 53.3% bf16 MFU | 206966 tok/s +step 12366/19560 | loss 3.351941 (-0.26z)| norm 0.2890 (+0.51z)| lr 1.91e-04 | 2532.38 ms | 53.3% bf16 MFU | 206969 tok/s +step 12367/19560 | loss 3.354217 (-0.21z)| norm 0.2949 (+0.86z)| lr 1.91e-04 | 2533.47 ms | 53.3% bf16 MFU | 206968 tok/s +step 12368/19560 | loss 3.501654 (+3.83z)| norm 0.2944 (+0.82z)| lr 1.91e-04 | 2532.87 ms | 53.3% bf16 MFU | 206969 tok/s +step 12369/19560 | loss 3.348454 (-0.38z)| norm 0.2910 (+0.60z)| lr 1.91e-04 | 2533.19 ms | 53.3% bf16 MFU | 206969 tok/s +step 12370/19560 | loss 3.377540 (+0.42z)| norm 0.2664 (-0.92z)| lr 1.91e-04 | 2533.16 ms | 53.3% bf16 MFU | 206969 tok/s +step 12371/19560 | loss 3.368785 (+0.16z)| norm 0.2711 (-0.65z)| lr 1.91e-04 | 2533.82 ms | 53.3% bf16 MFU | 206967 tok/s +step 12372/19560 | loss 3.351128 (-0.34z)| norm 0.2777 (-0.23z)| lr 1.91e-04 | 2532.93 ms | 53.3% bf16 MFU | 206968 tok/s +step 12373/19560 | loss 3.344228 (-0.55z)| norm 0.2612 (-1.25z)| lr 1.91e-04 | 2533.12 ms | 53.3% bf16 MFU | 206968 tok/s +step 12374/19560 | loss 3.338253 (-0.70z)| norm 0.2648 (-1.00z)| lr 1.91e-04 | 2533.25 ms | 53.3% bf16 MFU | 206968 tok/s +step 12375/19560 | loss 3.402402 (+1.11z)| norm 0.2764 (-0.27z)| lr 1.90e-04 | 2535.59 ms | 53.2% bf16 MFU | 206958 tok/s +step 12376/19560 | loss 3.366970 (+0.10z)| norm 0.2563 (-1.51z)| lr 1.90e-04 | 2533.26 ms | 53.3% bf16 MFU | 206958 tok/s +step 12377/19560 | loss 3.392511 (+0.82z)| norm 0.2822 (+0.09z)| lr 1.90e-04 | 2533.40 ms | 53.3% bf16 MFU | 206958 tok/s +step 12378/19560 | loss 3.509815 (+3.87z)| norm 0.2945 (+0.84z)| lr 1.90e-04 | 2532.29 ms | 53.3% bf16 MFU | 206962 tok/s +step 12379/19560 | loss 3.398198 (+0.87z)| norm 0.2762 (-0.29z)| lr 1.90e-04 | 2532.72 ms | 53.3% bf16 MFU | 206964 tok/s +step 12380/19560 | loss 3.392223 (+0.70z)| norm 0.3244 (+2.62z)| lr 1.90e-04 | 2533.24 ms | 53.3% bf16 MFU | 206964 tok/s +step 12381/19560 | loss 3.356841 (-0.26z)| norm 0.2501 (-1.83z)| lr 1.90e-04 | 2533.04 ms | 53.3% bf16 MFU | 206965 tok/s +step 12382/19560 | loss 3.373671 (+0.20z)| norm 0.2845 (+0.22z)| lr 1.90e-04 | 2533.26 ms | 53.3% bf16 MFU | 206965 tok/s +step 12383/19560 | loss 3.407536 (+1.10z)| norm 0.2706 (-0.61z)| lr 1.90e-04 | 2533.14 ms | 53.3% bf16 MFU | 206965 tok/s +step 12384/19560 | loss 3.357515 (-0.27z)| norm 0.2686 (-0.72z)| lr 1.90e-04 | 2533.34 ms | 53.3% bf16 MFU | 206965 tok/s +step 12385/19560 | loss 3.388419 (+0.57z)| norm 0.2696 (-0.65z)| lr 1.90e-04 | 2532.71 ms | 53.3% bf16 MFU | 206967 tok/s +step 12386/19560 | loss 3.367406 (-0.01z)| norm 0.2634 (-1.00z)| lr 1.90e-04 | 2533.68 ms | 53.3% bf16 MFU | 206965 tok/s +step 12387/19560 | loss 3.355791 (-0.33z)| norm 0.2617 (-1.09z)| lr 1.90e-04 | 2532.54 ms | 53.3% bf16 MFU | 206968 tok/s +step 12388/19560 | loss 3.337141 (-0.85z)| norm 0.2627 (-1.02z)| lr 1.90e-04 | 2531.80 ms | 53.3% bf16 MFU | 206973 tok/s +step 12389/19560 | loss 3.363221 (-0.14z)| norm 0.2650 (-0.88z)| lr 1.90e-04 | 2533.36 ms | 53.3% bf16 MFU | 206972 tok/s +step 12390/19560 | loss 3.323986 (-1.22z)| norm 0.2539 (-1.51z)| lr 1.90e-04 | 2534.06 ms | 53.3% bf16 MFU | 206968 tok/s +step 12391/19560 | loss 3.311172 (-1.55z)| norm 0.2481 (-1.83z)| lr 1.90e-04 | 2533.73 ms | 53.3% bf16 MFU | 206966 tok/s +step 12392/19560 | loss 3.342180 (-0.70z)| norm 0.2526 (-1.54z)| lr 1.90e-04 | 2535.19 ms | 53.3% bf16 MFU | 206958 tok/s +step 12393/19560 | loss 3.364225 (-0.09z)| norm 0.2750 (-0.25z)| lr 1.90e-04 | 2533.68 ms | 53.3% bf16 MFU | 206957 tok/s +step 12394/19560 | loss 3.376600 (+0.24z)| norm 0.2940 (+0.83z)| lr 1.90e-04 | 2534.66 ms | 53.3% bf16 MFU | 206951 tok/s +step 12395/19560 | loss 3.383549 (+0.43z)| norm 0.2585 (-1.20z)| lr 1.90e-04 | 2533.62 ms | 53.3% bf16 MFU | 206950 tok/s +step 12396/19560 | loss 3.398814 (+0.83z)| norm 0.2677 (-0.69z)| lr 1.89e-04 | 2533.58 ms | 53.3% bf16 MFU | 206949 tok/s +step 12397/19560 | loss 3.353863 (-0.39z)| norm 0.2676 (-0.68z)| lr 1.89e-04 | 2533.51 ms | 53.3% bf16 MFU | 206949 tok/s +step 12398/19560 | loss 3.359855 (-0.23z)| norm 0.2528 (-1.51z)| lr 1.89e-04 | 2534.10 ms | 53.3% bf16 MFU | 206946 tok/s +step 12399/19560 | loss 3.360244 (-0.22z)| norm 0.2660 (-0.74z)| lr 1.89e-04 | 2534.11 ms | 53.3% bf16 MFU | 206944 tok/s +step 12400/19560 | loss 3.325972 (-1.14z)| norm 0.2531 (-1.45z)| lr 1.89e-04 | 2533.49 ms | 53.3% bf16 MFU | 206943 tok/s +step 12401/19560 | loss 3.441870 (+1.97z)| norm 0.2791 (+0.02z)| lr 1.89e-04 | 2535.08 ms | 53.3% bf16 MFU | 206937 tok/s +step 12402/19560 | loss 3.366546 (-0.05z)| norm 0.2542 (-1.37z)| lr 1.89e-04 | 2534.40 ms | 53.3% bf16 MFU | 206934 tok/s +step 12403/19560 | loss 3.410069 (+1.11z)| norm 0.2882 (+0.55z)| lr 1.89e-04 | 2534.99 ms | 53.3% bf16 MFU | 206928 tok/s +step 12404/19560 | loss 3.350305 (-0.49z)| norm 0.2544 (-1.34z)| lr 1.89e-04 | 2533.97 ms | 53.3% bf16 MFU | 206927 tok/s +step 12405/19560 | loss 3.409695 (+1.09z)| norm 0.2613 (-0.95z)| lr 1.89e-04 | 2536.20 ms | 53.2% bf16 MFU | 206916 tok/s +step 12406/19560 | loss 3.352077 (-0.47z)| norm 0.2545 (-1.31z)| lr 1.89e-04 | 2534.34 ms | 53.3% bf16 MFU | 206914 tok/s +step 12407/19560 | loss 3.400798 (+0.83z)| norm 0.2660 (-0.67z)| lr 1.89e-04 | 2534.66 ms | 53.3% bf16 MFU | 206911 tok/s +step 12408/19560 | loss 3.350075 (-0.54z)| norm 0.2644 (-0.76z)| lr 1.89e-04 | 2534.27 ms | 53.3% bf16 MFU | 206909 tok/s +step 12409/19560 | loss 3.330075 (-1.08z)| norm 0.2611 (-0.94z)| lr 1.89e-04 | 2533.20 ms | 53.3% bf16 MFU | 206912 tok/s +step 12410/19560 | loss 3.389476 (+0.52z)| norm 0.2777 (-0.01z)| lr 1.89e-04 | 2532.96 ms | 53.3% bf16 MFU | 206916 tok/s +step 12411/19560 | loss 3.356572 (-0.37z)| norm 0.2607 (-0.95z)| lr 1.89e-04 | 2533.64 ms | 53.3% bf16 MFU | 206917 tok/s +step 12412/19560 | loss 3.415853 (+1.24z)| norm 0.2638 (-0.77z)| lr 1.89e-04 | 2534.48 ms | 53.3% bf16 MFU | 206914 tok/s +step 12413/19560 | loss 3.357723 (-0.35z)| norm 0.2666 (-0.60z)| lr 1.89e-04 | 2532.25 ms | 53.3% bf16 MFU | 206920 tok/s +step 12414/19560 | loss 3.511178 (+3.64z)| norm 0.2883 (+0.62z)| lr 1.89e-04 | 2534.42 ms | 53.3% bf16 MFU | 206918 tok/s +step 12415/19560 | loss 3.364320 (-0.19z)| norm 0.2865 (+0.52z)| lr 1.89e-04 | 2532.85 ms | 53.3% bf16 MFU | 206922 tok/s +step 12416/19560 | loss 3.429982 (+1.50z)| norm 0.2799 (+0.14z)| lr 1.89e-04 | 2533.27 ms | 53.3% bf16 MFU | 206924 tok/s +step 12417/19560 | loss 3.487887 (+2.88z)| norm 0.2820 (+0.26z)| lr 1.89e-04 | 2533.02 ms | 53.3% bf16 MFU | 206926 tok/s +step 12418/19560 | loss 3.364329 (-0.23z)| norm 0.2726 (-0.27z)| lr 1.88e-04 | 2532.32 ms | 53.3% bf16 MFU | 206932 tok/s +step 12419/19560 | loss 3.347188 (-0.66z)| norm 0.2694 (-0.43z)| lr 1.88e-04 | 2532.21 ms | 53.3% bf16 MFU | 206938 tok/s +step 12420/19560 | loss 3.313266 (-1.48z)| norm 0.2688 (-0.46z)| lr 1.88e-04 | 2533.68 ms | 53.3% bf16 MFU | 206937 tok/s +step 12421/19560 | loss 3.430307 (+1.43z)| norm 0.2650 (-0.67z)| lr 1.88e-04 | 2532.98 ms | 53.3% bf16 MFU | 206940 tok/s +step 12422/19560 | loss 3.357655 (-0.41z)| norm 0.2739 (-0.16z)| lr 1.88e-04 | 2534.96 ms | 53.3% bf16 MFU | 206934 tok/s +step 12423/19560 | loss 3.328651 (-1.13z)| norm 0.2947 (+1.02z)| lr 1.88e-04 | 2533.90 ms | 53.3% bf16 MFU | 206933 tok/s +step 12424/19560 | loss 3.337421 (-0.90z)| norm 0.2674 (-0.54z)| lr 1.88e-04 | 2534.39 ms | 53.3% bf16 MFU | 206930 tok/s +step 12425/19560 | loss 3.358023 (-0.38z)| norm 0.2971 (+1.14z)| lr 1.88e-04 | 2533.69 ms | 53.3% bf16 MFU | 206929 tok/s +step 12426/19560 | loss 3.333043 (-1.01z)| norm 0.2667 (-0.58z)| lr 1.88e-04 | 2532.92 ms | 53.3% bf16 MFU | 206932 tok/s +step 12427/19560 | loss 3.426952 (+1.36z)| norm 0.2909 (+0.78z)| lr 1.88e-04 | 2534.71 ms | 53.3% bf16 MFU | 206928 tok/s +step 12428/19560 | loss 3.419661 (+1.16z)| norm 0.2817 (+0.30z)| lr 1.88e-04 | 2534.43 ms | 53.3% bf16 MFU | 206925 tok/s +step 12429/19560 | loss 3.380296 (+0.15z)| norm 0.2988 (+1.31z)| lr 1.88e-04 | 2534.07 ms | 53.3% bf16 MFU | 206923 tok/s +step 12430/19560 | loss 3.392581 (+0.46z)| norm 0.2782 (+0.09z)| lr 1.88e-04 | 2531.31 ms | 53.3% bf16 MFU | 206933 tok/s +step 12431/19560 | loss 3.319198 (-1.40z)| norm 0.2751 (-0.08z)| lr 1.88e-04 | 2533.02 ms | 53.3% bf16 MFU | 206936 tok/s +step 12432/19560 | loss 3.358031 (-0.42z)| norm 0.2712 (-0.31z)| lr 1.88e-04 | 2533.19 ms | 53.3% bf16 MFU | 206937 tok/s +step 12433/19560 | loss 3.337382 (-0.96z)| norm 0.2638 (-0.74z)| lr 1.88e-04 | 2532.09 ms | 53.3% bf16 MFU | 206943 tok/s +step 12434/19560 | loss 3.330716 (-1.13z)| norm 0.2648 (-0.68z)| lr 1.88e-04 | 2534.23 ms | 53.3% bf16 MFU | 206940 tok/s +step 12435/19560 | loss 3.322179 (-1.32z)| norm 0.2619 (-0.84z)| lr 1.88e-04 | 2533.18 ms | 53.3% bf16 MFU | 206942 tok/s +step 12436/19560 | loss 3.458348 (+2.18z)| norm 0.2640 (-0.70z)| lr 1.88e-04 | 2532.62 ms | 53.3% bf16 MFU | 206945 tok/s +step 12437/19560 | loss 3.392599 (+0.49z)| norm 0.2538 (-1.29z)| lr 1.88e-04 | 2532.76 ms | 53.3% bf16 MFU | 206948 tok/s +step 12438/19560 | loss 3.305492 (-1.73z)| norm 0.2773 (+0.11z)| lr 1.88e-04 | 2533.80 ms | 53.3% bf16 MFU | 206947 tok/s +step 12439/19560 | loss 3.539209 (+3.95z)| norm 0.2799 (+0.26z)| lr 1.87e-04 | 2533.27 ms | 53.3% bf16 MFU | 206947 tok/s +step 12440/19560 | loss 3.399610 (+0.60z)| norm 0.2787 (+0.19z)| lr 1.87e-04 | 2532.14 ms | 53.3% bf16 MFU | 206953 tok/s +step 12441/19560 | loss 3.366111 (-0.19z)| norm 0.2992 (+1.39z)| lr 1.87e-04 | 2533.55 ms | 53.3% bf16 MFU | 206952 tok/s +step 12442/19560 | loss 3.418493 (+1.05z)| norm 0.2727 (-0.19z)| lr 1.87e-04 | 2533.64 ms | 53.3% bf16 MFU | 206951 tok/s +step 12443/19560 | loss 3.545581 (+3.81z)| norm 0.3067 (+1.79z)| lr 1.87e-04 | 2533.40 ms | 53.3% bf16 MFU | 206951 tok/s +step 12444/19560 | loss 3.369560 (-0.13z)| norm 0.2806 (+0.24z)| lr 1.87e-04 | 2534.34 ms | 53.3% bf16 MFU | 206947 tok/s +step 12445/19560 | loss 3.381711 (+0.13z)| norm 0.2587 (-1.06z)| lr 1.87e-04 | 2534.94 ms | 53.3% bf16 MFU | 206941 tok/s +step 12446/19560 | loss 3.392567 (+0.37z)| norm 0.2839 (+0.44z)| lr 1.87e-04 | 2534.89 ms | 53.3% bf16 MFU | 206935 tok/s +step 12447/19560 | loss 3.371946 (-0.09z)| norm 0.2717 (-0.29z)| lr 1.87e-04 | 2534.54 ms | 53.3% bf16 MFU | 206931 tok/s +step 12448/19560 | loss 3.295340 (-1.79z)| norm 0.2705 (-0.35z)| lr 1.87e-04 | 2535.95 ms | 53.2% bf16 MFU | 206922 tok/s +step 12449/19560 | loss 3.416464 (+0.92z)| norm 0.2922 (+1.03z)| lr 1.87e-04 | 2537.10 ms | 53.2% bf16 MFU | 206908 tok/s +step 12450/19560 | loss 3.324382 (-1.15z)| norm 0.2641 (-0.75z)| lr 1.87e-04 | 2536.43 ms | 53.2% bf16 MFU | 206898 tok/s +step 12451/19560 | loss 3.437900 (+1.38z)| norm 0.2939 (+1.16z)| lr 1.87e-04 | 2535.55 ms | 53.2% bf16 MFU | 206892 tok/s +step 12452/19560 | loss 3.314956 (-1.35z)| norm 0.2591 (-1.05z)| lr 1.87e-04 | 2536.57 ms | 53.2% bf16 MFU | 206882 tok/s +step 12453/19560 | loss 3.378746 (+0.05z)| norm 0.2859 (+0.68z)| lr 1.87e-04 | 2535.61 ms | 53.2% bf16 MFU | 206876 tok/s +step 12454/19560 | loss 3.578822 (+4.21z)| norm 0.4475 (+7.91z)| lr 1.87e-04 | 2533.25 ms | 53.3% bf16 MFU | 206880 tok/s +step 12455/19560 | loss 3.377383 (-0.00z)| norm 0.2865 (+0.46z)| lr 1.87e-04 | 2535.92 ms | 53.2% bf16 MFU | 206874 tok/s +step 12456/19560 | loss 3.362353 (-0.31z)| norm 0.3083 (+1.48z)| lr 1.87e-04 | 2534.54 ms | 53.3% bf16 MFU | 206873 tok/s +step 12457/19560 | loss 3.360925 (-0.34z)| norm 0.3107 (+1.56z)| lr 1.87e-04 | 2533.87 ms | 53.3% bf16 MFU | 206875 tok/s +step 12458/19560 | loss 3.370365 (-0.14z)| norm 0.2641 (-0.58z)| lr 1.87e-04 | 2534.94 ms | 53.3% bf16 MFU | 206872 tok/s +step 12459/19560 | loss 3.348424 (-0.60z)| norm 0.2606 (-0.74z)| lr 1.87e-04 | 2533.48 ms | 53.3% bf16 MFU | 206876 tok/s +step 12460/19560 | loss 3.383376 (+0.12z)| norm 0.2799 (+0.16z)| lr 1.87e-04 | 2533.45 ms | 53.3% bf16 MFU | 206879 tok/s +step 12461/19560 | loss 3.399042 (+0.45z)| norm 0.2760 (-0.02z)| lr 1.86e-04 | 2532.97 ms | 53.3% bf16 MFU | 206885 tok/s +step 12462/19560 | loss 3.307724 (-1.46z)| norm 0.2711 (-0.25z)| lr 1.86e-04 | 2534.52 ms | 53.3% bf16 MFU | 206883 tok/s +step 12463/19560 | loss 3.371980 (-0.11z)| norm 0.2669 (-0.44z)| lr 1.86e-04 | 2533.34 ms | 53.3% bf16 MFU | 206887 tok/s +step 12464/19560 | loss 3.472970 (+1.96z)| norm 0.2956 (+0.89z)| lr 1.86e-04 | 2534.36 ms | 53.3% bf16 MFU | 206886 tok/s +step 12465/19560 | loss 3.353994 (-0.50z)| norm 0.2751 (-0.06z)| lr 1.86e-04 | 2534.11 ms | 53.3% bf16 MFU | 206887 tok/s +step 12466/19560 | loss 3.356101 (-0.45z)| norm 0.2800 (+0.16z)| lr 1.86e-04 | 2533.31 ms | 53.3% bf16 MFU | 206890 tok/s +step 12467/19560 | loss 3.351100 (-0.55z)| norm 0.2726 (-0.19z)| lr 1.86e-04 | 2532.93 ms | 53.3% bf16 MFU | 206895 tok/s +step 12468/19560 | loss 3.383964 (+0.12z)| norm 0.2841 (+0.34z)| lr 1.86e-04 | 2534.94 ms | 53.3% bf16 MFU | 206892 tok/s +step 12469/19560 | loss 3.312943 (-1.32z)| norm 0.2667 (-0.46z)| lr 1.86e-04 | 2533.42 ms | 53.3% bf16 MFU | 206894 tok/s +step 12470/19560 | loss 3.312350 (-1.31z)| norm 0.2748 (-0.08z)| lr 1.86e-04 | 2534.01 ms | 53.3% bf16 MFU | 206895 tok/s +step 12471/19560 | loss 3.359125 (-0.35z)| norm 0.2950 (+0.85z)| lr 1.86e-04 | 2533.44 ms | 53.3% bf16 MFU | 206897 tok/s +step 12472/19560 | loss 3.319025 (-1.17z)| norm 0.2790 (+0.10z)| lr 1.86e-04 | 2535.36 ms | 53.3% bf16 MFU | 206892 tok/s +step 12473/19560 | loss 3.279254 (-1.93z)| norm 0.2830 (+0.28z)| lr 1.86e-04 | 2534.82 ms | 53.3% bf16 MFU | 206889 tok/s +step 12474/19560 | loss 3.325920 (-0.98z)| norm 0.2927 (+0.72z)| lr 1.86e-04 | 2534.37 ms | 53.3% bf16 MFU | 206888 tok/s +step 12475/19560 | loss 3.343934 (-0.61z)| norm 0.2796 (+0.12z)| lr 1.86e-04 | 2532.17 ms | 53.3% bf16 MFU | 206896 tok/s +step 12476/19560 | loss 3.361469 (-0.25z)| norm 0.2914 (+0.67z)| lr 1.86e-04 | 2532.93 ms | 53.3% bf16 MFU | 206901 tok/s +step 12477/19560 | loss 3.344162 (-0.60z)| norm 0.2776 (+0.03z)| lr 1.86e-04 | 2536.76 ms | 53.2% bf16 MFU | 206890 tok/s +step 12478/19560 | loss 3.351772 (-0.44z)| norm 0.2859 (+0.42z)| lr 1.86e-04 | 2533.36 ms | 53.3% bf16 MFU | 206893 tok/s +step 12479/19560 | loss 3.394094 (+0.41z)| norm 0.2698 (-0.34z)| lr 1.86e-04 | 2533.00 ms | 53.3% bf16 MFU | 206897 tok/s +step 12480/19560 | loss 3.352588 (-0.42z)| norm 0.2922 (+0.72z)| lr 1.86e-04 | 2533.92 ms | 53.3% bf16 MFU | 206898 tok/s +step 12481/19560 | loss 3.328902 (-0.89z)| norm 0.2648 (-0.58z)| lr 1.86e-04 | 2532.00 ms | 53.3% bf16 MFU | 206906 tok/s +step 12482/19560 | loss 3.325365 (-0.95z)| norm 0.2891 (+0.58z)| lr 1.85e-04 | 2532.18 ms | 53.3% bf16 MFU | 206913 tok/s +step 12483/19560 | loss 3.328146 (-0.89z)| norm 0.2465 (-1.43z)| lr 1.85e-04 | 2533.49 ms | 53.3% bf16 MFU | 206915 tok/s +step 12484/19560 | loss 3.360865 (-0.24z)| norm 0.2872 (+0.48z)| lr 1.85e-04 | 2532.95 ms | 53.3% bf16 MFU | 206919 tok/s +step 12485/19560 | loss 3.333953 (-0.77z)| norm 0.2586 (-0.87z)| lr 1.85e-04 | 2532.11 ms | 53.3% bf16 MFU | 206925 tok/s +step 12486/19560 | loss 3.335833 (-0.73z)| norm 0.3028 (+1.21z)| lr 1.85e-04 | 2533.29 ms | 53.3% bf16 MFU | 206927 tok/s +step 12487/19560 | loss 3.365182 (-0.15z)| norm 0.2776 (+0.02z)| lr 1.85e-04 | 2533.14 ms | 53.3% bf16 MFU | 206929 tok/s +step 12488/19560 | loss 3.333081 (-0.78z)| norm 0.3001 (+1.08z)| lr 1.85e-04 | 2532.08 ms | 53.3% bf16 MFU | 206936 tok/s +step 12489/19560 | loss 3.358328 (-0.28z)| norm 0.2739 (-0.16z)| lr 1.85e-04 | 2533.97 ms | 53.3% bf16 MFU | 206934 tok/s +step 12490/19560 | loss 3.379009 (+0.13z)| norm 0.2721 (-0.24z)| lr 1.85e-04 | 2533.31 ms | 53.3% bf16 MFU | 206935 tok/s +step 12491/19560 | loss 3.399599 (+0.53z)| norm 0.2611 (-0.75z)| lr 1.85e-04 | 2531.90 ms | 53.3% bf16 MFU | 206942 tok/s +step 12492/19560 | loss 3.340322 (-0.65z)| norm 0.2910 (+0.67z)| lr 1.85e-04 | 2532.44 ms | 53.3% bf16 MFU | 206947 tok/s +step 12493/19560 | loss 3.334594 (-0.75z)| norm 0.2662 (-0.50z)| lr 1.85e-04 | 2533.68 ms | 53.3% bf16 MFU | 206946 tok/s +step 12494/19560 | loss 3.403857 (+0.62z)| norm 0.2778 (+0.06z)| lr 1.85e-04 | 2533.93 ms | 53.3% bf16 MFU | 206944 tok/s +step 12495/19560 | loss 3.314734 (-1.14z)| norm 0.2746 (-0.08z)| lr 1.85e-04 | 2534.97 ms | 53.3% bf16 MFU | 206938 tok/s +step 12496/19560 | loss 3.370903 (-0.01z)| norm 0.2714 (-0.23z)| lr 1.85e-04 | 2535.49 ms | 53.3% bf16 MFU | 206930 tok/s +step 12497/19560 | loss 3.405770 (+0.69z)| norm 0.2910 (+0.71z)| lr 1.85e-04 | 2537.12 ms | 53.2% bf16 MFU | 206916 tok/s +step 12498/19560 | loss 3.415140 (+0.87z)| norm 0.2575 (-0.89z)| lr 1.85e-04 | 2536.81 ms | 53.2% bf16 MFU | 206903 tok/s +step 12499/19560 | loss 3.414938 (+0.86z)| norm 0.2945 (+0.87z)| lr 1.85e-04 | 2535.21 ms | 53.3% bf16 MFU | 206898 tok/s +step 12500/19560 | loss 3.397549 (+0.50z)| norm 0.2628 (-0.63z)| lr 1.85e-04 | 2536.39 ms | 53.2% bf16 MFU | 206889 tok/s +val loss 3.354139 +evaluating HellaSwag: 0/628 evaluating HellaSwag: 10/628 evaluating HellaSwag: 20/628 evaluating HellaSwag: 30/628 evaluating HellaSwag: 40/628 evaluating HellaSwag: 50/628 evaluating HellaSwag: 60/628 evaluating HellaSwag: 70/628 evaluating HellaSwag: 80/628 evaluating HellaSwag: 90/628 evaluating HellaSwag: 100/628 evaluating HellaSwag: 110/628 evaluating HellaSwag: 120/628 evaluating HellaSwag: 130/628 evaluating HellaSwag: 140/628 evaluating HellaSwag: 150/628 evaluating HellaSwag: 160/628 evaluating HellaSwag: 170/628 evaluating HellaSwag: 180/628 evaluating HellaSwag: 190/628 evaluating HellaSwag: 200/628 evaluating HellaSwag: 210/628 evaluating HellaSwag: 220/628 evaluating HellaSwag: 230/628 evaluating HellaSwag: 240/628 evaluating HellaSwag: 250/628 evaluating HellaSwag: 260/628 evaluating HellaSwag: 270/628 evaluating HellaSwag: 280/628 evaluating HellaSwag: 290/628 evaluating HellaSwag: 300/628 evaluating HellaSwag: 310/628 evaluating HellaSwag: 320/628 evaluating HellaSwag: 330/628 evaluating HellaSwag: 340/628 evaluating HellaSwag: 350/628 evaluating HellaSwag: 360/628 evaluating HellaSwag: 370/628 evaluating HellaSwag: 380/628 evaluating HellaSwag: 390/628 evaluating HellaSwag: 400/628 evaluating HellaSwag: 410/628 evaluating HellaSwag: 420/628 evaluating HellaSwag: 430/628 evaluating HellaSwag: 440/628 evaluating HellaSwag: 450/628 evaluating HellaSwag: 460/628 evaluating HellaSwag: 470/628 evaluating HellaSwag: 480/628 evaluating HellaSwag: 490/628 evaluating HellaSwag: 500/628 evaluating HellaSwag: 510/628 evaluating HellaSwag: 520/628 evaluating HellaSwag: 530/628 evaluating HellaSwag: 540/628 evaluating HellaSwag: 550/628 evaluating HellaSwag: 560/628 evaluating HellaSwag: 570/628 evaluating HellaSwag: 580/628 evaluating HellaSwag: 590/628 evaluating HellaSwag: 600/628 evaluating HellaSwag: 610/628 evaluating HellaSwag: 620/628 HellaSwag: 2956/10042 = 0.294364 +step 12501/19560 | loss 3.396989 (+0.48z)| norm 0.2684 (-0.37z)| lr 1.85e-04 | 2532.08 ms | 53.3% bf16 MFU | 206897 tok/s +step 12502/19560 | loss 3.392394 (+0.38z)| norm 0.2883 (+0.57z)| lr 1.85e-04 | 2534.93 ms | 53.3% bf16 MFU | 206894 tok/s +step 12503/19560 | loss 3.401901 (+0.57z)| norm 0.2573 (-0.90z)| lr 1.85e-04 | 2533.88 ms | 53.3% bf16 MFU | 206894 tok/s +step 12504/19560 | loss 3.410446 (+0.74z)| norm 0.2649 (-0.54z)| lr 1.84e-04 | 2534.00 ms | 53.3% bf16 MFU | 206895 tok/s +step 12505/19560 | loss 3.375008 (+0.02z)| norm 0.2664 (-0.47z)| lr 1.84e-04 | 2533.56 ms | 53.3% bf16 MFU | 206897 tok/s +step 12506/19560 | loss 3.329014 (-0.90z)| norm 0.2562 (-0.94z)| lr 1.84e-04 | 2534.94 ms | 53.3% bf16 MFU | 206893 tok/s +step 12507/19560 | loss 3.380877 (+0.18z)| norm 0.2548 (-0.99z)| lr 1.84e-04 | 2532.54 ms | 53.3% bf16 MFU | 206900 tok/s +step 12508/19560 | loss 3.362761 (-0.19z)| norm 0.2460 (-1.40z)| lr 1.84e-04 | 2535.30 ms | 53.3% bf16 MFU | 206894 tok/s +step 12509/19560 | loss 3.422127 (+1.03z)| norm 0.2835 (+0.39z)| lr 1.84e-04 | 2533.10 ms | 53.3% bf16 MFU | 206898 tok/s +step 12510/19560 | loss 3.374912 (+0.05z)| norm 0.2493 (-1.24z)| lr 1.84e-04 | 2533.97 ms | 53.3% bf16 MFU | 206899 tok/s +step 12511/19560 | loss 3.412574 (+0.83z)| norm 0.2837 (+0.40z)| lr 1.84e-04 | 2534.63 ms | 53.3% bf16 MFU | 206896 tok/s +step 12512/19560 | loss 3.402978 (+0.62z)| norm 0.3024 (+1.28z)| lr 1.84e-04 | 2534.69 ms | 53.3% bf16 MFU | 206894 tok/s +step 12513/19560 | loss 3.389638 (+0.34z)| norm 0.2966 (+0.99z)| lr 1.84e-04 | 2535.11 ms | 53.3% bf16 MFU | 206890 tok/s +step 12514/19560 | loss 3.331107 (-0.86z)| norm 0.2867 (+0.51z)| lr 1.84e-04 | 2536.33 ms | 53.2% bf16 MFU | 206881 tok/s +step 12515/19560 | loss 3.441320 (+1.39z)| norm 0.2765 (+0.02z)| lr 1.84e-04 | 2535.61 ms | 53.2% bf16 MFU | 206875 tok/s +step 12516/19560 | loss 3.376113 (+0.05z)| norm 0.2929 (+0.79z)| lr 1.84e-04 | 2536.59 ms | 53.2% bf16 MFU | 206866 tok/s +step 12517/19560 | loss 3.358209 (-0.32z)| norm 0.2924 (+0.76z)| lr 1.84e-04 | 2534.70 ms | 53.3% bf16 MFU | 206865 tok/s +step 12518/19560 | loss 3.357420 (-0.34z)| norm 0.2990 (+1.05z)| lr 1.84e-04 | 2535.83 ms | 53.2% bf16 MFU | 206859 tok/s +step 12519/19560 | loss 3.379079 (+0.10z)| norm 0.2729 (-0.20z)| lr 1.84e-04 | 2535.72 ms | 53.2% bf16 MFU | 206854 tok/s +step 12520/19560 | loss 3.417925 (+0.89z)| norm 0.2708 (-0.31z)| lr 1.84e-04 | 2534.00 ms | 53.3% bf16 MFU | 206857 tok/s +step 12521/19560 | loss 3.339491 (-0.73z)| norm 0.2841 (+0.33z)| lr 1.84e-04 | 2533.01 ms | 53.3% bf16 MFU | 206863 tok/s +step 12522/19560 | loss 3.411309 (+0.75z)| norm 0.2734 (-0.18z)| lr 1.84e-04 | 2535.33 ms | 53.3% bf16 MFU | 206859 tok/s +step 12523/19560 | loss 3.361312 (-0.28z)| norm 0.3321 (+2.56z)| lr 1.84e-04 | 2533.44 ms | 53.3% bf16 MFU | 206864 tok/s +step 12524/19560 | loss 3.362478 (-0.25z)| norm 0.3005 (+1.06z)| lr 1.84e-04 | 2534.55 ms | 53.3% bf16 MFU | 206863 tok/s +step 12525/19560 | loss 3.458047 (+1.69z)| norm 0.2935 (+0.72z)| lr 1.84e-04 | 2533.62 ms | 53.3% bf16 MFU | 206867 tok/s +step 12526/19560 | loss 3.415439 (+0.80z)| norm 0.3223 (+2.02z)| lr 1.83e-04 | 2532.81 ms | 53.3% bf16 MFU | 206873 tok/s +step 12527/19560 | loss 3.380350 (+0.09z)| norm 0.2804 (+0.08z)| lr 1.83e-04 | 2533.48 ms | 53.3% bf16 MFU | 206877 tok/s +step 12528/19560 | loss 3.396891 (+0.41z)| norm 0.2975 (+0.86z)| lr 1.83e-04 | 2533.80 ms | 53.3% bf16 MFU | 206879 tok/s +step 12529/19560 | loss 3.368073 (-0.16z)| norm 0.2788 (-0.01z)| lr 1.83e-04 | 2533.92 ms | 53.3% bf16 MFU | 206880 tok/s +step 12530/19560 | loss 3.379260 (+0.06z)| norm 0.2787 (-0.03z)| lr 1.83e-04 | 2532.68 ms | 53.3% bf16 MFU | 206887 tok/s +step 12531/19560 | loss 3.371449 (-0.09z)| norm 0.2770 (-0.10z)| lr 1.83e-04 | 2534.28 ms | 53.3% bf16 MFU | 206886 tok/s +step 12532/19560 | loss 3.321447 (-1.11z)| norm 0.2722 (-0.33z)| lr 1.83e-04 | 2534.23 ms | 53.3% bf16 MFU | 206886 tok/s +step 12533/19560 | loss 3.381052 (+0.12z)| norm 0.2864 (+0.33z)| lr 1.83e-04 | 2532.80 ms | 53.3% bf16 MFU | 206892 tok/s +step 12534/19560 | loss 3.387060 (+0.23z)| norm 0.2896 (+0.47z)| lr 1.83e-04 | 2534.70 ms | 53.3% bf16 MFU | 206890 tok/s +step 12535/19560 | loss 3.449558 (+1.50z)| norm 0.2797 (-0.01z)| lr 1.83e-04 | 2533.62 ms | 53.3% bf16 MFU | 206892 tok/s +step 12536/19560 | loss 3.356183 (-0.41z)| norm 0.2923 (+0.58z)| lr 1.83e-04 | 2533.19 ms | 53.3% bf16 MFU | 206895 tok/s +step 12537/19560 | loss 3.321009 (-1.12z)| norm 0.2782 (-0.10z)| lr 1.83e-04 | 2532.98 ms | 53.3% bf16 MFU | 206900 tok/s +step 12538/19560 | loss 3.385683 (+0.20z)| norm 0.2906 (+0.49z)| lr 1.83e-04 | 2534.47 ms | 53.3% bf16 MFU | 206898 tok/s +step 12539/19560 | loss 3.392138 (+0.32z)| norm 0.3110 (+1.44z)| lr 1.83e-04 | 2533.94 ms | 53.3% bf16 MFU | 206898 tok/s +step 12540/19560 | loss 3.351785 (-0.49z)| norm 0.2648 (-0.76z)| lr 1.83e-04 | 2532.65 ms | 53.3% bf16 MFU | 206904 tok/s +step 12541/19560 | loss 3.526711 (+2.96z)| norm 0.3127 (+1.49z)| lr 1.83e-04 | 2532.69 ms | 53.3% bf16 MFU | 206909 tok/s +step 12542/19560 | loss 3.351859 (-0.49z)| norm 0.2801 (-0.05z)| lr 1.83e-04 | 2533.67 ms | 53.3% bf16 MFU | 206910 tok/s +step 12543/19560 | loss 3.366354 (-0.19z)| norm 0.2772 (-0.18z)| lr 1.83e-04 | 2533.32 ms | 53.3% bf16 MFU | 206913 tok/s +step 12544/19560 | loss 3.386868 (+0.23z)| norm 0.2838 (+0.13z)| lr 1.83e-04 | 2533.63 ms | 53.3% bf16 MFU | 206914 tok/s +step 12545/19560 | loss 3.493095 (+2.39z)| norm 0.2794 (-0.07z)| lr 1.83e-04 | 2531.79 ms | 53.3% bf16 MFU | 206922 tok/s +step 12546/19560 | loss 3.361516 (-0.29z)| norm 0.2814 (+0.01z)| lr 1.83e-04 | 2535.38 ms | 53.3% bf16 MFU | 206915 tok/s +step 12547/19560 | loss 3.321263 (-1.10z)| norm 0.2798 (-0.06z)| lr 1.82e-04 | 2534.07 ms | 53.3% bf16 MFU | 206914 tok/s +step 12548/19560 | loss 3.382994 (+0.14z)| norm 0.2683 (-0.61z)| lr 1.82e-04 | 2534.43 ms | 53.3% bf16 MFU | 206912 tok/s +step 12549/19560 | loss 3.393826 (+0.37z)| norm 0.2622 (-0.89z)| lr 1.82e-04 | 2535.95 ms | 53.2% bf16 MFU | 206903 tok/s +step 12550/19560 | loss 3.349226 (-0.54z)| norm 0.2716 (-0.45z)| lr 1.82e-04 | 2535.49 ms | 53.3% bf16 MFU | 206897 tok/s +step 12551/19560 | loss 3.349591 (-0.54z)| norm 0.2901 (+0.43z)| lr 1.82e-04 | 2536.50 ms | 53.2% bf16 MFU | 206887 tok/s +step 12552/19560 | loss 3.340251 (-0.73z)| norm 0.2759 (-0.25z)| lr 1.82e-04 | 2534.81 ms | 53.3% bf16 MFU | 206885 tok/s +step 12553/19560 | loss 3.365901 (-0.20z)| norm 0.2496 (-1.46z)| lr 1.82e-04 | 2533.97 ms | 53.3% bf16 MFU | 206886 tok/s +step 12554/19560 | loss 3.346568 (-0.60z)| norm 0.2691 (-0.55z)| lr 1.82e-04 | 2533.45 ms | 53.3% bf16 MFU | 206889 tok/s +step 12555/19560 | loss 3.364907 (-0.22z)| norm 0.2824 (+0.08z)| lr 1.82e-04 | 2533.95 ms | 53.3% bf16 MFU | 206890 tok/s +step 12556/19560 | loss 3.306870 (-1.39z)| norm 0.2557 (-1.16z)| lr 1.82e-04 | 2535.09 ms | 53.3% bf16 MFU | 206886 tok/s +step 12557/19560 | loss 3.305780 (-1.39z)| norm 0.2731 (-0.34z)| lr 1.82e-04 | 2534.61 ms | 53.3% bf16 MFU | 206884 tok/s +step 12558/19560 | loss 3.426051 (+1.05z)| norm 0.3009 (+0.95z)| lr 1.82e-04 | 2534.98 ms | 53.3% bf16 MFU | 206881 tok/s +step 12559/19560 | loss 3.406869 (+0.65z)| norm 0.3036 (+1.06z)| lr 1.82e-04 | 2534.01 ms | 53.3% bf16 MFU | 206882 tok/s +step 12560/19560 | loss 3.416954 (+0.85z)| norm 0.2754 (-0.25z)| lr 1.82e-04 | 2533.41 ms | 53.3% bf16 MFU | 206885 tok/s +step 12561/19560 | loss 3.374595 (-0.02z)| norm 0.2617 (-0.89z)| lr 1.82e-04 | 2534.68 ms | 53.3% bf16 MFU | 206883 tok/s +step 12562/19560 | loss 3.394683 (+0.38z)| norm 0.2775 (-0.15z)| lr 1.82e-04 | 2534.91 ms | 53.3% bf16 MFU | 206880 tok/s +step 12563/19560 | loss 3.392479 (+0.32z)| norm 0.2689 (-0.56z)| lr 1.82e-04 | 2534.10 ms | 53.3% bf16 MFU | 206881 tok/s +step 12564/19560 | loss 3.398974 (+0.47z)| norm 0.2690 (-0.55z)| lr 1.82e-04 | 2534.34 ms | 53.3% bf16 MFU | 206881 tok/s +step 12565/19560 | loss 3.377891 (+0.04z)| norm 0.2636 (-0.82z)| lr 1.82e-04 | 2532.49 ms | 53.3% bf16 MFU | 206888 tok/s +step 12566/19560 | loss 3.321912 (-1.13z)| norm 0.2684 (-0.59z)| lr 1.82e-04 | 2533.42 ms | 53.3% bf16 MFU | 206891 tok/s +step 12567/19560 | loss 3.407762 (+0.71z)| norm 0.2689 (-0.56z)| lr 1.82e-04 | 2535.48 ms | 53.3% bf16 MFU | 206885 tok/s +step 12568/19560 | loss 3.394496 (+0.42z)| norm 0.2693 (-0.53z)| lr 1.82e-04 | 2532.28 ms | 53.3% bf16 MFU | 206893 tok/s +step 12569/19560 | loss 3.408953 (+0.73z)| norm 0.2809 (+0.01z)| lr 1.81e-04 | 2534.35 ms | 53.3% bf16 MFU | 206892 tok/s +step 12570/19560 | loss 3.395420 (+0.44z)| norm 0.2649 (-0.73z)| lr 1.81e-04 | 2535.26 ms | 53.3% bf16 MFU | 206888 tok/s +step 12571/19560 | loss 3.336312 (-0.86z)| norm 0.2743 (-0.28z)| lr 1.81e-04 | 2534.66 ms | 53.3% bf16 MFU | 206886 tok/s +step 12572/19560 | loss 3.403317 (+0.68z)| norm 0.2820 (+0.08z)| lr 1.81e-04 | 2533.84 ms | 53.3% bf16 MFU | 206887 tok/s +step 12573/19560 | loss 3.367617 (-0.14z)| norm 0.2809 (+0.02z)| lr 1.81e-04 | 2533.02 ms | 53.3% bf16 MFU | 206892 tok/s +step 12574/19560 | loss 3.285947 (-1.98z)| norm 0.2927 (+0.57z)| lr 1.81e-04 | 2535.01 ms | 53.3% bf16 MFU | 206888 tok/s +step 12575/19560 | loss 3.339738 (-0.75z)| norm 0.2878 (+0.34z)| lr 1.81e-04 | 2532.43 ms | 53.3% bf16 MFU | 206895 tok/s +step 12576/19560 | loss 3.367140 (-0.14z)| norm 0.2752 (-0.26z)| lr 1.81e-04 | 2535.17 ms | 53.3% bf16 MFU | 206891 tok/s +step 12577/19560 | loss 3.384851 (+0.27z)| norm 0.2806 (-0.00z)| lr 1.81e-04 | 2533.12 ms | 53.3% bf16 MFU | 206895 tok/s +step 12578/19560 | loss 3.360625 (-0.29z)| norm 0.2895 (+0.41z)| lr 1.81e-04 | 2535.46 ms | 53.3% bf16 MFU | 206889 tok/s +step 12579/19560 | loss 3.333290 (-0.91z)| norm 0.2758 (-0.23z)| lr 1.81e-04 | 2532.91 ms | 53.3% bf16 MFU | 206894 tok/s +step 12580/19560 | loss 3.364783 (-0.19z)| norm 0.2976 (+0.79z)| lr 1.81e-04 | 2533.05 ms | 53.3% bf16 MFU | 206898 tok/s +step 12581/19560 | loss 3.444032 (+1.64z)| norm 0.2794 (-0.07z)| lr 1.81e-04 | 2535.41 ms | 53.3% bf16 MFU | 206893 tok/s +step 12582/19560 | loss 3.340111 (-0.80z)| norm 0.2637 (-1.05z)| lr 1.81e-04 | 2533.12 ms | 53.3% bf16 MFU | 206897 tok/s +step 12583/19560 | loss 3.342445 (-0.73z)| norm 0.2771 (-0.15z)| lr 1.81e-04 | 2533.67 ms | 53.3% bf16 MFU | 206898 tok/s +step 12584/19560 | loss 3.357798 (-0.34z)| norm 0.2798 (+0.04z)| lr 1.81e-04 | 2534.25 ms | 53.3% bf16 MFU | 206898 tok/s +step 12585/19560 | loss 3.381429 (+0.25z)| norm 0.2906 (+0.79z)| lr 1.81e-04 | 2535.22 ms | 53.3% bf16 MFU | 206893 tok/s +step 12586/19560 | loss 3.376458 (+0.13z)| norm 0.2826 (+0.23z)| lr 1.81e-04 | 2532.61 ms | 53.3% bf16 MFU | 206899 tok/s +step 12587/19560 | loss 3.548444 (+4.15z)| norm 0.3078 (+1.93z)| lr 1.81e-04 | 2534.16 ms | 53.3% bf16 MFU | 206898 tok/s +step 12588/19560 | loss 3.337563 (-0.83z)| norm 0.2595 (-1.35z)| lr 1.81e-04 | 2530.45 ms | 53.4% bf16 MFU | 206913 tok/s +step 12589/19560 | loss 3.364980 (-0.18z)| norm 0.2756 (-0.26z)| lr 1.81e-04 | 2532.89 ms | 53.3% bf16 MFU | 206917 tok/s +step 12590/19560 | loss 3.359177 (-0.33z)| norm 0.2737 (-0.39z)| lr 1.81e-04 | 2531.09 ms | 53.3% bf16 MFU | 206928 tok/s +step 12591/19560 | loss 3.306949 (-1.55z)| norm 0.2866 (+0.48z)| lr 1.80e-04 | 2533.66 ms | 53.3% bf16 MFU | 206928 tok/s +step 12592/19560 | loss 3.386407 (+0.36z)| norm 0.2825 (+0.21z)| lr 1.80e-04 | 2532.60 ms | 53.3% bf16 MFU | 206932 tok/s +step 12593/19560 | loss 3.417986 (+1.10z)| norm 0.2749 (-0.31z)| lr 1.80e-04 | 2531.39 ms | 53.3% bf16 MFU | 206942 tok/s +step 12594/19560 | loss 3.365775 (-0.16z)| norm 0.2789 (-0.04z)| lr 1.80e-04 | 2532.06 ms | 53.3% bf16 MFU | 206948 tok/s +step 12595/19560 | loss 3.343229 (-0.70z)| norm 0.2928 (+0.90z)| lr 1.80e-04 | 2533.47 ms | 53.3% bf16 MFU | 206947 tok/s +step 12596/19560 | loss 3.484298 (+2.61z)| norm 0.3049 (+1.69z)| lr 1.80e-04 | 2530.27 ms | 53.4% bf16 MFU | 206960 tok/s +step 12597/19560 | loss 3.344423 (-0.68z)| norm 0.2755 (-0.29z)| lr 1.80e-04 | 2533.22 ms | 53.3% bf16 MFU | 206961 tok/s +step 12598/19560 | loss 3.343024 (-0.72z)| norm 0.2877 (+0.52z)| lr 1.80e-04 | 2533.84 ms | 53.3% bf16 MFU | 206958 tok/s +step 12599/19560 | loss 3.402534 (+0.68z)| norm 0.2624 (-1.16z)| lr 1.80e-04 | 2531.86 ms | 53.3% bf16 MFU | 206964 tok/s +step 12600/19560 | loss 3.367566 (-0.16z)| norm 0.2721 (-0.51z)| lr 1.80e-04 | 2533.67 ms | 53.3% bf16 MFU | 206962 tok/s +step 12601/19560 | loss 3.360718 (-0.34z)| norm 0.2806 (+0.07z)| lr 1.80e-04 | 2533.06 ms | 53.3% bf16 MFU | 206963 tok/s +step 12602/19560 | loss 3.364739 (-0.25z)| norm 0.2826 (+0.20z)| lr 1.80e-04 | 2531.36 ms | 53.3% bf16 MFU | 206971 tok/s +step 12603/19560 | loss 3.431544 (+1.36z)| norm 0.2919 (+0.82z)| lr 1.80e-04 | 2532.14 ms | 53.3% bf16 MFU | 206975 tok/s +step 12604/19560 | loss 3.373862 (-0.05z)| norm 0.2591 (-1.36z)| lr 1.80e-04 | 2533.50 ms | 53.3% bf16 MFU | 206973 tok/s +step 12605/19560 | loss 3.437015 (+1.46z)| norm 0.3231 (+2.81z)| lr 1.80e-04 | 2532.13 ms | 53.3% bf16 MFU | 206977 tok/s +step 12606/19560 | loss 3.377519 (+0.02z)| norm 0.2658 (-0.89z)| lr 1.80e-04 | 2532.42 ms | 53.3% bf16 MFU | 206980 tok/s +step 12607/19560 | loss 3.325791 (-1.22z)| norm 0.3122 (+2.06z)| lr 1.80e-04 | 2533.87 ms | 53.3% bf16 MFU | 206977 tok/s +step 12608/19560 | loss 3.357611 (-0.45z)| norm 0.2473 (-2.03z)| lr 1.80e-04 | 2532.01 ms | 53.3% bf16 MFU | 206981 tok/s +step 12609/19560 | loss 3.382512 (+0.14z)| norm 0.3248 (+2.74z)| lr 1.80e-04 | 2531.39 ms | 53.3% bf16 MFU | 206988 tok/s +step 12610/19560 | loss 3.366096 (-0.27z)| norm 0.2919 (+0.72z)| lr 1.80e-04 | 2533.80 ms | 53.3% bf16 MFU | 206984 tok/s +step 12611/19560 | loss 3.352910 (-0.60z)| norm 0.2768 (-0.22z)| lr 1.80e-04 | 2533.59 ms | 53.3% bf16 MFU | 206982 tok/s +step 12612/19560 | loss 3.368649 (-0.21z)| norm 0.2772 (-0.18z)| lr 1.80e-04 | 2534.60 ms | 53.3% bf16 MFU | 206975 tok/s +step 12613/19560 | loss 3.362124 (-0.38z)| norm 0.2929 (+0.77z)| lr 1.79e-04 | 2535.34 ms | 53.3% bf16 MFU | 206966 tok/s +step 12614/19560 | loss 3.346660 (-0.76z)| norm 0.2554 (-1.54z)| lr 1.79e-04 | 2532.73 ms | 53.3% bf16 MFU | 206968 tok/s +step 12615/19560 | loss 3.365260 (-0.30z)| norm 0.2624 (-1.09z)| lr 1.79e-04 | 2531.77 ms | 53.3% bf16 MFU | 206974 tok/s +step 12616/19560 | loss 3.434780 (+1.39z)| norm 0.2578 (-1.36z)| lr 1.79e-04 | 2531.99 ms | 53.3% bf16 MFU | 206978 tok/s +step 12617/19560 | loss 3.403784 (+0.61z)| norm 0.2657 (-0.86z)| lr 1.79e-04 | 2534.07 ms | 53.3% bf16 MFU | 206974 tok/s +step 12618/19560 | loss 3.382438 (+0.09z)| norm 0.2658 (-0.85z)| lr 1.79e-04 | 2535.93 ms | 53.2% bf16 MFU | 206963 tok/s +step 12619/19560 | loss 3.336768 (-1.02z)| norm 0.2645 (-0.93z)| lr 1.79e-04 | 2533.59 ms | 53.3% bf16 MFU | 206961 tok/s +step 12620/19560 | loss 3.379776 (+0.03z)| norm 0.2545 (-1.52z)| lr 1.79e-04 | 2533.82 ms | 53.3% bf16 MFU | 206959 tok/s +step 12621/19560 | loss 3.360083 (-0.46z)| norm 0.2710 (-0.51z)| lr 1.79e-04 | 2531.85 ms | 53.3% bf16 MFU | 206965 tok/s +step 12622/19560 | loss 3.384368 (+0.14z)| norm 0.2576 (-1.32z)| lr 1.79e-04 | 2532.66 ms | 53.3% bf16 MFU | 206967 tok/s +step 12623/19560 | loss 3.354966 (-0.60z)| norm 0.2708 (-0.51z)| lr 1.79e-04 | 2531.08 ms | 53.3% bf16 MFU | 206976 tok/s +step 12624/19560 | loss 3.326203 (-1.30z)| norm 0.2597 (-1.18z)| lr 1.79e-04 | 2531.82 ms | 53.3% bf16 MFU | 206981 tok/s +step 12625/19560 | loss 3.312050 (-1.61z)| norm 0.2791 (+0.01z)| lr 1.79e-04 | 2531.74 ms | 53.3% bf16 MFU | 206986 tok/s +step 12626/19560 | loss 3.320508 (-1.38z)| norm 0.2680 (-0.67z)| lr 1.79e-04 | 2532.21 ms | 53.3% bf16 MFU | 206989 tok/s +step 12627/19560 | loss 3.200381 (-4.01z)| norm 0.3418 (+3.62z)| lr 1.79e-04 | 2533.22 ms | 53.3% bf16 MFU | 206988 tok/s +step 12628/19560 | loss 3.331762 (-0.99z)| norm 0.2667 (-0.74z)| lr 1.79e-04 | 2532.10 ms | 53.3% bf16 MFU | 206992 tok/s +step 12629/19560 | loss 3.361552 (-0.30z)| norm 0.2785 (-0.06z)| lr 1.79e-04 | 2535.23 ms | 53.3% bf16 MFU | 206982 tok/s +step 12630/19560 | loss 3.375831 (+0.03z)| norm 0.2655 (-0.81z)| lr 1.79e-04 | 2531.83 ms | 53.3% bf16 MFU | 206987 tok/s +step 12631/19560 | loss 3.413901 (+0.90z)| norm 0.2764 (-0.18z)| lr 1.79e-04 | 2532.31 ms | 53.3% bf16 MFU | 206989 tok/s +step 12632/19560 | loss 3.353940 (-0.46z)| norm 0.2771 (-0.15z)| lr 1.79e-04 | 2532.53 ms | 53.3% bf16 MFU | 206991 tok/s +step 12633/19560 | loss 3.413624 (+0.89z)| norm 0.2806 (+0.06z)| lr 1.79e-04 | 2531.18 ms | 53.3% bf16 MFU | 206998 tok/s +step 12634/19560 | loss 3.310441 (-1.45z)| norm 0.2541 (-1.51z)| lr 1.79e-04 | 2531.41 ms | 53.3% bf16 MFU | 207004 tok/s +step 12635/19560 | loss 3.461415 (+1.93z)| norm 0.3058 (+1.51z)| lr 1.78e-04 | 2533.29 ms | 53.3% bf16 MFU | 207002 tok/s +step 12636/19560 | loss 3.334354 (-0.90z)| norm 0.2645 (-0.94z)| lr 1.78e-04 | 2533.70 ms | 53.3% bf16 MFU | 206998 tok/s +step 12637/19560 | loss 3.377008 (+0.06z)| norm 0.2828 (+0.15z)| lr 1.78e-04 | 2531.61 ms | 53.3% bf16 MFU | 207003 tok/s +step 12638/19560 | loss 3.446713 (+1.59z)| norm 0.2846 (+0.25z)| lr 1.78e-04 | 2530.94 ms | 53.3% bf16 MFU | 207010 tok/s +step 12639/19560 | loss 3.397153 (+0.50z)| norm 0.2947 (+0.85z)| lr 1.78e-04 | 2534.06 ms | 53.3% bf16 MFU | 207005 tok/s +step 12640/19560 | loss 3.325699 (-1.08z)| norm 0.2510 (-1.75z)| lr 1.78e-04 | 2534.07 ms | 53.3% bf16 MFU | 206999 tok/s +step 12641/19560 | loss 3.401046 (+0.59z)| norm 0.2726 (-0.44z)| lr 1.78e-04 | 2533.61 ms | 53.3% bf16 MFU | 206996 tok/s +step 12642/19560 | loss 3.364856 (-0.22z)| norm 0.2573 (-1.34z)| lr 1.78e-04 | 2533.89 ms | 53.3% bf16 MFU | 206992 tok/s +step 12643/19560 | loss 3.391223 (+0.38z)| norm 0.2616 (-1.07z)| lr 1.78e-04 | 2532.09 ms | 53.3% bf16 MFU | 206995 tok/s +step 12644/19560 | loss 3.293108 (-1.78z)| norm 0.2666 (-0.76z)| lr 1.78e-04 | 2533.34 ms | 53.3% bf16 MFU | 206993 tok/s +step 12645/19560 | loss 3.351661 (-0.48z)| norm 0.2641 (-0.90z)| lr 1.78e-04 | 2531.52 ms | 53.3% bf16 MFU | 206998 tok/s +step 12646/19560 | loss 3.337346 (-0.79z)| norm 0.2538 (-1.49z)| lr 1.78e-04 | 2533.20 ms | 53.3% bf16 MFU | 206997 tok/s +step 12647/19560 | loss 3.410642 (+0.82z)| norm 0.2829 (+0.23z)| lr 1.78e-04 | 2531.74 ms | 53.3% bf16 MFU | 207001 tok/s +step 12648/19560 | loss 3.304118 (-1.50z)| norm 0.2873 (+0.49z)| lr 1.78e-04 | 2533.44 ms | 53.3% bf16 MFU | 206999 tok/s +step 12649/19560 | loss 3.350128 (-0.50z)| norm 0.2795 (+0.03z)| lr 1.78e-04 | 2532.74 ms | 53.3% bf16 MFU | 206999 tok/s +step 12650/19560 | loss 3.342359 (-0.66z)| norm 0.2874 (+0.49z)| lr 1.78e-04 | 2534.65 ms | 53.3% bf16 MFU | 206991 tok/s +step 12651/19560 | loss 3.351394 (-0.46z)| norm 0.2599 (-1.14z)| lr 1.78e-04 | 2531.57 ms | 53.3% bf16 MFU | 206997 tok/s +step 12652/19560 | loss 3.337099 (-0.76z)| norm 0.2643 (-0.86z)| lr 1.78e-04 | 2532.98 ms | 53.3% bf16 MFU | 206996 tok/s +step 12653/19560 | loss 3.384482 (+0.29z)| norm 0.2928 (+0.89z)| lr 1.78e-04 | 2534.01 ms | 53.3% bf16 MFU | 206991 tok/s +step 12654/19560 | loss 3.345409 (-0.57z)| norm 0.2867 (+0.55z)| lr 1.78e-04 | 2534.64 ms | 53.3% bf16 MFU | 206984 tok/s +step 12655/19560 | loss 3.376054 (+0.12z)| norm 0.2808 (+0.17z)| lr 1.78e-04 | 2533.20 ms | 53.3% bf16 MFU | 206983 tok/s +step 12656/19560 | loss 3.346145 (-0.54z)| norm 0.2814 (+0.22z)| lr 1.78e-04 | 2533.10 ms | 53.3% bf16 MFU | 206983 tok/s +step 12657/19560 | loss 3.335823 (-0.76z)| norm 0.2931 (+0.96z)| lr 1.77e-04 | 2533.22 ms | 53.3% bf16 MFU | 206982 tok/s +step 12658/19560 | loss 3.371527 (+0.03z)| norm 0.2671 (-0.68z)| lr 1.77e-04 | 2534.23 ms | 53.3% bf16 MFU | 206977 tok/s +step 12659/19560 | loss 3.298851 (-1.55z)| norm 0.2642 (-0.86z)| lr 1.77e-04 | 2534.05 ms | 53.3% bf16 MFU | 206973 tok/s +step 12660/19560 | loss 3.372553 (+0.06z)| norm 0.2732 (-0.29z)| lr 1.77e-04 | 2534.18 ms | 53.3% bf16 MFU | 206969 tok/s +step 12661/19560 | loss 3.407816 (+0.83z)| norm 0.3084 (+1.89z)| lr 1.77e-04 | 2532.93 ms | 53.3% bf16 MFU | 206970 tok/s +step 12662/19560 | loss 3.428333 (+1.27z)| norm 0.2734 (-0.28z)| lr 1.77e-04 | 2534.26 ms | 53.3% bf16 MFU | 206965 tok/s +step 12663/19560 | loss 3.388825 (+0.42z)| norm 0.2748 (-0.19z)| lr 1.77e-04 | 2533.02 ms | 53.3% bf16 MFU | 206966 tok/s +step 12664/19560 | loss 3.388037 (+0.39z)| norm 0.2865 (+0.54z)| lr 1.77e-04 | 2533.14 ms | 53.3% bf16 MFU | 206966 tok/s +step 12665/19560 | loss 3.482150 (+2.40z)| norm 0.2856 (+0.48z)| lr 1.77e-04 | 2534.06 ms | 53.3% bf16 MFU | 206963 tok/s +step 12666/19560 | loss 3.360557 (-0.23z)| norm 0.2626 (-0.94z)| lr 1.77e-04 | 2533.26 ms | 53.3% bf16 MFU | 206963 tok/s +step 12667/19560 | loss 3.387921 (+0.36z)| norm 0.2890 (+0.73z)| lr 1.77e-04 | 2532.47 ms | 53.3% bf16 MFU | 206966 tok/s +step 12668/19560 | loss 3.410211 (+0.83z)| norm 0.2789 (+0.08z)| lr 1.77e-04 | 2532.30 ms | 53.3% bf16 MFU | 206970 tok/s +step 12669/19560 | loss 3.365114 (-0.12z)| norm 0.2780 (+0.05z)| lr 1.77e-04 | 2531.10 ms | 53.3% bf16 MFU | 206978 tok/s +step 12670/19560 | loss 3.295542 (-1.68z)| norm 0.2716 (-0.36z)| lr 1.77e-04 | 2533.35 ms | 53.3% bf16 MFU | 206977 tok/s +step 12671/19560 | loss 3.382339 (+0.27z)| norm 0.2837 (+0.41z)| lr 1.77e-04 | 2532.79 ms | 53.3% bf16 MFU | 206978 tok/s +step 12672/19560 | loss 3.365045 (-0.11z)| norm 0.2611 (-1.03z)| lr 1.77e-04 | 2533.58 ms | 53.3% bf16 MFU | 206976 tok/s +step 12673/19560 | loss 3.346797 (-0.51z)| norm 0.2601 (-1.08z)| lr 1.77e-04 | 2531.41 ms | 53.3% bf16 MFU | 206983 tok/s +step 12674/19560 | loss 3.394218 (+0.58z)| norm 0.2576 (-1.23z)| lr 1.77e-04 | 2532.59 ms | 53.3% bf16 MFU | 206985 tok/s +step 12675/19560 | loss 3.368544 (-0.02z)| norm 0.2717 (-0.32z)| lr 1.77e-04 | 2533.83 ms | 53.3% bf16 MFU | 206981 tok/s +step 12676/19560 | loss 3.346593 (-0.53z)| norm 0.2524 (-1.54z)| lr 1.77e-04 | 2532.61 ms | 53.3% bf16 MFU | 206983 tok/s +step 12677/19560 | loss 3.347466 (-0.50z)| norm 0.2511 (-1.60z)| lr 1.77e-04 | 2531.22 ms | 53.3% bf16 MFU | 206990 tok/s +step 12678/19560 | loss 3.399271 (+0.70z)| norm 0.2545 (-1.37z)| lr 1.77e-04 | 2531.49 ms | 53.3% bf16 MFU | 206996 tok/s +step 12679/19560 | loss 3.396931 (+0.64z)| norm 0.2646 (-0.73z)| lr 1.76e-04 | 2531.68 ms | 53.3% bf16 MFU | 207001 tok/s +step 12680/19560 | loss 3.318176 (-1.18z)| norm 0.2720 (-0.26z)| lr 1.76e-04 | 2530.90 ms | 53.3% bf16 MFU | 207008 tok/s +step 12681/19560 | loss 3.402804 (+0.76z)| norm 0.2586 (-1.11z)| lr 1.76e-04 | 2531.84 ms | 53.3% bf16 MFU | 207012 tok/s +step 12682/19560 | loss 3.427631 (+1.31z)| norm 0.2670 (-0.58z)| lr 1.76e-04 | 2532.06 ms | 53.3% bf16 MFU | 207014 tok/s +step 12683/19560 | loss 3.375324 (+0.11z)| norm 0.2667 (-0.59z)| lr 1.76e-04 | 2533.22 ms | 53.3% bf16 MFU | 207012 tok/s +step 12684/19560 | loss 3.316520 (-1.24z)| norm 0.2644 (-0.74z)| lr 1.76e-04 | 2532.22 ms | 53.3% bf16 MFU | 207013 tok/s +step 12685/19560 | loss 3.375588 (+0.11z)| norm 0.2677 (-0.53z)| lr 1.76e-04 | 2533.20 ms | 53.3% bf16 MFU | 207011 tok/s +step 12686/19560 | loss 3.388298 (+0.41z)| norm 0.2629 (-0.82z)| lr 1.76e-04 | 2533.08 ms | 53.3% bf16 MFU | 207009 tok/s +step 12687/19560 | loss 3.359981 (-0.24z)| norm 0.2655 (-0.64z)| lr 1.76e-04 | 2530.75 ms | 53.4% bf16 MFU | 207017 tok/s +step 12688/19560 | loss 3.368359 (-0.04z)| norm 0.2791 (+0.23z)| lr 1.76e-04 | 2533.24 ms | 53.3% bf16 MFU | 207015 tok/s +step 12689/19560 | loss 3.371943 (+0.05z)| norm 0.2791 (+0.22z)| lr 1.76e-04 | 2531.28 ms | 53.3% bf16 MFU | 207020 tok/s +step 12690/19560 | loss 3.442584 (+1.68z)| norm 0.2975 (+1.39z)| lr 1.76e-04 | 2531.31 ms | 53.3% bf16 MFU | 207025 tok/s +step 12691/19560 | loss 3.352918 (-0.40z)| norm 0.2829 (+0.45z)| lr 1.76e-04 | 2530.83 ms | 53.3% bf16 MFU | 207032 tok/s +step 12692/19560 | loss 3.333459 (-0.84z)| norm 0.2536 (-1.41z)| lr 1.76e-04 | 2533.78 ms | 53.3% bf16 MFU | 207026 tok/s +step 12693/19560 | loss 3.411793 (+0.97z)| norm 0.2765 (+0.04z)| lr 1.76e-04 | 2533.03 ms | 53.3% bf16 MFU | 207024 tok/s +step 12694/19560 | loss 3.279075 (-2.07z)| norm 0.2632 (-0.81z)| lr 1.76e-04 | 2533.72 ms | 53.3% bf16 MFU | 207019 tok/s +step 12695/19560 | loss 3.305451 (-1.44z)| norm 0.2688 (-0.45z)| lr 1.76e-04 | 2532.48 ms | 53.3% bf16 MFU | 207019 tok/s +step 12696/19560 | loss 3.362732 (-0.13z)| norm 0.2590 (-1.06z)| lr 1.76e-04 | 2534.32 ms | 53.3% bf16 MFU | 207012 tok/s +step 12697/19560 | loss 3.310308 (-1.30z)| norm 0.2540 (-1.35z)| lr 1.76e-04 | 2534.97 ms | 53.3% bf16 MFU | 207003 tok/s +step 12698/19560 | loss 3.317851 (-1.11z)| norm 0.2717 (-0.24z)| lr 1.76e-04 | 2532.54 ms | 53.3% bf16 MFU | 207003 tok/s +step 12699/19560 | loss 3.311876 (-1.24z)| norm 0.2655 (-0.63z)| lr 1.76e-04 | 2534.64 ms | 53.3% bf16 MFU | 206996 tok/s +step 12700/19560 | loss 3.647330 (+5.50z)| norm 0.3148 (+2.39z)| lr 1.76e-04 | 2534.69 ms | 53.3% bf16 MFU | 206988 tok/s +step 12701/19560 | loss 3.321800 (-0.92z)| norm 0.2972 (+1.30z)| lr 1.75e-04 | 2534.70 ms | 53.3% bf16 MFU | 206981 tok/s +step 12702/19560 | loss 3.344139 (-0.49z)| norm 0.2871 (+0.69z)| lr 1.75e-04 | 2533.58 ms | 53.3% bf16 MFU | 206979 tok/s +step 12703/19560 | loss 3.426323 (+1.12z)| norm 0.2822 (+0.39z)| lr 1.75e-04 | 2534.04 ms | 53.3% bf16 MFU | 206975 tok/s +step 12704/19560 | loss 3.400500 (+0.61z)| norm 0.2932 (+1.05z)| lr 1.75e-04 | 2532.80 ms | 53.3% bf16 MFU | 206976 tok/s +step 12705/19560 | loss 3.305999 (-1.24z)| norm 0.2762 (+0.02z)| lr 1.75e-04 | 2534.34 ms | 53.3% bf16 MFU | 206971 tok/s +step 12706/19560 | loss 3.356202 (-0.25z)| norm 0.2709 (-0.30z)| lr 1.75e-04 | 2532.19 ms | 53.3% bf16 MFU | 206975 tok/s +step 12707/19560 | loss 3.272109 (-1.87z)| norm 0.2859 (+0.61z)| lr 1.75e-04 | 2533.59 ms | 53.3% bf16 MFU | 206973 tok/s +step 12708/19560 | loss 3.346580 (-0.43z)| norm 0.2725 (-0.20z)| lr 1.75e-04 | 2532.19 ms | 53.3% bf16 MFU | 206976 tok/s +step 12709/19560 | loss 3.338462 (-0.57z)| norm 0.2596 (-0.98z)| lr 1.75e-04 | 2533.81 ms | 53.3% bf16 MFU | 206974 tok/s +step 12710/19560 | loss 3.287668 (-1.54z)| norm 0.2650 (-0.65z)| lr 1.75e-04 | 2533.98 ms | 53.3% bf16 MFU | 206970 tok/s +step 12711/19560 | loss 3.386056 (+0.36z)| norm 0.2627 (-0.78z)| lr 1.75e-04 | 2534.77 ms | 53.3% bf16 MFU | 206963 tok/s +step 12712/19560 | loss 3.305831 (-1.18z)| norm 0.2587 (-1.01z)| lr 1.75e-04 | 2533.72 ms | 53.3% bf16 MFU | 206961 tok/s +step 12713/19560 | loss 3.322286 (-0.85z)| norm 0.2741 (-0.07z)| lr 1.75e-04 | 2532.03 ms | 53.3% bf16 MFU | 206966 tok/s +step 12714/19560 | loss 3.326536 (-0.76z)| norm 0.2664 (-0.53z)| lr 1.75e-04 | 2533.99 ms | 53.3% bf16 MFU | 206963 tok/s +step 12715/19560 | loss 3.334226 (-0.61z)| norm 0.2613 (-0.83z)| lr 1.75e-04 | 2532.22 ms | 53.3% bf16 MFU | 206967 tok/s +step 12716/19560 | loss 3.320116 (-0.89z)| norm 0.2458 (-1.76z)| lr 1.75e-04 | 2534.43 ms | 53.3% bf16 MFU | 206962 tok/s +step 12717/19560 | loss 3.343991 (-0.41z)| norm 0.2574 (-1.04z)| lr 1.75e-04 | 2532.93 ms | 53.3% bf16 MFU | 206964 tok/s +step 12718/19560 | loss 3.326323 (-0.76z)| norm 0.2789 (+0.27z)| lr 1.75e-04 | 2534.59 ms | 53.3% bf16 MFU | 206958 tok/s +step 12719/19560 | loss 3.339132 (-0.51z)| norm 0.2676 (-0.41z)| lr 1.75e-04 | 2532.55 ms | 53.3% bf16 MFU | 206961 tok/s +step 12720/19560 | loss 3.316650 (-0.95z)| norm 0.2518 (-1.35z)| lr 1.75e-04 | 2531.92 ms | 53.3% bf16 MFU | 206967 tok/s +step 12721/19560 | loss 3.327046 (-0.73z)| norm 0.2769 (+0.17z)| lr 1.75e-04 | 2532.87 ms | 53.3% bf16 MFU | 206968 tok/s +step 12722/19560 | loss 3.424937 (+1.23z)| norm 0.2948 (+1.24z)| lr 1.75e-04 | 2533.40 ms | 53.3% bf16 MFU | 206967 tok/s +step 12723/19560 | loss 3.383265 (+0.39z)| norm 0.2717 (-0.14z)| lr 1.74e-04 | 2533.88 ms | 53.3% bf16 MFU | 206964 tok/s +step 12724/19560 | loss 3.357162 (-0.12z)| norm 0.2831 (+0.57z)| lr 1.74e-04 | 2532.27 ms | 53.3% bf16 MFU | 206968 tok/s +step 12725/19560 | loss 3.314991 (-0.97z)| norm 0.3038 (+1.80z)| lr 1.74e-04 | 2534.22 ms | 53.3% bf16 MFU | 206964 tok/s +step 12726/19560 | loss 3.326730 (-0.73z)| norm 0.2709 (-0.18z)| lr 1.74e-04 | 2532.57 ms | 53.3% bf16 MFU | 206967 tok/s +step 12727/19560 | loss 3.330878 (-0.64z)| norm 0.3007 (+1.60z)| lr 1.74e-04 | 2531.08 ms | 53.3% bf16 MFU | 206975 tok/s +step 12728/19560 | loss 3.274076 (-1.76z)| norm 0.2716 (-0.16z)| lr 1.74e-04 | 2531.71 ms | 53.3% bf16 MFU | 206981 tok/s +step 12729/19560 | loss 3.330905 (-0.61z)| norm 0.2720 (-0.13z)| lr 1.74e-04 | 2531.38 ms | 53.3% bf16 MFU | 206988 tok/s +step 12730/19560 | loss 3.397741 (+0.73z)| norm 0.2936 (+1.16z)| lr 1.74e-04 | 2531.80 ms | 53.3% bf16 MFU | 206993 tok/s +step 12731/19560 | loss 3.326417 (-0.69z)| norm 0.2503 (-1.42z)| lr 1.74e-04 | 2534.00 ms | 53.3% bf16 MFU | 206988 tok/s +step 12732/19560 | loss 3.372280 (+0.24z)| norm 0.2719 (-0.13z)| lr 1.74e-04 | 2533.62 ms | 53.3% bf16 MFU | 206985 tok/s +step 12733/19560 | loss 3.311625 (-0.97z)| norm 0.2614 (-0.76z)| lr 1.74e-04 | 2532.73 ms | 53.3% bf16 MFU | 206986 tok/s +step 12734/19560 | loss 3.372040 (+0.26z)| norm 0.2685 (-0.31z)| lr 1.74e-04 | 2533.23 ms | 53.3% bf16 MFU | 206985 tok/s +step 12735/19560 | loss 3.357673 (-0.04z)| norm 0.2492 (-1.51z)| lr 1.74e-04 | 2532.90 ms | 53.3% bf16 MFU | 206985 tok/s +step 12736/19560 | loss 3.379126 (+0.39z)| norm 0.2635 (-0.62z)| lr 1.74e-04 | 2532.47 ms | 53.3% bf16 MFU | 206987 tok/s +step 12737/19560 | loss 3.410716 (+1.03z)| norm 0.2537 (-1.25z)| lr 1.74e-04 | 2530.95 ms | 53.3% bf16 MFU | 206996 tok/s +step 12738/19560 | loss 3.371672 (+0.23z)| norm 0.2591 (-0.88z)| lr 1.74e-04 | 2531.63 ms | 53.3% bf16 MFU | 207001 tok/s +step 12739/19560 | loss 3.345772 (-0.29z)| norm 0.2817 (+0.61z)| lr 1.74e-04 | 2533.70 ms | 53.3% bf16 MFU | 206997 tok/s +step 12740/19560 | loss 3.458440 (+1.95z)| norm 0.2701 (-0.15z)| lr 1.74e-04 | 2530.86 ms | 53.3% bf16 MFU | 207005 tok/s +step 12741/19560 | loss 3.350991 (-0.19z)| norm 0.2922 (+1.31z)| lr 1.74e-04 | 2530.71 ms | 53.4% bf16 MFU | 207013 tok/s +step 12742/19560 | loss 3.336556 (-0.48z)| norm 0.2663 (-0.41z)| lr 1.74e-04 | 2530.70 ms | 53.4% bf16 MFU | 207021 tok/s +step 12743/19560 | loss 3.350331 (-0.20z)| norm 0.2897 (+1.13z)| lr 1.74e-04 | 2531.64 ms | 53.3% bf16 MFU | 207025 tok/s +step 12744/19560 | loss 3.342888 (-0.34z)| norm 0.2773 (+0.30z)| lr 1.74e-04 | 2532.09 ms | 53.3% bf16 MFU | 207026 tok/s +step 12745/19560 | loss 3.345737 (-0.27z)| norm 0.2604 (-0.82z)| lr 1.73e-04 | 2532.45 ms | 53.3% bf16 MFU | 207026 tok/s +step 12746/19560 | loss 3.385484 (+0.53z)| norm 0.2925 (+1.29z)| lr 1.73e-04 | 2533.11 ms | 53.3% bf16 MFU | 207024 tok/s +step 12747/19560 | loss 3.357493 (-0.04z)| norm 0.2616 (-0.75z)| lr 1.73e-04 | 2533.36 ms | 53.3% bf16 MFU | 207020 tok/s +step 12748/19560 | loss 3.365785 (+0.13z)| norm 0.2822 (+0.60z)| lr 1.73e-04 | 2531.29 ms | 53.3% bf16 MFU | 207025 tok/s +step 12749/19560 | loss 3.418240 (+1.17z)| norm 0.2792 (+0.40z)| lr 1.73e-04 | 2532.74 ms | 53.3% bf16 MFU | 207024 tok/s +step 12750/19560 | loss 3.281520 (-1.55z)| norm 0.2957 (+1.46z)| lr 1.73e-04 | 2532.14 ms | 53.3% bf16 MFU | 207026 tok/s +val loss 3.349633 +evaluating HellaSwag: 0/628 evaluating HellaSwag: 10/628 evaluating HellaSwag: 20/628 evaluating HellaSwag: 30/628 evaluating HellaSwag: 40/628 evaluating HellaSwag: 50/628 evaluating HellaSwag: 60/628 evaluating HellaSwag: 70/628 evaluating HellaSwag: 80/628 evaluating HellaSwag: 90/628 evaluating HellaSwag: 100/628 evaluating HellaSwag: 110/628 evaluating HellaSwag: 120/628 evaluating HellaSwag: 130/628 evaluating HellaSwag: 140/628 evaluating HellaSwag: 150/628 evaluating HellaSwag: 160/628 evaluating HellaSwag: 170/628 evaluating HellaSwag: 180/628 evaluating HellaSwag: 190/628 evaluating HellaSwag: 200/628 evaluating HellaSwag: 210/628 evaluating HellaSwag: 220/628 evaluating HellaSwag: 230/628 evaluating HellaSwag: 240/628 evaluating HellaSwag: 250/628 evaluating HellaSwag: 260/628 evaluating HellaSwag: 270/628 evaluating HellaSwag: 280/628 evaluating HellaSwag: 290/628 evaluating HellaSwag: 300/628 evaluating HellaSwag: 310/628 evaluating HellaSwag: 320/628 evaluating HellaSwag: 330/628 evaluating HellaSwag: 340/628 evaluating HellaSwag: 350/628 evaluating HellaSwag: 360/628 evaluating HellaSwag: 370/628 evaluating HellaSwag: 380/628 evaluating HellaSwag: 390/628 evaluating HellaSwag: 400/628 evaluating HellaSwag: 410/628 evaluating HellaSwag: 420/628 evaluating HellaSwag: 430/628 evaluating HellaSwag: 440/628 evaluating HellaSwag: 450/628 evaluating HellaSwag: 460/628 evaluating HellaSwag: 470/628 evaluating HellaSwag: 480/628 evaluating HellaSwag: 490/628 evaluating HellaSwag: 500/628 evaluating HellaSwag: 510/628 evaluating HellaSwag: 520/628 evaluating HellaSwag: 530/628 evaluating HellaSwag: 540/628 evaluating HellaSwag: 550/628 evaluating HellaSwag: 560/628 evaluating HellaSwag: 570/628 evaluating HellaSwag: 580/628 evaluating HellaSwag: 590/628 evaluating HellaSwag: 600/628 evaluating HellaSwag: 610/628 evaluating HellaSwag: 620/628 HellaSwag: 2955/10042 = 0.294264 +step 12751/19560 | loss 3.335203 (-0.47z)| norm 0.2702 (-0.22z)| lr 1.73e-04 | 2532.06 ms | 53.3% bf16 MFU | 207027 tok/s +step 12752/19560 | loss 3.279422 (-1.56z)| norm 0.2837 (+0.66z)| lr 1.73e-04 | 2532.62 ms | 53.3% bf16 MFU | 207027 tok/s +step 12753/19560 | loss 3.612158 (+4.56z)| norm 0.2916 (+1.17z)| lr 1.73e-04 | 2532.78 ms | 53.3% bf16 MFU | 207025 tok/s +step 12754/19560 | loss 3.284327 (-1.37z)| norm 0.2998 (+1.68z)| lr 1.73e-04 | 2533.45 ms | 53.3% bf16 MFU | 207021 tok/s +step 12755/19560 | loss 3.375398 (+0.25z)| norm 0.2771 (+0.25z)| lr 1.73e-04 | 2531.94 ms | 53.3% bf16 MFU | 207024 tok/s +step 12756/19560 | loss 3.371084 (+0.17z)| norm 0.2730 (-0.04z)| lr 1.73e-04 | 2534.15 ms | 53.3% bf16 MFU | 207017 tok/s +step 12757/19560 | loss 3.329715 (-0.60z)| norm 0.2981 (+1.70z)| lr 1.73e-04 | 2534.44 ms | 53.3% bf16 MFU | 207010 tok/s +step 12758/19560 | loss 3.367220 (+0.10z)| norm 0.2482 (-1.76z)| lr 1.73e-04 | 2534.71 ms | 53.3% bf16 MFU | 207001 tok/s +step 12759/19560 | loss 3.359956 (-0.03z)| norm 0.2977 (+1.64z)| lr 1.73e-04 | 2534.06 ms | 53.3% bf16 MFU | 206996 tok/s +step 12760/19560 | loss 3.294611 (-1.24z)| norm 0.2641 (-0.66z)| lr 1.73e-04 | 2534.37 ms | 53.3% bf16 MFU | 206990 tok/s +step 12761/19560 | loss 3.359980 (-0.01z)| norm 0.2718 (-0.13z)| lr 1.73e-04 | 2533.23 ms | 53.3% bf16 MFU | 206988 tok/s +step 12762/19560 | loss 3.303805 (-1.06z)| norm 0.2574 (-1.12z)| lr 1.73e-04 | 2533.43 ms | 53.3% bf16 MFU | 206986 tok/s +step 12763/19560 | loss 3.320226 (-0.74z)| norm 0.2723 (-0.08z)| lr 1.73e-04 | 2533.55 ms | 53.3% bf16 MFU | 206984 tok/s +step 12764/19560 | loss 3.375294 (+0.29z)| norm 0.2453 (-1.93z)| lr 1.73e-04 | 2531.25 ms | 53.3% bf16 MFU | 206991 tok/s +step 12765/19560 | loss 3.408026 (+0.90z)| norm 0.2824 (+0.63z)| lr 1.73e-04 | 2534.07 ms | 53.3% bf16 MFU | 206986 tok/s +step 12766/19560 | loss 3.360266 (+0.02z)| norm 0.2501 (-1.57z)| lr 1.73e-04 | 2534.32 ms | 53.3% bf16 MFU | 206981 tok/s +step 12767/19560 | loss 3.321652 (-0.71z)| norm 0.2674 (-0.37z)| lr 1.72e-04 | 2533.90 ms | 53.3% bf16 MFU | 206977 tok/s +step 12768/19560 | loss 3.278165 (-1.52z)| norm 0.2612 (-0.81z)| lr 1.72e-04 | 2532.18 ms | 53.3% bf16 MFU | 206981 tok/s +step 12769/19560 | loss 3.359212 (+0.02z)| norm 0.2570 (-1.09z)| lr 1.72e-04 | 2533.83 ms | 53.3% bf16 MFU | 206978 tok/s +step 12770/19560 | loss 3.333089 (-0.47z)| norm 0.2589 (-0.96z)| lr 1.72e-04 | 2534.11 ms | 53.3% bf16 MFU | 206973 tok/s +step 12771/19560 | loss 3.389645 (+0.60z)| norm 0.2598 (-0.90z)| lr 1.72e-04 | 2533.89 ms | 53.3% bf16 MFU | 206970 tok/s +step 12772/19560 | loss 3.395692 (+0.71z)| norm 0.2591 (-0.94z)| lr 1.72e-04 | 2534.85 ms | 53.3% bf16 MFU | 206963 tok/s +step 12773/19560 | loss 3.421966 (+1.19z)| norm 0.2929 (+1.38z)| lr 1.72e-04 | 2534.08 ms | 53.3% bf16 MFU | 206960 tok/s +step 12774/19560 | loss 3.303333 (-1.05z)| norm 0.2895 (+1.12z)| lr 1.72e-04 | 2533.03 ms | 53.3% bf16 MFU | 206961 tok/s +step 12775/19560 | loss 3.369062 (+0.20z)| norm 0.2772 (+0.28z)| lr 1.72e-04 | 2531.57 ms | 53.3% bf16 MFU | 206968 tok/s +step 12776/19560 | loss 3.404076 (+0.85z)| norm 0.2738 (+0.05z)| lr 1.72e-04 | 2533.65 ms | 53.3% bf16 MFU | 206966 tok/s +step 12777/19560 | loss 3.288246 (-1.33z)| norm 0.2774 (+0.31z)| lr 1.72e-04 | 2533.82 ms | 53.3% bf16 MFU | 206963 tok/s +step 12778/19560 | loss 3.340339 (-0.35z)| norm 0.2998 (+1.83z)| lr 1.72e-04 | 2532.73 ms | 53.3% bf16 MFU | 206966 tok/s +step 12779/19560 | loss 3.399768 (+0.76z)| norm 0.2940 (+1.41z)| lr 1.72e-04 | 2533.24 ms | 53.3% bf16 MFU | 206965 tok/s +step 12780/19560 | loss 3.433069 (+1.36z)| norm 0.3363 (+3.99z)| lr 1.72e-04 | 2531.81 ms | 53.3% bf16 MFU | 206971 tok/s +step 12781/19560 | loss 3.317075 (-0.79z)| norm 0.2914 (+1.12z)| lr 1.72e-04 | 2533.07 ms | 53.3% bf16 MFU | 206971 tok/s +step 12782/19560 | loss 3.303245 (-1.04z)| norm 0.2930 (+1.21z)| lr 1.72e-04 | 2531.60 ms | 53.3% bf16 MFU | 206978 tok/s +step 12783/19560 | loss 3.365495 (+0.12z)| norm 0.2876 (+0.86z)| lr 1.72e-04 | 2532.36 ms | 53.3% bf16 MFU | 206981 tok/s +step 12784/19560 | loss 3.319932 (-0.72z)| norm 0.2670 (-0.44z)| lr 1.72e-04 | 2532.41 ms | 53.3% bf16 MFU | 206983 tok/s +step 12785/19560 | loss 3.339649 (-0.36z)| norm 0.2852 (+0.73z)| lr 1.72e-04 | 2533.80 ms | 53.3% bf16 MFU | 206980 tok/s +step 12786/19560 | loss 3.293981 (-1.18z)| norm 0.2661 (-0.50z)| lr 1.72e-04 | 2533.84 ms | 53.3% bf16 MFU | 206977 tok/s +step 12787/19560 | loss 3.305838 (-0.97z)| norm 0.2776 (+0.23z)| lr 1.72e-04 | 2533.47 ms | 53.3% bf16 MFU | 206975 tok/s +step 12788/19560 | loss 3.374468 (+0.30z)| norm 0.2766 (+0.17z)| lr 1.72e-04 | 2533.47 ms | 53.3% bf16 MFU | 206974 tok/s +step 12789/19560 | loss 3.388492 (+0.56z)| norm 0.2858 (+0.78z)| lr 1.71e-04 | 2532.27 ms | 53.3% bf16 MFU | 206977 tok/s +step 12790/19560 | loss 3.376365 (+0.35z)| norm 0.2754 (+0.10z)| lr 1.71e-04 | 2532.37 ms | 53.3% bf16 MFU | 206980 tok/s +step 12791/19560 | loss 3.360112 (+0.05z)| norm 0.2818 (+0.52z)| lr 1.71e-04 | 2533.86 ms | 53.3% bf16 MFU | 206976 tok/s +step 12792/19560 | loss 3.299442 (-1.06z)| norm 0.2674 (-0.41z)| lr 1.71e-04 | 2532.35 ms | 53.3% bf16 MFU | 206979 tok/s +step 12793/19560 | loss 3.342793 (-0.25z)| norm 0.2733 (-0.02z)| lr 1.71e-04 | 2535.18 ms | 53.3% bf16 MFU | 206971 tok/s +step 12794/19560 | loss 3.361077 (+0.10z)| norm 0.2854 (+0.76z)| lr 1.71e-04 | 2532.91 ms | 53.3% bf16 MFU | 206972 tok/s +step 12795/19560 | loss 3.327890 (-0.52z)| norm 0.2689 (-0.31z)| lr 1.71e-04 | 2532.43 ms | 53.3% bf16 MFU | 206975 tok/s +step 12796/19560 | loss 3.375755 (+0.39z)| norm 0.2926 (+1.23z)| lr 1.71e-04 | 2533.14 ms | 53.3% bf16 MFU | 206974 tok/s +step 12797/19560 | loss 3.320766 (-0.64z)| norm 0.2920 (+1.17z)| lr 1.71e-04 | 2533.89 ms | 53.3% bf16 MFU | 206971 tok/s +step 12798/19560 | loss 3.336790 (-0.35z)| norm 0.2810 (+0.46z)| lr 1.71e-04 | 2531.73 ms | 53.3% bf16 MFU | 206977 tok/s +step 12799/19560 | loss 3.380932 (+0.50z)| norm 0.2924 (+1.18z)| lr 1.71e-04 | 2532.79 ms | 53.3% bf16 MFU | 206978 tok/s +step 12800/19560 | loss 3.306646 (-0.91z)| norm 0.2673 (-0.43z)| lr 1.71e-04 | 2530.92 ms | 53.3% bf16 MFU | 206987 tok/s +step 12801/19560 | loss 3.416097 (+1.16z)| norm 0.2840 (+0.63z)| lr 1.71e-04 | 2531.65 ms | 53.3% bf16 MFU | 206992 tok/s +step 12802/19560 | loss 3.339426 (-0.29z)| norm 0.2761 (+0.12z)| lr 1.71e-04 | 2533.67 ms | 53.3% bf16 MFU | 206989 tok/s +step 12803/19560 | loss 3.330602 (-0.45z)| norm 0.2631 (-0.73z)| lr 1.71e-04 | 2531.79 ms | 53.3% bf16 MFU | 206994 tok/s +step 12804/19560 | loss 3.377204 (+0.43z)| norm 0.2709 (-0.23z)| lr 1.71e-04 | 2533.54 ms | 53.3% bf16 MFU | 206991 tok/s +step 12805/19560 | loss 3.321484 (-0.62z)| norm 0.2692 (-0.36z)| lr 1.71e-04 | 2533.73 ms | 53.3% bf16 MFU | 206988 tok/s +step 12806/19560 | loss 3.387959 (+0.64z)| norm 0.5570 (+9.64z)| lr 1.71e-04 | 2531.97 ms | 53.3% bf16 MFU | 206992 tok/s +step 12807/19560 | loss 3.308949 (-0.85z)| norm 0.3675 (+2.98z)| lr 1.71e-04 | 2533.03 ms | 53.3% bf16 MFU | 206991 tok/s +step 12808/19560 | loss 3.328933 (-0.47z)| norm 0.3080 (+0.99z)| lr 1.71e-04 | 2534.49 ms | 53.3% bf16 MFU | 206984 tok/s +step 12809/19560 | loss 3.337147 (-0.30z)| norm 0.3256 (+1.55z)| lr 1.71e-04 | 2531.46 ms | 53.3% bf16 MFU | 206991 tok/s +step 12810/19560 | loss 3.355365 (+0.05z)| norm 0.2925 (+0.45z)| lr 1.71e-04 | 2532.37 ms | 53.3% bf16 MFU | 206993 tok/s +step 12811/19560 | loss 3.312679 (-0.75z)| norm 0.2940 (+0.49z)| lr 1.70e-04 | 2532.69 ms | 53.3% bf16 MFU | 206994 tok/s +step 12812/19560 | loss 3.306084 (-0.88z)| norm 0.2970 (+0.58z)| lr 1.70e-04 | 2531.47 ms | 53.3% bf16 MFU | 206999 tok/s +step 12813/19560 | loss 3.308006 (-0.83z)| norm 0.2677 (-0.38z)| lr 1.70e-04 | 2531.78 ms | 53.3% bf16 MFU | 207004 tok/s +step 12814/19560 | loss 3.341708 (-0.18z)| norm 0.2769 (-0.08z)| lr 1.70e-04 | 2531.96 ms | 53.3% bf16 MFU | 207007 tok/s +step 12815/19560 | loss 3.291366 (-1.13z)| norm 0.2903 (+0.35z)| lr 1.70e-04 | 2531.60 ms | 53.3% bf16 MFU | 207011 tok/s +step 12816/19560 | loss 3.358664 (+0.15z)| norm 0.2584 (-0.69z)| lr 1.70e-04 | 2531.82 ms | 53.3% bf16 MFU | 207015 tok/s +step 12817/19560 | loss 3.333294 (-0.32z)| norm 0.2828 (+0.11z)| lr 1.70e-04 | 2531.50 ms | 53.3% bf16 MFU | 207019 tok/s +step 12818/19560 | loss 3.329962 (-0.37z)| norm 0.2704 (-0.29z)| lr 1.70e-04 | 2532.15 ms | 53.3% bf16 MFU | 207021 tok/s +step 12819/19560 | loss 3.327946 (-0.41z)| norm 0.2751 (-0.13z)| lr 1.70e-04 | 2530.59 ms | 53.4% bf16 MFU | 207029 tok/s +step 12820/19560 | loss 3.414824 (+1.24z)| norm 0.2644 (-0.49z)| lr 1.70e-04 | 2532.75 ms | 53.3% bf16 MFU | 207028 tok/s +step 12821/19560 | loss 3.329577 (-0.38z)| norm 0.2748 (-0.15z)| lr 1.70e-04 | 2531.45 ms | 53.3% bf16 MFU | 207032 tok/s +step 12822/19560 | loss 3.368873 (+0.37z)| norm 0.2629 (-0.54z)| lr 1.70e-04 | 2532.79 ms | 53.3% bf16 MFU | 207030 tok/s +step 12823/19560 | loss 3.352326 (+0.04z)| norm 0.2691 (-0.33z)| lr 1.70e-04 | 2531.92 ms | 53.3% bf16 MFU | 207032 tok/s +step 12824/19560 | loss 3.298745 (-0.99z)| norm 0.2567 (-0.74z)| lr 1.70e-04 | 2532.01 ms | 53.3% bf16 MFU | 207034 tok/s +step 12825/19560 | loss 3.306980 (-0.83z)| norm 0.2549 (-0.80z)| lr 1.70e-04 | 2532.00 ms | 53.3% bf16 MFU | 207035 tok/s +step 12826/19560 | loss 3.316021 (-0.65z)| norm 0.2584 (-0.68z)| lr 1.70e-04 | 2532.21 ms | 53.3% bf16 MFU | 207036 tok/s +step 12827/19560 | loss 3.366848 (+0.32z)| norm 0.2603 (-0.62z)| lr 1.70e-04 | 2531.31 ms | 53.3% bf16 MFU | 207040 tok/s +step 12828/19560 | loss 3.411808 (+1.42z)| norm 0.2513 (-0.89z)| lr 1.70e-04 | 2531.27 ms | 53.3% bf16 MFU | 207044 tok/s +step 12829/19560 | loss 3.297628 (-1.12z)| norm 0.2632 (-0.50z)| lr 1.70e-04 | 2532.73 ms | 53.3% bf16 MFU | 207042 tok/s +step 12830/19560 | loss 3.348940 (+0.02z)| norm 0.2511 (-0.88z)| lr 1.70e-04 | 2531.97 ms | 53.3% bf16 MFU | 207044 tok/s +step 12831/19560 | loss 3.369062 (+0.48z)| norm 0.2726 (-0.17z)| lr 1.70e-04 | 2530.81 ms | 53.3% bf16 MFU | 207050 tok/s +step 12832/19560 | loss 3.379417 (+0.72z)| norm 0.2948 (+0.55z)| lr 1.70e-04 | 2532.31 ms | 53.3% bf16 MFU | 207049 tok/s +step 12833/19560 | loss 3.316822 (-0.70z)| norm 0.2641 (-0.45z)| lr 1.69e-04 | 2532.18 ms | 53.3% bf16 MFU | 207049 tok/s +step 12834/19560 | loss 3.376044 (+0.64z)| norm 0.2568 (-0.68z)| lr 1.69e-04 | 2532.91 ms | 53.3% bf16 MFU | 207046 tok/s +step 12835/19560 | loss 3.317971 (-0.69z)| norm 0.2702 (-0.24z)| lr 1.69e-04 | 2533.20 ms | 53.3% bf16 MFU | 207042 tok/s +step 12836/19560 | loss 3.278112 (-1.57z)| norm 0.2583 (-0.63z)| lr 1.69e-04 | 2531.12 ms | 53.3% bf16 MFU | 207047 tok/s +step 12837/19560 | loss 3.385831 (+0.85z)| norm 0.2823 (+0.15z)| lr 1.69e-04 | 2530.65 ms | 53.4% bf16 MFU | 207053 tok/s +step 12838/19560 | loss 3.293878 (-1.22z)| norm 0.2642 (-0.44z)| lr 1.69e-04 | 2530.98 ms | 53.3% bf16 MFU | 207058 tok/s +step 12839/19560 | loss 3.291550 (-1.25z)| norm 0.2663 (-0.37z)| lr 1.69e-04 | 2531.83 ms | 53.3% bf16 MFU | 207059 tok/s +step 12840/19560 | loss 3.364349 (+0.38z)| norm 0.2891 (+0.36z)| lr 1.69e-04 | 2531.20 ms | 53.3% bf16 MFU | 207063 tok/s +step 12841/19560 | loss 3.415065 (+1.49z)| norm 0.2836 (+0.18z)| lr 1.69e-04 | 2533.30 ms | 53.3% bf16 MFU | 207057 tok/s +step 12842/19560 | loss 3.270858 (-1.71z)| norm 0.3185 (+1.30z)| lr 1.69e-04 | 2531.41 ms | 53.3% bf16 MFU | 207060 tok/s +step 12843/19560 | loss 3.364716 (+0.36z)| norm 0.2844 (+0.19z)| lr 1.69e-04 | 2531.55 ms | 53.3% bf16 MFU | 207062 tok/s +step 12844/19560 | loss 3.395779 (+1.04z)| norm 0.2909 (+0.39z)| lr 1.69e-04 | 2532.89 ms | 53.3% bf16 MFU | 207059 tok/s +step 12845/19560 | loss 3.324672 (-0.53z)| norm 0.2868 (+0.25z)| lr 1.69e-04 | 2534.68 ms | 53.3% bf16 MFU | 207048 tok/s +step 12846/19560 | loss 3.333689 (-0.33z)| norm 0.2821 (+0.09z)| lr 1.69e-04 | 2532.12 ms | 53.3% bf16 MFU | 207048 tok/s +step 12847/19560 | loss 3.347420 (-0.03z)| norm 0.2922 (+0.42z)| lr 1.69e-04 | 2532.31 ms | 53.3% bf16 MFU | 207048 tok/s +step 12848/19560 | loss 3.363524 (+0.32z)| norm 0.2931 (+0.44z)| lr 1.69e-04 | 2533.26 ms | 53.3% bf16 MFU | 207044 tok/s +step 12849/19560 | loss 3.381895 (+0.71z)| norm 0.2849 (+0.17z)| lr 1.69e-04 | 2533.18 ms | 53.3% bf16 MFU | 207040 tok/s +step 12850/19560 | loss 3.334177 (-0.33z)| norm 0.3014 (+0.71z)| lr 1.69e-04 | 2533.55 ms | 53.3% bf16 MFU | 207035 tok/s +step 12851/19560 | loss 3.401132 (+1.16z)| norm 0.2661 (-0.45z)| lr 1.69e-04 | 2536.74 ms | 53.2% bf16 MFU | 207017 tok/s +step 12852/19560 | loss 3.381224 (+0.71z)| norm 0.3023 (+0.73z)| lr 1.69e-04 | 2534.74 ms | 53.3% bf16 MFU | 207008 tok/s +step 12853/19560 | loss 3.319747 (-0.66z)| norm 0.2816 (+0.06z)| lr 1.69e-04 | 2535.07 ms | 53.3% bf16 MFU | 206998 tok/s +step 12854/19560 | loss 3.392452 (+0.95z)| norm 0.2902 (+0.34z)| lr 1.69e-04 | 2536.23 ms | 53.2% bf16 MFU | 206984 tok/s +step 12855/19560 | loss 3.428671 (+1.71z)| norm 0.2830 (+0.11z)| lr 1.68e-04 | 2535.93 ms | 53.2% bf16 MFU | 206972 tok/s +step 12856/19560 | loss 3.351039 (-0.00z)| norm 0.2704 (-0.31z)| lr 1.68e-04 | 2534.85 ms | 53.3% bf16 MFU | 206965 tok/s +step 12857/19560 | loss 3.357167 (+0.13z)| norm 0.2823 (+0.08z)| lr 1.68e-04 | 2535.34 ms | 53.3% bf16 MFU | 206957 tok/s +step 12858/19560 | loss 3.344646 (-0.14z)| norm 0.2609 (-0.61z)| lr 1.68e-04 | 2535.78 ms | 53.2% bf16 MFU | 206947 tok/s +step 12859/19560 | loss 3.363147 (+0.27z)| norm 0.2771 (-0.09z)| lr 1.68e-04 | 2534.00 ms | 53.3% bf16 MFU | 206944 tok/s +step 12860/19560 | loss 3.407984 (+1.25z)| norm 0.2778 (-0.07z)| lr 1.68e-04 | 2533.68 ms | 53.3% bf16 MFU | 206944 tok/s +step 12861/19560 | loss 3.345118 (-0.15z)| norm 0.2866 (+0.22z)| lr 1.68e-04 | 2533.71 ms | 53.3% bf16 MFU | 206943 tok/s +step 12862/19560 | loss 3.420410 (+1.51z)| norm 0.2850 (+0.16z)| lr 1.68e-04 | 2534.54 ms | 53.3% bf16 MFU | 206938 tok/s +step 12863/19560 | loss 3.349545 (-0.06z)| norm 0.2619 (-0.61z)| lr 1.68e-04 | 2535.64 ms | 53.2% bf16 MFU | 206930 tok/s +step 12864/19560 | loss 3.356072 (+0.09z)| norm 0.2756 (-0.16z)| lr 1.68e-04 | 2533.71 ms | 53.3% bf16 MFU | 206930 tok/s +step 12865/19560 | loss 3.371871 (+0.45z)| norm 0.2430 (-1.23z)| lr 1.68e-04 | 2533.80 ms | 53.3% bf16 MFU | 206929 tok/s +step 12866/19560 | loss 3.355079 (+0.08z)| norm 0.2730 (-0.24z)| lr 1.68e-04 | 2533.83 ms | 53.3% bf16 MFU | 206928 tok/s +step 12867/19560 | loss 3.350453 (-0.02z)| norm 0.2629 (-0.57z)| lr 1.68e-04 | 2534.49 ms | 53.3% bf16 MFU | 206925 tok/s +step 12868/19560 | loss 3.285261 (-1.46z)| norm 0.2717 (-0.28z)| lr 1.68e-04 | 2535.60 ms | 53.2% bf16 MFU | 206917 tok/s +step 12869/19560 | loss 3.314056 (-0.81z)| norm 0.2707 (-0.31z)| lr 1.68e-04 | 2533.47 ms | 53.3% bf16 MFU | 206919 tok/s +step 12870/19560 | loss 3.291130 (-1.30z)| norm 0.2637 (-0.54z)| lr 1.68e-04 | 2535.45 ms | 53.3% bf16 MFU | 206912 tok/s +step 12871/19560 | loss 3.400111 (+1.12z)| norm 0.2775 (-0.08z)| lr 1.68e-04 | 2533.32 ms | 53.3% bf16 MFU | 206914 tok/s +step 12872/19560 | loss 3.352635 (+0.06z)| norm 0.2570 (-0.75z)| lr 1.68e-04 | 2533.01 ms | 53.3% bf16 MFU | 206917 tok/s +step 12873/19560 | loss 3.326210 (-0.52z)| norm 0.2626 (-0.57z)| lr 1.68e-04 | 2534.11 ms | 53.3% bf16 MFU | 206916 tok/s +step 12874/19560 | loss 3.406368 (+1.25z)| norm 0.2598 (-0.65z)| lr 1.68e-04 | 2533.81 ms | 53.3% bf16 MFU | 206916 tok/s +step 12875/19560 | loss 3.318094 (-0.70z)| norm 0.2796 (-0.00z)| lr 1.68e-04 | 2533.75 ms | 53.3% bf16 MFU | 206916 tok/s +step 12876/19560 | loss 3.292827 (-1.24z)| norm 0.2678 (-0.39z)| lr 1.68e-04 | 2533.71 ms | 53.3% bf16 MFU | 206917 tok/s +step 12877/19560 | loss 3.328086 (-0.45z)| norm 0.2587 (-0.68z)| lr 1.68e-04 | 2534.23 ms | 53.3% bf16 MFU | 206915 tok/s +step 12878/19560 | loss 3.349785 (+0.02z)| norm 0.2633 (-0.52z)| lr 1.67e-04 | 2532.77 ms | 53.3% bf16 MFU | 206920 tok/s +step 12879/19560 | loss 3.340677 (-0.19z)| norm 0.2741 (-0.17z)| lr 1.67e-04 | 2534.74 ms | 53.3% bf16 MFU | 206916 tok/s +step 12880/19560 | loss 3.428682 (+1.75z)| norm 0.2640 (-0.49z)| lr 1.67e-04 | 2532.66 ms | 53.3% bf16 MFU | 206920 tok/s +step 12881/19560 | loss 3.318026 (-0.78z)| norm 0.2665 (-0.41z)| lr 1.67e-04 | 2532.31 ms | 53.3% bf16 MFU | 206926 tok/s +step 12882/19560 | loss 3.317130 (-0.81z)| norm 0.2566 (-0.72z)| lr 1.67e-04 | 2532.39 ms | 53.3% bf16 MFU | 206932 tok/s +step 12883/19560 | loss 3.358597 (+0.28z)| norm 0.2636 (-0.49z)| lr 1.67e-04 | 2531.07 ms | 53.3% bf16 MFU | 206942 tok/s +step 12884/19560 | loss 3.310419 (-0.97z)| norm 0.2590 (-0.64z)| lr 1.67e-04 | 2531.81 ms | 53.3% bf16 MFU | 206949 tok/s +step 12885/19560 | loss 3.316347 (-0.81z)| norm 0.2860 (+0.25z)| lr 1.67e-04 | 2533.17 ms | 53.3% bf16 MFU | 206950 tok/s +step 12886/19560 | loss 3.374911 (+0.72z)| norm 0.2637 (-0.48z)| lr 1.67e-04 | 2532.72 ms | 53.3% bf16 MFU | 206953 tok/s +step 12887/19560 | loss 3.370932 (+0.61z)| norm 0.2653 (-0.42z)| lr 1.67e-04 | 2534.04 ms | 53.3% bf16 MFU | 206950 tok/s +step 12888/19560 | loss 3.322634 (-0.66z)| norm 0.2936 (+0.51z)| lr 1.67e-04 | 2533.69 ms | 53.3% bf16 MFU | 206949 tok/s +step 12889/19560 | loss 3.286427 (-1.58z)| norm 0.2582 (-0.66z)| lr 1.67e-04 | 2531.60 ms | 53.3% bf16 MFU | 206956 tok/s +step 12890/19560 | loss 3.327306 (-0.52z)| norm 0.2699 (-0.28z)| lr 1.67e-04 | 2534.88 ms | 53.3% bf16 MFU | 206950 tok/s +step 12891/19560 | loss 3.354311 (+0.18z)| norm 0.2781 (-0.01z)| lr 1.67e-04 | 2532.89 ms | 53.3% bf16 MFU | 206952 tok/s +step 12892/19560 | loss 3.424547 (+1.98z)| norm 0.2648 (-0.45z)| lr 1.67e-04 | 2532.64 ms | 53.3% bf16 MFU | 206955 tok/s +step 12893/19560 | loss 3.342223 (-0.14z)| norm 0.2957 (+0.57z)| lr 1.67e-04 | 2532.46 ms | 53.3% bf16 MFU | 206959 tok/s +step 12894/19560 | loss 3.336474 (-0.28z)| norm 0.2773 (-0.05z)| lr 1.67e-04 | 2532.61 ms | 53.3% bf16 MFU | 206961 tok/s +step 12895/19560 | loss 3.329239 (-0.47z)| norm 0.2865 (+0.25z)| lr 1.67e-04 | 2532.58 ms | 53.3% bf16 MFU | 206964 tok/s +step 12896/19560 | loss 3.392807 (+1.17z)| norm 0.2775 (-0.05z)| lr 1.67e-04 | 2535.29 ms | 53.3% bf16 MFU | 206956 tok/s +step 12897/19560 | loss 3.378397 (+0.79z)| norm 0.2737 (-0.18z)| lr 1.67e-04 | 2533.94 ms | 53.3% bf16 MFU | 206953 tok/s +step 12898/19560 | loss 3.388645 (+1.04z)| norm 0.2691 (-0.34z)| lr 1.67e-04 | 2533.81 ms | 53.3% bf16 MFU | 206952 tok/s +step 12899/19560 | loss 3.476191 (+3.19z)| norm 0.2871 (+0.25z)| lr 1.67e-04 | 2534.77 ms | 53.3% bf16 MFU | 206946 tok/s +step 12900/19560 | loss 3.340439 (-0.22z)| norm 0.2597 (-0.66z)| lr 1.66e-04 | 2537.02 ms | 53.2% bf16 MFU | 206931 tok/s +step 12901/19560 | loss 3.305119 (-1.10z)| norm 0.2645 (-0.50z)| lr 1.66e-04 | 2536.13 ms | 53.2% bf16 MFU | 206921 tok/s +step 12902/19560 | loss 3.357267 (+0.22z)| norm 0.2749 (-0.14z)| lr 1.66e-04 | 2536.03 ms | 53.2% bf16 MFU | 206912 tok/s +step 12903/19560 | loss 3.410760 (+1.57z)| norm 0.2589 (-0.67z)| lr 1.66e-04 | 2537.33 ms | 53.2% bf16 MFU | 206898 tok/s +step 12904/19560 | loss 3.336988 (-0.29z)| norm 0.2625 (-0.55z)| lr 1.66e-04 | 2534.65 ms | 53.3% bf16 MFU | 206895 tok/s +step 12905/19560 | loss 3.321757 (-0.70z)| norm 0.2490 (-0.99z)| lr 1.66e-04 | 2534.39 ms | 53.3% bf16 MFU | 206894 tok/s +step 12906/19560 | loss 3.290482 (-1.48z)| norm 0.2677 (-0.36z)| lr 1.66e-04 | 2536.83 ms | 53.2% bf16 MFU | 206883 tok/s +step 12907/19560 | loss 3.312770 (-0.90z)| norm 0.2660 (-0.41z)| lr 1.66e-04 | 2535.99 ms | 53.2% bf16 MFU | 206876 tok/s +step 12908/19560 | loss 3.384017 (+0.96z)| norm 0.2780 (+0.01z)| lr 1.66e-04 | 2535.03 ms | 53.3% bf16 MFU | 206873 tok/s +step 12909/19560 | loss 3.338295 (-0.24z)| norm 0.2611 (-0.56z)| lr 1.66e-04 | 2535.31 ms | 53.3% bf16 MFU | 206869 tok/s +step 12910/19560 | loss 3.302362 (-1.18z)| norm 0.2838 (+0.21z)| lr 1.66e-04 | 2536.14 ms | 53.2% bf16 MFU | 206862 tok/s +step 12911/19560 | loss 3.292373 (-1.41z)| norm 0.2519 (-0.85z)| lr 1.66e-04 | 2535.60 ms | 53.2% bf16 MFU | 206857 tok/s +step 12912/19560 | loss 3.307809 (-1.01z)| norm 0.2996 (+0.74z)| lr 1.66e-04 | 2533.57 ms | 53.3% bf16 MFU | 206861 tok/s +step 12913/19560 | loss 3.321337 (-0.65z)| norm 0.2895 (+0.40z)| lr 1.66e-04 | 2534.93 ms | 53.3% bf16 MFU | 206859 tok/s +step 12914/19560 | loss 3.368018 (+0.54z)| norm 0.2785 (+0.03z)| lr 1.66e-04 | 2534.43 ms | 53.3% bf16 MFU | 206860 tok/s +step 12915/19560 | loss 3.362407 (+0.39z)| norm 0.2943 (+0.56z)| lr 1.66e-04 | 2535.08 ms | 53.3% bf16 MFU | 206857 tok/s +step 12916/19560 | loss 3.336743 (-0.28z)| norm 0.2655 (-0.41z)| lr 1.66e-04 | 2535.68 ms | 53.2% bf16 MFU | 206853 tok/s +step 12917/19560 | loss 3.332271 (-0.38z)| norm 0.2793 (+0.06z)| lr 1.66e-04 | 2534.44 ms | 53.3% bf16 MFU | 206853 tok/s +step 12918/19560 | loss 3.344885 (-0.05z)| norm 0.2826 (+0.17z)| lr 1.66e-04 | 2534.47 ms | 53.3% bf16 MFU | 206854 tok/s +step 12919/19560 | loss 3.309444 (-0.97z)| norm 0.2775 (-0.00z)| lr 1.66e-04 | 2533.85 ms | 53.3% bf16 MFU | 206857 tok/s +step 12920/19560 | loss 3.318258 (-0.74z)| norm 0.2657 (-0.40z)| lr 1.66e-04 | 2533.19 ms | 53.3% bf16 MFU | 206862 tok/s +step 12921/19560 | loss 3.403323 (+1.48z)| norm 0.2964 (+0.62z)| lr 1.66e-04 | 2532.09 ms | 53.3% bf16 MFU | 206872 tok/s +step 12922/19560 | loss 3.355133 (+0.22z)| norm 0.2885 (+0.36z)| lr 1.65e-04 | 2534.78 ms | 53.3% bf16 MFU | 206870 tok/s +step 12923/19560 | loss 3.318924 (-0.73z)| norm 0.2823 (+0.15z)| lr 1.65e-04 | 2532.59 ms | 53.3% bf16 MFU | 206878 tok/s +step 12924/19560 | loss 3.323812 (-0.59z)| norm 0.2772 (-0.02z)| lr 1.65e-04 | 2533.10 ms | 53.3% bf16 MFU | 206882 tok/s +step 12925/19560 | loss 3.328417 (-0.47z)| norm 0.2844 (+0.22z)| lr 1.65e-04 | 2534.49 ms | 53.3% bf16 MFU | 206881 tok/s +step 12926/19560 | loss 3.378004 (+0.82z)| norm 0.2690 (-0.29z)| lr 1.65e-04 | 2532.52 ms | 53.3% bf16 MFU | 206888 tok/s +step 12927/19560 | loss 3.320303 (-0.68z)| norm 0.3056 (+0.93z)| lr 1.65e-04 | 2536.56 ms | 53.2% bf16 MFU | 206879 tok/s +step 12928/19560 | loss 3.330410 (-0.42z)| norm 0.2883 (+0.35z)| lr 1.65e-04 | 2534.03 ms | 53.3% bf16 MFU | 206880 tok/s +step 12929/19560 | loss 3.382593 (+0.97z)| norm 0.2859 (+0.27z)| lr 1.65e-04 | 2532.34 ms | 53.3% bf16 MFU | 206888 tok/s +step 12930/19560 | loss 3.314302 (-0.84z)| norm 0.2779 (-0.00z)| lr 1.65e-04 | 2533.21 ms | 53.3% bf16 MFU | 206891 tok/s +step 12931/19560 | loss 3.420164 (+1.92z)| norm 0.2774 (-0.02z)| lr 1.65e-04 | 2534.43 ms | 53.3% bf16 MFU | 206890 tok/s +step 12932/19560 | loss 3.384738 (+0.99z)| norm 0.2684 (-0.32z)| lr 1.65e-04 | 2533.85 ms | 53.3% bf16 MFU | 206891 tok/s +step 12933/19560 | loss 3.340377 (-0.17z)| norm 0.2912 (+0.44z)| lr 1.65e-04 | 2533.78 ms | 53.3% bf16 MFU | 206893 tok/s +step 12934/19560 | loss 3.322332 (-0.63z)| norm 0.2647 (-0.67z)| lr 1.65e-04 | 2532.33 ms | 53.3% bf16 MFU | 206900 tok/s +step 12935/19560 | loss 3.330234 (-0.43z)| norm 0.2860 (+0.73z)| lr 1.65e-04 | 2532.66 ms | 53.3% bf16 MFU | 206906 tok/s +step 12936/19560 | loss 3.328390 (-0.47z)| norm 0.2848 (+0.68z)| lr 1.65e-04 | 2532.31 ms | 53.3% bf16 MFU | 206912 tok/s +step 12937/19560 | loss 3.407466 (+1.57z)| norm 0.2735 (-0.09z)| lr 1.65e-04 | 2532.79 ms | 53.3% bf16 MFU | 206917 tok/s +step 12938/19560 | loss 3.309872 (-0.95z)| norm 0.2720 (-0.19z)| lr 1.65e-04 | 2532.26 ms | 53.3% bf16 MFU | 206923 tok/s +step 12939/19560 | loss 3.288692 (-1.49z)| norm 0.2901 (+1.16z)| lr 1.65e-04 | 2531.31 ms | 53.3% bf16 MFU | 206933 tok/s +step 12940/19560 | loss 3.345532 (-0.03z)| norm 0.2568 (-1.30z)| lr 1.65e-04 | 2531.70 ms | 53.3% bf16 MFU | 206941 tok/s +step 12941/19560 | loss 3.336034 (-0.29z)| norm 0.2973 (+1.68z)| lr 1.65e-04 | 2532.09 ms | 53.3% bf16 MFU | 206947 tok/s +step 12942/19560 | loss 3.337632 (-0.24z)| norm 0.2715 (-0.21z)| lr 1.65e-04 | 2531.80 ms | 53.3% bf16 MFU | 206953 tok/s +step 12943/19560 | loss 3.371219 (+0.62z)| norm 0.2755 (+0.09z)| lr 1.65e-04 | 2534.76 ms | 53.3% bf16 MFU | 206948 tok/s +step 12944/19560 | loss 3.296131 (-1.32z)| norm 0.2899 (+1.14z)| lr 1.65e-04 | 2532.92 ms | 53.3% bf16 MFU | 206950 tok/s +step 12945/19560 | loss 3.313634 (-0.86z)| norm 0.2829 (+0.62z)| lr 1.64e-04 | 2534.06 ms | 53.3% bf16 MFU | 206947 tok/s +step 12946/19560 | loss 3.353627 (+0.17z)| norm 0.2983 (+1.73z)| lr 1.64e-04 | 2533.72 ms | 53.3% bf16 MFU | 206946 tok/s +step 12947/19560 | loss 3.339530 (-0.20z)| norm 0.2724 (-0.17z)| lr 1.64e-04 | 2533.81 ms | 53.3% bf16 MFU | 206944 tok/s +step 12948/19560 | loss 3.349729 (+0.08z)| norm 0.2994 (+1.77z)| lr 1.64e-04 | 2532.29 ms | 53.3% bf16 MFU | 206949 tok/s +step 12949/19560 | loss 3.372111 (+0.66z)| norm 0.3055 (+2.15z)| lr 1.64e-04 | 2534.86 ms | 53.3% bf16 MFU | 206943 tok/s +step 12950/19560 | loss 3.350727 (+0.10z)| norm 0.2795 (+0.29z)| lr 1.64e-04 | 2533.77 ms | 53.3% bf16 MFU | 206942 tok/s +step 12951/19560 | loss 3.387267 (+1.05z)| norm 0.2885 (+0.92z)| lr 1.64e-04 | 2533.09 ms | 53.3% bf16 MFU | 206944 tok/s +step 12952/19560 | loss 3.394307 (+1.21z)| norm 0.3022 (+1.86z)| lr 1.64e-04 | 2534.67 ms | 53.3% bf16 MFU | 206939 tok/s +step 12953/19560 | loss 3.356218 (+0.21z)| norm 0.3184 (+2.89z)| lr 1.64e-04 | 2535.83 ms | 53.2% bf16 MFU | 206930 tok/s +step 12954/19560 | loss 3.302969 (-1.19z)| norm 0.2691 (-0.51z)| lr 1.64e-04 | 2536.98 ms | 53.2% bf16 MFU | 206916 tok/s +step 12955/19560 | loss 3.262690 (-2.18z)| norm 0.3081 (+2.14z)| lr 1.64e-04 | 2537.01 ms | 53.2% bf16 MFU | 206903 tok/s +step 12956/19560 | loss 3.373277 (+0.68z)| norm 0.2658 (-0.77z)| lr 1.64e-04 | 2536.24 ms | 53.2% bf16 MFU | 206894 tok/s +step 12957/19560 | loss 3.343569 (-0.10z)| norm 0.2875 (+0.72z)| lr 1.64e-04 | 2534.43 ms | 53.3% bf16 MFU | 206892 tok/s +step 12958/19560 | loss 3.315748 (-0.82z)| norm 0.2791 (+0.12z)| lr 1.64e-04 | 2535.90 ms | 53.2% bf16 MFU | 206885 tok/s +step 12959/19560 | loss 3.353711 (+0.17z)| norm 0.2587 (-1.29z)| lr 1.64e-04 | 2535.85 ms | 53.2% bf16 MFU | 206878 tok/s +step 12960/19560 | loss 3.373198 (+0.68z)| norm 0.2811 (+0.27z)| lr 1.64e-04 | 2535.84 ms | 53.2% bf16 MFU | 206872 tok/s +step 12961/19560 | loss 3.373237 (+0.67z)| norm 0.2785 (+0.08z)| lr 1.64e-04 | 2534.76 ms | 53.3% bf16 MFU | 206870 tok/s +step 12962/19560 | loss 3.305544 (-1.08z)| norm 0.2737 (-0.26z)| lr 1.64e-04 | 2535.45 ms | 53.3% bf16 MFU | 206866 tok/s +step 12963/19560 | loss 3.312439 (-0.90z)| norm 0.2673 (-0.71z)| lr 1.64e-04 | 2533.75 ms | 53.3% bf16 MFU | 206869 tok/s +step 12964/19560 | loss 3.340811 (-0.17z)| norm 0.2663 (-0.79z)| lr 1.64e-04 | 2532.99 ms | 53.3% bf16 MFU | 206875 tok/s +step 12965/19560 | loss 3.317188 (-0.78z)| norm 0.2718 (-0.40z)| lr 1.64e-04 | 2534.46 ms | 53.3% bf16 MFU | 206874 tok/s +step 12966/19560 | loss 3.301524 (-1.20z)| norm 0.2533 (-1.69z)| lr 1.64e-04 | 2533.04 ms | 53.3% bf16 MFU | 206879 tok/s +step 12967/19560 | loss 3.367013 (+0.52z)| norm 0.2712 (-0.43z)| lr 1.63e-04 | 2533.00 ms | 53.3% bf16 MFU | 206885 tok/s +step 12968/19560 | loss 3.307137 (-1.06z)| norm 0.2622 (-1.05z)| lr 1.63e-04 | 2533.64 ms | 53.3% bf16 MFU | 206887 tok/s +step 12969/19560 | loss 3.300372 (-1.23z)| norm 0.2651 (-0.83z)| lr 1.63e-04 | 2532.78 ms | 53.3% bf16 MFU | 206893 tok/s +step 12970/19560 | loss 3.354245 (+0.20z)| norm 0.2618 (-1.06z)| lr 1.63e-04 | 2533.00 ms | 53.3% bf16 MFU | 206897 tok/s +step 12971/19560 | loss 3.361375 (+0.40z)| norm 0.2553 (-1.51z)| lr 1.63e-04 | 2533.10 ms | 53.3% bf16 MFU | 206901 tok/s +step 12972/19560 | loss 3.388021 (+1.13z)| norm 0.2718 (-0.32z)| lr 1.63e-04 | 2534.79 ms | 53.3% bf16 MFU | 206898 tok/s +step 12973/19560 | loss 3.315182 (-0.86z)| norm 0.2720 (-0.29z)| lr 1.63e-04 | 2534.00 ms | 53.3% bf16 MFU | 206898 tok/s +step 12974/19560 | loss 3.385995 (+1.06z)| norm 0.2591 (-1.20z)| lr 1.63e-04 | 2532.58 ms | 53.3% bf16 MFU | 206904 tok/s +step 12975/19560 | loss 3.298520 (-1.30z)| norm 0.2629 (-0.92z)| lr 1.63e-04 | 2531.45 ms | 53.3% bf16 MFU | 206914 tok/s +step 12976/19560 | loss 3.322428 (-0.64z)| norm 0.2584 (-1.22z)| lr 1.63e-04 | 2534.11 ms | 53.3% bf16 MFU | 206913 tok/s +step 12977/19560 | loss 3.357504 (+0.31z)| norm 0.2638 (-0.82z)| lr 1.63e-04 | 2534.35 ms | 53.3% bf16 MFU | 206911 tok/s +step 12978/19560 | loss 3.270636 (-2.00z)| norm 0.2590 (-1.15z)| lr 1.63e-04 | 2532.08 ms | 53.3% bf16 MFU | 206918 tok/s +step 12979/19560 | loss 3.300365 (-1.19z)| norm 0.2689 (-0.43z)| lr 1.63e-04 | 2531.21 ms | 53.3% bf16 MFU | 206929 tok/s +step 12980/19560 | loss 3.312306 (-0.86z)| norm 0.2927 (+1.31z)| lr 1.63e-04 | 2532.49 ms | 53.3% bf16 MFU | 206934 tok/s +step 12981/19560 | loss 3.416359 (+1.88z)| norm 0.2801 (+0.39z)| lr 1.63e-04 | 2531.65 ms | 53.3% bf16 MFU | 206942 tok/s +step 12982/19560 | loss 3.312057 (-0.86z)| norm 0.2899 (+1.11z)| lr 1.63e-04 | 2533.14 ms | 53.3% bf16 MFU | 206943 tok/s +step 12983/19560 | loss 3.344372 (+0.02z)| norm 0.2577 (-1.23z)| lr 1.63e-04 | 2531.40 ms | 53.3% bf16 MFU | 206952 tok/s +step 12984/19560 | loss 3.327277 (-0.44z)| norm 0.2861 (+0.83z)| lr 1.63e-04 | 2532.78 ms | 53.3% bf16 MFU | 206954 tok/s +step 12985/19560 | loss 3.381900 (+1.02z)| norm 0.2694 (-0.38z)| lr 1.63e-04 | 2531.08 ms | 53.3% bf16 MFU | 206964 tok/s +step 12986/19560 | loss 3.326423 (-0.46z)| norm 0.2654 (-0.67z)| lr 1.63e-04 | 2533.31 ms | 53.3% bf16 MFU | 206963 tok/s +step 12987/19560 | loss 3.374911 (+0.84z)| norm 0.2844 (+0.71z)| lr 1.63e-04 | 2530.70 ms | 53.4% bf16 MFU | 206974 tok/s +step 12988/19560 | loss 3.294301 (-1.31z)| norm 0.2596 (-1.09z)| lr 1.63e-04 | 2532.08 ms | 53.3% bf16 MFU | 206978 tok/s +step 12989/19560 | loss 3.313777 (-0.78z)| norm 0.2702 (-0.31z)| lr 1.63e-04 | 2533.57 ms | 53.3% bf16 MFU | 206976 tok/s +step 12990/19560 | loss 3.303412 (-1.04z)| norm 0.2748 (+0.03z)| lr 1.62e-04 | 2532.20 ms | 53.3% bf16 MFU | 206979 tok/s +step 12991/19560 | loss 3.325655 (-0.43z)| norm 0.2723 (-0.15z)| lr 1.62e-04 | 2533.11 ms | 53.3% bf16 MFU | 206979 tok/s +step 12992/19560 | loss 3.438360 (+2.55z)| norm 0.2879 (+0.98z)| lr 1.62e-04 | 2531.91 ms | 53.3% bf16 MFU | 206984 tok/s +step 12993/19560 | loss 3.316590 (-0.67z)| norm 0.2823 (+0.56z)| lr 1.62e-04 | 2532.42 ms | 53.3% bf16 MFU | 206986 tok/s +step 12994/19560 | loss 3.381043 (+1.03z)| norm 0.2623 (-0.93z)| lr 1.62e-04 | 2531.69 ms | 53.3% bf16 MFU | 206991 tok/s +step 12995/19560 | loss 3.252276 (-2.30z)| norm 0.2785 (+0.27z)| lr 1.62e-04 | 2532.63 ms | 53.3% bf16 MFU | 206992 tok/s +step 12996/19560 | loss 3.315863 (-0.67z)| norm 0.2779 (+0.23z)| lr 1.62e-04 | 2531.18 ms | 53.3% bf16 MFU | 206999 tok/s +step 12997/19560 | loss 3.327250 (-0.37z)| norm 0.2685 (-0.47z)| lr 1.62e-04 | 2531.26 ms | 53.3% bf16 MFU | 207006 tok/s +step 12998/19560 | loss 3.352704 (+0.28z)| norm 0.2558 (-1.41z)| lr 1.62e-04 | 2531.37 ms | 53.3% bf16 MFU | 207011 tok/s +step 12999/19560 | loss 3.366804 (+0.66z)| norm 0.2773 (+0.18z)| lr 1.62e-04 | 2531.85 ms | 53.3% bf16 MFU | 207014 tok/s +step 13000/19560 | loss 3.358592 (+0.44z)| norm 0.2546 (-1.49z)| lr 1.62e-04 | 2532.28 ms | 53.3% bf16 MFU | 207016 tok/s +val loss 3.345843 +evaluating HellaSwag: 0/628 evaluating HellaSwag: 10/628 evaluating HellaSwag: 20/628 evaluating HellaSwag: 30/628 evaluating HellaSwag: 40/628 evaluating HellaSwag: 50/628 evaluating HellaSwag: 60/628 evaluating HellaSwag: 70/628 evaluating HellaSwag: 80/628 evaluating HellaSwag: 90/628 evaluating HellaSwag: 100/628 evaluating HellaSwag: 110/628 evaluating HellaSwag: 120/628 evaluating HellaSwag: 130/628 evaluating HellaSwag: 140/628 evaluating HellaSwag: 150/628 evaluating HellaSwag: 160/628 evaluating HellaSwag: 170/628 evaluating HellaSwag: 180/628 evaluating HellaSwag: 190/628 evaluating HellaSwag: 200/628 evaluating HellaSwag: 210/628 evaluating HellaSwag: 220/628 evaluating HellaSwag: 230/628 evaluating HellaSwag: 240/628 evaluating HellaSwag: 250/628 evaluating HellaSwag: 260/628 evaluating HellaSwag: 270/628 evaluating HellaSwag: 280/628 evaluating HellaSwag: 290/628 evaluating HellaSwag: 300/628 evaluating HellaSwag: 310/628 evaluating HellaSwag: 320/628 evaluating HellaSwag: 330/628 evaluating HellaSwag: 340/628 evaluating HellaSwag: 350/628 evaluating HellaSwag: 360/628 evaluating HellaSwag: 370/628 evaluating HellaSwag: 380/628 evaluating HellaSwag: 390/628 evaluating HellaSwag: 400/628 evaluating HellaSwag: 410/628 evaluating HellaSwag: 420/628 evaluating HellaSwag: 430/628 evaluating HellaSwag: 440/628 evaluating HellaSwag: 450/628 evaluating HellaSwag: 460/628 evaluating HellaSwag: 470/628 evaluating HellaSwag: 480/628 evaluating HellaSwag: 490/628 evaluating HellaSwag: 500/628 evaluating HellaSwag: 510/628 evaluating HellaSwag: 520/628 evaluating HellaSwag: 530/628 evaluating HellaSwag: 540/628 evaluating HellaSwag: 550/628 evaluating HellaSwag: 560/628 evaluating HellaSwag: 570/628 evaluating HellaSwag: 580/628 evaluating HellaSwag: 590/628 evaluating HellaSwag: 600/628 evaluating HellaSwag: 610/628 evaluating HellaSwag: 620/628 HellaSwag: 2976/10042 = 0.296355 +step 13001/19560 | loss 3.418516 (+1.98z)| norm 0.2805 (+0.41z)| lr 1.62e-04 | 2532.76 ms | 53.3% bf16 MFU | 207015 tok/s +step 13002/19560 | loss 3.289670 (-1.36z)| norm 0.2534 (-1.59z)| lr 1.62e-04 | 2531.77 ms | 53.3% bf16 MFU | 207019 tok/s +step 13003/19560 | loss 3.392730 (+1.32z)| norm 0.2774 (+0.19z)| lr 1.62e-04 | 2530.40 ms | 53.4% bf16 MFU | 207027 tok/s +step 13004/19560 | loss 3.349899 (+0.19z)| norm 0.2690 (-0.44z)| lr 1.62e-04 | 2531.04 ms | 53.3% bf16 MFU | 207033 tok/s +step 13005/19560 | loss 3.331353 (-0.30z)| norm 0.3083 (+2.39z)| lr 1.62e-04 | 2532.45 ms | 53.3% bf16 MFU | 207033 tok/s +step 13006/19560 | loss 3.343260 (+0.02z)| norm 0.2520 (-1.67z)| lr 1.62e-04 | 2531.28 ms | 53.3% bf16 MFU | 207038 tok/s +step 13007/19560 | loss 3.272151 (-1.81z)| norm 0.2877 (+0.89z)| lr 1.62e-04 | 2531.89 ms | 53.3% bf16 MFU | 207039 tok/s +step 13008/19560 | loss 3.297499 (-1.14z)| norm 0.2623 (-0.93z)| lr 1.62e-04 | 2532.20 ms | 53.3% bf16 MFU | 207040 tok/s +step 13009/19560 | loss 3.323694 (-0.46z)| norm 0.2954 (+1.41z)| lr 1.62e-04 | 2531.80 ms | 53.3% bf16 MFU | 207042 tok/s +step 13010/19560 | loss 3.370896 (+0.77z)| norm 0.2789 (+0.23z)| lr 1.62e-04 | 2533.45 ms | 53.3% bf16 MFU | 207037 tok/s +step 13011/19560 | loss 3.305888 (-0.92z)| norm 0.3130 (+2.58z)| lr 1.62e-04 | 2532.84 ms | 53.3% bf16 MFU | 207035 tok/s +step 13012/19560 | loss 3.361067 (+0.51z)| norm 0.2684 (-0.54z)| lr 1.61e-04 | 2562.51 ms | 52.7% bf16 MFU | 206913 tok/s +step 13013/19560 | loss 3.313811 (-0.73z)| norm 0.2862 (+0.71z)| lr 1.61e-04 | 2533.74 ms | 53.3% bf16 MFU | 206914 tok/s +step 13014/19560 | loss 3.425398 (+2.16z)| norm 0.2948 (+1.29z)| lr 1.61e-04 | 2532.69 ms | 53.3% bf16 MFU | 206918 tok/s +step 13015/19560 | loss 3.346588 (+0.13z)| norm 0.2763 (-0.01z)| lr 1.61e-04 | 2532.08 ms | 53.3% bf16 MFU | 206925 tok/s +step 13016/19560 | loss 3.319557 (-0.57z)| norm 0.2914 (+1.05z)| lr 1.61e-04 | 2530.78 ms | 53.4% bf16 MFU | 206937 tok/s +step 13017/19560 | loss 3.334363 (-0.20z)| norm 0.3010 (+1.70z)| lr 1.61e-04 | 2531.37 ms | 53.3% bf16 MFU | 206946 tok/s +step 13018/19560 | loss 3.290648 (-1.32z)| norm 0.2814 (+0.32z)| lr 1.61e-04 | 2531.96 ms | 53.3% bf16 MFU | 206952 tok/s +step 13019/19560 | loss 3.367241 (+0.66z)| norm 0.2728 (-0.28z)| lr 1.61e-04 | 2533.74 ms | 53.3% bf16 MFU | 206951 tok/s +step 13020/19560 | loss 3.310856 (-0.79z)| norm 0.2866 (+0.67z)| lr 1.61e-04 | 2533.19 ms | 53.3% bf16 MFU | 206952 tok/s +step 13021/19560 | loss 3.349334 (+0.22z)| norm 0.2693 (-0.53z)| lr 1.61e-04 | 2532.78 ms | 53.3% bf16 MFU | 206954 tok/s +step 13022/19560 | loss 3.316015 (-0.65z)| norm 0.2615 (-1.06z)| lr 1.61e-04 | 2534.34 ms | 53.3% bf16 MFU | 206950 tok/s +step 13023/19560 | loss 3.357780 (+0.44z)| norm 0.2725 (-0.28z)| lr 1.61e-04 | 2532.75 ms | 53.3% bf16 MFU | 206953 tok/s +step 13024/19560 | loss 3.349473 (+0.23z)| norm 0.2624 (-0.98z)| lr 1.61e-04 | 2532.50 ms | 53.3% bf16 MFU | 206956 tok/s +step 13025/19560 | loss 3.372307 (+0.84z)| norm 0.2803 (+0.27z)| lr 1.61e-04 | 2532.58 ms | 53.3% bf16 MFU | 206959 tok/s +step 13026/19560 | loss 3.335442 (-0.13z)| norm 0.2726 (-0.28z)| lr 1.61e-04 | 2533.28 ms | 53.3% bf16 MFU | 206959 tok/s +step 13027/19560 | loss 3.386561 (+1.31z)| norm 0.2727 (-0.26z)| lr 1.61e-04 | 2532.44 ms | 53.3% bf16 MFU | 206963 tok/s +step 13028/19560 | loss 3.371562 (+0.88z)| norm 0.2679 (-0.60z)| lr 1.61e-04 | 2533.47 ms | 53.3% bf16 MFU | 206962 tok/s +step 13029/19560 | loss 3.312026 (-0.78z)| norm 0.2787 (+0.15z)| lr 1.61e-04 | 2533.13 ms | 53.3% bf16 MFU | 206963 tok/s +step 13030/19560 | loss 3.354643 (+0.41z)| norm 0.2728 (-0.27z)| lr 1.61e-04 | 2534.38 ms | 53.3% bf16 MFU | 206958 tok/s +step 13031/19560 | loss 3.333856 (-0.15z)| norm 0.2720 (-0.33z)| lr 1.61e-04 | 2531.47 ms | 53.3% bf16 MFU | 206965 tok/s +step 13032/19560 | loss 3.341087 (+0.05z)| norm 0.2627 (-0.99z)| lr 1.61e-04 | 2532.88 ms | 53.3% bf16 MFU | 206967 tok/s +step 13033/19560 | loss 3.346946 (+0.21z)| norm 0.2859 (+0.64z)| lr 1.61e-04 | 2532.36 ms | 53.3% bf16 MFU | 206970 tok/s +step 13034/19560 | loss 3.443431 (+2.84z)| norm 0.2695 (-0.54z)| lr 1.61e-04 | 2533.41 ms | 53.3% bf16 MFU | 206969 tok/s +step 13035/19560 | loss 3.314685 (-0.72z)| norm 0.2759 (-0.08z)| lr 1.60e-04 | 2533.34 ms | 53.3% bf16 MFU | 206968 tok/s +step 13036/19560 | loss 3.295760 (-1.22z)| norm 0.2574 (-1.40z)| lr 1.60e-04 | 2534.49 ms | 53.3% bf16 MFU | 206963 tok/s +step 13037/19560 | loss 3.339478 (-0.01z)| norm 0.2757 (-0.09z)| lr 1.60e-04 | 2535.79 ms | 53.2% bf16 MFU | 206953 tok/s +step 13038/19560 | loss 3.279217 (-1.66z)| norm 0.2710 (-0.42z)| lr 1.60e-04 | 2534.90 ms | 53.3% bf16 MFU | 206946 tok/s +step 13039/19560 | loss 3.236573 (-2.75z)| norm 0.2553 (-1.56z)| lr 1.60e-04 | 2533.38 ms | 53.3% bf16 MFU | 206947 tok/s +step 13040/19560 | loss 3.284637 (-1.45z)| norm 0.2647 (-0.87z)| lr 1.60e-04 | 2535.40 ms | 53.3% bf16 MFU | 206939 tok/s +step 13041/19560 | loss 3.331626 (-0.20z)| norm 0.2689 (-0.56z)| lr 1.60e-04 | 2533.08 ms | 53.3% bf16 MFU | 206941 tok/s +step 13042/19560 | loss 3.314883 (-0.64z)| norm 0.2812 (+0.35z)| lr 1.60e-04 | 2532.68 ms | 53.3% bf16 MFU | 206944 tok/s +step 13043/19560 | loss 3.277430 (-1.61z)| norm 0.2789 (+0.18z)| lr 1.60e-04 | 2535.08 ms | 53.3% bf16 MFU | 206937 tok/s +step 13044/19560 | loss 3.359273 (+0.55z)| norm 0.2830 (+0.48z)| lr 1.60e-04 | 2532.66 ms | 53.3% bf16 MFU | 206941 tok/s +step 13045/19560 | loss 3.340215 (+0.05z)| norm 0.2873 (+0.79z)| lr 1.60e-04 | 2534.09 ms | 53.3% bf16 MFU | 206939 tok/s +step 13046/19560 | loss 3.332385 (-0.16z)| norm 0.2698 (-0.49z)| lr 1.60e-04 | 2534.43 ms | 53.3% bf16 MFU | 206935 tok/s +step 13047/19560 | loss 3.326281 (-0.32z)| norm 0.2646 (-0.86z)| lr 1.60e-04 | 2533.25 ms | 53.3% bf16 MFU | 206937 tok/s +step 13048/19560 | loss 3.359246 (+0.54z)| norm 0.2732 (-0.24z)| lr 1.60e-04 | 2534.92 ms | 53.3% bf16 MFU | 206931 tok/s +step 13049/19560 | loss 3.372783 (+0.91z)| norm 0.2617 (-1.06z)| lr 1.60e-04 | 2533.27 ms | 53.3% bf16 MFU | 206933 tok/s +step 13050/19560 | loss 3.364177 (+0.68z)| norm 0.2759 (-0.01z)| lr 1.60e-04 | 2532.29 ms | 53.3% bf16 MFU | 206938 tok/s +step 13051/19560 | loss 3.370844 (+0.85z)| norm 0.2743 (-0.13z)| lr 1.60e-04 | 2533.79 ms | 53.3% bf16 MFU | 206937 tok/s +step 13052/19560 | loss 3.350188 (+0.29z)| norm 0.2567 (-1.40z)| lr 1.60e-04 | 2532.25 ms | 53.3% bf16 MFU | 206942 tok/s +step 13053/19560 | loss 3.267807 (-1.86z)| norm 0.2643 (-0.84z)| lr 1.60e-04 | 2532.94 ms | 53.3% bf16 MFU | 206945 tok/s +step 13054/19560 | loss 3.373946 (+0.93z)| norm 0.2561 (-1.42z)| lr 1.60e-04 | 2535.63 ms | 53.2% bf16 MFU | 206936 tok/s +step 13055/19560 | loss 3.462409 (+3.10z)| norm 0.2703 (-0.37z)| lr 1.60e-04 | 2532.92 ms | 53.3% bf16 MFU | 206938 tok/s +step 13056/19560 | loss 3.305469 (-0.86z)| norm 0.2684 (-0.50z)| lr 1.60e-04 | 2533.31 ms | 53.3% bf16 MFU | 206939 tok/s +step 13057/19560 | loss 3.414203 (+1.86z)| norm 0.2527 (-1.64z)| lr 1.60e-04 | 2533.06 ms | 53.3% bf16 MFU | 206941 tok/s +step 13058/19560 | loss 3.327010 (-0.32z)| norm 0.2821 (+0.52z)| lr 1.59e-04 | 2534.14 ms | 53.3% bf16 MFU | 206939 tok/s +step 13059/19560 | loss 3.361696 (+0.56z)| norm 0.2917 (+1.22z)| lr 1.59e-04 | 2532.63 ms | 53.3% bf16 MFU | 206943 tok/s +step 13060/19560 | loss 3.410168 (+1.78z)| norm 0.3004 (+1.81z)| lr 1.59e-04 | 2531.81 ms | 53.3% bf16 MFU | 206949 tok/s +step 13061/19560 | loss 3.398613 (+1.46z)| norm 0.3337 (+3.95z)| lr 1.59e-04 | 2531.76 ms | 53.3% bf16 MFU | 206956 tok/s +step 13062/19560 | loss 3.380158 (+0.98z)| norm 0.2628 (-0.88z)| lr 1.59e-04 | 2532.53 ms | 53.3% bf16 MFU | 206959 tok/s +step 13063/19560 | loss 3.293723 (-1.16z)| norm 0.2940 (+1.24z)| lr 1.59e-04 | 2532.49 ms | 53.3% bf16 MFU | 206963 tok/s +step 13064/19560 | loss 3.403853 (+1.55z)| norm 0.2867 (+0.74z)| lr 1.59e-04 | 2532.76 ms | 53.3% bf16 MFU | 206965 tok/s +step 13065/19560 | loss 3.286639 (-1.32z)| norm 0.2750 (-0.05z)| lr 1.59e-04 | 2534.65 ms | 53.3% bf16 MFU | 206959 tok/s +step 13066/19560 | loss 3.318035 (-0.54z)| norm 0.2630 (-0.85z)| lr 1.59e-04 | 2533.46 ms | 53.3% bf16 MFU | 206958 tok/s +step 13067/19560 | loss 3.383440 (+1.05z)| norm 0.2805 (+0.34z)| lr 1.59e-04 | 2533.30 ms | 53.3% bf16 MFU | 206958 tok/s +step 13068/19560 | loss 3.333018 (-0.19z)| norm 0.2684 (-0.49z)| lr 1.59e-04 | 2532.90 ms | 53.3% bf16 MFU | 206960 tok/s +step 13069/19560 | loss 3.372331 (+0.77z)| norm 0.2670 (-0.58z)| lr 1.59e-04 | 2534.09 ms | 53.3% bf16 MFU | 206957 tok/s +step 13070/19560 | loss 3.334464 (-0.16z)| norm 0.3004 (+1.68z)| lr 1.59e-04 | 2532.65 ms | 53.3% bf16 MFU | 206959 tok/s +step 13071/19560 | loss 3.345068 (+0.11z)| norm 0.2844 (+0.59z)| lr 1.59e-04 | 2533.10 ms | 53.3% bf16 MFU | 206960 tok/s +step 13072/19560 | loss 3.376044 (+0.86z)| norm 0.2781 (+0.17z)| lr 1.59e-04 | 2533.51 ms | 53.3% bf16 MFU | 206959 tok/s +step 13073/19560 | loss 3.398658 (+1.39z)| norm 0.3188 (+2.83z)| lr 1.59e-04 | 2534.16 ms | 53.3% bf16 MFU | 206956 tok/s +step 13074/19560 | loss 3.297127 (-1.09z)| norm 0.2841 (+0.55z)| lr 1.59e-04 | 2532.78 ms | 53.3% bf16 MFU | 206958 tok/s +step 13075/19560 | loss 3.399440 (+1.40z)| norm 0.3509 (+4.53z)| lr 1.59e-04 | 2534.35 ms | 53.3% bf16 MFU | 206954 tok/s +step 13076/19560 | loss 3.337208 (-0.12z)| norm 0.2940 (+1.08z)| lr 1.59e-04 | 2534.23 ms | 53.3% bf16 MFU | 206950 tok/s +step 13077/19560 | loss 3.407570 (+1.58z)| norm 0.2996 (+1.43z)| lr 1.59e-04 | 2533.67 ms | 53.3% bf16 MFU | 206949 tok/s +step 13078/19560 | loss 3.391162 (+1.17z)| norm 0.3005 (+1.46z)| lr 1.59e-04 | 2533.66 ms | 53.3% bf16 MFU | 206948 tok/s +step 13079/19560 | loss 3.387322 (+1.07z)| norm 0.2895 (+0.79z)| lr 1.59e-04 | 2535.02 ms | 53.3% bf16 MFU | 206941 tok/s +step 13080/19560 | loss 3.411978 (+1.66z)| norm 0.3203 (+2.61z)| lr 1.58e-04 | 2533.78 ms | 53.3% bf16 MFU | 206940 tok/s +step 13081/19560 | loss 3.303466 (-0.93z)| norm 0.2922 (+0.96z)| lr 1.58e-04 | 2532.90 ms | 53.3% bf16 MFU | 206943 tok/s +step 13082/19560 | loss 3.306105 (-0.86z)| norm 0.2811 (+0.28z)| lr 1.58e-04 | 2532.33 ms | 53.3% bf16 MFU | 206948 tok/s +step 13083/19560 | loss 3.358889 (+0.38z)| norm 0.3235 (+2.81z)| lr 1.58e-04 | 2533.84 ms | 53.3% bf16 MFU | 206946 tok/s +step 13084/19560 | loss 3.372536 (+0.71z)| norm 0.3109 (+2.01z)| lr 1.58e-04 | 2533.46 ms | 53.3% bf16 MFU | 206946 tok/s +step 13085/19560 | loss 3.376075 (+0.79z)| norm 0.3270 (+2.85z)| lr 1.58e-04 | 2534.98 ms | 53.3% bf16 MFU | 206940 tok/s +step 13086/19560 | loss 3.391014 (+1.13z)| norm 0.3137 (+2.04z)| lr 1.58e-04 | 2534.38 ms | 53.3% bf16 MFU | 206936 tok/s +step 13087/19560 | loss 3.344708 (+0.02z)| norm 0.2956 (+1.00z)| lr 1.58e-04 | 2534.14 ms | 53.3% bf16 MFU | 206934 tok/s +step 13088/19560 | loss 3.330153 (-0.32z)| norm 0.3108 (+1.82z)| lr 1.58e-04 | 2532.13 ms | 53.3% bf16 MFU | 206940 tok/s +step 13089/19560 | loss 3.308496 (-0.83z)| norm 0.2677 (-0.57z)| lr 1.58e-04 | 2531.96 ms | 53.3% bf16 MFU | 206946 tok/s +step 13090/19560 | loss 3.400657 (+1.36z)| norm 0.2838 (+0.32z)| lr 1.58e-04 | 2532.69 ms | 53.3% bf16 MFU | 206949 tok/s +step 13091/19560 | loss 3.328874 (-0.36z)| norm 0.2684 (-0.54z)| lr 1.58e-04 | 2532.74 ms | 53.3% bf16 MFU | 206952 tok/s +step 13092/19560 | loss 3.371954 (+0.67z)| norm 0.2725 (-0.32z)| lr 1.58e-04 | 2533.27 ms | 53.3% bf16 MFU | 206953 tok/s +step 13093/19560 | loss 3.324728 (-0.47z)| norm 0.2967 (+1.02z)| lr 1.58e-04 | 2532.83 ms | 53.3% bf16 MFU | 206955 tok/s +step 13094/19560 | loss 3.358048 (+0.32z)| norm 0.2675 (-0.61z)| lr 1.58e-04 | 2532.00 ms | 53.3% bf16 MFU | 206960 tok/s +step 13095/19560 | loss 3.363683 (+0.46z)| norm 0.2735 (-0.28z)| lr 1.58e-04 | 2534.59 ms | 53.3% bf16 MFU | 206955 tok/s +step 13096/19560 | loss 3.318161 (-0.64z)| norm 0.2668 (-0.66z)| lr 1.58e-04 | 2532.69 ms | 53.3% bf16 MFU | 206958 tok/s +step 13097/19560 | loss 3.388091 (+1.03z)| norm 0.2693 (-0.52z)| lr 1.58e-04 | 2532.38 ms | 53.3% bf16 MFU | 206962 tok/s +step 13098/19560 | loss 3.324567 (-0.50z)| norm 0.2431 (-1.95z)| lr 1.58e-04 | 2531.28 ms | 53.3% bf16 MFU | 206970 tok/s +step 13099/19560 | loss 3.376503 (+0.75z)| norm 0.2620 (-0.91z)| lr 1.58e-04 | 2533.15 ms | 53.3% bf16 MFU | 206970 tok/s +step 13100/19560 | loss 3.343164 (-0.04z)| norm 0.2522 (-1.44z)| lr 1.58e-04 | 2531.15 ms | 53.3% bf16 MFU | 206978 tok/s +step 13101/19560 | loss 3.385661 (+0.97z)| norm 0.2610 (-0.94z)| lr 1.58e-04 | 2534.36 ms | 53.3% bf16 MFU | 206973 tok/s +step 13102/19560 | loss 3.356678 (+0.28z)| norm 0.2516 (-1.45z)| lr 1.58e-04 | 2531.62 ms | 53.3% bf16 MFU | 206979 tok/s +step 13103/19560 | loss 3.308532 (-0.89z)| norm 0.2522 (-1.41z)| lr 1.57e-04 | 2533.21 ms | 53.3% bf16 MFU | 206978 tok/s +step 13104/19560 | loss 3.372950 (+0.66z)| norm 0.2448 (-1.79z)| lr 1.57e-04 | 2531.74 ms | 53.3% bf16 MFU | 206984 tok/s +step 13105/19560 | loss 3.422001 (+1.82z)| norm 0.2724 (-0.30z)| lr 1.57e-04 | 2531.99 ms | 53.3% bf16 MFU | 206988 tok/s +step 13106/19560 | loss 3.370022 (+0.56z)| norm 0.2628 (-0.82z)| lr 1.57e-04 | 2532.36 ms | 53.3% bf16 MFU | 206990 tok/s +step 13107/19560 | loss 3.340603 (-0.16z)| norm 0.2515 (-1.42z)| lr 1.57e-04 | 2532.09 ms | 53.3% bf16 MFU | 206993 tok/s +step 13108/19560 | loss 3.331091 (-0.40z)| norm 0.2674 (-0.55z)| lr 1.57e-04 | 2532.97 ms | 53.3% bf16 MFU | 206993 tok/s +step 13109/19560 | loss 3.386016 (+0.95z)| norm 0.2733 (-0.24z)| lr 1.57e-04 | 2532.91 ms | 53.3% bf16 MFU | 206993 tok/s +step 13110/19560 | loss 3.349044 (+0.04z)| norm 0.2710 (-0.35z)| lr 1.57e-04 | 2531.92 ms | 53.3% bf16 MFU | 206997 tok/s +step 13111/19560 | loss 3.398451 (+1.24z)| norm 0.2726 (-0.27z)| lr 1.57e-04 | 2533.99 ms | 53.3% bf16 MFU | 206992 tok/s +step 13112/19560 | loss 3.370251 (+0.54z)| norm 0.2700 (-0.41z)| lr 1.57e-04 | 2532.00 ms | 53.3% bf16 MFU | 206996 tok/s +step 13113/19560 | loss 3.297647 (-1.22z)| norm 0.2749 (-0.14z)| lr 1.57e-04 | 2532.71 ms | 53.3% bf16 MFU | 206996 tok/s +step 13114/19560 | loss 3.309365 (-0.93z)| norm 0.2772 (-0.02z)| lr 1.57e-04 | 2531.97 ms | 53.3% bf16 MFU | 207000 tok/s +step 13115/19560 | loss 3.399724 (+1.26z)| norm 0.2712 (-0.34z)| lr 1.57e-04 | 2531.89 ms | 53.3% bf16 MFU | 207003 tok/s +step 13116/19560 | loss 3.368911 (+0.50z)| norm 0.2693 (-0.45z)| lr 1.57e-04 | 2533.28 ms | 53.3% bf16 MFU | 207001 tok/s +step 13117/19560 | loss 3.331231 (-0.42z)| norm 0.2770 (-0.03z)| lr 1.57e-04 | 2533.51 ms | 53.3% bf16 MFU | 206998 tok/s +step 13118/19560 | loss 3.461163 (+2.66z)| norm 0.2738 (-0.21z)| lr 1.57e-04 | 2533.66 ms | 53.3% bf16 MFU | 206995 tok/s +step 13119/19560 | loss 3.308707 (-0.97z)| norm 0.2679 (-0.53z)| lr 1.57e-04 | 2533.51 ms | 53.3% bf16 MFU | 206992 tok/s +step 13120/19560 | loss 3.337738 (-0.27z)| norm 0.2723 (-0.28z)| lr 1.57e-04 | 2532.38 ms | 53.3% bf16 MFU | 206994 tok/s +step 13121/19560 | loss 3.304634 (-1.06z)| norm 0.2674 (-0.55z)| lr 1.57e-04 | 2533.45 ms | 53.3% bf16 MFU | 206992 tok/s +step 13122/19560 | loss 3.302664 (-1.09z)| norm 0.2718 (-0.31z)| lr 1.57e-04 | 2532.68 ms | 53.3% bf16 MFU | 206993 tok/s +step 13123/19560 | loss 3.420037 (+1.72z)| norm 0.2724 (-0.27z)| lr 1.57e-04 | 2533.59 ms | 53.3% bf16 MFU | 206990 tok/s +step 13124/19560 | loss 3.345117 (-0.11z)| norm 0.2748 (-0.14z)| lr 1.57e-04 | 2534.23 ms | 53.3% bf16 MFU | 206984 tok/s +step 13125/19560 | loss 3.349896 (+0.01z)| norm 0.2710 (-0.35z)| lr 1.57e-04 | 2534.36 ms | 53.3% bf16 MFU | 206979 tok/s +step 13126/19560 | loss 3.374576 (+0.60z)| norm 0.2684 (-0.50z)| lr 1.56e-04 | 2533.62 ms | 53.3% bf16 MFU | 206976 tok/s +step 13127/19560 | loss 3.357683 (+0.19z)| norm 0.3093 (+1.71z)| lr 1.56e-04 | 2535.28 ms | 53.3% bf16 MFU | 206968 tok/s +step 13128/19560 | loss 3.311997 (-0.91z)| norm 0.2748 (-0.17z)| lr 1.56e-04 | 2532.70 ms | 53.3% bf16 MFU | 206970 tok/s +step 13129/19560 | loss 3.371598 (+0.55z)| norm 0.2614 (-0.89z)| lr 1.56e-04 | 2533.19 ms | 53.3% bf16 MFU | 206969 tok/s +step 13130/19560 | loss 3.438546 (+2.15z)| norm 0.2756 (-0.13z)| lr 1.56e-04 | 2534.52 ms | 53.3% bf16 MFU | 206964 tok/s +step 13131/19560 | loss 3.368092 (+0.44z)| norm 0.2647 (-0.72z)| lr 1.56e-04 | 2533.26 ms | 53.3% bf16 MFU | 206964 tok/s +step 13132/19560 | loss 3.305473 (-1.08z)| norm 0.2815 (+0.20z)| lr 1.56e-04 | 2532.92 ms | 53.3% bf16 MFU | 206965 tok/s +step 13133/19560 | loss 3.337273 (-0.30z)| norm 0.2724 (-0.29z)| lr 1.56e-04 | 2532.83 ms | 53.3% bf16 MFU | 206967 tok/s +step 13134/19560 | loss 3.342163 (-0.18z)| norm 0.2858 (+0.44z)| lr 1.56e-04 | 2534.28 ms | 53.3% bf16 MFU | 206962 tok/s +step 13135/19560 | loss 3.338636 (-0.29z)| norm 0.2600 (-0.98z)| lr 1.56e-04 | 2533.10 ms | 53.3% bf16 MFU | 206963 tok/s +step 13136/19560 | loss 3.322577 (-0.69z)| norm 0.2771 (-0.04z)| lr 1.56e-04 | 2533.72 ms | 53.3% bf16 MFU | 206961 tok/s +step 13137/19560 | loss 3.358352 (+0.19z)| norm 0.2812 (+0.20z)| lr 1.56e-04 | 2534.32 ms | 53.3% bf16 MFU | 206957 tok/s +step 13138/19560 | loss 3.389362 (+0.95z)| norm 0.2855 (+0.43z)| lr 1.56e-04 | 2534.59 ms | 53.3% bf16 MFU | 206952 tok/s +step 13139/19560 | loss 3.296968 (-1.33z)| norm 0.2690 (-0.48z)| lr 1.56e-04 | 2532.20 ms | 53.3% bf16 MFU | 206956 tok/s +step 13140/19560 | loss 3.418183 (+1.64z)| norm 0.2700 (-0.42z)| lr 1.56e-04 | 2533.74 ms | 53.3% bf16 MFU | 206955 tok/s +step 13141/19560 | loss 3.292433 (-1.43z)| norm 0.2797 (+0.13z)| lr 1.56e-04 | 2532.56 ms | 53.3% bf16 MFU | 206958 tok/s +step 13142/19560 | loss 3.327788 (-0.55z)| norm 0.2640 (-0.75z)| lr 1.56e-04 | 2533.33 ms | 53.3% bf16 MFU | 206958 tok/s +step 13143/19560 | loss 3.378784 (+0.69z)| norm 0.2686 (-0.48z)| lr 1.56e-04 | 2532.59 ms | 53.3% bf16 MFU | 206961 tok/s +step 13144/19560 | loss 3.328715 (-0.54z)| norm 0.2659 (-0.62z)| lr 1.56e-04 | 2534.15 ms | 53.3% bf16 MFU | 206957 tok/s +step 13145/19560 | loss 3.363204 (+0.30z)| norm 0.2701 (-0.38z)| lr 1.56e-04 | 2531.17 ms | 53.3% bf16 MFU | 206966 tok/s +step 13146/19560 | loss 3.363067 (+0.29z)| norm 0.2638 (-0.72z)| lr 1.56e-04 | 2533.40 ms | 53.3% bf16 MFU | 206965 tok/s +step 13147/19560 | loss 3.287093 (-1.57z)| norm 0.2812 (+0.27z)| lr 1.56e-04 | 2534.30 ms | 53.3% bf16 MFU | 206961 tok/s +step 13148/19560 | loss 3.352524 (+0.03z)| norm 0.2601 (-0.92z)| lr 1.56e-04 | 2533.45 ms | 53.3% bf16 MFU | 206960 tok/s +step 13149/19560 | loss 3.295697 (-1.35z)| norm 0.2855 (+0.51z)| lr 1.55e-04 | 2533.83 ms | 53.3% bf16 MFU | 206958 tok/s +step 13150/19560 | loss 3.354243 (+0.08z)| norm 0.2757 (-0.05z)| lr 1.55e-04 | 2533.15 ms | 53.3% bf16 MFU | 206958 tok/s +step 13151/19560 | loss 3.295582 (-1.34z)| norm 0.2786 (+0.11z)| lr 1.55e-04 | 2532.79 ms | 53.3% bf16 MFU | 206961 tok/s +step 13152/19560 | loss 3.406689 (+1.35z)| norm 0.2892 (+0.70z)| lr 1.55e-04 | 2534.19 ms | 53.3% bf16 MFU | 206957 tok/s +step 13153/19560 | loss 3.326591 (-0.58z)| norm 0.2612 (-0.88z)| lr 1.55e-04 | 2533.11 ms | 53.3% bf16 MFU | 206958 tok/s +step 13154/19560 | loss 3.324131 (-0.64z)| norm 0.2740 (-0.16z)| lr 1.55e-04 | 2533.58 ms | 53.3% bf16 MFU | 206956 tok/s +step 13155/19560 | loss 3.336929 (-0.32z)| norm 0.2516 (-1.41z)| lr 1.55e-04 | 2532.28 ms | 53.3% bf16 MFU | 206961 tok/s +step 13156/19560 | loss 3.341980 (-0.19z)| norm 0.2686 (-0.45z)| lr 1.55e-04 | 2531.51 ms | 53.3% bf16 MFU | 206968 tok/s +step 13157/19560 | loss 3.306164 (-1.06z)| norm 0.2575 (-1.06z)| lr 1.55e-04 | 2533.71 ms | 53.3% bf16 MFU | 206966 tok/s +step 13158/19560 | loss 3.301061 (-1.17z)| norm 0.2581 (-1.02z)| lr 1.55e-04 | 2531.54 ms | 53.3% bf16 MFU | 206973 tok/s +step 13159/19560 | loss 3.289169 (-1.44z)| norm 0.2535 (-1.26z)| lr 1.55e-04 | 2533.68 ms | 53.3% bf16 MFU | 206970 tok/s +step 13160/19560 | loss 3.352363 (+0.08z)| norm 0.2926 (+0.91z)| lr 1.55e-04 | 2533.96 ms | 53.3% bf16 MFU | 206967 tok/s +step 13161/19560 | loss 3.410761 (+1.45z)| norm 0.2615 (-0.81z)| lr 1.55e-04 | 2532.01 ms | 53.3% bf16 MFU | 206972 tok/s +step 13162/19560 | loss 3.367808 (+0.45z)| norm 0.2529 (-1.28z)| lr 1.55e-04 | 2535.37 ms | 53.3% bf16 MFU | 206963 tok/s +step 13163/19560 | loss 3.410354 (+1.46z)| norm 0.2641 (-0.66z)| lr 1.55e-04 | 2533.19 ms | 53.3% bf16 MFU | 206963 tok/s +step 13164/19560 | loss 3.291429 (-1.40z)| norm 0.2638 (-0.67z)| lr 1.55e-04 | 2532.77 ms | 53.3% bf16 MFU | 206965 tok/s +step 13165/19560 | loss 3.324745 (-0.60z)| norm 0.2566 (-1.06z)| lr 1.55e-04 | 2532.07 ms | 53.3% bf16 MFU | 206970 tok/s +step 13166/19560 | loss 3.354094 (+0.09z)| norm 0.2569 (-1.03z)| lr 1.55e-04 | 2532.59 ms | 53.3% bf16 MFU | 206972 tok/s +step 13167/19560 | loss 3.345916 (-0.13z)| norm 0.2428 (-1.79z)| lr 1.55e-04 | 2534.81 ms | 53.3% bf16 MFU | 206965 tok/s +step 13168/19560 | loss 3.373862 (+0.56z)| norm 0.2601 (-0.84z)| lr 1.55e-04 | 2532.22 ms | 53.3% bf16 MFU | 206969 tok/s +step 13169/19560 | loss 3.305424 (-1.16z)| norm 0.2639 (-0.63z)| lr 1.55e-04 | 2533.65 ms | 53.3% bf16 MFU | 206967 tok/s +step 13170/19560 | loss 3.327322 (-0.62z)| norm 0.2681 (-0.40z)| lr 1.55e-04 | 2533.47 ms | 53.3% bf16 MFU | 206966 tok/s +step 13171/19560 | loss 3.320158 (-0.81z)| norm 0.2694 (-0.33z)| lr 1.54e-04 | 2532.65 ms | 53.3% bf16 MFU | 206968 tok/s +step 13172/19560 | loss 3.393809 (+1.06z)| norm 0.2716 (-0.20z)| lr 1.54e-04 | 2533.49 ms | 53.3% bf16 MFU | 206967 tok/s +step 13173/19560 | loss 3.333836 (-0.47z)| norm 0.2590 (-0.87z)| lr 1.54e-04 | 2534.07 ms | 53.3% bf16 MFU | 206964 tok/s +step 13174/19560 | loss 3.334569 (-0.45z)| norm 0.2737 (-0.08z)| lr 1.54e-04 | 2532.55 ms | 53.3% bf16 MFU | 206966 tok/s +step 13175/19560 | loss 3.400562 (+1.21z)| norm 0.2774 (+0.12z)| lr 1.54e-04 | 2533.51 ms | 53.3% bf16 MFU | 206965 tok/s +step 13176/19560 | loss 3.387389 (+0.87z)| norm 0.2605 (-0.79z)| lr 1.54e-04 | 2534.02 ms | 53.3% bf16 MFU | 206962 tok/s +step 13177/19560 | loss 3.342554 (-0.26z)| norm 0.3134 (+2.02z)| lr 1.54e-04 | 2532.11 ms | 53.3% bf16 MFU | 206967 tok/s +step 13178/19560 | loss 3.346821 (-0.15z)| norm 0.2601 (-0.81z)| lr 1.54e-04 | 2532.69 ms | 53.3% bf16 MFU | 206969 tok/s +step 13179/19560 | loss 3.314658 (-0.95z)| norm 0.3126 (+1.94z)| lr 1.54e-04 | 2532.34 ms | 53.3% bf16 MFU | 206972 tok/s +step 13180/19560 | loss 3.313248 (-0.97z)| norm 0.2957 (+1.03z)| lr 1.54e-04 | 2535.23 ms | 53.3% bf16 MFU | 206964 tok/s +step 13181/19560 | loss 3.327992 (-0.62z)| norm 0.2652 (-0.57z)| lr 1.54e-04 | 2534.48 ms | 53.3% bf16 MFU | 206958 tok/s +step 13182/19560 | loss 3.335863 (-0.42z)| norm 0.2815 (+0.28z)| lr 1.54e-04 | 2535.07 ms | 53.3% bf16 MFU | 206951 tok/s +step 13183/19560 | loss 3.313103 (-1.00z)| norm 0.2659 (-0.54z)| lr 1.54e-04 | 2533.34 ms | 53.3% bf16 MFU | 206951 tok/s +step 13184/19560 | loss 3.374437 (+0.61z)| norm 0.2591 (-0.89z)| lr 1.54e-04 | 2530.97 ms | 53.3% bf16 MFU | 206961 tok/s +step 13185/19560 | loss 3.302818 (-1.27z)| norm 0.2702 (-0.32z)| lr 1.54e-04 | 2534.38 ms | 53.3% bf16 MFU | 206957 tok/s +step 13186/19560 | loss 3.339056 (-0.31z)| norm 0.2824 (+0.33z)| lr 1.54e-04 | 2533.89 ms | 53.3% bf16 MFU | 206954 tok/s +step 13187/19560 | loss 3.355520 (+0.13z)| norm 0.2873 (+0.59z)| lr 1.54e-04 | 2532.68 ms | 53.3% bf16 MFU | 206957 tok/s +step 13188/19560 | loss 3.295199 (-1.45z)| norm 0.2652 (-0.57z)| lr 1.54e-04 | 2532.98 ms | 53.3% bf16 MFU | 206959 tok/s +step 13189/19560 | loss 3.385753 (+0.96z)| norm 0.2765 (+0.06z)| lr 1.54e-04 | 2532.65 ms | 53.3% bf16 MFU | 206961 tok/s +step 13190/19560 | loss 3.337039 (-0.33z)| norm 0.2795 (+0.22z)| lr 1.54e-04 | 2533.73 ms | 53.3% bf16 MFU | 206959 tok/s +step 13191/19560 | loss 3.380347 (+0.82z)| norm 0.2798 (+0.24z)| lr 1.54e-04 | 2534.94 ms | 53.3% bf16 MFU | 206953 tok/s +step 13192/19560 | loss 3.383104 (+0.90z)| norm 0.2832 (+0.43z)| lr 1.54e-04 | 2534.23 ms | 53.3% bf16 MFU | 206949 tok/s +step 13193/19560 | loss 3.326245 (-0.65z)| norm 0.2567 (-1.03z)| lr 1.54e-04 | 2535.24 ms | 53.3% bf16 MFU | 206942 tok/s +step 13194/19560 | loss 3.338301 (-0.33z)| norm 0.2780 (+0.14z)| lr 1.53e-04 | 2533.13 ms | 53.3% bf16 MFU | 206943 tok/s +step 13195/19560 | loss 3.383434 (+0.91z)| norm 0.2691 (-0.35z)| lr 1.53e-04 | 2534.88 ms | 53.3% bf16 MFU | 206937 tok/s +step 13196/19560 | loss 3.326404 (-0.65z)| norm 0.2784 (+0.16z)| lr 1.53e-04 | 2534.11 ms | 53.3% bf16 MFU | 206935 tok/s +step 13197/19560 | loss 3.346323 (-0.10z)| norm 0.2535 (-1.20z)| lr 1.53e-04 | 2534.07 ms | 53.3% bf16 MFU | 206933 tok/s +step 13198/19560 | loss 3.355859 (+0.15z)| norm 0.2620 (-0.72z)| lr 1.53e-04 | 2533.47 ms | 53.3% bf16 MFU | 206934 tok/s +step 13199/19560 | loss 3.497727 (+3.78z)| norm 0.2747 (-0.01z)| lr 1.53e-04 | 2532.61 ms | 53.3% bf16 MFU | 206938 tok/s +step 13200/19560 | loss 3.328417 (-0.59z)| norm 0.2747 (-0.01z)| lr 1.53e-04 | 2532.57 ms | 53.3% bf16 MFU | 206942 tok/s +step 13201/19560 | loss 3.401747 (+1.31z)| norm 0.2830 (+0.47z)| lr 1.53e-04 | 2532.52 ms | 53.3% bf16 MFU | 206946 tok/s +step 13202/19560 | loss 3.356496 (+0.13z)| norm 0.2735 (-0.06z)| lr 1.53e-04 | 2535.59 ms | 53.2% bf16 MFU | 206937 tok/s +step 13203/19560 | loss 3.337903 (-0.34z)| norm 0.2817 (+0.47z)| lr 1.53e-04 | 2534.40 ms | 53.3% bf16 MFU | 206934 tok/s +step 13204/19560 | loss 3.446470 (+2.42z)| norm 0.2560 (-1.09z)| lr 1.53e-04 | 2532.70 ms | 53.3% bf16 MFU | 206937 tok/s +step 13205/19560 | loss 3.315579 (-0.92z)| norm 0.2791 (+0.34z)| lr 1.53e-04 | 2531.85 ms | 53.3% bf16 MFU | 206944 tok/s +step 13206/19560 | loss 3.349697 (-0.03z)| norm 0.2553 (-1.12z)| lr 1.53e-04 | 2533.45 ms | 53.3% bf16 MFU | 206944 tok/s +step 13207/19560 | loss 3.372402 (+0.56z)| norm 0.2616 (-0.72z)| lr 1.53e-04 | 2534.15 ms | 53.3% bf16 MFU | 206942 tok/s +step 13208/19560 | loss 3.348312 (-0.05z)| norm 0.2553 (-1.12z)| lr 1.53e-04 | 2532.80 ms | 53.3% bf16 MFU | 206945 tok/s +step 13209/19560 | loss 3.348535 (-0.05z)| norm 0.2607 (-0.75z)| lr 1.53e-04 | 2531.60 ms | 53.3% bf16 MFU | 206952 tok/s +step 13210/19560 | loss 3.388599 (+0.98z)| norm 0.2804 (+0.53z)| lr 1.53e-04 | 2533.49 ms | 53.3% bf16 MFU | 206952 tok/s +step 13211/19560 | loss 3.336931 (-0.37z)| norm 0.2577 (-0.95z)| lr 1.53e-04 | 2532.55 ms | 53.3% bf16 MFU | 206955 tok/s +step 13212/19560 | loss 3.335856 (-0.39z)| norm 0.2575 (-0.96z)| lr 1.53e-04 | 2533.23 ms | 53.3% bf16 MFU | 206956 tok/s +step 13213/19560 | loss 3.377746 (+0.71z)| norm 0.2564 (-1.06z)| lr 1.53e-04 | 2533.05 ms | 53.3% bf16 MFU | 206957 tok/s +step 13214/19560 | loss 3.319577 (-0.81z)| norm 0.2691 (-0.10z)| lr 1.53e-04 | 2532.59 ms | 53.3% bf16 MFU | 206960 tok/s +step 13215/19560 | loss 3.373013 (+0.59z)| norm 0.2772 (+0.54z)| lr 1.53e-04 | 2534.77 ms | 53.3% bf16 MFU | 206954 tok/s +step 13216/19560 | loss 3.334881 (-0.41z)| norm 0.2635 (-0.52z)| lr 1.53e-04 | 2530.91 ms | 53.3% bf16 MFU | 206964 tok/s +step 13217/19560 | loss 3.310958 (-1.04z)| norm 0.2799 (+0.80z)| lr 1.52e-04 | 2532.87 ms | 53.3% bf16 MFU | 206965 tok/s +step 13218/19560 | loss 3.377180 (+0.71z)| norm 0.2742 (+0.35z)| lr 1.52e-04 | 2532.63 ms | 53.3% bf16 MFU | 206968 tok/s +step 13219/19560 | loss 3.297040 (-1.40z)| norm 0.2566 (-1.07z)| lr 1.52e-04 | 2532.62 ms | 53.3% bf16 MFU | 206970 tok/s +step 13220/19560 | loss 3.326682 (-0.61z)| norm 0.2654 (-0.36z)| lr 1.52e-04 | 2531.77 ms | 53.3% bf16 MFU | 206976 tok/s +step 13221/19560 | loss 3.368667 (+0.49z)| norm 0.2593 (-0.84z)| lr 1.52e-04 | 2531.75 ms | 53.3% bf16 MFU | 206981 tok/s +step 13222/19560 | loss 3.287813 (-1.61z)| norm 0.2465 (-1.85z)| lr 1.52e-04 | 2535.41 ms | 53.3% bf16 MFU | 206971 tok/s +step 13223/19560 | loss 3.388017 (+0.99z)| norm 0.2749 (+0.46z)| lr 1.52e-04 | 2531.84 ms | 53.3% bf16 MFU | 206977 tok/s +step 13224/19560 | loss 3.341999 (-0.21z)| norm 0.2622 (-0.58z)| lr 1.52e-04 | 2532.69 ms | 53.3% bf16 MFU | 206978 tok/s +step 13225/19560 | loss 3.314310 (-0.91z)| norm 0.2500 (-1.53z)| lr 1.52e-04 | 2532.94 ms | 53.3% bf16 MFU | 206979 tok/s +step 13226/19560 | loss 3.326704 (-0.59z)| norm 0.2852 (+1.27z)| lr 1.52e-04 | 2533.59 ms | 53.3% bf16 MFU | 206977 tok/s +step 13227/19560 | loss 3.319115 (-0.78z)| norm 0.2618 (-0.62z)| lr 1.52e-04 | 2534.32 ms | 53.3% bf16 MFU | 206971 tok/s +step 13228/19560 | loss 3.300723 (-1.24z)| norm 0.2793 (+0.78z)| lr 1.52e-04 | 2536.07 ms | 53.2% bf16 MFU | 206960 tok/s +step 13229/19560 | loss 3.329095 (-0.49z)| norm 0.2748 (+0.41z)| lr 1.52e-04 | 2532.98 ms | 53.3% bf16 MFU | 206961 tok/s +step 13230/19560 | loss 3.319306 (-0.74z)| norm 0.2661 (-0.31z)| lr 1.52e-04 | 2533.83 ms | 53.3% bf16 MFU | 206958 tok/s +step 13231/19560 | loss 3.394013 (+1.18z)| norm 0.2768 (+0.56z)| lr 1.52e-04 | 2534.14 ms | 53.3% bf16 MFU | 206955 tok/s +step 13232/19560 | loss 3.305944 (-1.08z)| norm 0.2838 (+1.12z)| lr 1.52e-04 | 2531.91 ms | 53.3% bf16 MFU | 206961 tok/s +step 13233/19560 | loss 3.375665 (+0.73z)| norm 0.2751 (+0.40z)| lr 1.52e-04 | 2531.24 ms | 53.3% bf16 MFU | 206969 tok/s +step 13234/19560 | loss 3.355306 (+0.20z)| norm 0.2768 (+0.53z)| lr 1.52e-04 | 2532.89 ms | 53.3% bf16 MFU | 206970 tok/s +step 13235/19560 | loss 3.328373 (-0.50z)| norm 0.2786 (+0.67z)| lr 1.52e-04 | 2533.24 ms | 53.3% bf16 MFU | 206970 tok/s +step 13236/19560 | loss 3.367502 (+0.52z)| norm 0.2663 (-0.38z)| lr 1.52e-04 | 2532.67 ms | 53.3% bf16 MFU | 206972 tok/s +step 13237/19560 | loss 3.354489 (+0.18z)| norm 0.2818 (+0.93z)| lr 1.52e-04 | 2532.61 ms | 53.3% bf16 MFU | 206974 tok/s +step 13238/19560 | loss 3.403012 (+1.43z)| norm 0.2784 (+0.64z)| lr 1.52e-04 | 2534.92 ms | 53.3% bf16 MFU | 206967 tok/s +step 13239/19560 | loss 3.330247 (-0.45z)| norm 0.3048 (+2.76z)| lr 1.52e-04 | 2532.72 ms | 53.3% bf16 MFU | 206969 tok/s +step 13240/19560 | loss 3.386036 (+1.01z)| norm 0.2891 (+1.45z)| lr 1.51e-04 | 2535.24 ms | 53.3% bf16 MFU | 206960 tok/s +step 13241/19560 | loss 3.352451 (+0.12z)| norm 0.2951 (+1.89z)| lr 1.51e-04 | 2534.14 ms | 53.3% bf16 MFU | 206957 tok/s +step 13242/19560 | loss 3.358497 (+0.27z)| norm 0.2874 (+1.27z)| lr 1.51e-04 | 2534.35 ms | 53.3% bf16 MFU | 206953 tok/s +step 13243/19560 | loss 3.373182 (+0.67z)| norm 0.2874 (+1.25z)| lr 1.51e-04 | 2535.23 ms | 53.3% bf16 MFU | 206945 tok/s +step 13244/19560 | loss 3.351325 (+0.09z)| norm 0.2751 (+0.27z)| lr 1.51e-04 | 2535.21 ms | 53.3% bf16 MFU | 206938 tok/s +step 13245/19560 | loss 3.309848 (-1.00z)| norm 0.2785 (+0.54z)| lr 1.51e-04 | 2534.65 ms | 53.3% bf16 MFU | 206933 tok/s +step 13246/19560 | loss 3.328726 (-0.49z)| norm 0.2725 (+0.06z)| lr 1.51e-04 | 2533.12 ms | 53.3% bf16 MFU | 206935 tok/s +step 13247/19560 | loss 3.418816 (+1.93z)| norm 0.2961 (+1.88z)| lr 1.51e-04 | 2533.78 ms | 53.3% bf16 MFU | 206935 tok/s +step 13248/19560 | loss 3.335967 (-0.32z)| norm 0.2736 (+0.13z)| lr 1.51e-04 | 2533.74 ms | 53.3% bf16 MFU | 206934 tok/s +step 13249/19560 | loss 3.304599 (-1.17z)| norm 0.2838 (+0.91z)| lr 1.51e-04 | 2534.94 ms | 53.3% bf16 MFU | 206929 tok/s +step 13250/19560 | loss 3.276497 (-1.91z)| norm 0.2647 (-0.56z)| lr 1.51e-04 | 2534.07 ms | 53.3% bf16 MFU | 206927 tok/s +val loss 3.341838 +evaluating HellaSwag: 0/628 evaluating HellaSwag: 10/628 evaluating HellaSwag: 20/628 evaluating HellaSwag: 30/628 evaluating HellaSwag: 40/628 evaluating HellaSwag: 50/628 evaluating HellaSwag: 60/628 evaluating HellaSwag: 70/628 evaluating HellaSwag: 80/628 evaluating HellaSwag: 90/628 evaluating HellaSwag: 100/628 evaluating HellaSwag: 110/628 evaluating HellaSwag: 120/628 evaluating HellaSwag: 130/628 evaluating HellaSwag: 140/628 evaluating HellaSwag: 150/628 evaluating HellaSwag: 160/628 evaluating HellaSwag: 170/628 evaluating HellaSwag: 180/628 evaluating HellaSwag: 190/628 evaluating HellaSwag: 200/628 evaluating HellaSwag: 210/628 evaluating HellaSwag: 220/628 evaluating HellaSwag: 230/628 evaluating HellaSwag: 240/628 evaluating HellaSwag: 250/628 evaluating HellaSwag: 260/628 evaluating HellaSwag: 270/628 evaluating HellaSwag: 280/628 evaluating HellaSwag: 290/628 evaluating HellaSwag: 300/628 evaluating HellaSwag: 310/628 evaluating HellaSwag: 320/628 evaluating HellaSwag: 330/628 evaluating HellaSwag: 340/628 evaluating HellaSwag: 350/628 evaluating HellaSwag: 360/628 evaluating HellaSwag: 370/628 evaluating HellaSwag: 380/628 evaluating HellaSwag: 390/628 evaluating HellaSwag: 400/628 evaluating HellaSwag: 410/628 evaluating HellaSwag: 420/628 evaluating HellaSwag: 430/628 evaluating HellaSwag: 440/628 evaluating HellaSwag: 450/628 evaluating HellaSwag: 460/628 evaluating HellaSwag: 470/628 evaluating HellaSwag: 480/628 evaluating HellaSwag: 490/628 evaluating HellaSwag: 500/628 evaluating HellaSwag: 510/628 evaluating HellaSwag: 520/628 evaluating HellaSwag: 530/628 evaluating HellaSwag: 540/628 evaluating HellaSwag: 550/628 evaluating HellaSwag: 560/628 evaluating HellaSwag: 570/628 evaluating HellaSwag: 580/628 evaluating HellaSwag: 590/628 evaluating HellaSwag: 600/628 evaluating HellaSwag: 610/628 evaluating HellaSwag: 620/628 HellaSwag: 2997/10042 = 0.298447 +step 13251/19560 | loss 3.360597 (+0.37z)| norm 0.2592 (-0.98z)| lr 1.51e-04 | 2534.66 ms | 53.3% bf16 MFU | 206923 tok/s +step 13252/19560 | loss 3.344270 (-0.07z)| norm 0.2575 (-1.09z)| lr 1.51e-04 | 2532.33 ms | 53.3% bf16 MFU | 206929 tok/s +step 13253/19560 | loss 3.367371 (+0.55z)| norm 0.2568 (-1.13z)| lr 1.51e-04 | 2534.30 ms | 53.3% bf16 MFU | 206926 tok/s +step 13254/19560 | loss 3.343252 (-0.10z)| norm 0.2901 (+1.39z)| lr 1.51e-04 | 2532.30 ms | 53.3% bf16 MFU | 206932 tok/s +step 13255/19560 | loss 3.358232 (+0.31z)| norm 0.2692 (-0.17z)| lr 1.51e-04 | 2532.55 ms | 53.3% bf16 MFU | 206936 tok/s +step 13256/19560 | loss 3.312939 (-0.93z)| norm 0.2705 (-0.07z)| lr 1.51e-04 | 2533.64 ms | 53.3% bf16 MFU | 206936 tok/s +step 13257/19560 | loss 3.311064 (-0.96z)| norm 0.2650 (-0.51z)| lr 1.51e-04 | 2532.87 ms | 53.3% bf16 MFU | 206939 tok/s +step 13258/19560 | loss 3.474861 (+3.42z)| norm 0.2789 (+0.59z)| lr 1.51e-04 | 2533.45 ms | 53.3% bf16 MFU | 206939 tok/s +step 13259/19560 | loss 3.328264 (-0.48z)| norm 0.2760 (+0.35z)| lr 1.51e-04 | 2533.75 ms | 53.3% bf16 MFU | 206938 tok/s +step 13260/19560 | loss 3.300123 (-1.23z)| norm 0.2724 (+0.07z)| lr 1.51e-04 | 2533.24 ms | 53.3% bf16 MFU | 206939 tok/s +step 13261/19560 | loss 3.327877 (-0.49z)| norm 0.3091 (+2.84z)| lr 1.51e-04 | 2532.49 ms | 53.3% bf16 MFU | 206944 tok/s +step 13262/19560 | loss 3.350702 (+0.12z)| norm 0.2753 (+0.28z)| lr 1.51e-04 | 2534.81 ms | 53.3% bf16 MFU | 206938 tok/s +step 13263/19560 | loss 3.341853 (-0.12z)| norm 0.2857 (+1.06z)| lr 1.50e-04 | 2532.56 ms | 53.3% bf16 MFU | 206942 tok/s +step 13264/19560 | loss 3.371290 (+0.66z)| norm 0.2725 (+0.05z)| lr 1.50e-04 | 2533.75 ms | 53.3% bf16 MFU | 206941 tok/s +step 13265/19560 | loss 3.313694 (-0.87z)| norm 0.2782 (+0.49z)| lr 1.50e-04 | 2532.90 ms | 53.3% bf16 MFU | 206944 tok/s +step 13266/19560 | loss 3.294352 (-1.36z)| norm 0.2682 (-0.27z)| lr 1.50e-04 | 2533.62 ms | 53.3% bf16 MFU | 206943 tok/s +step 13267/19560 | loss 3.351069 (+0.14z)| norm 0.2701 (-0.12z)| lr 1.50e-04 | 2531.79 ms | 53.3% bf16 MFU | 206950 tok/s +step 13268/19560 | loss 3.353588 (+0.22z)| norm 0.2715 (-0.02z)| lr 1.50e-04 | 2533.49 ms | 53.3% bf16 MFU | 206950 tok/s +step 13269/19560 | loss 3.400861 (+1.47z)| norm 0.2711 (-0.05z)| lr 1.50e-04 | 2532.58 ms | 53.3% bf16 MFU | 206953 tok/s +step 13270/19560 | loss 3.314194 (-0.86z)| norm 0.2816 (+0.76z)| lr 1.50e-04 | 2531.99 ms | 53.3% bf16 MFU | 206959 tok/s +step 13271/19560 | loss 3.321199 (-0.66z)| norm 0.2561 (-1.20z)| lr 1.50e-04 | 2535.09 ms | 53.3% bf16 MFU | 206951 tok/s +step 13272/19560 | loss 3.334034 (-0.32z)| norm 0.2893 (+1.33z)| lr 1.50e-04 | 2533.10 ms | 53.3% bf16 MFU | 206953 tok/s +step 13273/19560 | loss 3.331173 (-0.39z)| norm 0.2672 (-0.36z)| lr 1.50e-04 | 2532.49 ms | 53.3% bf16 MFU | 206956 tok/s +step 13274/19560 | loss 3.355997 (+0.28z)| norm 0.2688 (-0.24z)| lr 1.50e-04 | 2533.25 ms | 53.3% bf16 MFU | 206957 tok/s +step 13275/19560 | loss 3.320570 (-0.69z)| norm 0.2656 (-0.47z)| lr 1.50e-04 | 2532.25 ms | 53.3% bf16 MFU | 206961 tok/s +step 13276/19560 | loss 3.340639 (-0.14z)| norm 0.2604 (-0.87z)| lr 1.50e-04 | 2532.26 ms | 53.3% bf16 MFU | 206965 tok/s +step 13277/19560 | loss 3.303397 (-1.16z)| norm 0.2476 (-1.81z)| lr 1.50e-04 | 2532.88 ms | 53.3% bf16 MFU | 206966 tok/s +step 13278/19560 | loss 3.297131 (-1.31z)| norm 0.2696 (-0.14z)| lr 1.50e-04 | 2533.39 ms | 53.3% bf16 MFU | 206966 tok/s +step 13279/19560 | loss 3.321897 (-0.65z)| norm 0.2461 (-1.87z)| lr 1.50e-04 | 2534.40 ms | 53.3% bf16 MFU | 206961 tok/s +step 13280/19560 | loss 3.316020 (-0.79z)| norm 0.2478 (-1.72z)| lr 1.50e-04 | 2534.31 ms | 53.3% bf16 MFU | 206957 tok/s +step 13281/19560 | loss 3.297009 (-1.30z)| norm 0.2599 (-0.81z)| lr 1.50e-04 | 2532.95 ms | 53.3% bf16 MFU | 206958 tok/s +step 13282/19560 | loss 3.327504 (-0.47z)| norm 0.2427 (-2.04z)| lr 1.50e-04 | 2533.75 ms | 53.3% bf16 MFU | 206956 tok/s +step 13283/19560 | loss 3.264612 (-2.14z)| norm 0.2649 (-0.43z)| lr 1.50e-04 | 2531.09 ms | 53.3% bf16 MFU | 206965 tok/s +step 13284/19560 | loss 3.355657 (+0.31z)| norm 0.2516 (-1.39z)| lr 1.50e-04 | 2532.74 ms | 53.3% bf16 MFU | 206967 tok/s +step 13285/19560 | loss 3.348155 (+0.10z)| norm 0.2632 (-0.54z)| lr 1.50e-04 | 2533.26 ms | 53.3% bf16 MFU | 206967 tok/s +step 13286/19560 | loss 3.304121 (-1.09z)| norm 0.2545 (-1.18z)| lr 1.49e-04 | 2532.17 ms | 53.3% bf16 MFU | 206971 tok/s +step 13287/19560 | loss 3.301239 (-1.18z)| norm 0.2555 (-1.11z)| lr 1.49e-04 | 2532.34 ms | 53.3% bf16 MFU | 206975 tok/s +step 13288/19560 | loss 3.335834 (-0.23z)| norm 0.2566 (-1.01z)| lr 1.49e-04 | 2534.40 ms | 53.3% bf16 MFU | 206969 tok/s +step 13289/19560 | loss 3.278783 (-1.75z)| norm 0.2561 (-1.05z)| lr 1.49e-04 | 2534.48 ms | 53.3% bf16 MFU | 206964 tok/s +step 13290/19560 | loss 3.401195 (+1.55z)| norm 0.2656 (-0.36z)| lr 1.49e-04 | 2532.34 ms | 53.3% bf16 MFU | 206968 tok/s +step 13291/19560 | loss 3.365890 (+0.61z)| norm 0.2679 (-0.18z)| lr 1.49e-04 | 2533.42 ms | 53.3% bf16 MFU | 206967 tok/s +step 13292/19560 | loss 3.382963 (+1.06z)| norm 0.2544 (-1.17z)| lr 1.49e-04 | 2533.54 ms | 53.3% bf16 MFU | 206965 tok/s +step 13293/19560 | loss 3.357563 (+0.36z)| norm 0.2516 (-1.37z)| lr 1.49e-04 | 2531.67 ms | 53.3% bf16 MFU | 206972 tok/s +step 13294/19560 | loss 3.361763 (+0.48z)| norm 0.2684 (-0.14z)| lr 1.49e-04 | 2532.70 ms | 53.3% bf16 MFU | 206973 tok/s +step 13295/19560 | loss 3.335475 (-0.24z)| norm 0.2526 (-1.33z)| lr 1.49e-04 | 2533.56 ms | 53.3% bf16 MFU | 206972 tok/s +step 13296/19560 | loss 3.340900 (-0.09z)| norm 0.2517 (-1.38z)| lr 1.49e-04 | 2535.44 ms | 53.3% bf16 MFU | 206962 tok/s +step 13297/19560 | loss 3.316257 (-0.76z)| norm 0.2681 (-0.17z)| lr 1.49e-04 | 2533.28 ms | 53.3% bf16 MFU | 206962 tok/s +step 13298/19560 | loss 3.347475 (+0.09z)| norm 0.2773 (+0.50z)| lr 1.49e-04 | 2532.89 ms | 53.3% bf16 MFU | 206964 tok/s +step 13299/19560 | loss 3.365328 (+0.57z)| norm 0.2567 (-1.01z)| lr 1.49e-04 | 2535.04 ms | 53.3% bf16 MFU | 206956 tok/s +step 13300/19560 | loss 3.373222 (+0.79z)| norm 0.2774 (+0.51z)| lr 1.49e-04 | 2532.50 ms | 53.3% bf16 MFU | 206960 tok/s +step 13301/19560 | loss 3.406834 (+1.69z)| norm 0.2616 (-0.65z)| lr 1.49e-04 | 2532.84 ms | 53.3% bf16 MFU | 206961 tok/s +step 13302/19560 | loss 3.339081 (-0.16z)| norm 0.2680 (-0.18z)| lr 1.49e-04 | 2533.91 ms | 53.3% bf16 MFU | 206959 tok/s +step 13303/19560 | loss 3.305125 (-1.08z)| norm 0.2611 (-0.68z)| lr 1.49e-04 | 2531.90 ms | 53.3% bf16 MFU | 206964 tok/s +step 13304/19560 | loss 3.373754 (+0.81z)| norm 0.2578 (-0.91z)| lr 1.49e-04 | 2534.37 ms | 53.3% bf16 MFU | 206960 tok/s +step 13305/19560 | loss 3.359352 (+0.41z)| norm 0.2641 (-0.45z)| lr 1.49e-04 | 2533.82 ms | 53.3% bf16 MFU | 206958 tok/s +step 13306/19560 | loss 3.306217 (-1.04z)| norm 0.2747 (+0.36z)| lr 1.49e-04 | 2532.13 ms | 53.3% bf16 MFU | 206962 tok/s +step 13307/19560 | loss 3.339019 (-0.14z)| norm 0.2530 (-1.32z)| lr 1.49e-04 | 2532.55 ms | 53.3% bf16 MFU | 206965 tok/s +step 13308/19560 | loss 3.328418 (-0.44z)| norm 0.2898 (+1.62z)| lr 1.49e-04 | 2532.18 ms | 53.3% bf16 MFU | 206970 tok/s +step 13309/19560 | loss 3.301919 (-1.16z)| norm 0.2630 (-0.52z)| lr 1.49e-04 | 2533.73 ms | 53.3% bf16 MFU | 206967 tok/s +step 13310/19560 | loss 3.306341 (-1.03z)| norm 0.2805 (+0.88z)| lr 1.48e-04 | 2535.41 ms | 53.3% bf16 MFU | 206958 tok/s +step 13311/19560 | loss 3.371999 (+0.75z)| norm 0.2615 (-0.64z)| lr 1.48e-04 | 2533.77 ms | 53.3% bf16 MFU | 206956 tok/s +step 13312/19560 | loss 3.300575 (-1.18z)| norm 0.2664 (-0.25z)| lr 1.48e-04 | 2533.63 ms | 53.3% bf16 MFU | 206955 tok/s +step 13313/19560 | loss 3.430180 (+2.29z)| norm 0.2998 (+2.36z)| lr 1.48e-04 | 2533.69 ms | 53.3% bf16 MFU | 206954 tok/s +step 13314/19560 | loss 3.294257 (-1.33z)| norm 0.2563 (-1.04z)| lr 1.48e-04 | 2533.77 ms | 53.3% bf16 MFU | 206952 tok/s +step 13315/19560 | loss 3.338983 (-0.14z)| norm 0.2956 (+2.02z)| lr 1.48e-04 | 2533.93 ms | 53.3% bf16 MFU | 206950 tok/s +step 13316/19560 | loss 3.397544 (+1.39z)| norm 0.2697 (+0.01z)| lr 1.48e-04 | 2534.22 ms | 53.3% bf16 MFU | 206946 tok/s +step 13317/19560 | loss 3.326440 (-0.49z)| norm 0.2847 (+1.16z)| lr 1.48e-04 | 2533.44 ms | 53.3% bf16 MFU | 206946 tok/s +step 13318/19560 | loss 3.290367 (-1.43z)| norm 0.2722 (+0.20z)| lr 1.48e-04 | 2534.58 ms | 53.3% bf16 MFU | 206942 tok/s +step 13319/19560 | loss 3.364261 (+0.53z)| norm 0.2631 (-0.50z)| lr 1.48e-04 | 2534.78 ms | 53.3% bf16 MFU | 206937 tok/s +step 13320/19560 | loss 3.308075 (-0.94z)| norm 0.2697 (+0.02z)| lr 1.48e-04 | 2531.97 ms | 53.3% bf16 MFU | 206943 tok/s +step 13321/19560 | loss 3.321151 (-0.59z)| norm 0.2887 (+1.48z)| lr 1.48e-04 | 2533.67 ms | 53.3% bf16 MFU | 206942 tok/s +step 13322/19560 | loss 3.330266 (-0.35z)| norm 0.2728 (+0.25z)| lr 1.48e-04 | 2533.96 ms | 53.3% bf16 MFU | 206941 tok/s +step 13323/19560 | loss 3.313298 (-0.79z)| norm 0.2748 (+0.40z)| lr 1.48e-04 | 2533.47 ms | 53.3% bf16 MFU | 206941 tok/s +step 13324/19560 | loss 3.267984 (-1.95z)| norm 0.2738 (+0.33z)| lr 1.48e-04 | 2536.30 ms | 53.2% bf16 MFU | 206929 tok/s +step 13325/19560 | loss 3.342071 (-0.01z)| norm 0.2961 (+2.01z)| lr 1.48e-04 | 2534.15 ms | 53.3% bf16 MFU | 206927 tok/s +step 13326/19560 | loss 3.294232 (-1.24z)| norm 0.2867 (+1.27z)| lr 1.48e-04 | 2534.41 ms | 53.3% bf16 MFU | 206924 tok/s +step 13327/19560 | loss 3.362593 (+0.61z)| norm 0.2795 (+0.71z)| lr 1.48e-04 | 2535.48 ms | 53.3% bf16 MFU | 206917 tok/s +step 13328/19560 | loss 3.381564 (+1.12z)| norm 0.2660 (-0.31z)| lr 1.48e-04 | 2534.38 ms | 53.3% bf16 MFU | 206915 tok/s +step 13329/19560 | loss 3.354538 (+0.38z)| norm 0.2783 (+0.63z)| lr 1.48e-04 | 2536.01 ms | 53.2% bf16 MFU | 206906 tok/s +step 13330/19560 | loss 3.257155 (-2.28z)| norm 0.2713 (+0.10z)| lr 1.48e-04 | 2535.01 ms | 53.3% bf16 MFU | 206902 tok/s +step 13331/19560 | loss 3.402165 (+1.67z)| norm 0.2932 (+1.75z)| lr 1.48e-04 | 2535.96 ms | 53.2% bf16 MFU | 206894 tok/s +step 13332/19560 | loss 3.419688 (+2.19z)| norm 0.3007 (+2.26z)| lr 1.48e-04 | 2533.57 ms | 53.3% bf16 MFU | 206896 tok/s +step 13333/19560 | loss 3.315986 (-0.68z)| norm 0.2816 (+0.83z)| lr 1.47e-04 | 2535.65 ms | 53.2% bf16 MFU | 206889 tok/s +step 13334/19560 | loss 3.297004 (-1.18z)| norm 0.3337 (+4.34z)| lr 1.47e-04 | 2533.72 ms | 53.3% bf16 MFU | 206891 tok/s +step 13335/19560 | loss 3.367452 (+0.75z)| norm 0.2813 (+0.69z)| lr 1.47e-04 | 2534.15 ms | 53.3% bf16 MFU | 206891 tok/s +step 13336/19560 | loss 3.418744 (+2.11z)| norm 0.2713 (-0.01z)| lr 1.47e-04 | 2533.19 ms | 53.3% bf16 MFU | 206895 tok/s +step 13337/19560 | loss 3.355416 (+0.40z)| norm 0.2813 (+0.67z)| lr 1.47e-04 | 2533.74 ms | 53.3% bf16 MFU | 206896 tok/s +step 13338/19560 | loss 3.361078 (+0.56z)| norm 0.2824 (+0.75z)| lr 1.47e-04 | 2533.29 ms | 53.3% bf16 MFU | 206899 tok/s +step 13339/19560 | loss 3.328605 (-0.32z)| norm 0.2720 (+0.02z)| lr 1.47e-04 | 2534.40 ms | 53.3% bf16 MFU | 206898 tok/s +step 13340/19560 | loss 3.273742 (-1.77z)| norm 0.2784 (+0.46z)| lr 1.47e-04 | 2533.38 ms | 53.3% bf16 MFU | 206900 tok/s +step 13341/19560 | loss 3.302574 (-0.98z)| norm 0.2715 (-0.03z)| lr 1.47e-04 | 2534.07 ms | 53.3% bf16 MFU | 206900 tok/s +step 13342/19560 | loss 3.296446 (-1.14z)| norm 0.2744 (+0.17z)| lr 1.47e-04 | 2532.92 ms | 53.3% bf16 MFU | 206905 tok/s +step 13343/19560 | loss 3.295248 (-1.15z)| norm 0.2590 (-0.91z)| lr 1.47e-04 | 2534.52 ms | 53.3% bf16 MFU | 206902 tok/s +step 13344/19560 | loss 3.315075 (-0.62z)| norm 0.2609 (-0.77z)| lr 1.47e-04 | 2535.80 ms | 53.2% bf16 MFU | 206895 tok/s +step 13345/19560 | loss 3.354346 (+0.42z)| norm 0.2835 (+0.81z)| lr 1.47e-04 | 2533.61 ms | 53.3% bf16 MFU | 206897 tok/s +step 13346/19560 | loss 3.304986 (-0.88z)| norm 0.2721 (+0.01z)| lr 1.47e-04 | 2533.64 ms | 53.3% bf16 MFU | 206899 tok/s +step 13347/19560 | loss 3.358387 (+0.53z)| norm 0.2807 (+0.61z)| lr 1.47e-04 | 2534.64 ms | 53.3% bf16 MFU | 206896 tok/s +step 13348/19560 | loss 3.310183 (-0.75z)| norm 0.2704 (-0.12z)| lr 1.47e-04 | 2533.01 ms | 53.3% bf16 MFU | 206900 tok/s +step 13349/19560 | loss 3.349513 (+0.30z)| norm 0.2781 (+0.41z)| lr 1.47e-04 | 2534.21 ms | 53.3% bf16 MFU | 206900 tok/s +step 13350/19560 | loss 3.291499 (-1.25z)| norm 0.2626 (-0.70z)| lr 1.47e-04 | 2533.53 ms | 53.3% bf16 MFU | 206902 tok/s +step 13351/19560 | loss 3.342985 (+0.14z)| norm 0.2669 (-0.39z)| lr 1.47e-04 | 2533.96 ms | 53.3% bf16 MFU | 206902 tok/s +step 13352/19560 | loss 3.337579 (-0.01z)| norm 0.2679 (-0.32z)| lr 1.47e-04 | 2535.05 ms | 53.3% bf16 MFU | 206897 tok/s +step 13353/19560 | loss 3.395836 (+1.53z)| norm 0.3474 (+4.84z)| lr 1.47e-04 | 2532.29 ms | 53.3% bf16 MFU | 206905 tok/s +step 13354/19560 | loss 3.292025 (-1.23z)| norm 0.2847 (+0.75z)| lr 1.47e-04 | 2535.41 ms | 53.3% bf16 MFU | 206899 tok/s +step 13355/19560 | loss 3.380955 (+1.12z)| norm 0.3077 (+2.19z)| lr 1.47e-04 | 2533.51 ms | 53.3% bf16 MFU | 206901 tok/s +step 13356/19560 | loss 3.368086 (+0.76z)| norm 0.2785 (+0.32z)| lr 1.46e-04 | 2533.70 ms | 53.3% bf16 MFU | 206902 tok/s +step 13357/19560 | loss 3.321815 (-0.46z)| norm 0.3010 (+1.73z)| lr 1.46e-04 | 2533.79 ms | 53.3% bf16 MFU | 206903 tok/s +step 13358/19560 | loss 3.394381 (+1.44z)| norm 0.2793 (+0.35z)| lr 1.46e-04 | 2534.03 ms | 53.3% bf16 MFU | 206903 tok/s +step 13359/19560 | loss 3.277295 (-1.62z)| norm 0.2891 (+0.96z)| lr 1.46e-04 | 2534.76 ms | 53.3% bf16 MFU | 206899 tok/s +step 13360/19560 | loss 3.406111 (+1.73z)| norm 0.2780 (+0.26z)| lr 1.46e-04 | 2534.66 ms | 53.3% bf16 MFU | 206897 tok/s +step 13361/19560 | loss 3.276489 (-1.62z)| norm 0.2673 (-0.41z)| lr 1.46e-04 | 2531.42 ms | 53.3% bf16 MFU | 206908 tok/s +step 13362/19560 | loss 3.313385 (-0.65z)| norm 0.2639 (-0.62z)| lr 1.46e-04 | 2535.00 ms | 53.3% bf16 MFU | 206903 tok/s +step 13363/19560 | loss 3.398571 (+1.52z)| norm 0.2692 (-0.28z)| lr 1.46e-04 | 2533.24 ms | 53.3% bf16 MFU | 206906 tok/s +step 13364/19560 | loss 3.363054 (+0.61z)| norm 0.2701 (-0.22z)| lr 1.46e-04 | 2533.32 ms | 53.3% bf16 MFU | 206909 tok/s +step 13365/19560 | loss 3.304930 (-0.86z)| norm 0.2813 (+0.49z)| lr 1.46e-04 | 2532.84 ms | 53.3% bf16 MFU | 206913 tok/s +step 13366/19560 | loss 3.346430 (+0.21z)| norm 0.2686 (-0.31z)| lr 1.46e-04 | 2532.48 ms | 53.3% bf16 MFU | 206919 tok/s +step 13367/19560 | loss 3.298817 (-1.01z)| norm 0.2720 (-0.08z)| lr 1.46e-04 | 2534.36 ms | 53.3% bf16 MFU | 206916 tok/s +step 13368/19560 | loss 3.286476 (-1.31z)| norm 0.2732 (+0.00z)| lr 1.46e-04 | 2531.88 ms | 53.3% bf16 MFU | 206924 tok/s +step 13369/19560 | loss 3.313494 (-0.60z)| norm 0.2568 (-1.04z)| lr 1.46e-04 | 2534.79 ms | 53.3% bf16 MFU | 206920 tok/s +step 13370/19560 | loss 3.357019 (+0.52z)| norm 0.2606 (-0.78z)| lr 1.46e-04 | 2535.83 ms | 53.2% bf16 MFU | 206912 tok/s +step 13371/19560 | loss 3.328128 (-0.22z)| norm 0.2647 (-0.50z)| lr 1.46e-04 | 2535.63 ms | 53.2% bf16 MFU | 206904 tok/s +step 13372/19560 | loss 3.333385 (-0.08z)| norm 0.2575 (-0.96z)| lr 1.46e-04 | 2533.86 ms | 53.3% bf16 MFU | 206905 tok/s +step 13373/19560 | loss 3.302775 (-0.87z)| norm 0.2703 (-0.13z)| lr 1.46e-04 | 2534.49 ms | 53.3% bf16 MFU | 206903 tok/s +step 13374/19560 | loss 3.349663 (+0.34z)| norm 0.2723 (+0.00z)| lr 1.46e-04 | 2536.45 ms | 53.2% bf16 MFU | 206893 tok/s +step 13375/19560 | loss 3.285880 (-1.29z)| norm 0.2606 (-0.75z)| lr 1.46e-04 | 2533.24 ms | 53.3% bf16 MFU | 206896 tok/s +step 13376/19560 | loss 3.380650 (+1.16z)| norm 0.2821 (+0.65z)| lr 1.46e-04 | 2533.86 ms | 53.3% bf16 MFU | 206897 tok/s +step 13377/19560 | loss 3.348390 (+0.32z)| norm 0.2880 (+1.03z)| lr 1.46e-04 | 2533.61 ms | 53.3% bf16 MFU | 206899 tok/s +step 13378/19560 | loss 3.302156 (-0.90z)| norm 0.2815 (+0.60z)| lr 1.46e-04 | 2533.60 ms | 53.3% bf16 MFU | 206901 tok/s +step 13379/19560 | loss 3.311782 (-0.63z)| norm 0.2703 (-0.13z)| lr 1.45e-04 | 2534.36 ms | 53.3% bf16 MFU | 206899 tok/s +step 13380/19560 | loss 3.342037 (+0.16z)| norm 0.2627 (-0.63z)| lr 1.45e-04 | 2533.92 ms | 53.3% bf16 MFU | 206900 tok/s +step 13381/19560 | loss 3.273393 (-1.61z)| norm 0.2650 (-0.49z)| lr 1.45e-04 | 2534.96 ms | 53.3% bf16 MFU | 206896 tok/s +step 13382/19560 | loss 3.282077 (-1.36z)| norm 0.2693 (-0.20z)| lr 1.45e-04 | 2533.11 ms | 53.3% bf16 MFU | 206900 tok/s +step 13383/19560 | loss 3.427082 (+2.32z)| norm 0.2762 (+0.26z)| lr 1.45e-04 | 2533.51 ms | 53.3% bf16 MFU | 206902 tok/s +step 13384/19560 | loss 3.311421 (-0.60z)| norm 0.2650 (-0.48z)| lr 1.45e-04 | 2532.44 ms | 53.3% bf16 MFU | 206908 tok/s +step 13385/19560 | loss 3.339465 (+0.10z)| norm 0.2667 (-0.36z)| lr 1.45e-04 | 2533.46 ms | 53.3% bf16 MFU | 206910 tok/s +step 13386/19560 | loss 3.322823 (-0.31z)| norm 0.2645 (-0.51z)| lr 1.45e-04 | 2533.69 ms | 53.3% bf16 MFU | 206911 tok/s +step 13387/19560 | loss 3.326394 (-0.21z)| norm 0.2562 (-1.04z)| lr 1.45e-04 | 2533.89 ms | 53.3% bf16 MFU | 206911 tok/s +step 13388/19560 | loss 3.336470 (+0.05z)| norm 0.2821 (+0.65z)| lr 1.45e-04 | 2534.96 ms | 53.3% bf16 MFU | 206906 tok/s +step 13389/19560 | loss 3.304907 (-0.79z)| norm 0.2674 (-0.29z)| lr 1.45e-04 | 2534.87 ms | 53.3% bf16 MFU | 206903 tok/s +step 13390/19560 | loss 3.351177 (+0.45z)| norm 0.2715 (-0.01z)| lr 1.45e-04 | 2532.09 ms | 53.3% bf16 MFU | 206910 tok/s +step 13391/19560 | loss 3.339328 (+0.13z)| norm 0.2736 (+0.13z)| lr 1.45e-04 | 2534.71 ms | 53.3% bf16 MFU | 206907 tok/s +step 13392/19560 | loss 3.336609 (+0.07z)| norm 0.2806 (+0.60z)| lr 1.45e-04 | 2532.81 ms | 53.3% bf16 MFU | 206912 tok/s +step 13393/19560 | loss 3.318060 (-0.43z)| norm 0.2963 (+1.62z)| lr 1.45e-04 | 2534.42 ms | 53.3% bf16 MFU | 206909 tok/s +step 13394/19560 | loss 3.275701 (-1.55z)| norm 0.2727 (+0.05z)| lr 1.45e-04 | 2533.87 ms | 53.3% bf16 MFU | 206910 tok/s +step 13395/19560 | loss 3.308450 (-0.67z)| norm 0.2824 (+0.69z)| lr 1.45e-04 | 2533.28 ms | 53.3% bf16 MFU | 206912 tok/s +step 13396/19560 | loss 3.320839 (-0.34z)| norm 0.2830 (+0.72z)| lr 1.45e-04 | 2533.27 ms | 53.3% bf16 MFU | 206914 tok/s +step 13397/19560 | loss 3.349742 (+0.45z)| norm 0.2586 (-0.88z)| lr 1.45e-04 | 2531.40 ms | 53.3% bf16 MFU | 206924 tok/s +step 13398/19560 | loss 3.344926 (+0.31z)| norm 0.2642 (-0.50z)| lr 1.45e-04 | 2533.78 ms | 53.3% bf16 MFU | 206924 tok/s +step 13399/19560 | loss 3.352492 (+0.51z)| norm 0.2777 (+0.38z)| lr 1.45e-04 | 2533.86 ms | 53.3% bf16 MFU | 206924 tok/s +step 13400/19560 | loss 3.325693 (-0.21z)| norm 0.2780 (+0.41z)| lr 1.45e-04 | 2531.98 ms | 53.3% bf16 MFU | 206931 tok/s +step 13401/19560 | loss 3.329824 (-0.10z)| norm 0.2601 (-0.78z)| lr 1.45e-04 | 2535.84 ms | 53.2% bf16 MFU | 206922 tok/s +step 13402/19560 | loss 3.331428 (-0.05z)| norm 0.2920 (+1.32z)| lr 1.45e-04 | 2533.63 ms | 53.3% bf16 MFU | 206922 tok/s +step 13403/19560 | loss 3.327135 (-0.17z)| norm 0.2659 (-0.40z)| lr 1.44e-04 | 2535.03 ms | 53.3% bf16 MFU | 206917 tok/s +step 13404/19560 | loss 3.395782 (+1.66z)| norm 0.2802 (+0.53z)| lr 1.44e-04 | 2535.80 ms | 53.2% bf16 MFU | 206909 tok/s +step 13405/19560 | loss 3.313843 (-0.53z)| norm 0.2747 (+0.16z)| lr 1.44e-04 | 2534.25 ms | 53.3% bf16 MFU | 206907 tok/s +step 13406/19560 | loss 3.368639 (+0.92z)| norm 0.2750 (+0.17z)| lr 1.44e-04 | 2534.22 ms | 53.3% bf16 MFU | 206906 tok/s +step 13407/19560 | loss 3.325195 (-0.25z)| norm 0.2841 (+0.77z)| lr 1.44e-04 | 2535.40 ms | 53.3% bf16 MFU | 206900 tok/s +step 13408/19560 | loss 3.277219 (-1.51z)| norm 0.2586 (-0.97z)| lr 1.44e-04 | 2534.98 ms | 53.3% bf16 MFU | 206896 tok/s +step 13409/19560 | loss 3.309965 (-0.65z)| norm 0.2730 (+0.00z)| lr 1.44e-04 | 2533.89 ms | 53.3% bf16 MFU | 206897 tok/s +step 13410/19560 | loss 3.285813 (-1.27z)| norm 0.2545 (-1.27z)| lr 1.44e-04 | 2533.59 ms | 53.3% bf16 MFU | 206899 tok/s +step 13411/19560 | loss 3.309604 (-0.66z)| norm 0.2646 (-0.58z)| lr 1.44e-04 | 2533.34 ms | 53.3% bf16 MFU | 206902 tok/s +step 13412/19560 | loss 3.266700 (-1.77z)| norm 0.2623 (-0.74z)| lr 1.44e-04 | 2535.18 ms | 53.3% bf16 MFU | 206897 tok/s +step 13413/19560 | loss 3.473956 (+3.51z)| norm 0.2686 (-0.31z)| lr 1.44e-04 | 2534.29 ms | 53.3% bf16 MFU | 206896 tok/s +step 13414/19560 | loss 3.364267 (+0.74z)| norm 0.2837 (+0.72z)| lr 1.44e-04 | 2534.10 ms | 53.3% bf16 MFU | 206896 tok/s +step 13415/19560 | loss 3.332878 (-0.06z)| norm 0.2908 (+1.20z)| lr 1.44e-04 | 2533.45 ms | 53.3% bf16 MFU | 206898 tok/s +step 13416/19560 | loss 3.276283 (-1.46z)| norm 0.2860 (+0.84z)| lr 1.44e-04 | 2535.13 ms | 53.3% bf16 MFU | 206894 tok/s +step 13417/19560 | loss 3.335943 (+0.02z)| norm 0.3040 (+2.06z)| lr 1.44e-04 | 2533.20 ms | 53.3% bf16 MFU | 206897 tok/s +step 13418/19560 | loss 3.273376 (-1.54z)| norm 0.2807 (+0.44z)| lr 1.44e-04 | 2533.65 ms | 53.3% bf16 MFU | 206899 tok/s +step 13419/19560 | loss 3.369884 (+0.90z)| norm 0.2896 (+1.04z)| lr 1.44e-04 | 2532.40 ms | 53.3% bf16 MFU | 206906 tok/s +step 13420/19560 | loss 3.310578 (-0.59z)| norm 0.2875 (+0.88z)| lr 1.44e-04 | 2533.60 ms | 53.3% bf16 MFU | 206907 tok/s +step 13421/19560 | loss 3.295065 (-0.96z)| norm 0.2981 (+1.59z)| lr 1.44e-04 | 2533.88 ms | 53.3% bf16 MFU | 206907 tok/s +step 13422/19560 | loss 3.331728 (-0.03z)| norm 0.2965 (+1.46z)| lr 1.44e-04 | 2532.01 ms | 53.3% bf16 MFU | 206915 tok/s +step 13423/19560 | loss 3.312938 (-0.50z)| norm 0.2638 (-0.81z)| lr 1.44e-04 | 2532.33 ms | 53.3% bf16 MFU | 206921 tok/s +step 13424/19560 | loss 3.398562 (+1.64z)| norm 0.3191 (+2.92z)| lr 1.44e-04 | 2533.71 ms | 53.3% bf16 MFU | 206921 tok/s +step 13425/19560 | loss 3.297926 (-0.88z)| norm 0.2764 (+0.03z)| lr 1.44e-04 | 2533.61 ms | 53.3% bf16 MFU | 206922 tok/s +step 13426/19560 | loss 3.361405 (+0.70z)| norm 0.2853 (+0.62z)| lr 1.43e-04 | 2533.64 ms | 53.3% bf16 MFU | 206922 tok/s +step 13427/19560 | loss 3.380344 (+1.17z)| norm 0.2870 (+0.72z)| lr 1.43e-04 | 2534.74 ms | 53.3% bf16 MFU | 206918 tok/s +step 13428/19560 | loss 3.325166 (-0.20z)| norm 0.2776 (+0.08z)| lr 1.43e-04 | 2533.54 ms | 53.3% bf16 MFU | 206919 tok/s +step 13429/19560 | loss 3.354700 (+0.56z)| norm 0.2726 (-0.26z)| lr 1.43e-04 | 2535.54 ms | 53.3% bf16 MFU | 206912 tok/s +step 13430/19560 | loss 3.314471 (-0.45z)| norm 0.2717 (-0.33z)| lr 1.43e-04 | 2533.44 ms | 53.3% bf16 MFU | 206914 tok/s +step 13431/19560 | loss 3.315140 (-0.44z)| norm 0.2808 (+0.28z)| lr 1.43e-04 | 2534.95 ms | 53.3% bf16 MFU | 206909 tok/s +step 13432/19560 | loss 3.387228 (+1.38z)| norm 0.2615 (-1.04z)| lr 1.43e-04 | 2533.75 ms | 53.3% bf16 MFU | 206910 tok/s +step 13433/19560 | loss 3.352930 (+0.52z)| norm 0.2746 (-0.15z)| lr 1.43e-04 | 2534.02 ms | 53.3% bf16 MFU | 206910 tok/s +step 13434/19560 | loss 3.325119 (-0.19z)| norm 0.2673 (-0.65z)| lr 1.43e-04 | 2532.42 ms | 53.3% bf16 MFU | 206916 tok/s +step 13435/19560 | loss 3.319020 (-0.34z)| norm 0.2604 (-1.13z)| lr 1.43e-04 | 2531.85 ms | 53.3% bf16 MFU | 206924 tok/s +step 13436/19560 | loss 3.371311 (+0.97z)| norm 0.2934 (+1.15z)| lr 1.43e-04 | 2532.66 ms | 53.3% bf16 MFU | 206928 tok/s +step 13437/19560 | loss 3.338423 (+0.13z)| norm 0.2711 (-0.39z)| lr 1.43e-04 | 2530.84 ms | 53.3% bf16 MFU | 206940 tok/s +step 13438/19560 | loss 3.386394 (+1.32z)| norm 0.2878 (+0.75z)| lr 1.43e-04 | 2533.28 ms | 53.3% bf16 MFU | 206941 tok/s +step 13439/19560 | loss 3.321048 (-0.31z)| norm 0.2759 (-0.07z)| lr 1.43e-04 | 2533.53 ms | 53.3% bf16 MFU | 206940 tok/s +step 13440/19560 | loss 3.280160 (-1.33z)| norm 0.2604 (-1.14z)| lr 1.43e-04 | 2535.42 ms | 53.3% bf16 MFU | 206933 tok/s +step 13441/19560 | loss 3.370450 (+0.97z)| norm 0.2837 (+0.48z)| lr 1.43e-04 | 2532.26 ms | 53.3% bf16 MFU | 206938 tok/s +step 13442/19560 | loss 3.359480 (+0.67z)| norm 0.2670 (-0.70z)| lr 1.43e-04 | 2533.28 ms | 53.3% bf16 MFU | 206939 tok/s +step 13443/19560 | loss 3.311469 (-0.55z)| norm 0.2724 (-0.31z)| lr 1.43e-04 | 2531.97 ms | 53.3% bf16 MFU | 206946 tok/s +step 13444/19560 | loss 3.321664 (-0.28z)| norm 0.2649 (-0.83z)| lr 1.43e-04 | 2533.84 ms | 53.3% bf16 MFU | 206944 tok/s +step 13445/19560 | loss 3.285881 (-1.19z)| norm 0.2708 (-0.41z)| lr 1.43e-04 | 2533.35 ms | 53.3% bf16 MFU | 206945 tok/s +step 13446/19560 | loss 3.316325 (-0.41z)| norm 0.2718 (-0.34z)| lr 1.43e-04 | 2532.93 ms | 53.3% bf16 MFU | 206947 tok/s +step 13447/19560 | loss 3.364239 (+0.83z)| norm 0.3124 (+2.45z)| lr 1.43e-04 | 2532.41 ms | 53.3% bf16 MFU | 206951 tok/s +step 13448/19560 | loss 3.298406 (-0.87z)| norm 0.2642 (-0.88z)| lr 1.43e-04 | 2533.31 ms | 53.3% bf16 MFU | 206951 tok/s +step 13449/19560 | loss 3.321502 (-0.28z)| norm 0.2701 (-0.46z)| lr 1.43e-04 | 2532.59 ms | 53.3% bf16 MFU | 206955 tok/s +step 13450/19560 | loss 3.404600 (+1.83z)| norm 0.2819 (+0.35z)| lr 1.42e-04 | 2533.99 ms | 53.3% bf16 MFU | 206952 tok/s +step 13451/19560 | loss 3.348211 (+0.39z)| norm 0.2824 (+0.38z)| lr 1.42e-04 | 2534.49 ms | 53.3% bf16 MFU | 206948 tok/s +step 13452/19560 | loss 3.435949 (+2.55z)| norm 0.2748 (-0.14z)| lr 1.42e-04 | 2534.58 ms | 53.3% bf16 MFU | 206943 tok/s +step 13453/19560 | loss 3.329287 (-0.13z)| norm 0.2828 (+0.42z)| lr 1.42e-04 | 2531.38 ms | 53.3% bf16 MFU | 206951 tok/s +step 13454/19560 | loss 3.331085 (-0.09z)| norm 0.2742 (-0.17z)| lr 1.42e-04 | 2533.50 ms | 53.3% bf16 MFU | 206951 tok/s +step 13455/19560 | loss 3.332227 (-0.05z)| norm 0.2862 (+0.66z)| lr 1.42e-04 | 2535.18 ms | 53.3% bf16 MFU | 206944 tok/s +step 13456/19560 | loss 3.325716 (-0.21z)| norm 0.2650 (-0.82z)| lr 1.42e-04 | 2534.56 ms | 53.3% bf16 MFU | 206939 tok/s +step 13457/19560 | loss 3.317943 (-0.40z)| norm 0.2524 (-1.66z)| lr 1.42e-04 | 2533.83 ms | 53.3% bf16 MFU | 206938 tok/s +step 13458/19560 | loss 3.271551 (-1.59z)| norm 0.2829 (+0.43z)| lr 1.42e-04 | 2535.44 ms | 53.3% bf16 MFU | 206930 tok/s +step 13459/19560 | loss 3.366550 (+0.85z)| norm 0.2591 (-1.19z)| lr 1.42e-04 | 2533.97 ms | 53.3% bf16 MFU | 206929 tok/s +step 13460/19560 | loss 3.271683 (-1.58z)| norm 0.2619 (-0.98z)| lr 1.42e-04 | 2534.79 ms | 53.3% bf16 MFU | 206924 tok/s +step 13461/19560 | loss 3.309372 (-0.59z)| norm 0.2869 (+0.74z)| lr 1.42e-04 | 2533.96 ms | 53.3% bf16 MFU | 206923 tok/s +step 13462/19560 | loss 3.304269 (-0.73z)| norm 0.2685 (-0.52z)| lr 1.42e-04 | 2534.32 ms | 53.3% bf16 MFU | 206921 tok/s +step 13463/19560 | loss 3.341197 (+0.24z)| norm 0.2620 (-0.99z)| lr 1.42e-04 | 2532.85 ms | 53.3% bf16 MFU | 206925 tok/s +step 13464/19560 | loss 3.345145 (+0.36z)| norm 0.2629 (-0.92z)| lr 1.42e-04 | 2534.69 ms | 53.3% bf16 MFU | 206921 tok/s +step 13465/19560 | loss 3.300736 (-0.81z)| norm 0.2687 (-0.48z)| lr 1.42e-04 | 2533.13 ms | 53.3% bf16 MFU | 206923 tok/s +step 13466/19560 | loss 3.374511 (+1.15z)| norm 0.2819 (+0.48z)| lr 1.42e-04 | 2531.92 ms | 53.3% bf16 MFU | 206931 tok/s +step 13467/19560 | loss 3.328308 (-0.08z)| norm 0.2679 (-0.54z)| lr 1.42e-04 | 2534.24 ms | 53.3% bf16 MFU | 206928 tok/s +step 13468/19560 | loss 3.379754 (+1.27z)| norm 0.2671 (-0.59z)| lr 1.42e-04 | 2532.56 ms | 53.3% bf16 MFU | 206933 tok/s +step 13469/19560 | loss 3.325758 (-0.17z)| norm 0.2519 (-1.67z)| lr 1.42e-04 | 2532.27 ms | 53.3% bf16 MFU | 206938 tok/s +step 13470/19560 | loss 3.316338 (-0.43z)| norm 0.2686 (-0.46z)| lr 1.42e-04 | 2533.23 ms | 53.3% bf16 MFU | 206940 tok/s +step 13471/19560 | loss 3.329933 (-0.07z)| norm 0.2787 (+0.26z)| lr 1.42e-04 | 2534.60 ms | 53.3% bf16 MFU | 206935 tok/s +step 13472/19560 | loss 3.244633 (-2.30z)| norm 0.2745 (-0.05z)| lr 1.42e-04 | 2531.93 ms | 53.3% bf16 MFU | 206942 tok/s +step 13473/19560 | loss 3.394573 (+1.62z)| norm 0.2606 (-1.05z)| lr 1.41e-04 | 2533.42 ms | 53.3% bf16 MFU | 206942 tok/s +step 13474/19560 | loss 3.373387 (+1.05z)| norm 0.2526 (-1.61z)| lr 1.41e-04 | 2532.40 ms | 53.3% bf16 MFU | 206947 tok/s +step 13475/19560 | loss 3.394464 (+1.58z)| norm 0.2446 (-2.12z)| lr 1.41e-04 | 2533.17 ms | 53.3% bf16 MFU | 206948 tok/s +step 13476/19560 | loss 3.311127 (-0.57z)| norm 0.2704 (-0.30z)| lr 1.41e-04 | 2532.56 ms | 53.3% bf16 MFU | 206951 tok/s +step 13477/19560 | loss 3.289855 (-1.10z)| norm 0.2546 (-1.39z)| lr 1.41e-04 | 2533.31 ms | 53.3% bf16 MFU | 206952 tok/s +step 13478/19560 | loss 3.377248 (+1.13z)| norm 0.2764 (+0.13z)| lr 1.41e-04 | 2533.48 ms | 53.3% bf16 MFU | 206951 tok/s +step 13479/19560 | loss 3.318125 (-0.39z)| norm 0.2557 (-1.32z)| lr 1.41e-04 | 2535.23 ms | 53.3% bf16 MFU | 206944 tok/s +step 13480/19560 | loss 3.273207 (-1.51z)| norm 0.2770 (+0.18z)| lr 1.41e-04 | 2533.88 ms | 53.3% bf16 MFU | 206942 tok/s +step 13481/19560 | loss 3.330987 (-0.03z)| norm 0.2658 (-0.63z)| lr 1.41e-04 | 2534.54 ms | 53.3% bf16 MFU | 206938 tok/s +step 13482/19560 | loss 3.331927 (-0.02z)| norm 0.2675 (-0.49z)| lr 1.41e-04 | 2533.92 ms | 53.3% bf16 MFU | 206936 tok/s +step 13483/19560 | loss 3.280667 (-1.32z)| norm 0.2660 (-0.60z)| lr 1.41e-04 | 2534.85 ms | 53.3% bf16 MFU | 206931 tok/s +step 13484/19560 | loss 3.298266 (-0.85z)| norm 0.3183 (+3.43z)| lr 1.41e-04 | 2534.80 ms | 53.3% bf16 MFU | 206926 tok/s +step 13485/19560 | loss 3.350691 (+0.50z)| norm 0.2980 (+1.88z)| lr 1.41e-04 | 2534.31 ms | 53.3% bf16 MFU | 206924 tok/s +step 13486/19560 | loss 3.325392 (-0.14z)| norm 0.3212 (+3.47z)| lr 1.41e-04 | 2535.26 ms | 53.3% bf16 MFU | 206918 tok/s +step 13487/19560 | loss 3.470684 (+3.46z)| norm 0.2940 (+1.46z)| lr 1.41e-04 | 2533.79 ms | 53.3% bf16 MFU | 206918 tok/s +step 13488/19560 | loss 3.478942 (+3.51z)| norm 0.2828 (+0.63z)| lr 1.41e-04 | 2533.71 ms | 53.3% bf16 MFU | 206918 tok/s +step 13489/19560 | loss 3.324756 (-0.21z)| norm 0.2778 (+0.26z)| lr 1.41e-04 | 2533.85 ms | 53.3% bf16 MFU | 206918 tok/s +step 13490/19560 | loss 3.319437 (-0.34z)| norm 0.2687 (-0.40z)| lr 1.41e-04 | 2533.76 ms | 53.3% bf16 MFU | 206918 tok/s +step 13491/19560 | loss 3.348302 (+0.37z)| norm 0.2683 (-0.43z)| lr 1.41e-04 | 2534.05 ms | 53.3% bf16 MFU | 206917 tok/s +step 13492/19560 | loss 3.374891 (+1.02z)| norm 0.2830 (+0.64z)| lr 1.41e-04 | 2533.80 ms | 53.3% bf16 MFU | 206917 tok/s +step 13493/19560 | loss 3.383373 (+1.21z)| norm 0.2893 (+1.09z)| lr 1.41e-04 | 2531.50 ms | 53.3% bf16 MFU | 206926 tok/s +step 13494/19560 | loss 3.368031 (+0.83z)| norm 0.2656 (-0.64z)| lr 1.41e-04 | 2533.22 ms | 53.3% bf16 MFU | 206928 tok/s +step 13495/19560 | loss 3.297758 (-0.88z)| norm 0.2819 (+0.54z)| lr 1.41e-04 | 2535.17 ms | 53.3% bf16 MFU | 206922 tok/s +step 13496/19560 | loss 3.385922 (+1.24z)| norm 0.2677 (-0.49z)| lr 1.41e-04 | 2533.28 ms | 53.3% bf16 MFU | 206924 tok/s +step 13497/19560 | loss 3.399455 (+1.54z)| norm 0.3266 (+3.59z)| lr 1.40e-04 | 2535.33 ms | 53.3% bf16 MFU | 206918 tok/s +step 13498/19560 | loss 3.331558 (-0.09z)| norm 0.2764 (+0.09z)| lr 1.40e-04 | 2533.87 ms | 53.3% bf16 MFU | 206917 tok/s +step 13499/19560 | loss 3.309042 (-0.62z)| norm 0.2975 (+1.54z)| lr 1.40e-04 | 2532.96 ms | 53.3% bf16 MFU | 206921 tok/s +step 13500/19560 | loss 3.307087 (-0.66z)| norm 0.2572 (-1.26z)| lr 1.40e-04 | 2533.11 ms | 53.3% bf16 MFU | 206923 tok/s +val loss 3.337931 +evaluating HellaSwag: 0/628 evaluating HellaSwag: 10/628 evaluating HellaSwag: 20/628 evaluating HellaSwag: 30/628 evaluating HellaSwag: 40/628 evaluating HellaSwag: 50/628 evaluating HellaSwag: 60/628 evaluating HellaSwag: 70/628 evaluating HellaSwag: 80/628 evaluating HellaSwag: 90/628 evaluating HellaSwag: 100/628 evaluating HellaSwag: 110/628 evaluating HellaSwag: 120/628 evaluating HellaSwag: 130/628 evaluating HellaSwag: 140/628 evaluating HellaSwag: 150/628 evaluating HellaSwag: 160/628 evaluating HellaSwag: 170/628 evaluating HellaSwag: 180/628 evaluating HellaSwag: 190/628 evaluating HellaSwag: 200/628 evaluating HellaSwag: 210/628 evaluating HellaSwag: 220/628 evaluating HellaSwag: 230/628 evaluating HellaSwag: 240/628 evaluating HellaSwag: 250/628 evaluating HellaSwag: 260/628 evaluating HellaSwag: 270/628 evaluating HellaSwag: 280/628 evaluating HellaSwag: 290/628 evaluating HellaSwag: 300/628 evaluating HellaSwag: 310/628 evaluating HellaSwag: 320/628 evaluating HellaSwag: 330/628 evaluating HellaSwag: 340/628 evaluating HellaSwag: 350/628 evaluating HellaSwag: 360/628 evaluating HellaSwag: 370/628 evaluating HellaSwag: 380/628 evaluating HellaSwag: 390/628 evaluating HellaSwag: 400/628 evaluating HellaSwag: 410/628 evaluating HellaSwag: 420/628 evaluating HellaSwag: 430/628 evaluating HellaSwag: 440/628 evaluating HellaSwag: 450/628 evaluating HellaSwag: 460/628 evaluating HellaSwag: 470/628 evaluating HellaSwag: 480/628 evaluating HellaSwag: 490/628 evaluating HellaSwag: 500/628 evaluating HellaSwag: 510/628 evaluating HellaSwag: 520/628 evaluating HellaSwag: 530/628 evaluating HellaSwag: 540/628 evaluating HellaSwag: 550/628 evaluating HellaSwag: 560/628 evaluating HellaSwag: 570/628 evaluating HellaSwag: 580/628 evaluating HellaSwag: 590/628 evaluating HellaSwag: 600/628 evaluating HellaSwag: 610/628 evaluating HellaSwag: 620/628 HellaSwag: 2999/10042 = 0.298646 +step 13501/19560 | loss 3.459008 (+2.86z)| norm 0.3024 (+1.83z)| lr 1.40e-04 | 2531.78 ms | 53.3% bf16 MFU | 206931 tok/s +step 13502/19560 | loss 3.313113 (-0.53z)| norm 0.2696 (-0.41z)| lr 1.40e-04 | 2533.48 ms | 53.3% bf16 MFU | 206932 tok/s +step 13503/19560 | loss 3.329657 (-0.15z)| norm 0.2718 (-0.26z)| lr 1.40e-04 | 2534.04 ms | 53.3% bf16 MFU | 206930 tok/s +step 13504/19560 | loss 3.369045 (+0.77z)| norm 0.2629 (-0.86z)| lr 1.40e-04 | 2532.92 ms | 53.3% bf16 MFU | 206933 tok/s +step 13505/19560 | loss 3.327874 (-0.19z)| norm 0.2743 (-0.08z)| lr 1.40e-04 | 2531.76 ms | 53.3% bf16 MFU | 206941 tok/s +step 13506/19560 | loss 3.319741 (-0.38z)| norm 0.2713 (-0.27z)| lr 1.40e-04 | 2531.83 ms | 53.3% bf16 MFU | 206948 tok/s +step 13507/19560 | loss 3.402821 (+1.54z)| norm 0.2731 (-0.15z)| lr 1.40e-04 | 2534.05 ms | 53.3% bf16 MFU | 206945 tok/s +step 13508/19560 | loss 3.276003 (-1.39z)| norm 0.2758 (+0.03z)| lr 1.40e-04 | 2533.02 ms | 53.3% bf16 MFU | 206947 tok/s +step 13509/19560 | loss 3.428680 (+2.09z)| norm 0.2616 (-0.95z)| lr 1.40e-04 | 2533.53 ms | 53.3% bf16 MFU | 206947 tok/s +step 13510/19560 | loss 3.345310 (+0.17z)| norm 0.2635 (-0.82z)| lr 1.40e-04 | 2532.85 ms | 53.3% bf16 MFU | 206949 tok/s +step 13511/19560 | loss 3.412985 (+1.74z)| norm 0.2527 (-1.53z)| lr 1.40e-04 | 2534.54 ms | 53.3% bf16 MFU | 206944 tok/s +step 13512/19560 | loss 3.364898 (+0.62z)| norm 0.2560 (-1.29z)| lr 1.40e-04 | 2532.72 ms | 53.3% bf16 MFU | 206948 tok/s +step 13513/19560 | loss 3.328926 (-0.21z)| norm 0.2519 (-1.55z)| lr 1.40e-04 | 2534.47 ms | 53.3% bf16 MFU | 206943 tok/s +step 13514/19560 | loss 3.379343 (+0.94z)| norm 0.2609 (-0.94z)| lr 1.40e-04 | 2533.52 ms | 53.3% bf16 MFU | 206943 tok/s +step 13515/19560 | loss 3.382585 (+1.00z)| norm 0.2548 (-1.35z)| lr 1.40e-04 | 2534.32 ms | 53.3% bf16 MFU | 206940 tok/s +step 13516/19560 | loss 3.345373 (+0.15z)| norm 0.2648 (-0.67z)| lr 1.40e-04 | 2534.40 ms | 53.3% bf16 MFU | 206936 tok/s +step 13517/19560 | loss 3.362844 (+0.54z)| norm 0.2477 (-1.78z)| lr 1.40e-04 | 2533.19 ms | 53.3% bf16 MFU | 206938 tok/s +step 13518/19560 | loss 3.322968 (-0.37z)| norm 0.2704 (-0.28z)| lr 1.40e-04 | 2532.73 ms | 53.3% bf16 MFU | 206941 tok/s +step 13519/19560 | loss 3.324540 (-0.34z)| norm 0.2741 (-0.03z)| lr 1.40e-04 | 2532.40 ms | 53.3% bf16 MFU | 206946 tok/s +step 13520/19560 | loss 3.418804 (+1.79z)| norm 0.2531 (-1.40z)| lr 1.39e-04 | 2533.10 ms | 53.3% bf16 MFU | 206947 tok/s +step 13521/19560 | loss 3.330315 (-0.22z)| norm 0.2609 (-0.87z)| lr 1.39e-04 | 2531.52 ms | 53.3% bf16 MFU | 206955 tok/s +step 13522/19560 | loss 3.496322 (+3.38z)| norm 0.2712 (-0.20z)| lr 1.39e-04 | 2533.45 ms | 53.3% bf16 MFU | 206955 tok/s +step 13523/19560 | loss 3.330324 (-0.25z)| norm 0.2651 (-0.59z)| lr 1.39e-04 | 2532.65 ms | 53.3% bf16 MFU | 206957 tok/s +step 13524/19560 | loss 3.335268 (-0.14z)| norm 0.2545 (-1.27z)| lr 1.39e-04 | 2532.49 ms | 53.3% bf16 MFU | 206961 tok/s +step 13525/19560 | loss 3.366272 (+0.53z)| norm 0.2633 (-0.70z)| lr 1.39e-04 | 2533.74 ms | 53.3% bf16 MFU | 206959 tok/s +step 13526/19560 | loss 3.341287 (-0.02z)| norm 0.2469 (-1.74z)| lr 1.39e-04 | 2534.47 ms | 53.3% bf16 MFU | 206954 tok/s +step 13527/19560 | loss 3.341434 (-0.01z)| norm 0.2730 (-0.04z)| lr 1.39e-04 | 2532.47 ms | 53.3% bf16 MFU | 206958 tok/s +step 13528/19560 | loss 3.306702 (-0.77z)| norm 0.2558 (-1.15z)| lr 1.39e-04 | 2532.86 ms | 53.3% bf16 MFU | 206959 tok/s +step 13529/19560 | loss 3.318839 (-0.50z)| norm 0.2656 (-0.52z)| lr 1.39e-04 | 2532.80 ms | 53.3% bf16 MFU | 206961 tok/s +step 13530/19560 | loss 3.328846 (-0.28z)| norm 0.2568 (-1.07z)| lr 1.39e-04 | 2534.04 ms | 53.3% bf16 MFU | 206958 tok/s +step 13531/19560 | loss 3.307969 (-0.73z)| norm 0.2824 (+0.59z)| lr 1.39e-04 | 2535.73 ms | 53.2% bf16 MFU | 206948 tok/s +step 13532/19560 | loss 3.336637 (-0.10z)| norm 0.2648 (-0.55z)| lr 1.39e-04 | 2533.73 ms | 53.3% bf16 MFU | 206947 tok/s +step 13533/19560 | loss 3.322294 (-0.41z)| norm 0.2582 (-0.97z)| lr 1.39e-04 | 2533.40 ms | 53.3% bf16 MFU | 206947 tok/s +step 13534/19560 | loss 3.375886 (+0.76z)| norm 0.2712 (-0.12z)| lr 1.39e-04 | 2535.41 ms | 53.3% bf16 MFU | 206939 tok/s +step 13535/19560 | loss 3.295244 (-1.00z)| norm 0.2517 (-1.36z)| lr 1.39e-04 | 2534.92 ms | 53.3% bf16 MFU | 206934 tok/s +step 13536/19560 | loss 3.285927 (-1.21z)| norm 0.2757 (+0.17z)| lr 1.39e-04 | 2535.06 ms | 53.3% bf16 MFU | 206928 tok/s +step 13537/19560 | loss 3.352645 (+0.25z)| norm 0.2545 (-1.18z)| lr 1.39e-04 | 2532.11 ms | 53.3% bf16 MFU | 206934 tok/s +step 13538/19560 | loss 3.438935 (+2.09z)| norm 0.2569 (-1.03z)| lr 1.39e-04 | 2534.21 ms | 53.3% bf16 MFU | 206932 tok/s +step 13539/19560 | loss 3.406865 (+1.37z)| norm 0.2774 (+0.29z)| lr 1.39e-04 | 2532.04 ms | 53.3% bf16 MFU | 206938 tok/s +step 13540/19560 | loss 3.305990 (-0.82z)| norm 0.2607 (-0.79z)| lr 1.39e-04 | 2532.72 ms | 53.3% bf16 MFU | 206941 tok/s +step 13541/19560 | loss 3.325433 (-0.38z)| norm 0.2740 (+0.07z)| lr 1.39e-04 | 2532.82 ms | 53.3% bf16 MFU | 206944 tok/s +step 13542/19560 | loss 3.373240 (+0.69z)| norm 0.2516 (-1.35z)| lr 1.39e-04 | 2531.82 ms | 53.3% bf16 MFU | 206951 tok/s +step 13543/19560 | loss 3.379455 (+0.82z)| norm 0.2463 (-1.66z)| lr 1.39e-04 | 2532.31 ms | 53.3% bf16 MFU | 206955 tok/s +step 13544/19560 | loss 3.329726 (-0.31z)| norm 0.2617 (-0.67z)| lr 1.38e-04 | 2533.07 ms | 53.3% bf16 MFU | 206956 tok/s +step 13545/19560 | loss 3.327140 (-0.36z)| norm 0.2804 (+0.54z)| lr 1.38e-04 | 2533.13 ms | 53.3% bf16 MFU | 206957 tok/s +step 13546/19560 | loss 3.359366 (+0.35z)| norm 0.2446 (-1.73z)| lr 1.38e-04 | 2532.99 ms | 53.3% bf16 MFU | 206959 tok/s +step 13547/19560 | loss 3.379662 (+0.81z)| norm 0.2736 (+0.13z)| lr 1.38e-04 | 2532.90 ms | 53.3% bf16 MFU | 206960 tok/s +step 13548/19560 | loss 3.401087 (+1.28z)| norm 0.2724 (+0.06z)| lr 1.38e-04 | 2532.92 ms | 53.3% bf16 MFU | 206962 tok/s +step 13549/19560 | loss 3.430606 (+1.90z)| norm 0.2603 (-0.71z)| lr 1.38e-04 | 2531.67 ms | 53.3% bf16 MFU | 206968 tok/s +step 13550/19560 | loss 3.354980 (+0.20z)| norm 0.2690 (-0.13z)| lr 1.38e-04 | 2532.99 ms | 53.3% bf16 MFU | 206969 tok/s +step 13551/19560 | loss 3.355496 (+0.21z)| norm 0.2658 (-0.34z)| lr 1.38e-04 | 2533.91 ms | 53.3% bf16 MFU | 206966 tok/s +step 13552/19560 | loss 3.253201 (-2.04z)| norm 0.2583 (-0.84z)| lr 1.38e-04 | 2533.57 ms | 53.3% bf16 MFU | 206964 tok/s +step 13553/19560 | loss 3.363956 (+0.41z)| norm 0.2782 (+0.52z)| lr 1.38e-04 | 2532.70 ms | 53.3% bf16 MFU | 206967 tok/s +step 13554/19560 | loss 3.277335 (-1.50z)| norm 0.2457 (-1.67z)| lr 1.38e-04 | 2533.72 ms | 53.3% bf16 MFU | 206964 tok/s +step 13555/19560 | loss 3.346054 (+0.03z)| norm 0.2655 (-0.31z)| lr 1.38e-04 | 2532.40 ms | 53.3% bf16 MFU | 206968 tok/s +step 13556/19560 | loss 3.325557 (-0.42z)| norm 0.2804 (+0.71z)| lr 1.38e-04 | 2533.30 ms | 53.3% bf16 MFU | 206967 tok/s +step 13557/19560 | loss 3.314321 (-0.67z)| norm 0.2704 (+0.03z)| lr 1.38e-04 | 2533.63 ms | 53.3% bf16 MFU | 206966 tok/s +step 13558/19560 | loss 3.318060 (-0.58z)| norm 0.2931 (+1.55z)| lr 1.38e-04 | 2533.30 ms | 53.3% bf16 MFU | 206965 tok/s +step 13559/19560 | loss 3.354954 (+0.23z)| norm 0.2594 (-0.72z)| lr 1.38e-04 | 2533.16 ms | 53.3% bf16 MFU | 206965 tok/s +step 13560/19560 | loss 3.339556 (-0.11z)| norm 0.2808 (+0.71z)| lr 1.38e-04 | 2534.79 ms | 53.3% bf16 MFU | 206959 tok/s +step 13561/19560 | loss 3.363520 (+0.42z)| norm 0.2578 (-0.83z)| lr 1.38e-04 | 2534.65 ms | 53.3% bf16 MFU | 206953 tok/s +step 13562/19560 | loss 3.276965 (-1.48z)| norm 0.2669 (-0.22z)| lr 1.38e-04 | 2534.41 ms | 53.3% bf16 MFU | 206949 tok/s +step 13563/19560 | loss 3.262982 (-1.76z)| norm 0.2608 (-0.63z)| lr 1.38e-04 | 2533.90 ms | 53.3% bf16 MFU | 206947 tok/s +step 13564/19560 | loss 3.278985 (-1.39z)| norm 0.2931 (+1.55z)| lr 1.38e-04 | 2535.32 ms | 53.3% bf16 MFU | 206939 tok/s +step 13565/19560 | loss 3.319837 (-0.50z)| norm 0.2831 (+0.87z)| lr 1.38e-04 | 2533.60 ms | 53.3% bf16 MFU | 206939 tok/s +step 13566/19560 | loss 3.303123 (-0.84z)| norm 0.2784 (+0.56z)| lr 1.38e-04 | 2535.02 ms | 53.3% bf16 MFU | 206933 tok/s +step 13567/19560 | loss 3.344629 (+0.05z)| norm 0.2721 (+0.14z)| lr 1.38e-04 | 2533.07 ms | 53.3% bf16 MFU | 206935 tok/s +step 13568/19560 | loss 3.335451 (-0.16z)| norm 0.2776 (+0.50z)| lr 1.37e-04 | 2534.38 ms | 53.3% bf16 MFU | 206932 tok/s +step 13569/19560 | loss 3.408473 (+1.42z)| norm 0.2702 (+0.01z)| lr 1.37e-04 | 2532.25 ms | 53.3% bf16 MFU | 206938 tok/s +step 13570/19560 | loss 3.310380 (-0.70z)| norm 0.2707 (+0.04z)| lr 1.37e-04 | 2531.73 ms | 53.3% bf16 MFU | 206945 tok/s +step 13571/19560 | loss 3.341484 (-0.03z)| norm 0.2712 (+0.08z)| lr 1.37e-04 | 2534.81 ms | 53.3% bf16 MFU | 206940 tok/s +step 13572/19560 | loss 3.293872 (-1.06z)| norm 0.2856 (+1.04z)| lr 1.37e-04 | 2532.81 ms | 53.3% bf16 MFU | 206943 tok/s +step 13573/19560 | loss 3.337672 (-0.12z)| norm 0.2659 (-0.29z)| lr 1.37e-04 | 2532.03 ms | 53.3% bf16 MFU | 206949 tok/s +step 13574/19560 | loss 3.318622 (-0.53z)| norm 0.2915 (+1.41z)| lr 1.37e-04 | 2534.51 ms | 53.3% bf16 MFU | 206944 tok/s +step 13575/19560 | loss 3.335400 (-0.16z)| norm 0.2631 (-0.48z)| lr 1.37e-04 | 2535.83 ms | 53.2% bf16 MFU | 206934 tok/s +step 13576/19560 | loss 3.336576 (-0.14z)| norm 0.2719 (+0.13z)| lr 1.37e-04 | 2533.88 ms | 53.3% bf16 MFU | 206933 tok/s +step 13577/19560 | loss 3.365539 (+0.48z)| norm 0.2658 (-0.29z)| lr 1.37e-04 | 2534.15 ms | 53.3% bf16 MFU | 206931 tok/s +step 13578/19560 | loss 3.465727 (+2.61z)| norm 0.2631 (-0.47z)| lr 1.37e-04 | 2533.18 ms | 53.3% bf16 MFU | 206933 tok/s +step 13579/19560 | loss 3.339093 (-0.10z)| norm 0.2739 (+0.28z)| lr 1.37e-04 | 2536.87 ms | 53.2% bf16 MFU | 206920 tok/s +step 13580/19560 | loss 3.374258 (+0.67z)| norm 0.2701 (+0.02z)| lr 1.37e-04 | 2534.18 ms | 53.3% bf16 MFU | 206918 tok/s +step 13581/19560 | loss 3.327467 (-0.35z)| norm 0.2798 (+0.70z)| lr 1.37e-04 | 2535.54 ms | 53.2% bf16 MFU | 206911 tok/s +step 13582/19560 | loss 3.359211 (+0.34z)| norm 0.2850 (+1.05z)| lr 1.37e-04 | 2533.82 ms | 53.3% bf16 MFU | 206911 tok/s +step 13583/19560 | loss 3.317298 (-0.57z)| norm 0.2647 (-0.35z)| lr 1.37e-04 | 2533.44 ms | 53.3% bf16 MFU | 206913 tok/s +step 13584/19560 | loss 3.325966 (-0.38z)| norm 0.2795 (+0.68z)| lr 1.37e-04 | 2532.04 ms | 53.3% bf16 MFU | 206920 tok/s +step 13585/19560 | loss 3.318564 (-0.54z)| norm 0.2635 (-0.45z)| lr 1.37e-04 | 2534.27 ms | 53.3% bf16 MFU | 206918 tok/s +step 13586/19560 | loss 3.342282 (-0.04z)| norm 0.2684 (-0.10z)| lr 1.37e-04 | 2534.38 ms | 53.3% bf16 MFU | 206916 tok/s +step 13587/19560 | loss 3.343501 (-0.01z)| norm 0.2997 (+2.05z)| lr 1.37e-04 | 2534.62 ms | 53.3% bf16 MFU | 206913 tok/s +step 13588/19560 | loss 3.440959 (+2.08z)| norm 0.2728 (+0.18z)| lr 1.37e-04 | 2533.84 ms | 53.3% bf16 MFU | 206913 tok/s +step 13589/19560 | loss 3.380988 (+0.77z)| norm 0.2822 (+0.84z)| lr 1.37e-04 | 2532.89 ms | 53.3% bf16 MFU | 206917 tok/s +step 13590/19560 | loss 3.294896 (-1.10z)| norm 0.2630 (-0.49z)| lr 1.37e-04 | 2534.12 ms | 53.3% bf16 MFU | 206915 tok/s +step 13591/19560 | loss 3.362615 (+0.36z)| norm 0.3013 (+2.11z)| lr 1.37e-04 | 2532.61 ms | 53.3% bf16 MFU | 206920 tok/s +step 13592/19560 | loss 3.452426 (+2.25z)| norm 0.2894 (+1.28z)| lr 1.36e-04 | 2533.13 ms | 53.3% bf16 MFU | 206923 tok/s +step 13593/19560 | loss 3.340295 (-0.14z)| norm 0.3576 (+5.20z)| lr 1.36e-04 | 2532.32 ms | 53.3% bf16 MFU | 206929 tok/s +step 13594/19560 | loss 3.325696 (-0.45z)| norm 0.2996 (+1.68z)| lr 1.36e-04 | 2534.20 ms | 53.3% bf16 MFU | 206927 tok/s +step 13595/19560 | loss 3.518191 (+3.46z)| norm 0.3388 (+3.76z)| lr 1.36e-04 | 2532.47 ms | 53.3% bf16 MFU | 206932 tok/s +step 13596/19560 | loss 3.350555 (+0.05z)| norm 0.2952 (+1.29z)| lr 1.36e-04 | 2532.58 ms | 53.3% bf16 MFU | 206936 tok/s +step 13597/19560 | loss 3.348105 (+0.00z)| norm 0.2896 (+0.96z)| lr 1.36e-04 | 2533.93 ms | 53.3% bf16 MFU | 206934 tok/s +step 13598/19560 | loss 3.332005 (-0.33z)| norm 0.2776 (+0.28z)| lr 1.36e-04 | 2531.69 ms | 53.3% bf16 MFU | 206942 tok/s +step 13599/19560 | loss 3.359295 (+0.22z)| norm 0.2739 (+0.08z)| lr 1.36e-04 | 2532.57 ms | 53.3% bf16 MFU | 206946 tok/s +step 13600/19560 | loss 3.327669 (-0.44z)| norm 0.2762 (+0.20z)| lr 1.36e-04 | 2533.86 ms | 53.3% bf16 MFU | 206944 tok/s +step 13601/19560 | loss 3.387914 (+0.81z)| norm 0.2832 (+0.58z)| lr 1.36e-04 | 2533.19 ms | 53.3% bf16 MFU | 206945 tok/s +step 13602/19560 | loss 3.245270 (-2.10z)| norm 0.2873 (+0.80z)| lr 1.36e-04 | 2532.10 ms | 53.3% bf16 MFU | 206951 tok/s +step 13603/19560 | loss 3.301258 (-0.94z)| norm 0.2891 (+0.89z)| lr 1.36e-04 | 2534.00 ms | 53.3% bf16 MFU | 206949 tok/s +step 13604/19560 | loss 3.268195 (-1.60z)| norm 0.2837 (+0.58z)| lr 1.36e-04 | 2532.76 ms | 53.3% bf16 MFU | 206951 tok/s +step 13605/19560 | loss 3.362012 (+0.30z)| norm 0.2672 (-0.36z)| lr 1.36e-04 | 2532.83 ms | 53.3% bf16 MFU | 206954 tok/s +step 13606/19560 | loss 3.361503 (+0.29z)| norm 0.2917 (+1.02z)| lr 1.36e-04 | 2534.68 ms | 53.3% bf16 MFU | 206948 tok/s +step 13607/19560 | loss 3.293749 (-1.09z)| norm 0.2886 (+0.83z)| lr 1.36e-04 | 2533.69 ms | 53.3% bf16 MFU | 206947 tok/s +step 13608/19560 | loss 3.367041 (+0.39z)| norm 0.2564 (-0.98z)| lr 1.36e-04 | 2533.86 ms | 53.3% bf16 MFU | 206945 tok/s +step 13609/19560 | loss 3.288191 (-1.21z)| norm 0.2729 (-0.05z)| lr 1.36e-04 | 2531.61 ms | 53.3% bf16 MFU | 206953 tok/s +step 13610/19560 | loss 3.307653 (-0.81z)| norm 0.2713 (-0.14z)| lr 1.36e-04 | 2531.88 ms | 53.3% bf16 MFU | 206959 tok/s +step 13611/19560 | loss 3.319278 (-0.58z)| norm 0.2687 (-0.29z)| lr 1.36e-04 | 2533.59 ms | 53.3% bf16 MFU | 206958 tok/s +step 13612/19560 | loss 3.270140 (-1.58z)| norm 0.2529 (-1.18z)| lr 1.36e-04 | 2531.44 ms | 53.3% bf16 MFU | 206965 tok/s +step 13613/19560 | loss 3.333447 (-0.28z)| norm 0.2585 (-0.84z)| lr 1.36e-04 | 2534.02 ms | 53.3% bf16 MFU | 206962 tok/s +step 13614/19560 | loss 3.360512 (+0.26z)| norm 0.2778 (+0.30z)| lr 1.36e-04 | 2534.71 ms | 53.3% bf16 MFU | 206956 tok/s +step 13615/19560 | loss 3.314651 (-0.66z)| norm 0.2611 (-0.68z)| lr 1.36e-04 | 2532.59 ms | 53.3% bf16 MFU | 206959 tok/s +step 13616/19560 | loss 3.365428 (+0.43z)| norm 0.2607 (-0.70z)| lr 1.35e-04 | 2531.76 ms | 53.3% bf16 MFU | 206965 tok/s +step 13617/19560 | loss 3.351254 (+0.12z)| norm 0.2666 (-0.34z)| lr 1.35e-04 | 2533.75 ms | 53.3% bf16 MFU | 206963 tok/s +step 13618/19560 | loss 3.308955 (-0.79z)| norm 0.2553 (-1.00z)| lr 1.35e-04 | 2533.34 ms | 53.3% bf16 MFU | 206963 tok/s +step 13619/19560 | loss 3.361133 (+0.33z)| norm 0.2686 (-0.21z)| lr 1.35e-04 | 2534.29 ms | 53.3% bf16 MFU | 206959 tok/s +step 13620/19560 | loss 3.354406 (+0.19z)| norm 0.2620 (-0.59z)| lr 1.35e-04 | 2536.09 ms | 53.2% bf16 MFU | 206947 tok/s +step 13621/19560 | loss 3.402357 (+1.22z)| norm 0.2704 (-0.08z)| lr 1.35e-04 | 2533.64 ms | 53.3% bf16 MFU | 206946 tok/s +step 13622/19560 | loss 3.313817 (-0.67z)| norm 0.2833 (+0.68z)| lr 1.35e-04 | 2533.71 ms | 53.3% bf16 MFU | 206945 tok/s +step 13623/19560 | loss 3.285595 (-1.27z)| norm 0.2863 (+0.86z)| lr 1.35e-04 | 2534.56 ms | 53.3% bf16 MFU | 206941 tok/s +step 13624/19560 | loss 3.345223 (+0.01z)| norm 0.2829 (+0.65z)| lr 1.35e-04 | 2535.18 ms | 53.3% bf16 MFU | 206934 tok/s +step 13625/19560 | loss 3.276086 (-1.45z)| norm 0.2864 (+0.91z)| lr 1.35e-04 | 2535.58 ms | 53.2% bf16 MFU | 206926 tok/s +step 13626/19560 | loss 3.352474 (+0.18z)| norm 0.2576 (-0.87z)| lr 1.35e-04 | 2536.20 ms | 53.2% bf16 MFU | 206916 tok/s +step 13627/19560 | loss 3.339444 (-0.10z)| norm 0.2734 (+0.13z)| lr 1.35e-04 | 2535.32 ms | 53.3% bf16 MFU | 206910 tok/s +step 13628/19560 | loss 3.385487 (+0.87z)| norm 0.2822 (+0.66z)| lr 1.35e-04 | 2534.78 ms | 53.3% bf16 MFU | 206906 tok/s +step 13629/19560 | loss 3.311207 (-0.71z)| norm 0.2536 (-1.12z)| lr 1.35e-04 | 2532.08 ms | 53.3% bf16 MFU | 206914 tok/s +step 13630/19560 | loss 3.306865 (-0.81z)| norm 0.2661 (-0.32z)| lr 1.35e-04 | 2535.16 ms | 53.3% bf16 MFU | 206908 tok/s +step 13631/19560 | loss 3.295886 (-1.04z)| norm 0.2649 (-0.40z)| lr 1.35e-04 | 2533.23 ms | 53.3% bf16 MFU | 206911 tok/s +step 13632/19560 | loss 3.391290 (+1.04z)| norm 0.2696 (-0.10z)| lr 1.35e-04 | 2533.24 ms | 53.3% bf16 MFU | 206914 tok/s +step 13633/19560 | loss 3.412254 (+1.47z)| norm 0.2621 (-0.57z)| lr 1.35e-04 | 2532.31 ms | 53.3% bf16 MFU | 206920 tok/s +step 13634/19560 | loss 3.304615 (-0.85z)| norm 0.2622 (-0.56z)| lr 1.35e-04 | 2536.73 ms | 53.2% bf16 MFU | 206908 tok/s +step 13635/19560 | loss 3.349109 (+0.12z)| norm 0.2614 (-0.60z)| lr 1.35e-04 | 2533.56 ms | 53.3% bf16 MFU | 206909 tok/s +step 13636/19560 | loss 3.405746 (+1.32z)| norm 0.2734 (+0.15z)| lr 1.35e-04 | 2534.31 ms | 53.3% bf16 MFU | 206908 tok/s +step 13637/19560 | loss 3.280001 (-1.39z)| norm 0.2611 (-0.62z)| lr 1.35e-04 | 2532.76 ms | 53.3% bf16 MFU | 206912 tok/s +step 13638/19560 | loss 3.394064 (+1.09z)| norm 0.2611 (-0.62z)| lr 1.35e-04 | 2532.60 ms | 53.3% bf16 MFU | 206918 tok/s +step 13639/19560 | loss 3.435320 (+1.97z)| norm 0.3270 (+3.36z)| lr 1.35e-04 | 2535.55 ms | 53.2% bf16 MFU | 206910 tok/s +step 13640/19560 | loss 3.320212 (-0.51z)| norm 0.2805 (+0.54z)| lr 1.34e-04 | 2533.84 ms | 53.3% bf16 MFU | 206911 tok/s +step 13641/19560 | loss 3.356863 (+0.28z)| norm 0.2667 (-0.31z)| lr 1.34e-04 | 2535.14 ms | 53.3% bf16 MFU | 206905 tok/s +step 13642/19560 | loss 3.407489 (+1.36z)| norm 0.2765 (+0.28z)| lr 1.34e-04 | 2535.13 ms | 53.3% bf16 MFU | 206901 tok/s +step 13643/19560 | loss 3.370425 (+0.56z)| norm 0.2627 (-0.57z)| lr 1.34e-04 | 2533.92 ms | 53.3% bf16 MFU | 206901 tok/s +step 13644/19560 | loss 3.269774 (-1.57z)| norm 0.2667 (-0.33z)| lr 1.34e-04 | 2535.68 ms | 53.2% bf16 MFU | 206894 tok/s +step 13645/19560 | loss 3.345137 (+0.04z)| norm 0.2736 (+0.09z)| lr 1.34e-04 | 2533.95 ms | 53.3% bf16 MFU | 206895 tok/s +step 13646/19560 | loss 3.404206 (+1.27z)| norm 0.2697 (-0.15z)| lr 1.34e-04 | 2532.61 ms | 53.3% bf16 MFU | 206901 tok/s +step 13647/19560 | loss 3.346881 (+0.06z)| norm 0.2617 (-0.64z)| lr 1.34e-04 | 2535.15 ms | 53.3% bf16 MFU | 206896 tok/s +step 13648/19560 | loss 3.355633 (+0.25z)| norm 0.2888 (+1.02z)| lr 1.34e-04 | 2534.06 ms | 53.3% bf16 MFU | 206896 tok/s +step 13649/19560 | loss 3.356024 (+0.26z)| norm 0.2879 (+0.95z)| lr 1.34e-04 | 2534.88 ms | 53.3% bf16 MFU | 206893 tok/s +step 13650/19560 | loss 3.383195 (+0.89z)| norm 0.3511 (+4.42z)| lr 1.34e-04 | 2532.55 ms | 53.3% bf16 MFU | 206899 tok/s +step 13651/19560 | loss 3.305998 (-0.82z)| norm 0.2779 (+0.26z)| lr 1.34e-04 | 2535.20 ms | 53.3% bf16 MFU | 206894 tok/s +step 13652/19560 | loss 3.275597 (-1.47z)| norm 0.3182 (+2.47z)| lr 1.34e-04 | 2535.15 ms | 53.3% bf16 MFU | 206890 tok/s +step 13653/19560 | loss 3.349445 (+0.16z)| norm 0.2686 (-0.29z)| lr 1.34e-04 | 2535.17 ms | 53.3% bf16 MFU | 206886 tok/s +step 13654/19560 | loss 3.332305 (-0.22z)| norm 0.2815 (+0.42z)| lr 1.34e-04 | 2534.87 ms | 53.3% bf16 MFU | 206883 tok/s +step 13655/19560 | loss 3.354901 (+0.28z)| norm 0.2734 (-0.04z)| lr 1.34e-04 | 2535.01 ms | 53.3% bf16 MFU | 206880 tok/s +step 13656/19560 | loss 3.333007 (-0.21z)| norm 0.2832 (+0.50z)| lr 1.34e-04 | 2532.92 ms | 53.3% bf16 MFU | 206885 tok/s +step 13657/19560 | loss 3.319638 (-0.51z)| norm 0.2605 (-0.77z)| lr 1.34e-04 | 2532.93 ms | 53.3% bf16 MFU | 206890 tok/s +step 13658/19560 | loss 3.497327 (+3.25z)| norm 0.3078 (+1.84z)| lr 1.34e-04 | 2534.04 ms | 53.3% bf16 MFU | 206891 tok/s +step 13659/19560 | loss 3.353236 (+0.19z)| norm 0.2644 (-0.56z)| lr 1.34e-04 | 2533.14 ms | 53.3% bf16 MFU | 206895 tok/s +step 13660/19560 | loss 3.316526 (-0.58z)| norm 0.2934 (+1.04z)| lr 1.34e-04 | 2534.71 ms | 53.3% bf16 MFU | 206892 tok/s +step 13661/19560 | loss 3.418690 (+1.55z)| norm 0.2891 (+0.78z)| lr 1.34e-04 | 2532.83 ms | 53.3% bf16 MFU | 206897 tok/s +step 13662/19560 | loss 3.345808 (+0.03z)| norm 0.2893 (+0.78z)| lr 1.34e-04 | 2533.20 ms | 53.3% bf16 MFU | 206901 tok/s +step 13663/19560 | loss 3.298501 (-0.97z)| norm 0.2706 (-0.26z)| lr 1.34e-04 | 2533.93 ms | 53.3% bf16 MFU | 206901 tok/s +step 13664/19560 | loss 3.373962 (+0.61z)| norm 0.2683 (-0.39z)| lr 1.33e-04 | 2535.07 ms | 53.3% bf16 MFU | 206897 tok/s +step 13665/19560 | loss 3.314855 (-0.64z)| norm 0.2640 (-0.63z)| lr 1.33e-04 | 2536.51 ms | 53.2% bf16 MFU | 206887 tok/s +step 13666/19560 | loss 3.456307 (+2.33z)| norm 0.3030 (+1.52z)| lr 1.33e-04 | 2533.70 ms | 53.3% bf16 MFU | 206889 tok/s +step 13667/19560 | loss 3.320014 (-0.52z)| norm 0.2837 (+0.44z)| lr 1.33e-04 | 2534.59 ms | 53.3% bf16 MFU | 206887 tok/s +step 13668/19560 | loss 3.304259 (-0.85z)| norm 0.2737 (-0.12z)| lr 1.33e-04 | 2533.47 ms | 53.3% bf16 MFU | 206890 tok/s +step 13669/19560 | loss 3.355453 (+0.23z)| norm 0.2979 (+1.21z)| lr 1.33e-04 | 2534.59 ms | 53.3% bf16 MFU | 206888 tok/s +step 13670/19560 | loss 3.331135 (-0.28z)| norm 0.2627 (-0.75z)| lr 1.33e-04 | 2533.65 ms | 53.3% bf16 MFU | 206890 tok/s +step 13671/19560 | loss 3.518194 (+3.48z)| norm 0.3285 (+2.83z)| lr 1.33e-04 | 2533.89 ms | 53.3% bf16 MFU | 206891 tok/s +step 13672/19560 | loss 3.347763 (+0.04z)| norm 0.3351 (+3.05z)| lr 1.33e-04 | 2532.94 ms | 53.3% bf16 MFU | 206896 tok/s +step 13673/19560 | loss 3.359159 (+0.27z)| norm 0.2949 (+0.92z)| lr 1.33e-04 | 2533.43 ms | 53.3% bf16 MFU | 206899 tok/s +step 13674/19560 | loss 3.359274 (+0.27z)| norm 0.3098 (+1.67z)| lr 1.33e-04 | 2535.47 ms | 53.3% bf16 MFU | 206893 tok/s +step 13675/19560 | loss 3.382720 (+0.74z)| norm 0.2999 (+1.14z)| lr 1.33e-04 | 2534.01 ms | 53.3% bf16 MFU | 206893 tok/s +step 13676/19560 | loss 3.369479 (+0.48z)| norm 0.3090 (+1.58z)| lr 1.33e-04 | 2532.61 ms | 53.3% bf16 MFU | 206899 tok/s +step 13677/19560 | loss 3.374322 (+0.60z)| norm 0.3007 (+1.14z)| lr 1.33e-04 | 2534.44 ms | 53.3% bf16 MFU | 206898 tok/s +step 13678/19560 | loss 3.462280 (+2.33z)| norm 0.2753 (-0.18z)| lr 1.33e-04 | 2535.06 ms | 53.3% bf16 MFU | 206893 tok/s +step 13679/19560 | loss 3.356260 (+0.21z)| norm 0.2997 (+1.06z)| lr 1.33e-04 | 2532.55 ms | 53.3% bf16 MFU | 206900 tok/s +step 13680/19560 | loss 3.317382 (-0.59z)| norm 0.2842 (+0.26z)| lr 1.33e-04 | 2533.74 ms | 53.3% bf16 MFU | 206901 tok/s +step 13681/19560 | loss 3.265918 (-1.60z)| norm 0.2843 (+0.26z)| lr 1.33e-04 | 2534.39 ms | 53.3% bf16 MFU | 206899 tok/s +step 13682/19560 | loss 3.344348 (-0.04z)| norm 0.2765 (-0.16z)| lr 1.33e-04 | 2536.24 ms | 53.2% bf16 MFU | 206890 tok/s +step 13683/19560 | loss 3.277021 (-1.38z)| norm 0.2832 (+0.18z)| lr 1.33e-04 | 2535.74 ms | 53.2% bf16 MFU | 206884 tok/s +step 13684/19560 | loss 3.340786 (-0.10z)| norm 0.2756 (-0.21z)| lr 1.33e-04 | 2534.98 ms | 53.3% bf16 MFU | 206881 tok/s +step 13685/19560 | loss 3.512761 (+3.19z)| norm 0.3027 (+1.19z)| lr 1.33e-04 | 2532.72 ms | 53.3% bf16 MFU | 206887 tok/s +step 13686/19560 | loss 3.326085 (-0.41z)| norm 0.2970 (+0.89z)| lr 1.33e-04 | 2535.30 ms | 53.3% bf16 MFU | 206882 tok/s +step 13687/19560 | loss 3.301400 (-0.88z)| norm 0.2899 (+0.51z)| lr 1.33e-04 | 2534.20 ms | 53.3% bf16 MFU | 206882 tok/s +step 13688/19560 | loss 3.343300 (-0.07z)| norm 0.3233 (+2.20z)| lr 1.32e-04 | 2532.49 ms | 53.3% bf16 MFU | 206890 tok/s +step 13689/19560 | loss 3.394710 (+0.91z)| norm 0.2812 (+0.02z)| lr 1.32e-04 | 2534.42 ms | 53.3% bf16 MFU | 206888 tok/s +step 13690/19560 | loss 3.353441 (+0.11z)| norm 0.3046 (+1.22z)| lr 1.32e-04 | 2532.59 ms | 53.3% bf16 MFU | 206895 tok/s +step 13691/19560 | loss 3.369538 (+0.41z)| norm 0.2795 (-0.09z)| lr 1.32e-04 | 2533.84 ms | 53.3% bf16 MFU | 206896 tok/s +step 13692/19560 | loss 3.327190 (-0.43z)| norm 0.3252 (+2.23z)| lr 1.32e-04 | 2535.66 ms | 53.2% bf16 MFU | 206889 tok/s +step 13693/19560 | loss 3.339424 (-0.19z)| norm 0.2734 (-0.40z)| lr 1.32e-04 | 2533.88 ms | 53.3% bf16 MFU | 206890 tok/s +step 13694/19560 | loss 3.386578 (+0.72z)| norm 0.3176 (+1.81z)| lr 1.32e-04 | 2536.46 ms | 53.2% bf16 MFU | 206881 tok/s +step 13695/19560 | loss 3.308066 (-0.82z)| norm 0.2837 (+0.10z)| lr 1.32e-04 | 2533.81 ms | 53.3% bf16 MFU | 206883 tok/s +step 13696/19560 | loss 3.311733 (-0.74z)| norm 0.2709 (-0.54z)| lr 1.32e-04 | 2534.52 ms | 53.3% bf16 MFU | 206882 tok/s +step 13697/19560 | loss 3.304633 (-0.86z)| norm 0.2793 (-0.12z)| lr 1.32e-04 | 2535.60 ms | 53.2% bf16 MFU | 206876 tok/s +step 13698/19560 | loss 3.302383 (-0.91z)| norm 0.2671 (-0.73z)| lr 1.32e-04 | 2535.02 ms | 53.3% bf16 MFU | 206873 tok/s +step 13699/19560 | loss 3.414611 (+1.28z)| norm 0.2842 (+0.12z)| lr 1.32e-04 | 2534.38 ms | 53.3% bf16 MFU | 206873 tok/s +step 13700/19560 | loss 3.399523 (+0.97z)| norm 0.3156 (+1.67z)| lr 1.32e-04 | 2534.81 ms | 53.3% bf16 MFU | 206871 tok/s +step 13701/19560 | loss 3.321920 (-0.54z)| norm 0.2933 (+0.55z)| lr 1.32e-04 | 2534.89 ms | 53.3% bf16 MFU | 206869 tok/s +step 13702/19560 | loss 3.355797 (+0.11z)| norm 0.2935 (+0.56z)| lr 1.32e-04 | 2533.88 ms | 53.3% bf16 MFU | 206871 tok/s +step 13703/19560 | loss 3.346215 (-0.08z)| norm 0.2771 (-0.26z)| lr 1.32e-04 | 2534.35 ms | 53.3% bf16 MFU | 206871 tok/s +step 13704/19560 | loss 3.565824 (+3.92z)| norm 0.3038 (+1.05z)| lr 1.32e-04 | 2534.86 ms | 53.3% bf16 MFU | 206869 tok/s +step 13705/19560 | loss 3.294873 (-1.03z)| norm 0.2935 (+0.53z)| lr 1.32e-04 | 2535.09 ms | 53.3% bf16 MFU | 206866 tok/s +step 13706/19560 | loss 3.378534 (+0.52z)| norm 0.2919 (+0.44z)| lr 1.32e-04 | 2532.97 ms | 53.3% bf16 MFU | 206872 tok/s +step 13707/19560 | loss 3.307721 (-0.79z)| norm 0.2696 (-0.67z)| lr 1.32e-04 | 2532.92 ms | 53.3% bf16 MFU | 206878 tok/s +step 13708/19560 | loss 3.252731 (-1.77z)| norm 0.2736 (-0.47z)| lr 1.32e-04 | 2533.56 ms | 53.3% bf16 MFU | 206881 tok/s +step 13709/19560 | loss 3.475554 (+2.24z)| norm 0.3217 (+1.88z)| lr 1.32e-04 | 2533.41 ms | 53.3% bf16 MFU | 206884 tok/s +step 13710/19560 | loss 3.392175 (+0.74z)| norm 0.2605 (-1.11z)| lr 1.32e-04 | 2534.79 ms | 53.3% bf16 MFU | 206882 tok/s +step 13711/19560 | loss 3.315347 (-0.64z)| norm 0.2773 (-0.30z)| lr 1.32e-04 | 2533.11 ms | 53.3% bf16 MFU | 206887 tok/s +step 13712/19560 | loss 3.423541 (+1.28z)| norm 0.2814 (-0.09z)| lr 1.31e-04 | 2534.12 ms | 53.3% bf16 MFU | 206887 tok/s +step 13713/19560 | loss 3.339445 (-0.22z)| norm 0.2705 (-0.63z)| lr 1.31e-04 | 2533.48 ms | 53.3% bf16 MFU | 206890 tok/s +step 13714/19560 | loss 3.349181 (-0.05z)| norm 0.2834 (-0.01z)| lr 1.31e-04 | 2533.76 ms | 53.3% bf16 MFU | 206891 tok/s +step 13715/19560 | loss 3.263794 (-1.55z)| norm 0.2860 (+0.13z)| lr 1.31e-04 | 2534.60 ms | 53.3% bf16 MFU | 206889 tok/s +step 13716/19560 | loss 3.325832 (-0.44z)| norm 0.2828 (-0.03z)| lr 1.31e-04 | 2533.64 ms | 53.3% bf16 MFU | 206891 tok/s +step 13717/19560 | loss 3.344003 (-0.11z)| norm 0.2703 (-0.64z)| lr 1.31e-04 | 2532.34 ms | 53.3% bf16 MFU | 206899 tok/s +step 13718/19560 | loss 3.340971 (-0.17z)| norm 0.2525 (-1.52z)| lr 1.31e-04 | 2532.34 ms | 53.3% bf16 MFU | 206906 tok/s +step 13719/19560 | loss 3.338729 (-0.21z)| norm 0.2725 (-0.52z)| lr 1.31e-04 | 2532.88 ms | 53.3% bf16 MFU | 206910 tok/s +step 13720/19560 | loss 3.326779 (-0.41z)| norm 0.2650 (-0.88z)| lr 1.31e-04 | 2534.56 ms | 53.3% bf16 MFU | 206907 tok/s +step 13721/19560 | loss 3.425851 (+1.37z)| norm 0.2761 (-0.32z)| lr 1.31e-04 | 2533.04 ms | 53.3% bf16 MFU | 206911 tok/s +step 13722/19560 | loss 3.402349 (+0.93z)| norm 0.2934 (+0.58z)| lr 1.31e-04 | 2534.59 ms | 53.3% bf16 MFU | 206908 tok/s +step 13723/19560 | loss 3.339355 (-0.18z)| norm 0.2652 (-0.88z)| lr 1.31e-04 | 2533.86 ms | 53.3% bf16 MFU | 206908 tok/s +step 13724/19560 | loss 3.323811 (-0.47z)| norm 0.2711 (-0.55z)| lr 1.31e-04 | 2535.55 ms | 53.2% bf16 MFU | 206902 tok/s +step 13725/19560 | loss 3.331532 (-0.32z)| norm 0.2980 (+0.89z)| lr 1.31e-04 | 2532.91 ms | 53.3% bf16 MFU | 206906 tok/s +step 13726/19560 | loss 3.372821 (+0.44z)| norm 0.3133 (+1.68z)| lr 1.31e-04 | 2535.07 ms | 53.3% bf16 MFU | 206901 tok/s +step 13727/19560 | loss 3.395071 (+0.85z)| norm 0.2912 (+0.50z)| lr 1.31e-04 | 2535.16 ms | 53.3% bf16 MFU | 206897 tok/s +step 13728/19560 | loss 3.336023 (-0.25z)| norm 0.2667 (-0.80z)| lr 1.31e-04 | 2535.54 ms | 53.2% bf16 MFU | 206891 tok/s +step 13729/19560 | loss 3.307239 (-0.77z)| norm 0.2707 (-0.58z)| lr 1.31e-04 | 2533.45 ms | 53.3% bf16 MFU | 206893 tok/s +step 13730/19560 | loss 3.369641 (+0.37z)| norm 0.2752 (-0.34z)| lr 1.31e-04 | 2532.63 ms | 53.3% bf16 MFU | 206899 tok/s +step 13731/19560 | loss 3.360393 (+0.19z)| norm 0.2669 (-0.77z)| lr 1.31e-04 | 2533.73 ms | 53.3% bf16 MFU | 206901 tok/s +step 13732/19560 | loss 3.354189 (+0.06z)| norm 0.2748 (-0.35z)| lr 1.31e-04 | 2532.54 ms | 53.3% bf16 MFU | 206907 tok/s +step 13733/19560 | loss 3.349402 (-0.03z)| norm 0.2536 (-1.45z)| lr 1.31e-04 | 2534.07 ms | 53.3% bf16 MFU | 206906 tok/s +step 13734/19560 | loss 3.357573 (+0.13z)| norm 0.2677 (-0.70z)| lr 1.31e-04 | 2533.44 ms | 53.3% bf16 MFU | 206908 tok/s +step 13735/19560 | loss 3.348214 (-0.06z)| norm 0.2661 (-0.77z)| lr 1.31e-04 | 2532.50 ms | 53.3% bf16 MFU | 206914 tok/s +step 13736/19560 | loss 3.347046 (-0.08z)| norm 0.2534 (-1.44z)| lr 1.30e-04 | 2533.23 ms | 53.3% bf16 MFU | 206916 tok/s +step 13737/19560 | loss 3.420177 (+1.30z)| norm 0.2680 (-0.67z)| lr 1.30e-04 | 2534.49 ms | 53.3% bf16 MFU | 206914 tok/s +step 13738/19560 | loss 3.338024 (-0.28z)| norm 0.2665 (-0.75z)| lr 1.30e-04 | 2534.64 ms | 53.3% bf16 MFU | 206910 tok/s +step 13739/19560 | loss 3.350143 (-0.05z)| norm 0.2761 (-0.25z)| lr 1.30e-04 | 2535.21 ms | 53.3% bf16 MFU | 206905 tok/s +step 13740/19560 | loss 3.420289 (+1.28z)| norm 0.2704 (-0.56z)| lr 1.30e-04 | 2535.09 ms | 53.3% bf16 MFU | 206900 tok/s +step 13741/19560 | loss 3.357206 (+0.06z)| norm 0.2826 (+0.07z)| lr 1.30e-04 | 2532.26 ms | 53.3% bf16 MFU | 206908 tok/s +step 13742/19560 | loss 3.387226 (+0.64z)| norm 0.2839 (+0.14z)| lr 1.30e-04 | 2533.82 ms | 53.3% bf16 MFU | 206908 tok/s +step 13743/19560 | loss 3.464863 (+2.08z)| norm 0.2915 (+0.53z)| lr 1.30e-04 | 2534.02 ms | 53.3% bf16 MFU | 206908 tok/s +step 13744/19560 | loss 3.343248 (-0.23z)| norm 0.2953 (+0.72z)| lr 1.30e-04 | 2532.95 ms | 53.3% bf16 MFU | 206912 tok/s +step 13745/19560 | loss 3.356469 (+0.02z)| norm 0.2532 (-1.51z)| lr 1.30e-04 | 2532.69 ms | 53.3% bf16 MFU | 206916 tok/s +step 13746/19560 | loss 3.348937 (-0.13z)| norm 0.2777 (-0.22z)| lr 1.30e-04 | 2532.80 ms | 53.3% bf16 MFU | 206921 tok/s +step 13747/19560 | loss 3.374427 (+0.36z)| norm 0.2639 (-0.95z)| lr 1.30e-04 | 2534.01 ms | 53.3% bf16 MFU | 206920 tok/s +step 13748/19560 | loss 3.380763 (+0.47z)| norm 0.2756 (-0.33z)| lr 1.30e-04 | 2532.87 ms | 53.3% bf16 MFU | 206923 tok/s +step 13749/19560 | loss 3.353047 (-0.05z)| norm 0.2682 (-0.73z)| lr 1.30e-04 | 2534.29 ms | 53.3% bf16 MFU | 206921 tok/s +step 13750/19560 | loss 3.350124 (-0.11z)| norm 0.2734 (-0.45z)| lr 1.30e-04 | 2534.38 ms | 53.3% bf16 MFU | 206918 tok/s +val loss 3.332983 +evaluating HellaSwag: 0/628 evaluating HellaSwag: 10/628 evaluating HellaSwag: 20/628 evaluating HellaSwag: 30/628 evaluating HellaSwag: 40/628 evaluating HellaSwag: 50/628 evaluating HellaSwag: 60/628 evaluating HellaSwag: 70/628 evaluating HellaSwag: 80/628 evaluating HellaSwag: 90/628 evaluating HellaSwag: 100/628 evaluating HellaSwag: 110/628 evaluating HellaSwag: 120/628 evaluating HellaSwag: 130/628 evaluating HellaSwag: 140/628 evaluating HellaSwag: 150/628 evaluating HellaSwag: 160/628 evaluating HellaSwag: 170/628 evaluating HellaSwag: 180/628 evaluating HellaSwag: 190/628 evaluating HellaSwag: 200/628 evaluating HellaSwag: 210/628 evaluating HellaSwag: 220/628 evaluating HellaSwag: 230/628 evaluating HellaSwag: 240/628 evaluating HellaSwag: 250/628 evaluating HellaSwag: 260/628 evaluating HellaSwag: 270/628 evaluating HellaSwag: 280/628 evaluating HellaSwag: 290/628 evaluating HellaSwag: 300/628 evaluating HellaSwag: 310/628 evaluating HellaSwag: 320/628 evaluating HellaSwag: 330/628 evaluating HellaSwag: 340/628 evaluating HellaSwag: 350/628 evaluating HellaSwag: 360/628 evaluating HellaSwag: 370/628 evaluating HellaSwag: 380/628 evaluating HellaSwag: 390/628 evaluating HellaSwag: 400/628 evaluating HellaSwag: 410/628 evaluating HellaSwag: 420/628 evaluating HellaSwag: 430/628 evaluating HellaSwag: 440/628 evaluating HellaSwag: 450/628 evaluating HellaSwag: 460/628 evaluating HellaSwag: 470/628 evaluating HellaSwag: 480/628 evaluating HellaSwag: 490/628 evaluating HellaSwag: 500/628 evaluating HellaSwag: 510/628 evaluating HellaSwag: 520/628 evaluating HellaSwag: 530/628 evaluating HellaSwag: 540/628 evaluating HellaSwag: 550/628 evaluating HellaSwag: 560/628 evaluating HellaSwag: 570/628 evaluating HellaSwag: 580/628 evaluating HellaSwag: 590/628 evaluating HellaSwag: 600/628 evaluating HellaSwag: 610/628 evaluating HellaSwag: 620/628 HellaSwag: 2985/10042 = 0.297252 +step 13751/19560 | loss 3.377908 (+0.41z)| norm 0.2682 (-0.72z)| lr 1.30e-04 | 2533.36 ms | 53.3% bf16 MFU | 206920 tok/s +step 13752/19560 | loss 3.361751 (+0.10z)| norm 0.2610 (-1.09z)| lr 1.30e-04 | 2534.05 ms | 53.3% bf16 MFU | 206919 tok/s +step 13753/19560 | loss 3.267038 (-1.72z)| norm 0.2705 (-0.57z)| lr 1.30e-04 | 2534.24 ms | 53.3% bf16 MFU | 206917 tok/s +step 13754/19560 | loss 3.319661 (-0.70z)| norm 0.2536 (-1.47z)| lr 1.30e-04 | 2532.41 ms | 53.3% bf16 MFU | 206923 tok/s +step 13755/19560 | loss 3.348276 (-0.15z)| norm 0.2656 (-0.82z)| lr 1.30e-04 | 2533.96 ms | 53.3% bf16 MFU | 206922 tok/s +step 13756/19560 | loss 3.443246 (+1.64z)| norm 0.3095 (+1.46z)| lr 1.30e-04 | 2531.98 ms | 53.3% bf16 MFU | 206929 tok/s +step 13757/19560 | loss 3.367027 (+0.19z)| norm 0.2754 (-0.33z)| lr 1.30e-04 | 2532.41 ms | 53.3% bf16 MFU | 206934 tok/s +step 13758/19560 | loss 3.345281 (-0.23z)| norm 0.2584 (-1.22z)| lr 1.30e-04 | 2532.40 ms | 53.3% bf16 MFU | 206939 tok/s +step 13759/19560 | loss 3.355737 (-0.04z)| norm 0.2741 (-0.40z)| lr 1.30e-04 | 2533.08 ms | 53.3% bf16 MFU | 206941 tok/s +step 13760/19560 | loss 3.350926 (-0.13z)| norm 0.2638 (-0.93z)| lr 1.29e-04 | 2534.24 ms | 53.3% bf16 MFU | 206938 tok/s +step 13761/19560 | loss 3.337238 (-0.38z)| norm 0.2668 (-0.78z)| lr 1.29e-04 | 2533.58 ms | 53.3% bf16 MFU | 206938 tok/s +step 13762/19560 | loss 3.336283 (-0.41z)| norm 0.2711 (-0.56z)| lr 1.29e-04 | 2532.13 ms | 53.3% bf16 MFU | 206944 tok/s +step 13763/19560 | loss 3.331513 (-0.50z)| norm 0.2714 (-0.55z)| lr 1.29e-04 | 2531.56 ms | 53.3% bf16 MFU | 206952 tok/s +step 13764/19560 | loss 3.373259 (+0.32z)| norm 0.3108 (+1.51z)| lr 1.29e-04 | 2532.08 ms | 53.3% bf16 MFU | 206957 tok/s +step 13765/19560 | loss 3.278630 (-1.52z)| norm 0.2925 (+0.54z)| lr 1.29e-04 | 2533.85 ms | 53.3% bf16 MFU | 206955 tok/s +step 13766/19560 | loss 3.384997 (+0.55z)| norm 0.2967 (+0.75z)| lr 1.29e-04 | 2533.96 ms | 53.3% bf16 MFU | 206952 tok/s +step 13767/19560 | loss 3.382473 (+0.51z)| norm 0.2720 (-0.55z)| lr 1.29e-04 | 2533.18 ms | 53.3% bf16 MFU | 206953 tok/s +step 13768/19560 | loss 3.404485 (+0.93z)| norm 0.2823 (+0.01z)| lr 1.29e-04 | 2532.17 ms | 53.3% bf16 MFU | 206958 tok/s +step 13769/19560 | loss 3.409908 (+1.02z)| norm 0.2970 (+0.79z)| lr 1.29e-04 | 2533.98 ms | 53.3% bf16 MFU | 206955 tok/s +step 13770/19560 | loss 3.392531 (+0.69z)| norm 0.2570 (-1.35z)| lr 1.29e-04 | 2533.79 ms | 53.3% bf16 MFU | 206953 tok/s +step 13771/19560 | loss 3.383839 (+0.51z)| norm 0.2885 (+0.32z)| lr 1.29e-04 | 2532.19 ms | 53.3% bf16 MFU | 206958 tok/s +step 13772/19560 | loss 3.337842 (-0.40z)| norm 0.2869 (+0.23z)| lr 1.29e-04 | 2531.72 ms | 53.3% bf16 MFU | 206965 tok/s +step 13773/19560 | loss 3.361698 (+0.07z)| norm 0.2829 (+0.01z)| lr 1.29e-04 | 2533.78 ms | 53.3% bf16 MFU | 206962 tok/s +step 13774/19560 | loss 3.341721 (-0.32z)| norm 0.2683 (-0.78z)| lr 1.29e-04 | 2531.14 ms | 53.3% bf16 MFU | 206971 tok/s +step 13775/19560 | loss 3.372248 (+0.28z)| norm 0.3050 (+1.19z)| lr 1.29e-04 | 2535.05 ms | 53.3% bf16 MFU | 206963 tok/s +step 13776/19560 | loss 3.411265 (+1.04z)| norm 0.2699 (-0.70z)| lr 1.29e-04 | 2534.00 ms | 53.3% bf16 MFU | 206960 tok/s +step 13777/19560 | loss 3.375170 (+0.33z)| norm 0.2806 (-0.12z)| lr 1.29e-04 | 2533.46 ms | 53.3% bf16 MFU | 206959 tok/s +step 13778/19560 | loss 3.436211 (+1.51z)| norm 0.2835 (+0.07z)| lr 1.29e-04 | 2534.90 ms | 53.3% bf16 MFU | 206953 tok/s +step 13779/19560 | loss 3.346261 (-0.25z)| norm 0.2634 (-1.07z)| lr 1.29e-04 | 2532.13 ms | 53.3% bf16 MFU | 206958 tok/s +step 13780/19560 | loss 3.348339 (-0.23z)| norm 0.2709 (-0.63z)| lr 1.29e-04 | 2532.94 ms | 53.3% bf16 MFU | 206959 tok/s +step 13781/19560 | loss 3.341644 (-0.36z)| norm 0.2709 (-0.63z)| lr 1.29e-04 | 2531.57 ms | 53.3% bf16 MFU | 206966 tok/s +step 13782/19560 | loss 3.385886 (+0.51z)| norm 0.2564 (-1.45z)| lr 1.29e-04 | 2533.15 ms | 53.3% bf16 MFU | 206967 tok/s +step 13783/19560 | loss 3.321867 (-0.75z)| norm 0.2814 (-0.02z)| lr 1.29e-04 | 2532.37 ms | 53.3% bf16 MFU | 206970 tok/s +step 13784/19560 | loss 3.360339 (+0.00z)| norm 0.2630 (-1.06z)| lr 1.29e-04 | 2533.24 ms | 53.3% bf16 MFU | 206970 tok/s +step 13785/19560 | loss 3.305583 (-1.07z)| norm 0.2798 (-0.11z)| lr 1.28e-04 | 2533.71 ms | 53.3% bf16 MFU | 206967 tok/s +step 13786/19560 | loss 3.300639 (-1.17z)| norm 0.2666 (-0.85z)| lr 1.28e-04 | 2533.68 ms | 53.3% bf16 MFU | 206965 tok/s +step 13787/19560 | loss 3.314017 (-0.89z)| norm 0.2845 (+0.17z)| lr 1.28e-04 | 2533.56 ms | 53.3% bf16 MFU | 206964 tok/s +step 13788/19560 | loss 3.415590 (+1.14z)| norm 0.2804 (-0.06z)| lr 1.28e-04 | 2533.59 ms | 53.3% bf16 MFU | 206963 tok/s +step 13789/19560 | loss 3.342250 (-0.33z)| norm 0.2575 (-1.36z)| lr 1.28e-04 | 2532.67 ms | 53.3% bf16 MFU | 206965 tok/s +step 13790/19560 | loss 3.345270 (-0.26z)| norm 0.2833 (+0.13z)| lr 1.28e-04 | 2533.13 ms | 53.3% bf16 MFU | 206965 tok/s +step 13791/19560 | loss 3.359557 (+0.02z)| norm 0.2656 (-0.89z)| lr 1.28e-04 | 2533.89 ms | 53.3% bf16 MFU | 206963 tok/s +step 13792/19560 | loss 3.329747 (-0.58z)| norm 0.2525 (-1.63z)| lr 1.28e-04 | 2532.83 ms | 53.3% bf16 MFU | 206964 tok/s +step 13793/19560 | loss 3.405581 (+0.94z)| norm 0.2772 (-0.23z)| lr 1.28e-04 | 2534.06 ms | 53.3% bf16 MFU | 206961 tok/s +step 13794/19560 | loss 3.357374 (-0.02z)| norm 0.2446 (-2.04z)| lr 1.28e-04 | 2534.37 ms | 53.3% bf16 MFU | 206956 tok/s +step 13795/19560 | loss 3.358265 (-0.01z)| norm 0.2654 (-0.85z)| lr 1.28e-04 | 2532.38 ms | 53.3% bf16 MFU | 206960 tok/s +step 13796/19560 | loss 3.306652 (-1.08z)| norm 0.2596 (-1.17z)| lr 1.28e-04 | 2533.89 ms | 53.3% bf16 MFU | 206958 tok/s +step 13797/19560 | loss 3.368964 (+0.21z)| norm 0.2559 (-1.35z)| lr 1.28e-04 | 2531.29 ms | 53.3% bf16 MFU | 206966 tok/s +step 13798/19560 | loss 3.454175 (+1.93z)| norm 0.2748 (-0.30z)| lr 1.28e-04 | 2531.94 ms | 53.3% bf16 MFU | 206971 tok/s +step 13799/19560 | loss 3.398293 (+0.84z)| norm 0.2798 (+0.00z)| lr 1.28e-04 | 2534.31 ms | 53.3% bf16 MFU | 206966 tok/s +step 13800/19560 | loss 3.450189 (+1.90z)| norm 0.2669 (-0.75z)| lr 1.28e-04 | 2532.33 ms | 53.3% bf16 MFU | 206970 tok/s +step 13801/19560 | loss 3.416643 (+1.18z)| norm 0.2748 (-0.26z)| lr 1.28e-04 | 2534.06 ms | 53.3% bf16 MFU | 206966 tok/s +step 13802/19560 | loss 3.370782 (+0.22z)| norm 0.2761 (-0.17z)| lr 1.28e-04 | 2532.24 ms | 53.3% bf16 MFU | 206970 tok/s +step 13803/19560 | loss 3.491541 (+2.65z)| norm 0.2749 (-0.23z)| lr 1.28e-04 | 2533.20 ms | 53.3% bf16 MFU | 206970 tok/s +step 13804/19560 | loss 3.388003 (+0.54z)| norm 0.2741 (-0.27z)| lr 1.28e-04 | 2537.29 ms | 53.2% bf16 MFU | 206953 tok/s +step 13805/19560 | loss 3.299968 (-1.22z)| norm 0.2798 (+0.10z)| lr 1.28e-04 | 2532.03 ms | 53.3% bf16 MFU | 206959 tok/s +step 13806/19560 | loss 3.333872 (-0.53z)| norm 0.2619 (-1.02z)| lr 1.28e-04 | 2533.46 ms | 53.3% bf16 MFU | 206958 tok/s +step 13807/19560 | loss 3.335167 (-0.50z)| norm 0.2744 (-0.22z)| lr 1.28e-04 | 2533.49 ms | 53.3% bf16 MFU | 206957 tok/s +step 13808/19560 | loss 3.341036 (-0.38z)| norm 0.2719 (-0.37z)| lr 1.28e-04 | 2533.16 ms | 53.3% bf16 MFU | 206958 tok/s +step 13809/19560 | loss 3.383730 (+0.48z)| norm 0.2761 (-0.10z)| lr 1.27e-04 | 2532.96 ms | 53.3% bf16 MFU | 206959 tok/s +step 13810/19560 | loss 3.466409 (+2.14z)| norm 0.3000 (+1.39z)| lr 1.27e-04 | 2533.43 ms | 53.3% bf16 MFU | 206959 tok/s +step 13811/19560 | loss 3.399287 (+0.76z)| norm 0.2642 (-0.85z)| lr 1.27e-04 | 2530.81 ms | 53.3% bf16 MFU | 206969 tok/s +step 13812/19560 | loss 3.375231 (+0.26z)| norm 0.2763 (-0.09z)| lr 1.27e-04 | 2532.70 ms | 53.3% bf16 MFU | 206971 tok/s +step 13813/19560 | loss 3.362740 (+0.03z)| norm 0.2736 (-0.25z)| lr 1.27e-04 | 2531.75 ms | 53.3% bf16 MFU | 206976 tok/s +step 13814/19560 | loss 3.333053 (-0.61z)| norm 0.2517 (-1.61z)| lr 1.27e-04 | 2533.38 ms | 53.3% bf16 MFU | 206975 tok/s +step 13815/19560 | loss 3.395188 (+0.71z)| norm 0.2713 (-0.36z)| lr 1.27e-04 | 2534.32 ms | 53.3% bf16 MFU | 206970 tok/s +step 13816/19560 | loss 3.349807 (-0.27z)| norm 0.2660 (-0.69z)| lr 1.27e-04 | 2533.34 ms | 53.3% bf16 MFU | 206969 tok/s +step 13817/19560 | loss 3.376307 (+0.30z)| norm 0.2623 (-0.92z)| lr 1.27e-04 | 2533.36 ms | 53.3% bf16 MFU | 206969 tok/s +step 13818/19560 | loss 3.409483 (+1.01z)| norm 0.2667 (-0.62z)| lr 1.27e-04 | 2531.64 ms | 53.3% bf16 MFU | 206975 tok/s +step 13819/19560 | loss 3.380508 (+0.38z)| norm 0.2729 (-0.21z)| lr 1.27e-04 | 2531.69 ms | 53.3% bf16 MFU | 206981 tok/s +step 13820/19560 | loss 3.333310 (-0.63z)| norm 0.2737 (-0.14z)| lr 1.27e-04 | 2533.76 ms | 53.3% bf16 MFU | 206978 tok/s +step 13821/19560 | loss 3.337263 (-0.55z)| norm 0.2498 (-1.75z)| lr 1.27e-04 | 2531.70 ms | 53.3% bf16 MFU | 206983 tok/s +step 13822/19560 | loss 3.447957 (+1.80z)| norm 0.2779 (+0.19z)| lr 1.27e-04 | 2532.30 ms | 53.3% bf16 MFU | 206986 tok/s +step 13823/19560 | loss 3.331882 (-0.67z)| norm 0.2624 (-0.88z)| lr 1.27e-04 | 2532.39 ms | 53.3% bf16 MFU | 206988 tok/s +step 13824/19560 | loss 3.504477 (+2.89z)| norm 0.2604 (-1.02z)| lr 1.27e-04 | 2531.71 ms | 53.3% bf16 MFU | 206994 tok/s +step 13825/19560 | loss 3.324286 (-0.85z)| norm 0.2618 (-0.90z)| lr 1.27e-04 | 2533.03 ms | 53.3% bf16 MFU | 206993 tok/s +step 13826/19560 | loss 3.329661 (-0.74z)| norm 0.2713 (-0.25z)| lr 1.27e-04 | 2534.51 ms | 53.3% bf16 MFU | 206986 tok/s +step 13827/19560 | loss 3.280258 (-1.74z)| norm 0.2596 (-1.05z)| lr 1.27e-04 | 2532.67 ms | 53.3% bf16 MFU | 206987 tok/s +step 13828/19560 | loss 3.381329 (+0.36z)| norm 0.2605 (-0.98z)| lr 1.27e-04 | 2534.59 ms | 53.3% bf16 MFU | 206981 tok/s +step 13829/19560 | loss 3.333988 (-0.63z)| norm 0.2584 (-1.12z)| lr 1.27e-04 | 2533.80 ms | 53.3% bf16 MFU | 206978 tok/s +step 13830/19560 | loss 3.468287 (+2.11z)| norm 0.2746 (+0.06z)| lr 1.27e-04 | 2534.15 ms | 53.3% bf16 MFU | 206973 tok/s +step 13831/19560 | loss 3.308015 (-1.15z)| norm 0.2595 (-1.02z)| lr 1.27e-04 | 2532.92 ms | 53.3% bf16 MFU | 206974 tok/s +step 13832/19560 | loss 3.322913 (-0.87z)| norm 0.2719 (-0.11z)| lr 1.27e-04 | 2534.35 ms | 53.3% bf16 MFU | 206969 tok/s +step 13833/19560 | loss 3.360620 (-0.06z)| norm 0.2735 (+0.02z)| lr 1.27e-04 | 2532.46 ms | 53.3% bf16 MFU | 206972 tok/s +step 13834/19560 | loss 3.344839 (-0.40z)| norm 0.2690 (-0.31z)| lr 1.26e-04 | 2532.06 ms | 53.3% bf16 MFU | 206976 tok/s +step 13835/19560 | loss 3.400802 (+0.81z)| norm 0.2853 (+0.90z)| lr 1.26e-04 | 2534.24 ms | 53.3% bf16 MFU | 206971 tok/s +step 13836/19560 | loss 3.290648 (-1.64z)| norm 0.2832 (+0.74z)| lr 1.26e-04 | 2533.49 ms | 53.3% bf16 MFU | 206970 tok/s +step 13837/19560 | loss 3.328410 (-0.79z)| norm 0.2657 (-0.56z)| lr 1.26e-04 | 2533.19 ms | 53.3% bf16 MFU | 206970 tok/s +step 13838/19560 | loss 3.221459 (-3.08z)| norm 0.2728 (-0.01z)| lr 1.26e-04 | 2535.24 ms | 53.3% bf16 MFU | 206961 tok/s +step 13839/19560 | loss 3.312510 (-1.08z)| norm 0.2915 (+1.44z)| lr 1.26e-04 | 2532.46 ms | 53.3% bf16 MFU | 206965 tok/s +step 13840/19560 | loss 3.358186 (-0.07z)| norm 0.2860 (+1.01z)| lr 1.26e-04 | 2534.30 ms | 53.3% bf16 MFU | 206960 tok/s +step 13841/19560 | loss 3.387890 (+0.58z)| norm 0.2813 (+0.63z)| lr 1.26e-04 | 2534.93 ms | 53.3% bf16 MFU | 206954 tok/s +step 13842/19560 | loss 3.378380 (+0.37z)| norm 0.2874 (+1.10z)| lr 1.26e-04 | 2533.66 ms | 53.3% bf16 MFU | 206952 tok/s +step 13843/19560 | loss 3.292850 (-1.54z)| norm 0.2805 (+0.57z)| lr 1.26e-04 | 2533.50 ms | 53.3% bf16 MFU | 206952 tok/s +step 13844/19560 | loss 3.250891 (-2.42z)| norm 0.2801 (+0.54z)| lr 1.26e-04 | 2535.11 ms | 53.3% bf16 MFU | 206945 tok/s +step 13845/19560 | loss 3.315164 (-1.00z)| norm 0.2665 (-0.51z)| lr 1.26e-04 | 2534.39 ms | 53.3% bf16 MFU | 206941 tok/s +step 13846/19560 | loss 3.205023 (-3.24z)| norm 0.2632 (-0.79z)| lr 1.26e-04 | 2533.47 ms | 53.3% bf16 MFU | 206941 tok/s +step 13847/19560 | loss 3.294194 (-1.36z)| norm 0.2824 (+0.71z)| lr 1.26e-04 | 2534.52 ms | 53.3% bf16 MFU | 206937 tok/s +step 13848/19560 | loss 3.284177 (-1.55z)| norm 0.2490 (-1.86z)| lr 1.26e-04 | 2533.02 ms | 53.3% bf16 MFU | 206939 tok/s +step 13849/19560 | loss 3.293252 (-1.34z)| norm 0.2807 (+0.58z)| lr 1.26e-04 | 2532.35 ms | 53.3% bf16 MFU | 206944 tok/s +step 13850/19560 | loss 3.250792 (-2.16z)| norm 0.2775 (+0.34z)| lr 1.26e-04 | 2534.12 ms | 53.3% bf16 MFU | 206942 tok/s +step 13851/19560 | loss 3.312210 (-0.91z)| norm 0.2613 (-0.91z)| lr 1.26e-04 | 2534.81 ms | 53.3% bf16 MFU | 206936 tok/s +step 13852/19560 | loss 3.364673 (+0.15z)| norm 0.2544 (-1.42z)| lr 1.26e-04 | 2535.96 ms | 53.2% bf16 MFU | 206926 tok/s +step 13853/19560 | loss 3.348977 (-0.17z)| norm 0.2743 (+0.12z)| lr 1.26e-04 | 2534.28 ms | 53.3% bf16 MFU | 206924 tok/s +step 13854/19560 | loss 3.268061 (-1.77z)| norm 0.2599 (-1.01z)| lr 1.26e-04 | 2535.84 ms | 53.2% bf16 MFU | 206915 tok/s +step 13855/19560 | loss 3.303969 (-1.04z)| norm 0.2934 (+1.71z)| lr 1.26e-04 | 2534.87 ms | 53.3% bf16 MFU | 206911 tok/s +step 13856/19560 | loss 3.346092 (-0.20z)| norm 0.2631 (-0.74z)| lr 1.26e-04 | 2533.99 ms | 53.3% bf16 MFU | 206911 tok/s +step 13857/19560 | loss 3.335509 (-0.42z)| norm 0.2634 (-0.71z)| lr 1.26e-04 | 2535.76 ms | 53.2% bf16 MFU | 206903 tok/s +step 13858/19560 | loss 3.359057 (+0.06z)| norm 0.2947 (+1.78z)| lr 1.25e-04 | 2534.04 ms | 53.3% bf16 MFU | 206903 tok/s +step 13859/19560 | loss 3.245919 (-2.15z)| norm 0.2663 (-0.48z)| lr 1.25e-04 | 2533.95 ms | 53.3% bf16 MFU | 206903 tok/s +step 13860/19560 | loss 3.286304 (-1.34z)| norm 0.2740 (+0.13z)| lr 1.25e-04 | 2534.53 ms | 53.3% bf16 MFU | 206901 tok/s +step 13861/19560 | loss 3.394882 (+0.78z)| norm 0.3020 (+2.30z)| lr 1.25e-04 | 2534.27 ms | 53.3% bf16 MFU | 206900 tok/s +step 13862/19560 | loss 3.349288 (-0.11z)| norm 0.2686 (-0.33z)| lr 1.25e-04 | 2532.29 ms | 53.3% bf16 MFU | 206907 tok/s +step 13863/19560 | loss 3.344931 (-0.20z)| norm 0.2889 (+1.25z)| lr 1.25e-04 | 2534.56 ms | 53.3% bf16 MFU | 206904 tok/s +step 13864/19560 | loss 3.241051 (-2.17z)| norm 0.2647 (-0.66z)| lr 1.25e-04 | 2535.26 ms | 53.3% bf16 MFU | 206899 tok/s +step 13865/19560 | loss 3.330003 (-0.45z)| norm 0.2660 (-0.55z)| lr 1.25e-04 | 2534.96 ms | 53.3% bf16 MFU | 206895 tok/s +step 13866/19560 | loss 3.321905 (-0.60z)| norm 0.2859 (+1.01z)| lr 1.25e-04 | 2535.13 ms | 53.3% bf16 MFU | 206891 tok/s +step 13867/19560 | loss 3.188869 (-3.02z)| norm 0.2793 (+0.48z)| lr 1.25e-04 | 2535.14 ms | 53.3% bf16 MFU | 206887 tok/s +step 13868/19560 | loss 3.279495 (-1.32z)| norm 0.2781 (+0.38z)| lr 1.25e-04 | 2533.20 ms | 53.3% bf16 MFU | 206891 tok/s +step 13869/19560 | loss 3.334304 (-0.31z)| norm 0.2739 (+0.06z)| lr 1.25e-04 | 2533.65 ms | 53.3% bf16 MFU | 206893 tok/s +step 13870/19560 | loss 3.306417 (-0.81z)| norm 0.2764 (+0.26z)| lr 1.25e-04 | 2532.87 ms | 53.3% bf16 MFU | 206898 tok/s +step 13871/19560 | loss 3.340784 (-0.16z)| norm 0.2681 (-0.38z)| lr 1.25e-04 | 2533.69 ms | 53.3% bf16 MFU | 206899 tok/s +step 13872/19560 | loss 3.331755 (-0.33z)| norm 0.2835 (+0.85z)| lr 1.25e-04 | 2535.23 ms | 53.3% bf16 MFU | 206894 tok/s +step 13873/19560 | loss 3.295269 (-1.00z)| norm 0.2762 (+0.26z)| lr 1.25e-04 | 2533.48 ms | 53.3% bf16 MFU | 206897 tok/s +step 13874/19560 | loss 3.348578 (-0.00z)| norm 0.2692 (-0.31z)| lr 1.25e-04 | 2535.12 ms | 53.3% bf16 MFU | 206892 tok/s +step 13875/19560 | loss 3.343860 (-0.08z)| norm 0.2722 (-0.07z)| lr 1.25e-04 | 2533.53 ms | 53.3% bf16 MFU | 206895 tok/s +step 13876/19560 | loss 3.390497 (+0.79z)| norm 0.2910 (+1.44z)| lr 1.25e-04 | 2531.68 ms | 53.3% bf16 MFU | 206905 tok/s +step 13877/19560 | loss 3.267271 (-1.50z)| norm 0.2920 (+1.50z)| lr 1.25e-04 | 2532.27 ms | 53.3% bf16 MFU | 206911 tok/s +step 13878/19560 | loss 3.303701 (-0.81z)| norm 0.2770 (+0.30z)| lr 1.25e-04 | 2533.01 ms | 53.3% bf16 MFU | 206915 tok/s +step 13879/19560 | loss 3.323519 (-0.44z)| norm 0.2799 (+0.51z)| lr 1.25e-04 | 2533.20 ms | 53.3% bf16 MFU | 206917 tok/s +step 13880/19560 | loss 3.331190 (-0.29z)| norm 0.2833 (+0.77z)| lr 1.25e-04 | 2533.01 ms | 53.3% bf16 MFU | 206921 tok/s +step 13881/19560 | loss 3.249568 (-1.79z)| norm 0.2700 (-0.29z)| lr 1.25e-04 | 2535.05 ms | 53.3% bf16 MFU | 206915 tok/s +step 13882/19560 | loss 3.283511 (-1.16z)| norm 0.2789 (+0.41z)| lr 1.25e-04 | 2532.22 ms | 53.3% bf16 MFU | 206922 tok/s +step 13883/19560 | loss 3.356361 (+0.18z)| norm 0.2678 (-0.49z)| lr 1.24e-04 | 2535.04 ms | 53.3% bf16 MFU | 206917 tok/s +step 13884/19560 | loss 3.329618 (-0.30z)| norm 0.2880 (+1.20z)| lr 1.24e-04 | 2535.25 ms | 53.3% bf16 MFU | 206911 tok/s +step 13885/19560 | loss 3.397164 (+0.95z)| norm 0.2738 (+0.01z)| lr 1.24e-04 | 2533.45 ms | 53.3% bf16 MFU | 206913 tok/s +step 13886/19560 | loss 3.356798 (+0.20z)| norm 0.2940 (+1.66z)| lr 1.24e-04 | 2533.16 ms | 53.3% bf16 MFU | 206916 tok/s +step 13887/19560 | loss 3.308936 (-0.68z)| norm 0.2926 (+1.52z)| lr 1.24e-04 | 2531.97 ms | 53.3% bf16 MFU | 206923 tok/s +step 13888/19560 | loss 3.270975 (-1.36z)| norm 0.2700 (-0.34z)| lr 1.24e-04 | 2532.44 ms | 53.3% bf16 MFU | 206928 tok/s +step 13889/19560 | loss 3.337142 (-0.14z)| norm 0.2830 (+0.72z)| lr 1.24e-04 | 2531.81 ms | 53.3% bf16 MFU | 206936 tok/s +step 13890/19560 | loss 3.304562 (-0.73z)| norm 0.2835 (+0.75z)| lr 1.24e-04 | 2534.17 ms | 53.3% bf16 MFU | 206934 tok/s +step 13891/19560 | loss 3.300078 (-0.81z)| norm 0.2569 (-1.41z)| lr 1.24e-04 | 2532.09 ms | 53.3% bf16 MFU | 206940 tok/s +step 13892/19560 | loss 3.233902 (-1.97z)| norm 0.2632 (-0.90z)| lr 1.24e-04 | 2534.08 ms | 53.3% bf16 MFU | 206937 tok/s +step 13893/19560 | loss 3.257418 (-1.54z)| norm 0.2519 (-1.81z)| lr 1.24e-04 | 2534.78 ms | 53.3% bf16 MFU | 206932 tok/s +step 13894/19560 | loss 3.346349 (+0.06z)| norm 0.2780 (+0.39z)| lr 1.24e-04 | 2533.55 ms | 53.3% bf16 MFU | 206933 tok/s +step 13895/19560 | loss 3.313070 (-0.53z)| norm 0.2528 (-1.72z)| lr 1.24e-04 | 2533.44 ms | 53.3% bf16 MFU | 206933 tok/s +step 13896/19560 | loss 3.314529 (-0.49z)| norm 0.2635 (-0.81z)| lr 1.24e-04 | 2532.37 ms | 53.3% bf16 MFU | 206939 tok/s +step 13897/19560 | loss 3.314939 (-0.47z)| norm 0.2573 (-1.32z)| lr 1.24e-04 | 2532.52 ms | 53.3% bf16 MFU | 206943 tok/s +step 13898/19560 | loss 3.248703 (-1.64z)| norm 0.2765 (+0.30z)| lr 1.24e-04 | 2535.51 ms | 53.3% bf16 MFU | 206934 tok/s +step 13899/19560 | loss 3.260593 (-1.40z)| norm 0.2670 (-0.50z)| lr 1.24e-04 | 2533.25 ms | 53.3% bf16 MFU | 206936 tok/s +step 13900/19560 | loss 3.331861 (-0.12z)| norm 0.2608 (-1.01z)| lr 1.24e-04 | 2531.32 ms | 53.3% bf16 MFU | 206945 tok/s +step 13901/19560 | loss 3.293777 (-0.80z)| norm 0.2797 (+0.62z)| lr 1.24e-04 | 2532.00 ms | 53.3% bf16 MFU | 206951 tok/s +step 13902/19560 | loss 3.247945 (-1.59z)| norm 0.2640 (-0.74z)| lr 1.24e-04 | 2533.07 ms | 53.3% bf16 MFU | 206952 tok/s +step 13903/19560 | loss 3.351451 (+0.25z)| norm 0.2794 (+0.63z)| lr 1.24e-04 | 2534.21 ms | 53.3% bf16 MFU | 206949 tok/s +step 13904/19560 | loss 3.331640 (-0.09z)| norm 0.2821 (+0.86z)| lr 1.24e-04 | 2533.14 ms | 53.3% bf16 MFU | 206950 tok/s +step 13905/19560 | loss 3.278060 (-1.03z)| norm 0.2969 (+2.13z)| lr 1.24e-04 | 2533.75 ms | 53.3% bf16 MFU | 206949 tok/s +step 13906/19560 | loss 3.255408 (-1.42z)| norm 0.2805 (+0.70z)| lr 1.24e-04 | 2532.89 ms | 53.3% bf16 MFU | 206951 tok/s +step 13907/19560 | loss 3.439749 (+1.85z)| norm 0.3475 (+5.64z)| lr 1.24e-04 | 2533.94 ms | 53.3% bf16 MFU | 206949 tok/s +step 13908/19560 | loss 3.398522 (+1.11z)| norm 0.2710 (-0.16z)| lr 1.23e-04 | 2535.74 ms | 53.2% bf16 MFU | 206939 tok/s +step 13909/19560 | loss 3.354624 (+0.33z)| norm 0.3090 (+2.62z)| lr 1.23e-04 | 2533.20 ms | 53.3% bf16 MFU | 206941 tok/s +step 13910/19560 | loss 3.370904 (+0.62z)| norm 0.2884 (+1.08z)| lr 1.23e-04 | 2533.53 ms | 53.3% bf16 MFU | 206940 tok/s +step 13911/19560 | loss 3.253405 (-1.43z)| norm 0.2638 (-0.72z)| lr 1.23e-04 | 2533.67 ms | 53.3% bf16 MFU | 206940 tok/s +step 13912/19560 | loss 3.341424 (+0.11z)| norm 0.3094 (+2.55z)| lr 1.23e-04 | 2535.58 ms | 53.2% bf16 MFU | 206932 tok/s +step 13913/19560 | loss 3.265491 (-1.20z)| norm 0.2749 (+0.08z)| lr 1.23e-04 | 2533.56 ms | 53.3% bf16 MFU | 206932 tok/s +step 13914/19560 | loss 3.337353 (+0.04z)| norm 0.2788 (+0.35z)| lr 1.23e-04 | 2533.58 ms | 53.3% bf16 MFU | 206932 tok/s +step 13915/19560 | loss 3.278540 (-0.98z)| norm 0.2737 (-0.02z)| lr 1.23e-04 | 2534.28 ms | 53.3% bf16 MFU | 206929 tok/s +step 13916/19560 | loss 3.281067 (-0.92z)| norm 0.2655 (-0.60z)| lr 1.23e-04 | 2532.01 ms | 53.3% bf16 MFU | 206936 tok/s +step 13917/19560 | loss 3.256932 (-1.32z)| norm 0.2735 (-0.03z)| lr 1.23e-04 | 2534.68 ms | 53.3% bf16 MFU | 206932 tok/s +step 13918/19560 | loss 3.304855 (-0.48z)| norm 0.2648 (-0.65z)| lr 1.23e-04 | 2533.00 ms | 53.3% bf16 MFU | 206934 tok/s +step 13919/19560 | loss 3.307195 (-0.43z)| norm 0.2659 (-0.57z)| lr 1.23e-04 | 2534.86 ms | 53.3% bf16 MFU | 206929 tok/s +step 13920/19560 | loss 3.340180 (+0.14z)| norm 0.2781 (+0.30z)| lr 1.23e-04 | 2533.15 ms | 53.3% bf16 MFU | 206931 tok/s +step 13921/19560 | loss 3.318201 (-0.23z)| norm 0.2718 (-0.16z)| lr 1.23e-04 | 2534.33 ms | 53.3% bf16 MFU | 206928 tok/s +step 13922/19560 | loss 3.333919 (+0.04z)| norm 0.2619 (-0.91z)| lr 1.23e-04 | 2534.18 ms | 53.3% bf16 MFU | 206926 tok/s +step 13923/19560 | loss 3.350085 (+0.33z)| norm 0.2733 (-0.06z)| lr 1.23e-04 | 2534.17 ms | 53.3% bf16 MFU | 206924 tok/s +step 13924/19560 | loss 3.299872 (-0.55z)| norm 0.2799 (+0.42z)| lr 1.23e-04 | 2533.66 ms | 53.3% bf16 MFU | 206924 tok/s +step 13925/19560 | loss 3.291819 (-0.68z)| norm 0.2667 (-0.58z)| lr 1.23e-04 | 2533.11 ms | 53.3% bf16 MFU | 206927 tok/s +step 13926/19560 | loss 3.482145 (+2.62z)| norm 0.3028 (+2.09z)| lr 1.23e-04 | 2532.77 ms | 53.3% bf16 MFU | 206931 tok/s +step 13927/19560 | loss 3.299469 (-0.53z)| norm 0.2619 (-0.92z)| lr 1.23e-04 | 2533.63 ms | 53.3% bf16 MFU | 206931 tok/s +step 13928/19560 | loss 3.304684 (-0.43z)| norm 0.2545 (-1.46z)| lr 1.23e-04 | 2533.13 ms | 53.3% bf16 MFU | 206933 tok/s +step 13929/19560 | loss 3.290900 (-0.66z)| norm 0.2810 (+0.48z)| lr 1.23e-04 | 2533.35 ms | 53.3% bf16 MFU | 206934 tok/s +step 13930/19560 | loss 3.317734 (-0.18z)| norm 0.2605 (-1.01z)| lr 1.23e-04 | 2533.42 ms | 53.3% bf16 MFU | 206935 tok/s +step 13931/19560 | loss 3.298365 (-0.51z)| norm 0.2535 (-1.49z)| lr 1.23e-04 | 2534.07 ms | 53.3% bf16 MFU | 206933 tok/s +step 13932/19560 | loss 3.288602 (-0.68z)| norm 0.2537 (-1.45z)| lr 1.22e-04 | 2532.02 ms | 53.3% bf16 MFU | 206939 tok/s +step 13933/19560 | loss 3.316970 (-0.16z)| norm 0.3093 (+2.46z)| lr 1.22e-04 | 2534.72 ms | 53.3% bf16 MFU | 206934 tok/s +step 13934/19560 | loss 3.260501 (-1.19z)| norm 0.2688 (-0.38z)| lr 1.22e-04 | 2533.18 ms | 53.3% bf16 MFU | 206936 tok/s +step 13935/19560 | loss 3.329018 (+0.08z)| norm 0.2857 (+0.80z)| lr 1.22e-04 | 2534.00 ms | 53.3% bf16 MFU | 206934 tok/s +step 13936/19560 | loss 3.262120 (-1.14z)| norm 0.2583 (-1.11z)| lr 1.22e-04 | 2533.36 ms | 53.3% bf16 MFU | 206935 tok/s +step 13937/19560 | loss 3.305120 (-0.34z)| norm 0.2652 (-0.62z)| lr 1.22e-04 | 2534.44 ms | 53.3% bf16 MFU | 206932 tok/s +step 13938/19560 | loss 3.357221 (+0.65z)| norm 0.2572 (-1.16z)| lr 1.22e-04 | 2533.52 ms | 53.3% bf16 MFU | 206932 tok/s +step 13939/19560 | loss 3.229591 (-1.73z)| norm 0.2704 (-0.24z)| lr 1.22e-04 | 2531.93 ms | 53.3% bf16 MFU | 206939 tok/s +step 13940/19560 | loss 3.291798 (-0.55z)| norm 0.2545 (-1.34z)| lr 1.22e-04 | 2534.94 ms | 53.3% bf16 MFU | 206933 tok/s +step 13941/19560 | loss 3.313287 (-0.13z)| norm 0.2426 (-2.11z)| lr 1.22e-04 | 2534.96 ms | 53.3% bf16 MFU | 206928 tok/s +step 13942/19560 | loss 3.273763 (-0.87z)| norm 0.2653 (-0.57z)| lr 1.22e-04 | 2532.52 ms | 53.3% bf16 MFU | 206933 tok/s +step 13943/19560 | loss 3.284557 (-0.66z)| norm 0.2684 (-0.35z)| lr 1.22e-04 | 2533.43 ms | 53.3% bf16 MFU | 206933 tok/s +step 13944/19560 | loss 3.250496 (-1.28z)| norm 0.2663 (-0.50z)| lr 1.22e-04 | 2533.95 ms | 53.3% bf16 MFU | 206932 tok/s +step 13945/19560 | loss 3.291519 (-0.50z)| norm 0.2555 (-1.23z)| lr 1.22e-04 | 2533.60 ms | 53.3% bf16 MFU | 206932 tok/s +step 13946/19560 | loss 3.409522 (+1.75z)| norm 0.2992 (+1.73z)| lr 1.22e-04 | 2533.97 ms | 53.3% bf16 MFU | 206931 tok/s +step 13947/19560 | loss 3.297359 (-0.37z)| norm 0.2592 (-0.98z)| lr 1.22e-04 | 2532.74 ms | 53.3% bf16 MFU | 206934 tok/s +step 13948/19560 | loss 3.281810 (-0.66z)| norm 0.2767 (+0.21z)| lr 1.22e-04 | 2534.11 ms | 53.3% bf16 MFU | 206932 tok/s +step 13949/19560 | loss 3.419454 (+1.92z)| norm 0.2915 (+1.20z)| lr 1.22e-04 | 2531.86 ms | 53.3% bf16 MFU | 206939 tok/s +step 13950/19560 | loss 3.266113 (-0.95z)| norm 0.2617 (-0.83z)| lr 1.22e-04 | 2533.21 ms | 53.3% bf16 MFU | 206941 tok/s +step 13951/19560 | loss 3.327764 (+0.23z)| norm 0.2681 (-0.40z)| lr 1.22e-04 | 2532.20 ms | 53.3% bf16 MFU | 206946 tok/s +step 13952/19560 | loss 3.263074 (-1.03z)| norm 0.2724 (-0.11z)| lr 1.22e-04 | 2533.73 ms | 53.3% bf16 MFU | 206945 tok/s +step 13953/19560 | loss 3.287530 (-0.52z)| norm 0.2734 (-0.05z)| lr 1.22e-04 | 2533.12 ms | 53.3% bf16 MFU | 206946 tok/s +step 13954/19560 | loss 3.321403 (+0.16z)| norm 0.2748 (+0.05z)| lr 1.22e-04 | 2533.97 ms | 53.3% bf16 MFU | 206944 tok/s +step 13955/19560 | loss 3.357113 (+0.87z)| norm 0.2951 (+1.41z)| lr 1.22e-04 | 2533.39 ms | 53.3% bf16 MFU | 206945 tok/s +step 13956/19560 | loss 3.326896 (+0.27z)| norm 0.2684 (-0.41z)| lr 1.22e-04 | 2533.80 ms | 53.3% bf16 MFU | 206943 tok/s +step 13957/19560 | loss 3.359119 (+0.92z)| norm 0.2800 (+0.37z)| lr 1.21e-04 | 2532.58 ms | 53.3% bf16 MFU | 206947 tok/s +step 13958/19560 | loss 3.338915 (+0.55z)| norm 0.2909 (+1.11z)| lr 1.21e-04 | 2533.65 ms | 53.3% bf16 MFU | 206946 tok/s +step 13959/19560 | loss 3.329274 (+0.34z)| norm 0.2855 (+0.72z)| lr 1.21e-04 | 2534.56 ms | 53.3% bf16 MFU | 206942 tok/s +step 13960/19560 | loss 3.372370 (+1.23z)| norm 0.2879 (+0.88z)| lr 1.21e-04 | 2534.78 ms | 53.3% bf16 MFU | 206936 tok/s +step 13961/19560 | loss 3.332902 (+0.41z)| norm 0.2941 (+1.28z)| lr 1.21e-04 | 2533.00 ms | 53.3% bf16 MFU | 206939 tok/s +step 13962/19560 | loss 3.289464 (-0.49z)| norm 0.2846 (+0.63z)| lr 1.21e-04 | 2534.60 ms | 53.3% bf16 MFU | 206934 tok/s +step 13963/19560 | loss 3.332058 (+0.42z)| norm 0.2697 (-0.37z)| lr 1.21e-04 | 2532.87 ms | 53.3% bf16 MFU | 206937 tok/s +step 13964/19560 | loss 3.332884 (+0.43z)| norm 0.3124 (+2.45z)| lr 1.21e-04 | 2530.77 ms | 53.4% bf16 MFU | 206949 tok/s +step 13965/19560 | loss 3.330605 (+0.38z)| norm 0.2785 (+0.20z)| lr 1.21e-04 | 2532.72 ms | 53.3% bf16 MFU | 206952 tok/s +step 13966/19560 | loss 3.288519 (-0.53z)| norm 0.2830 (+0.49z)| lr 1.21e-04 | 2534.26 ms | 53.3% bf16 MFU | 206948 tok/s +step 13967/19560 | loss 3.319142 (+0.13z)| norm 0.2679 (-0.50z)| lr 1.21e-04 | 2531.82 ms | 53.3% bf16 MFU | 206955 tok/s +step 13968/19560 | loss 3.342793 (+0.64z)| norm 0.2902 (+0.98z)| lr 1.21e-04 | 2534.15 ms | 53.3% bf16 MFU | 206951 tok/s +step 13969/19560 | loss 3.264270 (-1.04z)| norm 0.2699 (-0.36z)| lr 1.21e-04 | 2533.28 ms | 53.3% bf16 MFU | 206952 tok/s +step 13970/19560 | loss 3.311679 (+0.00z)| norm 0.2629 (-0.81z)| lr 1.21e-04 | 2531.89 ms | 53.3% bf16 MFU | 206958 tok/s +step 13971/19560 | loss 3.309608 (-0.04z)| norm 0.2888 (+0.90z)| lr 1.21e-04 | 2533.69 ms | 53.3% bf16 MFU | 206956 tok/s +step 13972/19560 | loss 3.299415 (-0.28z)| norm 0.2812 (+0.40z)| lr 1.21e-04 | 2533.16 ms | 53.3% bf16 MFU | 206957 tok/s +step 13973/19560 | loss 3.319051 (+0.15z)| norm 0.2830 (+0.51z)| lr 1.21e-04 | 2534.01 ms | 53.3% bf16 MFU | 206954 tok/s +step 13974/19560 | loss 3.369092 (+1.25z)| norm 0.2785 (+0.20z)| lr 1.21e-04 | 2532.42 ms | 53.3% bf16 MFU | 206958 tok/s +step 13975/19560 | loss 3.300659 (-0.29z)| norm 0.3068 (+2.04z)| lr 1.21e-04 | 2534.08 ms | 53.3% bf16 MFU | 206955 tok/s +step 13976/19560 | loss 3.352975 (+0.88z)| norm 0.2798 (+0.25z)| lr 1.21e-04 | 2532.08 ms | 53.3% bf16 MFU | 206960 tok/s +step 13977/19560 | loss 3.276596 (-0.83z)| norm 0.3128 (+2.37z)| lr 1.21e-04 | 2532.33 ms | 53.3% bf16 MFU | 206964 tok/s +step 13978/19560 | loss 3.313745 (-0.01z)| norm 0.2739 (-0.15z)| lr 1.21e-04 | 2533.46 ms | 53.3% bf16 MFU | 206963 tok/s +step 13979/19560 | loss 3.290934 (-0.52z)| norm 0.2828 (+0.42z)| lr 1.21e-04 | 2533.88 ms | 53.3% bf16 MFU | 206960 tok/s +step 13980/19560 | loss 3.359213 (+1.02z)| norm 0.2626 (-0.90z)| lr 1.21e-04 | 2534.39 ms | 53.3% bf16 MFU | 206956 tok/s +step 13981/19560 | loss 3.315607 (+0.04z)| norm 0.2887 (+0.80z)| lr 1.21e-04 | 2533.33 ms | 53.3% bf16 MFU | 206956 tok/s +step 13982/19560 | loss 3.353141 (+0.88z)| norm 0.3000 (+1.50z)| lr 1.20e-04 | 2531.55 ms | 53.3% bf16 MFU | 206963 tok/s +step 13983/19560 | loss 3.285215 (-0.66z)| norm 0.2720 (-0.30z)| lr 1.20e-04 | 2531.94 ms | 53.3% bf16 MFU | 206968 tok/s +step 13984/19560 | loss 3.340800 (+0.60z)| norm 0.2755 (-0.08z)| lr 1.20e-04 | 2532.11 ms | 53.3% bf16 MFU | 206973 tok/s +step 13985/19560 | loss 3.372733 (+1.31z)| norm 0.2683 (-0.55z)| lr 1.20e-04 | 2532.93 ms | 53.3% bf16 MFU | 206974 tok/s +step 13986/19560 | loss 3.315397 (+0.03z)| norm 0.2753 (-0.09z)| lr 1.20e-04 | 2534.85 ms | 53.3% bf16 MFU | 206966 tok/s +step 13987/19560 | loss 3.332806 (+0.41z)| norm 0.2817 (+0.33z)| lr 1.20e-04 | 2531.23 ms | 53.3% bf16 MFU | 206974 tok/s +step 13988/19560 | loss 3.388825 (+1.65z)| norm 0.2565 (-1.32z)| lr 1.20e-04 | 2530.30 ms | 53.4% bf16 MFU | 206986 tok/s +step 13989/19560 | loss 3.324623 (+0.22z)| norm 0.2564 (-1.31z)| lr 1.20e-04 | 2531.30 ms | 53.3% bf16 MFU | 206993 tok/s +step 13990/19560 | loss 3.245268 (-1.57z)| norm 0.2585 (-1.16z)| lr 1.20e-04 | 2532.27 ms | 53.3% bf16 MFU | 206995 tok/s +step 13991/19560 | loss 3.288856 (-0.57z)| norm 0.2538 (-1.44z)| lr 1.20e-04 | 2533.96 ms | 53.3% bf16 MFU | 206991 tok/s +step 13992/19560 | loss 3.348308 (+0.77z)| norm 0.2658 (-0.66z)| lr 1.20e-04 | 2530.74 ms | 53.4% bf16 MFU | 207000 tok/s +step 13993/19560 | loss 3.328621 (+0.32z)| norm 0.2532 (-1.46z)| lr 1.20e-04 | 2531.61 ms | 53.3% bf16 MFU | 207004 tok/s +step 13994/19560 | loss 3.341998 (+0.62z)| norm 0.2881 (+0.79z)| lr 1.20e-04 | 2532.20 ms | 53.3% bf16 MFU | 207007 tok/s +step 13995/19560 | loss 3.321028 (+0.12z)| norm 0.2575 (-1.17z)| lr 1.20e-04 | 2533.88 ms | 53.3% bf16 MFU | 207002 tok/s +step 13996/19560 | loss 3.294190 (-0.52z)| norm 0.2635 (-0.77z)| lr 1.20e-04 | 2530.64 ms | 53.4% bf16 MFU | 207011 tok/s +step 13997/19560 | loss 3.244440 (-1.67z)| norm 0.2715 (-0.26z)| lr 1.20e-04 | 2532.16 ms | 53.3% bf16 MFU | 207013 tok/s +step 13998/19560 | loss 3.288220 (-0.63z)| norm 0.2523 (-1.47z)| lr 1.20e-04 | 2534.02 ms | 53.3% bf16 MFU | 207007 tok/s +step 13999/19560 | loss 3.299789 (-0.35z)| norm 0.2667 (-0.55z)| lr 1.20e-04 | 2531.43 ms | 53.3% bf16 MFU | 207012 tok/s +step 14000/19560 | loss 3.291043 (-0.55z)| norm 0.2760 (+0.05z)| lr 1.20e-04 | 2534.46 ms | 53.3% bf16 MFU | 207005 tok/s +val loss 3.327283 +evaluating HellaSwag: 0/628 evaluating HellaSwag: 10/628 evaluating HellaSwag: 20/628 evaluating HellaSwag: 30/628 evaluating HellaSwag: 40/628 evaluating HellaSwag: 50/628 evaluating HellaSwag: 60/628 evaluating HellaSwag: 70/628 evaluating HellaSwag: 80/628 evaluating HellaSwag: 90/628 evaluating HellaSwag: 100/628 evaluating HellaSwag: 110/628 evaluating HellaSwag: 120/628 evaluating HellaSwag: 130/628 evaluating HellaSwag: 140/628 evaluating HellaSwag: 150/628 evaluating HellaSwag: 160/628 evaluating HellaSwag: 170/628 evaluating HellaSwag: 180/628 evaluating HellaSwag: 190/628 evaluating HellaSwag: 200/628 evaluating HellaSwag: 210/628 evaluating HellaSwag: 220/628 evaluating HellaSwag: 230/628 evaluating HellaSwag: 240/628 evaluating HellaSwag: 250/628 evaluating HellaSwag: 260/628 evaluating HellaSwag: 270/628 evaluating HellaSwag: 280/628 evaluating HellaSwag: 290/628 evaluating HellaSwag: 300/628 evaluating HellaSwag: 310/628 evaluating HellaSwag: 320/628 evaluating HellaSwag: 330/628 evaluating HellaSwag: 340/628 evaluating HellaSwag: 350/628 evaluating HellaSwag: 360/628 evaluating HellaSwag: 370/628 evaluating HellaSwag: 380/628 evaluating HellaSwag: 390/628 evaluating HellaSwag: 400/628 evaluating HellaSwag: 410/628 evaluating HellaSwag: 420/628 evaluating HellaSwag: 430/628 evaluating HellaSwag: 440/628 evaluating HellaSwag: 450/628 evaluating HellaSwag: 460/628 evaluating HellaSwag: 470/628 evaluating HellaSwag: 480/628 evaluating HellaSwag: 490/628 evaluating HellaSwag: 500/628 evaluating HellaSwag: 510/628 evaluating HellaSwag: 520/628 evaluating HellaSwag: 530/628 evaluating HellaSwag: 540/628 evaluating HellaSwag: 550/628 evaluating HellaSwag: 560/628 evaluating HellaSwag: 570/628 evaluating HellaSwag: 580/628 evaluating HellaSwag: 590/628 evaluating HellaSwag: 600/628 evaluating HellaSwag: 610/628 evaluating HellaSwag: 620/628 HellaSwag: 3023/10042 = 0.301036 +step 14001/19560 | loss 3.366496 (+1.20z)| norm 0.2712 (-0.25z)| lr 1.20e-04 | 2533.41 ms | 53.3% bf16 MFU | 207002 tok/s +step 14002/19560 | loss 3.304378 (-0.24z)| norm 0.2687 (-0.42z)| lr 1.20e-04 | 2533.55 ms | 53.3% bf16 MFU | 206999 tok/s +step 14003/19560 | loss 3.299738 (-0.34z)| norm 0.2482 (-1.69z)| lr 1.20e-04 | 2531.61 ms | 53.3% bf16 MFU | 207004 tok/s +step 14004/19560 | loss 3.321053 (+0.17z)| norm 0.2597 (-0.95z)| lr 1.20e-04 | 2532.17 ms | 53.3% bf16 MFU | 207006 tok/s +step 14005/19560 | loss 3.328006 (+0.33z)| norm 0.2523 (-1.39z)| lr 1.20e-04 | 2534.47 ms | 53.3% bf16 MFU | 206999 tok/s +step 14006/19560 | loss 3.327487 (+0.31z)| norm 0.2556 (-1.17z)| lr 1.20e-04 | 2533.53 ms | 53.3% bf16 MFU | 206996 tok/s +step 14007/19560 | loss 3.298718 (-0.37z)| norm 0.2665 (-0.48z)| lr 1.19e-04 | 2531.61 ms | 53.3% bf16 MFU | 207001 tok/s +step 14008/19560 | loss 3.269255 (-1.06z)| norm 0.2635 (-0.66z)| lr 1.19e-04 | 2532.93 ms | 53.3% bf16 MFU | 207000 tok/s +step 14009/19560 | loss 3.302149 (-0.29z)| norm 0.2663 (-0.48z)| lr 1.19e-04 | 2533.64 ms | 53.3% bf16 MFU | 206997 tok/s +step 14010/19560 | loss 3.256530 (-1.37z)| norm 0.2559 (-1.11z)| lr 1.19e-04 | 2533.70 ms | 53.3% bf16 MFU | 206993 tok/s +step 14011/19560 | loss 3.264128 (-1.17z)| norm 0.2925 (+1.14z)| lr 1.19e-04 | 2532.71 ms | 53.3% bf16 MFU | 206994 tok/s +step 14012/19560 | loss 3.227908 (-1.98z)| norm 0.2670 (-0.43z)| lr 1.19e-04 | 2533.79 ms | 53.3% bf16 MFU | 206990 tok/s +step 14013/19560 | loss 3.267616 (-1.04z)| norm 0.2706 (-0.20z)| lr 1.19e-04 | 2534.13 ms | 53.3% bf16 MFU | 206985 tok/s +step 14014/19560 | loss 3.329875 (+0.44z)| norm 0.2822 (+0.52z)| lr 1.19e-04 | 2534.68 ms | 53.3% bf16 MFU | 206978 tok/s +step 14015/19560 | loss 3.301831 (-0.22z)| norm 0.2636 (-0.62z)| lr 1.19e-04 | 2532.32 ms | 53.3% bf16 MFU | 206981 tok/s +step 14016/19560 | loss 3.353971 (+1.00z)| norm 0.2806 (+0.44z)| lr 1.19e-04 | 2534.72 ms | 53.3% bf16 MFU | 206974 tok/s +step 14017/19560 | loss 3.342911 (+0.73z)| norm 0.2699 (-0.22z)| lr 1.19e-04 | 2533.07 ms | 53.3% bf16 MFU | 206974 tok/s +step 14018/19560 | loss 3.341725 (+0.70z)| norm 0.2705 (-0.18z)| lr 1.19e-04 | 2532.50 ms | 53.3% bf16 MFU | 206977 tok/s +step 14019/19560 | loss 3.316054 (+0.09z)| norm 0.2916 (+1.12z)| lr 1.19e-04 | 2532.61 ms | 53.3% bf16 MFU | 206979 tok/s +step 14020/19560 | loss 3.354176 (+0.98z)| norm 0.2611 (-0.79z)| lr 1.19e-04 | 2532.02 ms | 53.3% bf16 MFU | 206983 tok/s +step 14021/19560 | loss 3.296560 (-0.41z)| norm 0.2838 (+0.62z)| lr 1.19e-04 | 2533.64 ms | 53.3% bf16 MFU | 206980 tok/s +step 14022/19560 | loss 3.312745 (-0.02z)| norm 0.2899 (+0.99z)| lr 1.19e-04 | 2532.12 ms | 53.3% bf16 MFU | 206984 tok/s +step 14023/19560 | loss 3.450964 (+3.17z)| norm 0.2742 (-0.00z)| lr 1.19e-04 | 2534.04 ms | 53.3% bf16 MFU | 206980 tok/s +step 14024/19560 | loss 3.316391 (+0.04z)| norm 0.2887 (+0.90z)| lr 1.19e-04 | 2533.64 ms | 53.3% bf16 MFU | 206977 tok/s +step 14025/19560 | loss 3.365006 (+1.16z)| norm 0.2647 (-0.61z)| lr 1.19e-04 | 2534.11 ms | 53.3% bf16 MFU | 206973 tok/s +step 14026/19560 | loss 3.267485 (-1.10z)| norm 0.2807 (+0.39z)| lr 1.19e-04 | 2533.27 ms | 53.3% bf16 MFU | 206972 tok/s +step 14027/19560 | loss 3.242524 (-1.67z)| norm 0.2823 (+0.48z)| lr 1.19e-04 | 2532.86 ms | 53.3% bf16 MFU | 206974 tok/s +step 14028/19560 | loss 3.333565 (+0.43z)| norm 0.2725 (-0.14z)| lr 1.19e-04 | 2535.10 ms | 53.3% bf16 MFU | 206965 tok/s +step 14029/19560 | loss 3.358679 (+1.00z)| norm 0.2804 (+0.36z)| lr 1.19e-04 | 2533.92 ms | 53.3% bf16 MFU | 206963 tok/s +step 14030/19560 | loss 3.341499 (+0.59z)| norm 0.2811 (+0.40z)| lr 1.19e-04 | 2536.70 ms | 53.2% bf16 MFU | 206948 tok/s +step 14031/19560 | loss 3.343080 (+0.63z)| norm 0.2899 (+0.95z)| lr 1.19e-04 | 2535.16 ms | 53.3% bf16 MFU | 206941 tok/s +step 14032/19560 | loss 3.339486 (+0.54z)| norm 0.2829 (+0.51z)| lr 1.18e-04 | 2534.14 ms | 53.3% bf16 MFU | 206939 tok/s +step 14033/19560 | loss 3.327127 (+0.25z)| norm 0.2831 (+0.53z)| lr 1.18e-04 | 2533.42 ms | 53.3% bf16 MFU | 206939 tok/s +step 14034/19560 | loss 3.335084 (+0.42z)| norm 0.2766 (+0.11z)| lr 1.18e-04 | 2532.38 ms | 53.3% bf16 MFU | 206944 tok/s +step 14035/19560 | loss 3.292514 (-0.57z)| norm 0.2744 (+0.01z)| lr 1.18e-04 | 2534.31 ms | 53.3% bf16 MFU | 206941 tok/s +step 14036/19560 | loss 3.349433 (+0.83z)| norm 0.2783 (+0.28z)| lr 1.18e-04 | 2534.72 ms | 53.3% bf16 MFU | 206936 tok/s +step 14037/19560 | loss 3.343933 (+0.70z)| norm 0.2717 (-0.16z)| lr 1.18e-04 | 2534.88 ms | 53.3% bf16 MFU | 206930 tok/s +step 14038/19560 | loss 3.319042 (+0.10z)| norm 0.2881 (+1.01z)| lr 1.18e-04 | 2533.89 ms | 53.3% bf16 MFU | 206929 tok/s +step 14039/19560 | loss 3.318878 (+0.08z)| norm 0.2656 (-0.59z)| lr 1.18e-04 | 2534.25 ms | 53.3% bf16 MFU | 206927 tok/s +step 14040/19560 | loss 3.443531 (+3.06z)| norm 0.2847 (+0.80z)| lr 1.18e-04 | 2533.84 ms | 53.3% bf16 MFU | 206926 tok/s +step 14041/19560 | loss 3.250269 (-1.58z)| norm 0.2782 (+0.32z)| lr 1.18e-04 | 2533.91 ms | 53.3% bf16 MFU | 206926 tok/s +step 14042/19560 | loss 3.326733 (+0.25z)| norm 0.2798 (+0.43z)| lr 1.18e-04 | 2532.88 ms | 53.3% bf16 MFU | 206929 tok/s +step 14043/19560 | loss 3.354030 (+0.89z)| norm 0.2557 (-1.30z)| lr 1.18e-04 | 2535.59 ms | 53.2% bf16 MFU | 206921 tok/s +step 14044/19560 | loss 3.298134 (-0.45z)| norm 0.2992 (+1.81z)| lr 1.18e-04 | 2533.85 ms | 53.3% bf16 MFU | 206921 tok/s +step 14045/19560 | loss 3.318553 (+0.03z)| norm 0.2834 (+0.67z)| lr 1.18e-04 | 2533.92 ms | 53.3% bf16 MFU | 206920 tok/s +step 14046/19560 | loss 3.322597 (+0.12z)| norm 0.2785 (+0.32z)| lr 1.18e-04 | 2532.30 ms | 53.3% bf16 MFU | 206926 tok/s +step 14047/19560 | loss 3.310048 (-0.18z)| norm 0.2826 (+0.60z)| lr 1.18e-04 | 2535.04 ms | 53.3% bf16 MFU | 206921 tok/s +step 14048/19560 | loss 3.276028 (-0.99z)| norm 0.2862 (+0.85z)| lr 1.18e-04 | 2532.31 ms | 53.3% bf16 MFU | 206926 tok/s +step 14049/19560 | loss 3.314427 (-0.06z)| norm 0.2845 (+0.72z)| lr 1.18e-04 | 2533.89 ms | 53.3% bf16 MFU | 206926 tok/s +step 14050/19560 | loss 3.295665 (-0.51z)| norm 0.2703 (-0.30z)| lr 1.18e-04 | 2534.75 ms | 53.3% bf16 MFU | 206921 tok/s +step 14051/19560 | loss 3.325544 (+0.22z)| norm 0.2703 (-0.30z)| lr 1.18e-04 | 2533.98 ms | 53.3% bf16 MFU | 206920 tok/s +step 14052/19560 | loss 3.348921 (+0.77z)| norm 0.2617 (-0.90z)| lr 1.18e-04 | 2532.94 ms | 53.3% bf16 MFU | 206924 tok/s +step 14053/19560 | loss 3.308568 (-0.21z)| norm 0.2728 (-0.11z)| lr 1.18e-04 | 2533.23 ms | 53.3% bf16 MFU | 206926 tok/s +step 14054/19560 | loss 3.256864 (-1.50z)| norm 0.2748 (+0.05z)| lr 1.18e-04 | 2533.55 ms | 53.3% bf16 MFU | 206926 tok/s +step 14055/19560 | loss 3.325197 (+0.25z)| norm 0.2790 (+0.34z)| lr 1.18e-04 | 2534.82 ms | 53.3% bf16 MFU | 206922 tok/s +step 14056/19560 | loss 3.328685 (+0.33z)| norm 0.2681 (-0.46z)| lr 1.18e-04 | 2533.62 ms | 53.3% bf16 MFU | 206922 tok/s +step 14057/19560 | loss 3.328091 (+0.31z)| norm 0.2631 (-0.81z)| lr 1.17e-04 | 2533.97 ms | 53.3% bf16 MFU | 206921 tok/s +step 14058/19560 | loss 3.308393 (-0.19z)| norm 0.2662 (-0.59z)| lr 1.17e-04 | 2534.07 ms | 53.3% bf16 MFU | 206920 tok/s +step 14059/19560 | loss 3.336307 (+0.52z)| norm 0.2768 (+0.17z)| lr 1.17e-04 | 2532.89 ms | 53.3% bf16 MFU | 206924 tok/s +step 14060/19560 | loss 3.236936 (-2.00z)| norm 0.2617 (-0.96z)| lr 1.17e-04 | 2533.18 ms | 53.3% bf16 MFU | 206926 tok/s +step 14061/19560 | loss 3.327099 (+0.28z)| norm 0.2804 (+0.47z)| lr 1.17e-04 | 2533.21 ms | 53.3% bf16 MFU | 206928 tok/s +step 14062/19560 | loss 3.340741 (+0.62z)| norm 0.2770 (+0.20z)| lr 1.17e-04 | 2534.98 ms | 53.3% bf16 MFU | 206923 tok/s +step 14063/19560 | loss 3.263477 (-1.33z)| norm 0.2637 (-0.81z)| lr 1.17e-04 | 2533.90 ms | 53.3% bf16 MFU | 206922 tok/s +step 14064/19560 | loss 3.305461 (-0.28z)| norm 0.2754 (+0.08z)| lr 1.17e-04 | 2533.99 ms | 53.3% bf16 MFU | 206921 tok/s +step 14065/19560 | loss 3.351343 (+0.88z)| norm 0.2780 (+0.27z)| lr 1.17e-04 | 2533.97 ms | 53.3% bf16 MFU | 206920 tok/s +step 14066/19560 | loss 3.328617 (+0.31z)| norm 0.2637 (-0.83z)| lr 1.17e-04 | 2534.32 ms | 53.3% bf16 MFU | 206918 tok/s +step 14067/19560 | loss 3.337715 (+0.53z)| norm 0.2839 (+0.72z)| lr 1.17e-04 | 2533.32 ms | 53.3% bf16 MFU | 206920 tok/s +step 14068/19560 | loss 3.284483 (-0.85z)| norm 0.2726 (-0.17z)| lr 1.17e-04 | 2535.05 ms | 53.3% bf16 MFU | 206915 tok/s +step 14069/19560 | loss 3.323129 (+0.15z)| norm 0.2652 (-0.78z)| lr 1.17e-04 | 2534.73 ms | 53.3% bf16 MFU | 206911 tok/s +step 14070/19560 | loss 3.363911 (+1.19z)| norm 0.2669 (-0.64z)| lr 1.17e-04 | 2533.08 ms | 53.3% bf16 MFU | 206914 tok/s +step 14071/19560 | loss 3.327355 (+0.23z)| norm 0.2713 (-0.29z)| lr 1.17e-04 | 2534.07 ms | 53.3% bf16 MFU | 206913 tok/s +step 14072/19560 | loss 3.310854 (-0.21z)| norm 0.2568 (-1.43z)| lr 1.17e-04 | 2532.75 ms | 53.3% bf16 MFU | 206918 tok/s +step 14073/19560 | loss 3.246747 (-1.87z)| norm 0.2639 (-0.88z)| lr 1.17e-04 | 2533.58 ms | 53.3% bf16 MFU | 206919 tok/s +step 14074/19560 | loss 3.321846 (+0.11z)| norm 0.2679 (-0.55z)| lr 1.17e-04 | 2535.18 ms | 53.3% bf16 MFU | 206913 tok/s +step 14075/19560 | loss 3.315424 (-0.07z)| norm 0.2717 (-0.25z)| lr 1.17e-04 | 2534.33 ms | 53.3% bf16 MFU | 206911 tok/s +step 14076/19560 | loss 3.287545 (-0.81z)| norm 0.2616 (-1.06z)| lr 1.17e-04 | 2534.42 ms | 53.3% bf16 MFU | 206909 tok/s +step 14077/19560 | loss 3.315076 (-0.06z)| norm 0.2672 (-0.59z)| lr 1.17e-04 | 2532.98 ms | 53.3% bf16 MFU | 206913 tok/s +step 14078/19560 | loss 3.252431 (-1.77z)| norm 0.2700 (-0.37z)| lr 1.17e-04 | 2533.73 ms | 53.3% bf16 MFU | 206913 tok/s +step 14079/19560 | loss 3.326671 (+0.26z)| norm 0.2681 (-0.53z)| lr 1.17e-04 | 2535.69 ms | 53.2% bf16 MFU | 206906 tok/s +step 14080/19560 | loss 3.329879 (+0.34z)| norm 0.2700 (-0.37z)| lr 1.17e-04 | 2532.22 ms | 53.3% bf16 MFU | 206913 tok/s +step 14081/19560 | loss 3.349795 (+0.88z)| norm 0.2693 (-0.42z)| lr 1.17e-04 | 2533.45 ms | 53.3% bf16 MFU | 206914 tok/s +step 14082/19560 | loss 3.448389 (+3.41z)| norm 0.2682 (-0.51z)| lr 1.17e-04 | 2532.66 ms | 53.3% bf16 MFU | 206919 tok/s +step 14083/19560 | loss 3.280545 (-1.00z)| norm 0.2837 (+0.77z)| lr 1.16e-04 | 2533.86 ms | 53.3% bf16 MFU | 206919 tok/s +step 14084/19560 | loss 3.288309 (-0.78z)| norm 0.2525 (-1.77z)| lr 1.16e-04 | 2533.71 ms | 53.3% bf16 MFU | 206919 tok/s +step 14085/19560 | loss 3.340199 (+0.58z)| norm 0.2752 (+0.08z)| lr 1.16e-04 | 2534.40 ms | 53.3% bf16 MFU | 206917 tok/s +step 14086/19560 | loss 3.291062 (-0.70z)| norm 0.2879 (+1.13z)| lr 1.16e-04 | 2532.43 ms | 53.3% bf16 MFU | 206922 tok/s +step 14087/19560 | loss 3.339411 (+0.57z)| norm 0.2719 (-0.17z)| lr 1.16e-04 | 2534.68 ms | 53.3% bf16 MFU | 206918 tok/s +step 14088/19560 | loss 3.364956 (+1.25z)| norm 0.2666 (-0.60z)| lr 1.16e-04 | 2533.14 ms | 53.3% bf16 MFU | 206921 tok/s +step 14089/19560 | loss 3.277562 (-1.04z)| norm 0.2690 (-0.39z)| lr 1.16e-04 | 2533.20 ms | 53.3% bf16 MFU | 206923 tok/s +step 14090/19560 | loss 3.275421 (-1.09z)| norm 0.2838 (+0.84z)| lr 1.16e-04 | 2532.92 ms | 53.3% bf16 MFU | 206927 tok/s +step 14091/19560 | loss 3.304959 (-0.31z)| norm 0.2623 (-0.94z)| lr 1.16e-04 | 2534.14 ms | 53.3% bf16 MFU | 206925 tok/s +step 14092/19560 | loss 3.281463 (-0.92z)| norm 0.2859 (+1.08z)| lr 1.16e-04 | 2534.16 ms | 53.3% bf16 MFU | 206923 tok/s +step 14093/19560 | loss 3.294957 (-0.56z)| norm 0.2836 (+0.87z)| lr 1.16e-04 | 2535.45 ms | 53.3% bf16 MFU | 206916 tok/s +step 14094/19560 | loss 3.256325 (-1.55z)| norm 0.2754 (+0.17z)| lr 1.16e-04 | 2534.67 ms | 53.3% bf16 MFU | 206913 tok/s +step 14095/19560 | loss 3.308515 (-0.19z)| norm 0.2700 (-0.30z)| lr 1.16e-04 | 2533.05 ms | 53.3% bf16 MFU | 206916 tok/s +step 14096/19560 | loss 3.354856 (+1.01z)| norm 0.2661 (-0.62z)| lr 1.16e-04 | 2534.65 ms | 53.3% bf16 MFU | 206913 tok/s +step 14097/19560 | loss 3.290961 (-0.66z)| norm 0.2642 (-0.78z)| lr 1.16e-04 | 2533.85 ms | 53.3% bf16 MFU | 206913 tok/s +step 14098/19560 | loss 3.379936 (+1.63z)| norm 0.2653 (-0.69z)| lr 1.16e-04 | 2536.05 ms | 53.2% bf16 MFU | 206904 tok/s +step 14099/19560 | loss 3.319083 (+0.06z)| norm 0.2717 (-0.13z)| lr 1.16e-04 | 2535.17 ms | 53.3% bf16 MFU | 206899 tok/s +step 14100/19560 | loss 3.377069 (+1.52z)| norm 0.2772 (+0.36z)| lr 1.16e-04 | 2531.74 ms | 53.3% bf16 MFU | 206908 tok/s +step 14101/19560 | loss 3.290344 (-0.68z)| norm 0.2919 (+1.64z)| lr 1.16e-04 | 2531.43 ms | 53.3% bf16 MFU | 206918 tok/s +step 14102/19560 | loss 3.302495 (-0.36z)| norm 0.2562 (-1.45z)| lr 1.16e-04 | 2534.48 ms | 53.3% bf16 MFU | 206915 tok/s +step 14103/19560 | loss 3.279667 (-0.94z)| norm 0.2459 (-2.32z)| lr 1.16e-04 | 2532.42 ms | 53.3% bf16 MFU | 206921 tok/s +step 14104/19560 | loss 3.232775 (-2.09z)| norm 0.2641 (-0.72z)| lr 1.16e-04 | 2532.63 ms | 53.3% bf16 MFU | 206926 tok/s +step 14105/19560 | loss 3.347525 (+0.79z)| norm 0.2561 (-1.44z)| lr 1.16e-04 | 2532.53 ms | 53.3% bf16 MFU | 206931 tok/s +step 14106/19560 | loss 3.283145 (-0.82z)| norm 0.2821 (+0.92z)| lr 1.16e-04 | 2534.20 ms | 53.3% bf16 MFU | 206928 tok/s +step 14107/19560 | loss 3.229833 (-2.12z)| norm 0.2611 (-0.97z)| lr 1.16e-04 | 2535.25 ms | 53.3% bf16 MFU | 206922 tok/s +step 14108/19560 | loss 3.291443 (-0.58z)| norm 0.2760 (+0.38z)| lr 1.15e-04 | 2534.80 ms | 53.3% bf16 MFU | 206918 tok/s +step 14109/19560 | loss 3.228310 (-2.10z)| norm 0.2674 (-0.40z)| lr 1.15e-04 | 2533.56 ms | 53.3% bf16 MFU | 206919 tok/s +step 14110/19560 | loss 3.300820 (-0.32z)| norm 0.2568 (-1.37z)| lr 1.15e-04 | 2534.42 ms | 53.3% bf16 MFU | 206916 tok/s +step 14111/19560 | loss 3.353605 (+0.96z)| norm 0.2654 (-0.56z)| lr 1.15e-04 | 2532.47 ms | 53.3% bf16 MFU | 206922 tok/s +step 14112/19560 | loss 3.267686 (-1.12z)| norm 0.2702 (-0.11z)| lr 1.15e-04 | 2532.58 ms | 53.3% bf16 MFU | 206926 tok/s +step 14113/19560 | loss 3.263868 (-1.20z)| norm 0.2642 (-0.66z)| lr 1.15e-04 | 2531.46 ms | 53.3% bf16 MFU | 206935 tok/s +step 14114/19560 | loss 3.331852 (+0.46z)| norm 0.2742 (+0.28z)| lr 1.15e-04 | 2531.55 ms | 53.3% bf16 MFU | 206944 tok/s +step 14115/19560 | loss 3.294334 (-0.45z)| norm 0.2683 (-0.27z)| lr 1.15e-04 | 2532.16 ms | 53.3% bf16 MFU | 206949 tok/s +step 14116/19560 | loss 3.297306 (-0.36z)| norm 0.2664 (-0.46z)| lr 1.15e-04 | 2533.36 ms | 53.3% bf16 MFU | 206949 tok/s +step 14117/19560 | loss 3.304689 (-0.18z)| norm 0.2897 (+1.72z)| lr 1.15e-04 | 2532.25 ms | 53.3% bf16 MFU | 206954 tok/s +step 14118/19560 | loss 3.339721 (+0.68z)| norm 0.2717 (+0.01z)| lr 1.15e-04 | 2533.35 ms | 53.3% bf16 MFU | 206954 tok/s +step 14119/19560 | loss 3.314541 (+0.04z)| norm 0.2664 (-0.51z)| lr 1.15e-04 | 2533.88 ms | 53.3% bf16 MFU | 206952 tok/s +step 14120/19560 | loss 3.310387 (-0.05z)| norm 0.2769 (+0.49z)| lr 1.15e-04 | 2534.59 ms | 53.3% bf16 MFU | 206947 tok/s +step 14121/19560 | loss 3.271234 (-1.02z)| norm 0.2624 (-0.91z)| lr 1.15e-04 | 2535.55 ms | 53.2% bf16 MFU | 206938 tok/s +step 14122/19560 | loss 3.288006 (-0.59z)| norm 0.2752 (+0.33z)| lr 1.15e-04 | 2534.56 ms | 53.3% bf16 MFU | 206934 tok/s +step 14123/19560 | loss 3.404285 (+2.26z)| norm 0.2561 (-1.52z)| lr 1.15e-04 | 2533.87 ms | 53.3% bf16 MFU | 206933 tok/s +step 14124/19560 | loss 3.303013 (-0.23z)| norm 0.3175 (+4.13z)| lr 1.15e-04 | 2534.65 ms | 53.3% bf16 MFU | 206929 tok/s +step 14125/19560 | loss 3.298919 (-0.34z)| norm 0.2908 (+1.67z)| lr 1.15e-04 | 2533.66 ms | 53.3% bf16 MFU | 206929 tok/s +step 14126/19560 | loss 3.367684 (+1.34z)| norm 0.2548 (-1.59z)| lr 1.15e-04 | 2534.62 ms | 53.3% bf16 MFU | 206925 tok/s +step 14127/19560 | loss 3.321526 (+0.20z)| norm 0.2745 (+0.19z)| lr 1.15e-04 | 2533.90 ms | 53.3% bf16 MFU | 206924 tok/s +step 14128/19560 | loss 3.277165 (-0.89z)| norm 0.2945 (+1.95z)| lr 1.15e-04 | 2534.14 ms | 53.3% bf16 MFU | 206923 tok/s +step 14129/19560 | loss 3.328208 (+0.37z)| norm 0.2590 (-1.19z)| lr 1.15e-04 | 2533.94 ms | 53.3% bf16 MFU | 206922 tok/s +step 14130/19560 | loss 3.326709 (+0.33z)| norm 0.2717 (-0.07z)| lr 1.15e-04 | 2533.34 ms | 53.3% bf16 MFU | 206923 tok/s +step 14131/19560 | loss 3.333889 (+0.50z)| norm 0.2857 (+1.16z)| lr 1.15e-04 | 2535.52 ms | 53.3% bf16 MFU | 206916 tok/s +step 14132/19560 | loss 3.311453 (-0.05z)| norm 0.2588 (-1.25z)| lr 1.15e-04 | 2532.65 ms | 53.3% bf16 MFU | 206921 tok/s +step 14133/19560 | loss 3.320466 (+0.17z)| norm 0.2686 (-0.39z)| lr 1.14e-04 | 2532.77 ms | 53.3% bf16 MFU | 206925 tok/s +step 14134/19560 | loss 3.338193 (+0.61z)| norm 0.2629 (-0.92z)| lr 1.14e-04 | 2534.33 ms | 53.3% bf16 MFU | 206922 tok/s +step 14135/19560 | loss 3.327779 (+0.35z)| norm 0.2912 (+1.64z)| lr 1.14e-04 | 2534.74 ms | 53.3% bf16 MFU | 206918 tok/s +step 14136/19560 | loss 3.370775 (+1.39z)| norm 0.2579 (-1.37z)| lr 1.14e-04 | 2534.60 ms | 53.3% bf16 MFU | 206915 tok/s +step 14137/19560 | loss 3.331835 (+0.42z)| norm 0.2710 (-0.19z)| lr 1.14e-04 | 2533.65 ms | 53.3% bf16 MFU | 206916 tok/s +step 14138/19560 | loss 3.311321 (-0.10z)| norm 0.2875 (+1.28z)| lr 1.14e-04 | 2533.50 ms | 53.3% bf16 MFU | 206917 tok/s +step 14139/19560 | loss 3.313562 (-0.05z)| norm 0.2698 (-0.31z)| lr 1.14e-04 | 2535.69 ms | 53.2% bf16 MFU | 206909 tok/s +step 14140/19560 | loss 3.277038 (-0.99z)| norm 0.2722 (-0.09z)| lr 1.14e-04 | 2533.85 ms | 53.3% bf16 MFU | 206910 tok/s +step 14141/19560 | loss 3.314454 (-0.05z)| norm 0.2632 (-0.91z)| lr 1.14e-04 | 2535.79 ms | 53.2% bf16 MFU | 206902 tok/s +step 14142/19560 | loss 3.293324 (-0.58z)| norm 0.2867 (+1.23z)| lr 1.14e-04 | 2535.06 ms | 53.3% bf16 MFU | 206897 tok/s +step 14143/19560 | loss 3.286227 (-0.76z)| norm 0.2596 (-1.24z)| lr 1.14e-04 | 2534.40 ms | 53.3% bf16 MFU | 206896 tok/s +step 14144/19560 | loss 3.294053 (-0.55z)| norm 0.2555 (-1.58z)| lr 1.14e-04 | 2533.59 ms | 53.3% bf16 MFU | 206898 tok/s +step 14145/19560 | loss 3.292443 (-0.58z)| norm 0.3061 (+2.86z)| lr 1.14e-04 | 2534.91 ms | 53.3% bf16 MFU | 206894 tok/s +step 14146/19560 | loss 3.298073 (-0.43z)| norm 0.2581 (-1.31z)| lr 1.14e-04 | 2536.00 ms | 53.2% bf16 MFU | 206887 tok/s +step 14147/19560 | loss 3.336564 (+0.55z)| norm 0.2781 (+0.44z)| lr 1.14e-04 | 2533.94 ms | 53.3% bf16 MFU | 206888 tok/s +step 14148/19560 | loss 3.259062 (-1.40z)| norm 0.2742 (+0.09z)| lr 1.14e-04 | 2535.61 ms | 53.2% bf16 MFU | 206882 tok/s +step 14149/19560 | loss 3.421536 (+2.64z)| norm 0.3928 (+7.68z)| lr 1.14e-04 | 2533.27 ms | 53.3% bf16 MFU | 206886 tok/s +step 14150/19560 | loss 3.273926 (-1.01z)| norm 0.2740 (+0.01z)| lr 1.14e-04 | 2531.57 ms | 53.3% bf16 MFU | 206896 tok/s +step 14151/19560 | loss 3.332147 (+0.47z)| norm 0.2918 (+1.15z)| lr 1.14e-04 | 2533.65 ms | 53.3% bf16 MFU | 206898 tok/s +step 14152/19560 | loss 3.345655 (+0.81z)| norm 0.3095 (+2.24z)| lr 1.14e-04 | 2533.67 ms | 53.3% bf16 MFU | 206900 tok/s +step 14153/19560 | loss 3.394425 (+2.04z)| norm 0.2738 (-0.03z)| lr 1.14e-04 | 2534.36 ms | 53.3% bf16 MFU | 206898 tok/s +step 14154/19560 | loss 3.328067 (+0.34z)| norm 0.2784 (+0.26z)| lr 1.14e-04 | 2533.49 ms | 53.3% bf16 MFU | 206900 tok/s +step 14155/19560 | loss 3.331596 (+0.42z)| norm 0.2691 (-0.32z)| lr 1.14e-04 | 2533.97 ms | 53.3% bf16 MFU | 206901 tok/s +step 14156/19560 | loss 3.307740 (-0.20z)| norm 0.2783 (+0.26z)| lr 1.14e-04 | 2533.10 ms | 53.3% bf16 MFU | 206904 tok/s +step 14157/19560 | loss 3.308595 (-0.17z)| norm 0.2651 (-0.57z)| lr 1.14e-04 | 2532.30 ms | 53.3% bf16 MFU | 206911 tok/s +step 14158/19560 | loss 3.314828 (+0.00z)| norm 0.3030 (+1.81z)| lr 1.14e-04 | 2533.98 ms | 53.3% bf16 MFU | 206911 tok/s +step 14159/19560 | loss 3.316184 (+0.04z)| norm 0.2688 (-0.34z)| lr 1.13e-04 | 2533.54 ms | 53.3% bf16 MFU | 206912 tok/s +step 14160/19560 | loss 3.335724 (+0.56z)| norm 0.2598 (-0.88z)| lr 1.13e-04 | 2534.17 ms | 53.3% bf16 MFU | 206911 tok/s +step 14161/19560 | loss 3.292104 (-0.58z)| norm 0.2710 (-0.18z)| lr 1.13e-04 | 2533.03 ms | 53.3% bf16 MFU | 206914 tok/s +step 14162/19560 | loss 3.290869 (-0.60z)| norm 0.2571 (-1.04z)| lr 1.13e-04 | 2532.32 ms | 53.3% bf16 MFU | 206921 tok/s +step 14163/19560 | loss 3.296317 (-0.46z)| norm 0.2649 (-0.54z)| lr 1.13e-04 | 2534.22 ms | 53.3% bf16 MFU | 206919 tok/s +step 14164/19560 | loss 3.283347 (-0.79z)| norm 0.2679 (-0.35z)| lr 1.13e-04 | 2534.65 ms | 53.3% bf16 MFU | 206915 tok/s +step 14165/19560 | loss 3.398375 (+2.18z)| norm 0.2751 (+0.10z)| lr 1.13e-04 | 2533.66 ms | 53.3% bf16 MFU | 206916 tok/s +step 14166/19560 | loss 3.292660 (-0.54z)| norm 0.2647 (-0.54z)| lr 1.13e-04 | 2533.79 ms | 53.3% bf16 MFU | 206916 tok/s +step 14167/19560 | loss 3.309096 (-0.11z)| norm 0.2706 (-0.18z)| lr 1.13e-04 | 2534.86 ms | 53.3% bf16 MFU | 206912 tok/s +step 14168/19560 | loss 3.321502 (+0.24z)| norm 0.2468 (-1.64z)| lr 1.13e-04 | 2533.46 ms | 53.3% bf16 MFU | 206913 tok/s +step 14169/19560 | loss 3.336434 (+0.63z)| norm 0.2659 (-0.44z)| lr 1.13e-04 | 2533.78 ms | 53.3% bf16 MFU | 206914 tok/s +step 14170/19560 | loss 3.395753 (+2.20z)| norm 0.2634 (-0.59z)| lr 1.13e-04 | 2534.26 ms | 53.3% bf16 MFU | 206912 tok/s +step 14171/19560 | loss 3.378181 (+1.71z)| norm 0.2545 (-1.14z)| lr 1.13e-04 | 2534.09 ms | 53.3% bf16 MFU | 206911 tok/s +step 14172/19560 | loss 3.394404 (+2.08z)| norm 0.2611 (-0.72z)| lr 1.13e-04 | 2535.05 ms | 53.3% bf16 MFU | 206906 tok/s +step 14173/19560 | loss 3.280066 (-0.90z)| norm 0.2562 (-1.01z)| lr 1.13e-04 | 2532.74 ms | 53.3% bf16 MFU | 206911 tok/s +step 14174/19560 | loss 3.257101 (-1.47z)| norm 0.2533 (-1.17z)| lr 1.13e-04 | 2534.58 ms | 53.3% bf16 MFU | 206908 tok/s +step 14175/19560 | loss 3.346818 (+0.84z)| norm 0.2725 (+0.03z)| lr 1.13e-04 | 2535.00 ms | 53.3% bf16 MFU | 206904 tok/s +step 14176/19560 | loss 3.265070 (-1.26z)| norm 0.2740 (+0.13z)| lr 1.13e-04 | 2535.18 ms | 53.3% bf16 MFU | 206899 tok/s +step 14177/19560 | loss 3.265696 (-1.23z)| norm 0.2736 (+0.10z)| lr 1.13e-04 | 2534.19 ms | 53.3% bf16 MFU | 206898 tok/s +step 14178/19560 | loss 3.371944 (+1.46z)| norm 0.2789 (+0.43z)| lr 1.13e-04 | 2532.17 ms | 53.3% bf16 MFU | 206906 tok/s +step 14179/19560 | loss 3.320223 (+0.15z)| norm 0.2591 (-0.80z)| lr 1.13e-04 | 2534.40 ms | 53.3% bf16 MFU | 206904 tok/s +step 14180/19560 | loss 3.364711 (+1.27z)| norm 0.2820 (+0.62z)| lr 1.13e-04 | 2532.36 ms | 53.3% bf16 MFU | 206911 tok/s +step 14181/19560 | loss 3.271302 (-1.08z)| norm 0.2983 (+1.61z)| lr 1.13e-04 | 2533.39 ms | 53.3% bf16 MFU | 206913 tok/s +step 14182/19560 | loss 3.325078 (+0.26z)| norm 0.2732 (+0.06z)| lr 1.13e-04 | 2531.59 ms | 53.3% bf16 MFU | 206922 tok/s +step 14183/19560 | loss 3.281470 (-0.83z)| norm 0.2660 (-0.38z)| lr 1.13e-04 | 2533.12 ms | 53.3% bf16 MFU | 206925 tok/s +step 14184/19560 | loss 3.344091 (+0.75z)| norm 0.2721 (-0.00z)| lr 1.13e-04 | 2534.05 ms | 53.3% bf16 MFU | 206923 tok/s +step 14185/19560 | loss 3.298883 (-0.39z)| norm 0.2820 (+0.60z)| lr 1.12e-04 | 2533.07 ms | 53.3% bf16 MFU | 206926 tok/s +step 14186/19560 | loss 3.277356 (-0.92z)| norm 0.2636 (-0.54z)| lr 1.12e-04 | 2533.26 ms | 53.3% bf16 MFU | 206928 tok/s +step 14187/19560 | loss 3.368104 (+1.35z)| norm 0.2710 (-0.08z)| lr 1.12e-04 | 2535.90 ms | 53.2% bf16 MFU | 206919 tok/s +step 14188/19560 | loss 3.280725 (-0.86z)| norm 0.2705 (-0.11z)| lr 1.12e-04 | 2533.02 ms | 53.3% bf16 MFU | 206922 tok/s +step 14189/19560 | loss 3.296320 (-0.46z)| norm 0.2505 (-1.32z)| lr 1.12e-04 | 2534.72 ms | 53.3% bf16 MFU | 206918 tok/s +step 14190/19560 | loss 3.365508 (+1.29z)| norm 0.2728 (+0.05z)| lr 1.12e-04 | 2533.99 ms | 53.3% bf16 MFU | 206917 tok/s +step 14191/19560 | loss 3.391119 (+1.89z)| norm 0.2589 (-0.81z)| lr 1.12e-04 | 2532.70 ms | 53.3% bf16 MFU | 206922 tok/s +step 14192/19560 | loss 3.358181 (+1.05z)| norm 0.2565 (-0.94z)| lr 1.12e-04 | 2533.34 ms | 53.3% bf16 MFU | 206923 tok/s +step 14193/19560 | loss 3.345159 (+0.73z)| norm 0.2740 (+0.14z)| lr 1.12e-04 | 2533.68 ms | 53.3% bf16 MFU | 206923 tok/s +step 14194/19560 | loss 3.300438 (-0.38z)| norm 0.2498 (-1.34z)| lr 1.12e-04 | 2531.81 ms | 53.3% bf16 MFU | 206931 tok/s +step 14195/19560 | loss 3.269976 (-1.12z)| norm 0.2529 (-1.13z)| lr 1.12e-04 | 2533.07 ms | 53.3% bf16 MFU | 206934 tok/s +step 14196/19560 | loss 3.316373 (+0.02z)| norm 0.2724 (+0.06z)| lr 1.12e-04 | 2534.57 ms | 53.3% bf16 MFU | 206930 tok/s +step 14197/19560 | loss 3.332080 (+0.41z)| norm 0.2490 (-1.35z)| lr 1.12e-04 | 2534.58 ms | 53.3% bf16 MFU | 206926 tok/s +step 14198/19560 | loss 3.276479 (-0.95z)| norm 0.2546 (-1.00z)| lr 1.12e-04 | 2534.73 ms | 53.3% bf16 MFU | 206922 tok/s +step 14199/19560 | loss 3.389201 (+1.82z)| norm 0.2564 (-0.88z)| lr 1.12e-04 | 2533.92 ms | 53.3% bf16 MFU | 206921 tok/s +step 14200/19560 | loss 3.258574 (-1.37z)| norm 0.2563 (-0.89z)| lr 1.12e-04 | 2532.51 ms | 53.3% bf16 MFU | 206926 tok/s +step 14201/19560 | loss 3.351345 (+0.88z)| norm 0.2837 (+0.74z)| lr 1.12e-04 | 2534.37 ms | 53.3% bf16 MFU | 206923 tok/s +step 14202/19560 | loss 3.326081 (+0.26z)| norm 0.2722 (+0.05z)| lr 1.12e-04 | 2532.62 ms | 53.3% bf16 MFU | 206928 tok/s +step 14203/19560 | loss 3.305999 (-0.24z)| norm 0.2540 (-1.02z)| lr 1.12e-04 | 2532.22 ms | 53.3% bf16 MFU | 206934 tok/s +step 14204/19560 | loss 3.258850 (-1.38z)| norm 0.2637 (-0.45z)| lr 1.12e-04 | 2535.19 ms | 53.3% bf16 MFU | 206927 tok/s +step 14205/19560 | loss 3.242173 (-1.76z)| norm 0.2556 (-0.92z)| lr 1.12e-04 | 2533.44 ms | 53.3% bf16 MFU | 206928 tok/s +step 14206/19560 | loss 3.319215 (+0.09z)| norm 0.2662 (-0.29z)| lr 1.12e-04 | 2533.95 ms | 53.3% bf16 MFU | 206927 tok/s +step 14207/19560 | loss 3.320487 (+0.13z)| norm 0.2701 (-0.06z)| lr 1.12e-04 | 2534.35 ms | 53.3% bf16 MFU | 206924 tok/s +step 14208/19560 | loss 3.257774 (-1.38z)| norm 0.2858 (+0.87z)| lr 1.12e-04 | 2533.55 ms | 53.3% bf16 MFU | 206925 tok/s +step 14209/19560 | loss 3.291370 (-0.56z)| norm 0.2800 (+0.52z)| lr 1.12e-04 | 2533.44 ms | 53.3% bf16 MFU | 206926 tok/s +step 14210/19560 | loss 3.264884 (-1.21z)| norm 0.2584 (-0.76z)| lr 1.11e-04 | 2532.54 ms | 53.3% bf16 MFU | 206931 tok/s +step 14211/19560 | loss 3.269994 (-1.08z)| norm 0.2897 (+1.09z)| lr 1.11e-04 | 2535.64 ms | 53.2% bf16 MFU | 206923 tok/s +step 14212/19560 | loss 3.303676 (-0.23z)| norm 0.2586 (-0.75z)| lr 1.11e-04 | 2532.94 ms | 53.3% bf16 MFU | 206926 tok/s +step 14213/19560 | loss 3.307919 (-0.12z)| norm 0.2729 (+0.09z)| lr 1.11e-04 | 2534.62 ms | 53.3% bf16 MFU | 206922 tok/s +step 14214/19560 | loss 3.415655 (+2.51z)| norm 0.2619 (-0.54z)| lr 1.11e-04 | 2534.87 ms | 53.3% bf16 MFU | 206918 tok/s +step 14215/19560 | loss 3.374381 (+1.48z)| norm 0.2664 (-0.27z)| lr 1.11e-04 | 2533.95 ms | 53.3% bf16 MFU | 206917 tok/s +step 14216/19560 | loss 3.314183 (+0.02z)| norm 0.2759 (+0.28z)| lr 1.11e-04 | 2534.78 ms | 53.3% bf16 MFU | 206913 tok/s +step 14217/19560 | loss 3.359139 (+1.11z)| norm 0.2670 (-0.24z)| lr 1.11e-04 | 2534.79 ms | 53.3% bf16 MFU | 206909 tok/s +step 14218/19560 | loss 3.272214 (-1.03z)| norm 0.2738 (+0.17z)| lr 1.11e-04 | 2534.43 ms | 53.3% bf16 MFU | 206907 tok/s +step 14219/19560 | loss 3.338507 (+0.59z)| norm 0.2800 (+0.52z)| lr 1.11e-04 | 2533.70 ms | 53.3% bf16 MFU | 206908 tok/s +step 14220/19560 | loss 3.351686 (+0.90z)| norm 0.2776 (+0.39z)| lr 1.11e-04 | 2532.72 ms | 53.3% bf16 MFU | 206913 tok/s +step 14221/19560 | loss 3.359206 (+1.07z)| norm 0.2616 (-0.55z)| lr 1.11e-04 | 2533.04 ms | 53.3% bf16 MFU | 206916 tok/s +step 14222/19560 | loss 3.419347 (+2.47z)| norm 0.2710 (+0.01z)| lr 1.11e-04 | 2533.61 ms | 53.3% bf16 MFU | 206917 tok/s +step 14223/19560 | loss 3.330092 (+0.32z)| norm 0.2839 (+0.77z)| lr 1.11e-04 | 2536.00 ms | 53.2% bf16 MFU | 206908 tok/s +step 14224/19560 | loss 3.321481 (+0.12z)| norm 0.2633 (-0.46z)| lr 1.11e-04 | 2534.44 ms | 53.3% bf16 MFU | 206906 tok/s +step 14225/19560 | loss 3.344256 (+0.66z)| norm 0.2595 (-0.68z)| lr 1.11e-04 | 2533.71 ms | 53.3% bf16 MFU | 206907 tok/s +step 14226/19560 | loss 3.269383 (-1.13z)| norm 0.2560 (-0.88z)| lr 1.11e-04 | 2533.40 ms | 53.3% bf16 MFU | 206909 tok/s +step 14227/19560 | loss 3.291691 (-0.59z)| norm 0.2577 (-0.77z)| lr 1.11e-04 | 2532.13 ms | 53.3% bf16 MFU | 206916 tok/s +step 14228/19560 | loss 3.390361 (+1.79z)| norm 0.2718 (+0.06z)| lr 1.11e-04 | 2532.76 ms | 53.3% bf16 MFU | 206921 tok/s +step 14229/19560 | loss 3.394540 (+1.85z)| norm 0.2673 (-0.19z)| lr 1.11e-04 | 2532.52 ms | 53.3% bf16 MFU | 206926 tok/s +step 14230/19560 | loss 3.319929 (+0.07z)| norm 0.2733 (+0.16z)| lr 1.11e-04 | 2533.04 ms | 53.3% bf16 MFU | 206928 tok/s +step 14231/19560 | loss 3.323710 (+0.15z)| norm 0.2470 (-1.41z)| lr 1.11e-04 | 2534.08 ms | 53.3% bf16 MFU | 206927 tok/s +step 14232/19560 | loss 3.295350 (-0.54z)| norm 0.2692 (-0.09z)| lr 1.11e-04 | 2533.65 ms | 53.3% bf16 MFU | 206927 tok/s +step 14233/19560 | loss 3.332747 (+0.37z)| norm 0.2653 (-0.33z)| lr 1.11e-04 | 2535.43 ms | 53.3% bf16 MFU | 206920 tok/s +step 14234/19560 | loss 3.360030 (+1.01z)| norm 0.2616 (-0.54z)| lr 1.11e-04 | 2533.62 ms | 53.3% bf16 MFU | 206920 tok/s +step 14235/19560 | loss 3.368250 (+1.20z)| norm 0.2687 (-0.12z)| lr 1.11e-04 | 2536.29 ms | 53.2% bf16 MFU | 206910 tok/s +step 14236/19560 | loss 3.293242 (-0.64z)| norm 0.2953 (+1.46z)| lr 1.10e-04 | 2533.40 ms | 53.3% bf16 MFU | 206912 tok/s +step 14237/19560 | loss 3.275749 (-1.10z)| norm 0.2678 (-0.18z)| lr 1.10e-04 | 2533.30 ms | 53.3% bf16 MFU | 206914 tok/s +step 14238/19560 | loss 3.283741 (-0.89z)| norm 0.2770 (+0.36z)| lr 1.10e-04 | 2533.14 ms | 53.3% bf16 MFU | 206917 tok/s +step 14239/19560 | loss 3.370027 (+1.25z)| norm 0.2746 (+0.21z)| lr 1.10e-04 | 2532.32 ms | 53.3% bf16 MFU | 206923 tok/s +step 14240/19560 | loss 3.343358 (+0.57z)| norm 0.2750 (+0.23z)| lr 1.10e-04 | 2534.39 ms | 53.3% bf16 MFU | 206921 tok/s +step 14241/19560 | loss 3.385492 (+1.60z)| norm 0.2788 (+0.45z)| lr 1.10e-04 | 2533.74 ms | 53.3% bf16 MFU | 206921 tok/s +step 14242/19560 | loss 3.374711 (+1.31z)| norm 0.2875 (+0.96z)| lr 1.10e-04 | 2533.56 ms | 53.3% bf16 MFU | 206922 tok/s +step 14243/19560 | loss 3.383567 (+1.50z)| norm 0.2995 (+1.64z)| lr 1.10e-04 | 2532.59 ms | 53.3% bf16 MFU | 206926 tok/s +step 14244/19560 | loss 3.396234 (+1.77z)| norm 0.2862 (+0.85z)| lr 1.10e-04 | 2532.84 ms | 53.3% bf16 MFU | 206930 tok/s +step 14245/19560 | loss 3.278136 (-1.08z)| norm 0.2809 (+0.55z)| lr 1.10e-04 | 2532.27 ms | 53.3% bf16 MFU | 206935 tok/s +step 14246/19560 | loss 3.330469 (+0.18z)| norm 0.2690 (-0.16z)| lr 1.10e-04 | 2532.75 ms | 53.3% bf16 MFU | 206939 tok/s +step 14247/19560 | loss 3.356123 (+0.80z)| norm 0.2815 (+0.57z)| lr 1.10e-04 | 2534.38 ms | 53.3% bf16 MFU | 206935 tok/s +step 14248/19560 | loss 3.337690 (+0.35z)| norm 0.3010 (+1.69z)| lr 1.10e-04 | 2533.13 ms | 53.3% bf16 MFU | 206937 tok/s +step 14249/19560 | loss 3.328644 (+0.12z)| norm 0.2720 (+0.00z)| lr 1.10e-04 | 2535.19 ms | 53.3% bf16 MFU | 206931 tok/s +step 14250/19560 | loss 3.309241 (-0.36z)| norm 0.2826 (+0.61z)| lr 1.10e-04 | 2533.44 ms | 53.3% bf16 MFU | 206931 tok/s +val loss 3.325346 +evaluating HellaSwag: 0/628 evaluating HellaSwag: 10/628 evaluating HellaSwag: 20/628 evaluating HellaSwag: 30/628 evaluating HellaSwag: 40/628 evaluating HellaSwag: 50/628 evaluating HellaSwag: 60/628 evaluating HellaSwag: 70/628 evaluating HellaSwag: 80/628 evaluating HellaSwag: 90/628 evaluating HellaSwag: 100/628 evaluating HellaSwag: 110/628 evaluating HellaSwag: 120/628 evaluating HellaSwag: 130/628 evaluating HellaSwag: 140/628 evaluating HellaSwag: 150/628 evaluating HellaSwag: 160/628 evaluating HellaSwag: 170/628 evaluating HellaSwag: 180/628 evaluating HellaSwag: 190/628 evaluating HellaSwag: 200/628 evaluating HellaSwag: 210/628 evaluating HellaSwag: 220/628 evaluating HellaSwag: 230/628 evaluating HellaSwag: 240/628 evaluating HellaSwag: 250/628 evaluating HellaSwag: 260/628 evaluating HellaSwag: 270/628 evaluating HellaSwag: 280/628 evaluating HellaSwag: 290/628 evaluating HellaSwag: 300/628 evaluating HellaSwag: 310/628 evaluating HellaSwag: 320/628 evaluating HellaSwag: 330/628 evaluating HellaSwag: 340/628 evaluating HellaSwag: 350/628 evaluating HellaSwag: 360/628 evaluating HellaSwag: 370/628 evaluating HellaSwag: 380/628 evaluating HellaSwag: 390/628 evaluating HellaSwag: 400/628 evaluating HellaSwag: 410/628 evaluating HellaSwag: 420/628 evaluating HellaSwag: 430/628 evaluating HellaSwag: 440/628 evaluating HellaSwag: 450/628 evaluating HellaSwag: 460/628 evaluating HellaSwag: 470/628 evaluating HellaSwag: 480/628 evaluating HellaSwag: 490/628 evaluating HellaSwag: 500/628 evaluating HellaSwag: 510/628 evaluating HellaSwag: 520/628 evaluating HellaSwag: 530/628 evaluating HellaSwag: 540/628 evaluating HellaSwag: 550/628 evaluating HellaSwag: 560/628 evaluating HellaSwag: 570/628 evaluating HellaSwag: 580/628 evaluating HellaSwag: 590/628 evaluating HellaSwag: 600/628 evaluating HellaSwag: 610/628 evaluating HellaSwag: 620/628 HellaSwag: 2986/10042 = 0.297351 +step 14251/19560 | loss 3.340337 (+0.42z)| norm 0.2753 (+0.18z)| lr 1.10e-04 | 2535.09 ms | 53.3% bf16 MFU | 206926 tok/s +step 14252/19560 | loss 3.294443 (-0.71z)| norm 0.2676 (-0.25z)| lr 1.10e-04 | 2536.04 ms | 53.2% bf16 MFU | 206916 tok/s +step 14253/19560 | loss 3.340002 (+0.40z)| norm 0.2699 (-0.10z)| lr 1.10e-04 | 2535.55 ms | 53.2% bf16 MFU | 206909 tok/s +step 14254/19560 | loss 3.371114 (+1.17z)| norm 0.2865 (+0.88z)| lr 1.10e-04 | 2533.91 ms | 53.3% bf16 MFU | 206909 tok/s +step 14255/19560 | loss 3.311316 (-0.30z)| norm 0.2614 (-0.63z)| lr 1.10e-04 | 2535.21 ms | 53.3% bf16 MFU | 206904 tok/s +step 14256/19560 | loss 3.341491 (+0.43z)| norm 0.2529 (-1.12z)| lr 1.10e-04 | 2535.13 ms | 53.3% bf16 MFU | 206899 tok/s +step 14257/19560 | loss 3.359707 (+0.87z)| norm 0.2924 (+1.25z)| lr 1.10e-04 | 2534.03 ms | 53.3% bf16 MFU | 206899 tok/s +step 14258/19560 | loss 3.331205 (+0.17z)| norm 0.2488 (-1.36z)| lr 1.10e-04 | 2533.39 ms | 53.3% bf16 MFU | 206902 tok/s +step 14259/19560 | loss 3.239541 (-2.04z)| norm 0.2564 (-0.89z)| lr 1.10e-04 | 2533.89 ms | 53.3% bf16 MFU | 206902 tok/s +step 14260/19560 | loss 3.356398 (+0.78z)| norm 0.3118 (+2.35z)| lr 1.10e-04 | 2533.38 ms | 53.3% bf16 MFU | 206904 tok/s +step 14261/19560 | loss 3.279800 (-1.06z)| norm 0.2639 (-0.46z)| lr 1.10e-04 | 2533.19 ms | 53.3% bf16 MFU | 206908 tok/s +step 14262/19560 | loss 3.328546 (+0.12z)| norm 0.2709 (-0.05z)| lr 1.09e-04 | 2532.00 ms | 53.3% bf16 MFU | 206915 tok/s +step 14263/19560 | loss 3.333070 (+0.23z)| norm 0.2669 (-0.28z)| lr 1.09e-04 | 2534.22 ms | 53.3% bf16 MFU | 206914 tok/s +step 14264/19560 | loss 3.314978 (-0.20z)| norm 0.2726 (+0.06z)| lr 1.09e-04 | 2533.98 ms | 53.3% bf16 MFU | 206913 tok/s +step 14265/19560 | loss 3.338455 (+0.37z)| norm 0.2488 (-1.33z)| lr 1.09e-04 | 2534.24 ms | 53.3% bf16 MFU | 206912 tok/s +step 14266/19560 | loss 3.305445 (-0.43z)| norm 0.2803 (+0.52z)| lr 1.09e-04 | 2533.83 ms | 53.3% bf16 MFU | 206912 tok/s +step 14267/19560 | loss 3.305206 (-0.44z)| norm 0.2737 (+0.13z)| lr 1.09e-04 | 2533.92 ms | 53.3% bf16 MFU | 206912 tok/s +step 14268/19560 | loss 3.342692 (+0.46z)| norm 0.2828 (+0.66z)| lr 1.09e-04 | 2532.77 ms | 53.3% bf16 MFU | 206916 tok/s +step 14269/19560 | loss 3.471967 (+3.41z)| norm 0.3011 (+1.69z)| lr 1.09e-04 | 2533.71 ms | 53.3% bf16 MFU | 206917 tok/s +step 14270/19560 | loss 3.303811 (-0.49z)| norm 0.3039 (+1.83z)| lr 1.09e-04 | 2535.17 ms | 53.3% bf16 MFU | 206911 tok/s +step 14271/19560 | loss 3.374775 (+1.14z)| norm 0.2709 (-0.07z)| lr 1.09e-04 | 2533.67 ms | 53.3% bf16 MFU | 206912 tok/s +step 14272/19560 | loss 3.318653 (-0.17z)| norm 0.2761 (+0.22z)| lr 1.09e-04 | 2534.05 ms | 53.3% bf16 MFU | 206911 tok/s +step 14273/19560 | loss 3.393156 (+1.53z)| norm 0.2883 (+0.94z)| lr 1.09e-04 | 2534.77 ms | 53.3% bf16 MFU | 206908 tok/s +step 14274/19560 | loss 3.388016 (+1.39z)| norm 0.2686 (-0.21z)| lr 1.09e-04 | 2532.49 ms | 53.3% bf16 MFU | 206913 tok/s +step 14275/19560 | loss 3.398452 (+1.60z)| norm 0.3250 (+2.96z)| lr 1.09e-04 | 2532.65 ms | 53.3% bf16 MFU | 206918 tok/s +step 14276/19560 | loss 3.324873 (-0.08z)| norm 0.2712 (-0.08z)| lr 1.09e-04 | 2533.45 ms | 53.3% bf16 MFU | 206920 tok/s +step 14277/19560 | loss 3.325226 (-0.06z)| norm 0.2979 (+1.82z)| lr 1.09e-04 | 2535.06 ms | 53.3% bf16 MFU | 206915 tok/s +step 14278/19560 | loss 3.308706 (-0.45z)| norm 0.2670 (-0.33z)| lr 1.09e-04 | 2531.32 ms | 53.3% bf16 MFU | 206925 tok/s +step 14279/19560 | loss 3.277501 (-1.17z)| norm 0.2983 (+1.84z)| lr 1.09e-04 | 2535.00 ms | 53.3% bf16 MFU | 206920 tok/s +step 14280/19560 | loss 3.313780 (-0.31z)| norm 0.2844 (+0.91z)| lr 1.09e-04 | 2532.45 ms | 53.3% bf16 MFU | 206925 tok/s +step 14281/19560 | loss 3.351617 (+0.58z)| norm 0.2771 (+0.39z)| lr 1.09e-04 | 2533.10 ms | 53.3% bf16 MFU | 206927 tok/s +step 14282/19560 | loss 3.362862 (+0.84z)| norm 0.2888 (+1.21z)| lr 1.09e-04 | 2533.13 ms | 53.3% bf16 MFU | 206930 tok/s +step 14283/19560 | loss 3.340037 (+0.30z)| norm 0.2765 (+0.34z)| lr 1.09e-04 | 2533.59 ms | 53.3% bf16 MFU | 206930 tok/s +step 14284/19560 | loss 3.379868 (+1.21z)| norm 0.2527 (-1.32z)| lr 1.09e-04 | 2532.37 ms | 53.3% bf16 MFU | 206935 tok/s +step 14285/19560 | loss 3.324090 (-0.09z)| norm 0.2959 (+1.68z)| lr 1.09e-04 | 2533.31 ms | 53.3% bf16 MFU | 206936 tok/s +step 14286/19560 | loss 3.338551 (+0.24z)| norm 0.2698 (-0.12z)| lr 1.09e-04 | 2532.09 ms | 53.3% bf16 MFU | 206942 tok/s +step 14287/19560 | loss 3.285841 (-0.98z)| norm 0.2591 (-0.88z)| lr 1.09e-04 | 2532.79 ms | 53.3% bf16 MFU | 206945 tok/s +step 14288/19560 | loss 3.291296 (-0.84z)| norm 0.2733 (+0.12z)| lr 1.08e-04 | 2533.54 ms | 53.3% bf16 MFU | 206945 tok/s +step 14289/19560 | loss 3.315909 (-0.28z)| norm 0.2586 (-0.91z)| lr 1.08e-04 | 2533.74 ms | 53.3% bf16 MFU | 206944 tok/s +step 14290/19560 | loss 3.316681 (-0.26z)| norm 0.2691 (-0.18z)| lr 1.08e-04 | 2534.78 ms | 53.3% bf16 MFU | 206938 tok/s +step 14291/19560 | loss 3.315806 (-0.29z)| norm 0.2624 (-0.65z)| lr 1.08e-04 | 2533.50 ms | 53.3% bf16 MFU | 206939 tok/s +step 14292/19560 | loss 3.317147 (-0.26z)| norm 0.2657 (-0.41z)| lr 1.08e-04 | 2534.46 ms | 53.3% bf16 MFU | 206935 tok/s +step 14293/19560 | loss 3.345356 (+0.41z)| norm 0.2563 (-1.06z)| lr 1.08e-04 | 2535.21 ms | 53.3% bf16 MFU | 206928 tok/s +step 14294/19560 | loss 3.349586 (+0.50z)| norm 0.2593 (-0.85z)| lr 1.08e-04 | 2534.35 ms | 53.3% bf16 MFU | 206926 tok/s +step 14295/19560 | loss 3.344450 (+0.38z)| norm 0.2552 (-1.12z)| lr 1.08e-04 | 2534.68 ms | 53.3% bf16 MFU | 206922 tok/s +step 14296/19560 | loss 3.391823 (+1.47z)| norm 0.2547 (-1.17z)| lr 1.08e-04 | 2533.77 ms | 53.3% bf16 MFU | 206921 tok/s +step 14297/19560 | loss 3.340950 (+0.28z)| norm 0.3185 (+3.17z)| lr 1.08e-04 | 2534.19 ms | 53.3% bf16 MFU | 206920 tok/s +step 14298/19560 | loss 3.289329 (-0.93z)| norm 0.2635 (-0.56z)| lr 1.08e-04 | 2534.84 ms | 53.3% bf16 MFU | 206915 tok/s +step 14299/19560 | loss 3.347621 (+0.46z)| norm 0.2663 (-0.37z)| lr 1.08e-04 | 2532.57 ms | 53.3% bf16 MFU | 206920 tok/s +step 14300/19560 | loss 3.318507 (-0.22z)| norm 0.2642 (-0.52z)| lr 1.08e-04 | 2534.38 ms | 53.3% bf16 MFU | 206918 tok/s +step 14301/19560 | loss 3.403792 (+1.80z)| norm 0.2542 (-1.20z)| lr 1.08e-04 | 2533.53 ms | 53.3% bf16 MFU | 206919 tok/s +step 14302/19560 | loss 3.331102 (+0.05z)| norm 0.2666 (-0.36z)| lr 1.08e-04 | 2534.62 ms | 53.3% bf16 MFU | 206916 tok/s +step 14303/19560 | loss 3.371950 (+1.03z)| norm 0.2666 (-0.36z)| lr 1.08e-04 | 2533.03 ms | 53.3% bf16 MFU | 206919 tok/s +step 14304/19560 | loss 3.269538 (-1.44z)| norm 0.2645 (-0.50z)| lr 1.08e-04 | 2533.38 ms | 53.3% bf16 MFU | 206920 tok/s +step 14305/19560 | loss 3.374567 (+1.07z)| norm 0.2741 (+0.16z)| lr 1.08e-04 | 2533.98 ms | 53.3% bf16 MFU | 206920 tok/s +step 14306/19560 | loss 3.271089 (-1.41z)| norm 0.2774 (+0.38z)| lr 1.08e-04 | 2531.24 ms | 53.3% bf16 MFU | 206930 tok/s +step 14307/19560 | loss 3.341504 (+0.29z)| norm 0.2792 (+0.50z)| lr 1.08e-04 | 2534.41 ms | 53.3% bf16 MFU | 206927 tok/s +step 14308/19560 | loss 3.320733 (-0.21z)| norm 0.2660 (-0.40z)| lr 1.08e-04 | 2534.59 ms | 53.3% bf16 MFU | 206923 tok/s +step 14309/19560 | loss 3.274999 (-1.31z)| norm 0.2690 (-0.18z)| lr 1.08e-04 | 2533.76 ms | 53.3% bf16 MFU | 206923 tok/s +step 14310/19560 | loss 3.371134 (+1.00z)| norm 0.2881 (+1.14z)| lr 1.08e-04 | 2533.63 ms | 53.3% bf16 MFU | 206924 tok/s +step 14311/19560 | loss 3.332050 (+0.05z)| norm 0.2511 (-1.41z)| lr 1.08e-04 | 2532.16 ms | 53.3% bf16 MFU | 206930 tok/s +step 14312/19560 | loss 3.299789 (-0.72z)| norm 0.2749 (+0.23z)| lr 1.08e-04 | 2533.76 ms | 53.3% bf16 MFU | 206929 tok/s +step 14313/19560 | loss 3.320322 (-0.23z)| norm 0.2813 (+0.67z)| lr 1.08e-04 | 2533.92 ms | 53.3% bf16 MFU | 206928 tok/s +step 14314/19560 | loss 3.300947 (-0.71z)| norm 0.2595 (-0.83z)| lr 1.07e-04 | 2533.89 ms | 53.3% bf16 MFU | 206927 tok/s +step 14315/19560 | loss 3.320265 (-0.23z)| norm 0.2612 (-0.70z)| lr 1.07e-04 | 2534.16 ms | 53.3% bf16 MFU | 206925 tok/s +step 14316/19560 | loss 3.298960 (-0.76z)| norm 0.2790 (+0.51z)| lr 1.07e-04 | 2533.91 ms | 53.3% bf16 MFU | 206925 tok/s +step 14317/19560 | loss 3.315811 (-0.35z)| norm 0.2665 (-0.36z)| lr 1.07e-04 | 2534.65 ms | 53.3% bf16 MFU | 206921 tok/s +step 14318/19560 | loss 3.455365 (+2.96z)| norm 0.2681 (-0.25z)| lr 1.07e-04 | 2534.62 ms | 53.3% bf16 MFU | 206917 tok/s +step 14319/19560 | loss 3.308841 (-0.51z)| norm 0.2865 (+1.01z)| lr 1.07e-04 | 2534.26 ms | 53.3% bf16 MFU | 206915 tok/s +step 14320/19560 | loss 3.323690 (-0.15z)| norm 0.2831 (+0.76z)| lr 1.07e-04 | 2533.17 ms | 53.3% bf16 MFU | 206918 tok/s +step 14321/19560 | loss 3.314994 (-0.35z)| norm 0.2808 (+0.60z)| lr 1.07e-04 | 2533.31 ms | 53.3% bf16 MFU | 206920 tok/s +step 14322/19560 | loss 3.318638 (-0.26z)| norm 0.2620 (-0.71z)| lr 1.07e-04 | 2532.78 ms | 53.3% bf16 MFU | 206924 tok/s +step 14323/19560 | loss 3.340197 (+0.24z)| norm 0.2784 (+0.42z)| lr 1.07e-04 | 2533.88 ms | 53.3% bf16 MFU | 206924 tok/s +step 14324/19560 | loss 3.352000 (+0.52z)| norm 0.2871 (+1.01z)| lr 1.07e-04 | 2534.76 ms | 53.3% bf16 MFU | 206919 tok/s +step 14325/19560 | loss 3.337787 (+0.18z)| norm 0.2653 (-0.51z)| lr 1.07e-04 | 2534.01 ms | 53.3% bf16 MFU | 206918 tok/s +step 14326/19560 | loss 3.310464 (-0.49z)| norm 0.2673 (-0.38z)| lr 1.07e-04 | 2534.22 ms | 53.3% bf16 MFU | 206917 tok/s +step 14327/19560 | loss 3.324522 (-0.14z)| norm 0.2493 (-1.64z)| lr 1.07e-04 | 2533.39 ms | 53.3% bf16 MFU | 206918 tok/s +step 14328/19560 | loss 3.352768 (+0.54z)| norm 0.2689 (-0.28z)| lr 1.07e-04 | 2535.34 ms | 53.3% bf16 MFU | 206912 tok/s +step 14329/19560 | loss 3.401460 (+1.72z)| norm 0.2641 (-0.60z)| lr 1.07e-04 | 2533.07 ms | 53.3% bf16 MFU | 206915 tok/s +step 14330/19560 | loss 3.402311 (+1.71z)| norm 0.2668 (-0.41z)| lr 1.07e-04 | 2533.64 ms | 53.3% bf16 MFU | 206916 tok/s +step 14331/19560 | loss 3.356079 (+0.57z)| norm 0.2642 (-0.60z)| lr 1.07e-04 | 2534.42 ms | 53.3% bf16 MFU | 206914 tok/s +step 14332/19560 | loss 3.307896 (-0.61z)| norm 0.2753 (+0.18z)| lr 1.07e-04 | 2534.78 ms | 53.3% bf16 MFU | 206910 tok/s +step 14333/19560 | loss 3.304702 (-0.71z)| norm 0.2647 (-0.58z)| lr 1.07e-04 | 2533.63 ms | 53.3% bf16 MFU | 206911 tok/s +step 14334/19560 | loss 3.318809 (-0.36z)| norm 0.2602 (-0.90z)| lr 1.07e-04 | 2533.97 ms | 53.3% bf16 MFU | 206911 tok/s +step 14335/19560 | loss 3.315948 (-0.43z)| norm 0.2728 (-0.00z)| lr 1.07e-04 | 2533.98 ms | 53.3% bf16 MFU | 206910 tok/s +step 14336/19560 | loss 3.311507 (-0.56z)| norm 0.2713 (-0.10z)| lr 1.07e-04 | 2534.59 ms | 53.3% bf16 MFU | 206907 tok/s +step 14337/19560 | loss 3.332640 (-0.03z)| norm 0.2685 (-0.30z)| lr 1.07e-04 | 2535.19 ms | 53.3% bf16 MFU | 206902 tok/s +step 14338/19560 | loss 3.287803 (-1.19z)| norm 0.2876 (+1.05z)| lr 1.07e-04 | 2534.05 ms | 53.3% bf16 MFU | 206902 tok/s +step 14339/19560 | loss 3.330027 (-0.12z)| norm 0.2713 (-0.10z)| lr 1.07e-04 | 2532.68 ms | 53.3% bf16 MFU | 206907 tok/s +step 14340/19560 | loss 3.293018 (-1.07z)| norm 0.3442 (+4.66z)| lr 1.06e-04 | 2533.60 ms | 53.3% bf16 MFU | 206909 tok/s +step 14341/19560 | loss 3.327477 (-0.19z)| norm 0.2651 (-0.54z)| lr 1.06e-04 | 2533.16 ms | 53.3% bf16 MFU | 206912 tok/s +step 14342/19560 | loss 3.382595 (+1.26z)| norm 0.2950 (+1.40z)| lr 1.06e-04 | 2533.29 ms | 53.3% bf16 MFU | 206914 tok/s +step 14343/19560 | loss 3.297174 (-0.96z)| norm 0.2755 (+0.12z)| lr 1.06e-04 | 2533.46 ms | 53.3% bf16 MFU | 206916 tok/s +step 14344/19560 | loss 3.295219 (-1.00z)| norm 0.2729 (-0.05z)| lr 1.06e-04 | 2531.43 ms | 53.3% bf16 MFU | 206925 tok/s +step 14345/19560 | loss 3.355969 (+0.58z)| norm 0.2683 (-0.35z)| lr 1.06e-04 | 2532.60 ms | 53.3% bf16 MFU | 206930 tok/s +step 14346/19560 | loss 3.298916 (-0.92z)| norm 0.2577 (-1.03z)| lr 1.06e-04 | 2532.68 ms | 53.3% bf16 MFU | 206934 tok/s +step 14347/19560 | loss 3.312863 (-0.55z)| norm 0.2647 (-0.57z)| lr 1.06e-04 | 2533.63 ms | 53.3% bf16 MFU | 206934 tok/s +step 14348/19560 | loss 3.317024 (-0.43z)| norm 0.2811 (+0.50z)| lr 1.06e-04 | 2533.22 ms | 53.3% bf16 MFU | 206935 tok/s +step 14349/19560 | loss 3.314970 (-0.47z)| norm 0.2505 (-1.47z)| lr 1.06e-04 | 2533.23 ms | 53.3% bf16 MFU | 206937 tok/s +step 14350/19560 | loss 3.390842 (+1.54z)| norm 0.2956 (+1.42z)| lr 1.06e-04 | 2533.18 ms | 53.3% bf16 MFU | 206938 tok/s +step 14351/19560 | loss 3.324731 (-0.21z)| norm 0.2775 (+0.26z)| lr 1.06e-04 | 2535.02 ms | 53.3% bf16 MFU | 206932 tok/s +step 14352/19560 | loss 3.287353 (-1.19z)| norm 0.2771 (+0.23z)| lr 1.06e-04 | 2533.69 ms | 53.3% bf16 MFU | 206932 tok/s +step 14353/19560 | loss 3.379334 (+1.22z)| norm 0.2487 (-1.58z)| lr 1.06e-04 | 2532.68 ms | 53.3% bf16 MFU | 206936 tok/s +step 14354/19560 | loss 3.314369 (-0.50z)| norm 0.2681 (-0.35z)| lr 1.06e-04 | 2534.70 ms | 53.3% bf16 MFU | 206931 tok/s +step 14355/19560 | loss 3.332594 (-0.02z)| norm 0.2828 (+0.58z)| lr 1.06e-04 | 2535.55 ms | 53.2% bf16 MFU | 206924 tok/s +step 14356/19560 | loss 3.313877 (-0.51z)| norm 0.2745 (+0.05z)| lr 1.06e-04 | 2534.60 ms | 53.3% bf16 MFU | 206920 tok/s +step 14357/19560 | loss 3.445280 (+2.94z)| norm 0.2798 (+0.38z)| lr 1.06e-04 | 2533.51 ms | 53.3% bf16 MFU | 206921 tok/s +step 14358/19560 | loss 3.321878 (-0.30z)| norm 0.2890 (+0.96z)| lr 1.06e-04 | 2535.26 ms | 53.3% bf16 MFU | 206915 tok/s +step 14359/19560 | loss 3.322636 (-0.28z)| norm 0.2698 (-0.28z)| lr 1.06e-04 | 2534.18 ms | 53.3% bf16 MFU | 206913 tok/s +step 14360/19560 | loss 3.330123 (-0.09z)| norm 0.2726 (-0.10z)| lr 1.06e-04 | 2532.49 ms | 53.3% bf16 MFU | 206919 tok/s +step 14361/19560 | loss 3.354901 (+0.56z)| norm 0.2650 (-0.60z)| lr 1.06e-04 | 2534.96 ms | 53.3% bf16 MFU | 206914 tok/s +step 14362/19560 | loss 3.411387 (+2.00z)| norm 0.2770 (+0.17z)| lr 1.06e-04 | 2532.21 ms | 53.3% bf16 MFU | 206921 tok/s +step 14363/19560 | loss 3.345654 (+0.30z)| norm 0.2814 (+0.45z)| lr 1.06e-04 | 2534.05 ms | 53.3% bf16 MFU | 206920 tok/s +step 14364/19560 | loss 3.329137 (-0.13z)| norm 0.2647 (-0.62z)| lr 1.06e-04 | 2533.66 ms | 53.3% bf16 MFU | 206920 tok/s +step 14365/19560 | loss 3.318295 (-0.43z)| norm 0.2753 (+0.07z)| lr 1.06e-04 | 2533.74 ms | 53.3% bf16 MFU | 206920 tok/s +step 14366/19560 | loss 3.319101 (-0.42z)| norm 0.2756 (+0.09z)| lr 1.05e-04 | 2533.00 ms | 53.3% bf16 MFU | 206924 tok/s +step 14367/19560 | loss 3.311755 (-0.60z)| norm 0.2674 (-0.44z)| lr 1.05e-04 | 2533.17 ms | 53.3% bf16 MFU | 206926 tok/s +step 14368/19560 | loss 3.329856 (-0.12z)| norm 0.2629 (-0.73z)| lr 1.05e-04 | 2534.17 ms | 53.3% bf16 MFU | 206924 tok/s +step 14369/19560 | loss 3.334884 (+0.03z)| norm 0.2822 (+0.53z)| lr 1.05e-04 | 2533.64 ms | 53.3% bf16 MFU | 206924 tok/s +step 14370/19560 | loss 3.352880 (+0.52z)| norm 0.2764 (+0.15z)| lr 1.05e-04 | 2535.11 ms | 53.3% bf16 MFU | 206919 tok/s +step 14371/19560 | loss 3.348031 (+0.40z)| norm 0.2743 (+0.03z)| lr 1.05e-04 | 2533.52 ms | 53.3% bf16 MFU | 206920 tok/s +step 14372/19560 | loss 3.430234 (+2.57z)| norm 0.2863 (+0.82z)| lr 1.05e-04 | 2533.39 ms | 53.3% bf16 MFU | 206921 tok/s +step 14373/19560 | loss 3.430389 (+2.50z)| norm 0.2967 (+1.49z)| lr 1.05e-04 | 2533.53 ms | 53.3% bf16 MFU | 206922 tok/s +step 14374/19560 | loss 3.365642 (+0.80z)| norm 0.2698 (-0.27z)| lr 1.05e-04 | 2534.61 ms | 53.3% bf16 MFU | 206919 tok/s +step 14375/19560 | loss 3.345311 (+0.27z)| norm 0.2827 (+0.57z)| lr 1.05e-04 | 2531.51 ms | 53.3% bf16 MFU | 206928 tok/s +step 14376/19560 | loss 3.334168 (-0.02z)| norm 0.2754 (+0.11z)| lr 1.05e-04 | 2533.25 ms | 53.3% bf16 MFU | 206930 tok/s +step 14377/19560 | loss 3.311766 (-0.60z)| norm 0.2662 (-0.50z)| lr 1.05e-04 | 2534.65 ms | 53.3% bf16 MFU | 206926 tok/s +step 14378/19560 | loss 3.324887 (-0.27z)| norm 0.2650 (-0.57z)| lr 1.05e-04 | 2535.40 ms | 53.3% bf16 MFU | 206919 tok/s +step 14379/19560 | loss 3.347636 (+0.33z)| norm 0.2513 (-1.45z)| lr 1.05e-04 | 2536.63 ms | 53.2% bf16 MFU | 206907 tok/s +step 14380/19560 | loss 3.351640 (+0.42z)| norm 0.2894 (+1.04z)| lr 1.05e-04 | 2534.87 ms | 53.3% bf16 MFU | 206903 tok/s +step 14381/19560 | loss 3.329030 (-0.17z)| norm 0.2535 (-1.30z)| lr 1.05e-04 | 2535.32 ms | 53.3% bf16 MFU | 206898 tok/s +step 14382/19560 | loss 3.359925 (+0.65z)| norm 0.2709 (-0.16z)| lr 1.05e-04 | 2534.19 ms | 53.3% bf16 MFU | 206897 tok/s +step 14383/19560 | loss 3.382517 (+1.22z)| norm 0.2631 (-0.67z)| lr 1.05e-04 | 2533.67 ms | 53.3% bf16 MFU | 206899 tok/s +step 14384/19560 | loss 3.224380 (-2.81z)| norm 0.2710 (-0.16z)| lr 1.05e-04 | 2534.12 ms | 53.3% bf16 MFU | 206898 tok/s +step 14385/19560 | loss 3.314801 (-0.50z)| norm 0.2698 (-0.23z)| lr 1.05e-04 | 2535.12 ms | 53.3% bf16 MFU | 206894 tok/s +step 14386/19560 | loss 3.300509 (-0.86z)| norm 0.2447 (-1.88z)| lr 1.05e-04 | 2534.22 ms | 53.3% bf16 MFU | 206893 tok/s +step 14387/19560 | loss 3.326846 (-0.21z)| norm 0.2946 (+1.39z)| lr 1.05e-04 | 2534.92 ms | 53.3% bf16 MFU | 206890 tok/s +step 14388/19560 | loss 3.258079 (-1.95z)| norm 0.2434 (-1.97z)| lr 1.05e-04 | 2534.79 ms | 53.3% bf16 MFU | 206887 tok/s +step 14389/19560 | loss 3.327626 (-0.18z)| norm 0.2759 (+0.19z)| lr 1.05e-04 | 2534.89 ms | 53.3% bf16 MFU | 206884 tok/s +step 14390/19560 | loss 3.299137 (-0.91z)| norm 0.2439 (-1.91z)| lr 1.05e-04 | 2535.08 ms | 53.3% bf16 MFU | 206881 tok/s +step 14391/19560 | loss 3.347092 (+0.32z)| norm 0.2612 (-0.76z)| lr 1.05e-04 | 2533.85 ms | 53.3% bf16 MFU | 206883 tok/s +step 14392/19560 | loss 3.324351 (-0.26z)| norm 0.2539 (-1.22z)| lr 1.05e-04 | 2534.71 ms | 53.3% bf16 MFU | 206881 tok/s +step 14393/19560 | loss 3.321902 (-0.32z)| norm 0.2677 (-0.34z)| lr 1.04e-04 | 2532.57 ms | 53.3% bf16 MFU | 206887 tok/s +step 14394/19560 | loss 3.266494 (-1.72z)| norm 0.2745 (+0.11z)| lr 1.04e-04 | 2534.17 ms | 53.3% bf16 MFU | 206887 tok/s +step 14395/19560 | loss 3.320715 (-0.35z)| norm 0.2612 (-0.76z)| lr 1.04e-04 | 2535.86 ms | 53.2% bf16 MFU | 206881 tok/s +step 14396/19560 | loss 3.315869 (-0.46z)| norm 0.2598 (-0.84z)| lr 1.04e-04 | 2535.43 ms | 53.3% bf16 MFU | 206876 tok/s +step 14397/19560 | loss 3.336782 (+0.10z)| norm 0.2607 (-0.77z)| lr 1.04e-04 | 2533.34 ms | 53.3% bf16 MFU | 206880 tok/s +step 14398/19560 | loss 3.264363 (-1.81z)| norm 0.2731 (+0.07z)| lr 1.04e-04 | 2533.57 ms | 53.3% bf16 MFU | 206883 tok/s +step 14399/19560 | loss 3.359194 (+0.71z)| norm 0.2677 (-0.29z)| lr 1.04e-04 | 2534.04 ms | 53.3% bf16 MFU | 206883 tok/s +step 14400/19560 | loss 3.355438 (+0.60z)| norm 0.3136 (+2.71z)| lr 1.04e-04 | 2534.00 ms | 53.3% bf16 MFU | 206884 tok/s +step 14401/19560 | loss 3.261150 (-1.87z)| norm 0.2696 (-0.17z)| lr 1.04e-04 | 2533.22 ms | 53.3% bf16 MFU | 206888 tok/s +step 14402/19560 | loss 3.396029 (+1.69z)| norm 0.2678 (-0.28z)| lr 1.04e-04 | 2533.42 ms | 53.3% bf16 MFU | 206891 tok/s +step 14403/19560 | loss 3.271818 (-1.56z)| norm 0.2697 (-0.14z)| lr 1.04e-04 | 2534.00 ms | 53.3% bf16 MFU | 206892 tok/s +step 14404/19560 | loss 3.281218 (-1.30z)| norm 0.2734 (+0.12z)| lr 1.04e-04 | 2534.51 ms | 53.3% bf16 MFU | 206890 tok/s +step 14405/19560 | loss 3.384830 (+1.40z)| norm 0.2653 (-0.43z)| lr 1.04e-04 | 2533.63 ms | 53.3% bf16 MFU | 206892 tok/s +step 14406/19560 | loss 3.314920 (-0.42z)| norm 0.2993 (+1.91z)| lr 1.04e-04 | 2533.66 ms | 53.3% bf16 MFU | 206894 tok/s +step 14407/19560 | loss 3.349577 (+0.47z)| norm 0.2791 (+0.53z)| lr 1.04e-04 | 2533.76 ms | 53.3% bf16 MFU | 206895 tok/s +step 14408/19560 | loss 3.316451 (-0.40z)| norm 0.2816 (+0.70z)| lr 1.04e-04 | 2534.29 ms | 53.3% bf16 MFU | 206895 tok/s +step 14409/19560 | loss 3.306188 (-0.66z)| norm 0.2711 (-0.03z)| lr 1.04e-04 | 2532.24 ms | 53.3% bf16 MFU | 206902 tok/s +step 14410/19560 | loss 3.376261 (+1.17z)| norm 0.2758 (+0.31z)| lr 1.04e-04 | 2532.47 ms | 53.3% bf16 MFU | 206908 tok/s +step 14411/19560 | loss 3.327291 (-0.11z)| norm 0.2824 (+0.77z)| lr 1.04e-04 | 2533.10 ms | 53.3% bf16 MFU | 206912 tok/s +step 14412/19560 | loss 3.293429 (-0.98z)| norm 0.2614 (-0.72z)| lr 1.04e-04 | 2534.07 ms | 53.3% bf16 MFU | 206911 tok/s +step 14413/19560 | loss 3.339965 (+0.24z)| norm 0.2778 (+0.46z)| lr 1.04e-04 | 2535.02 ms | 53.3% bf16 MFU | 206906 tok/s +step 14414/19560 | loss 3.374785 (+1.14z)| norm 0.2632 (-0.58z)| lr 1.04e-04 | 2534.17 ms | 53.3% bf16 MFU | 206905 tok/s +step 14415/19560 | loss 3.312391 (-0.50z)| norm 0.2843 (+0.91z)| lr 1.04e-04 | 2532.93 ms | 53.3% bf16 MFU | 206909 tok/s +step 14416/19560 | loss 3.363126 (+0.82z)| norm 0.2837 (+0.86z)| lr 1.04e-04 | 2533.37 ms | 53.3% bf16 MFU | 206912 tok/s +step 14417/19560 | loss 3.415695 (+2.14z)| norm 0.2768 (+0.36z)| lr 1.04e-04 | 2533.97 ms | 53.3% bf16 MFU | 206911 tok/s +step 14418/19560 | loss 3.354603 (+0.56z)| norm 0.2787 (+0.49z)| lr 1.04e-04 | 2533.56 ms | 53.3% bf16 MFU | 206912 tok/s +step 14419/19560 | loss 3.362687 (+0.76z)| norm 0.2695 (-0.17z)| lr 1.03e-04 | 2534.38 ms | 53.3% bf16 MFU | 206910 tok/s +step 14420/19560 | loss 3.275741 (-1.46z)| norm 0.2697 (-0.16z)| lr 1.03e-04 | 2532.80 ms | 53.3% bf16 MFU | 206915 tok/s +step 14421/19560 | loss 3.386865 (+1.36z)| norm 0.2644 (-0.54z)| lr 1.03e-04 | 2532.32 ms | 53.3% bf16 MFU | 206921 tok/s +step 14422/19560 | loss 3.341630 (+0.21z)| norm 0.2858 (+0.98z)| lr 1.03e-04 | 2533.95 ms | 53.3% bf16 MFU | 206920 tok/s +step 14423/19560 | loss 3.357119 (+0.60z)| norm 0.2661 (-0.44z)| lr 1.03e-04 | 2532.99 ms | 53.3% bf16 MFU | 206923 tok/s +step 14424/19560 | loss 3.348160 (+0.39z)| norm 0.2691 (-0.23z)| lr 1.03e-04 | 2533.36 ms | 53.3% bf16 MFU | 206925 tok/s +step 14425/19560 | loss 3.326618 (-0.16z)| norm 0.2582 (-1.03z)| lr 1.03e-04 | 2533.10 ms | 53.3% bf16 MFU | 206927 tok/s +step 14426/19560 | loss 3.394471 (+1.55z)| norm 0.3077 (+2.61z)| lr 1.03e-04 | 2532.17 ms | 53.3% bf16 MFU | 206933 tok/s +step 14427/19560 | loss 3.359380 (+0.65z)| norm 0.2793 (+0.51z)| lr 1.03e-04 | 2534.03 ms | 53.3% bf16 MFU | 206932 tok/s +step 14428/19560 | loss 3.370418 (+0.92z)| norm 0.2666 (-0.42z)| lr 1.03e-04 | 2533.97 ms | 53.3% bf16 MFU | 206930 tok/s +step 14429/19560 | loss 3.299626 (-0.86z)| norm 0.2583 (-1.04z)| lr 1.03e-04 | 2534.16 ms | 53.3% bf16 MFU | 206928 tok/s +step 14430/19560 | loss 3.332886 (-0.01z)| norm 0.2737 (+0.09z)| lr 1.03e-04 | 2533.25 ms | 53.3% bf16 MFU | 206930 tok/s +step 14431/19560 | loss 3.336536 (+0.09z)| norm 0.2599 (-0.92z)| lr 1.03e-04 | 2534.61 ms | 53.3% bf16 MFU | 206926 tok/s +step 14432/19560 | loss 3.344889 (+0.29z)| norm 0.2580 (-1.05z)| lr 1.03e-04 | 2532.78 ms | 53.3% bf16 MFU | 206930 tok/s +step 14433/19560 | loss 3.368960 (+0.92z)| norm 0.2792 (+0.50z)| lr 1.03e-04 | 2534.44 ms | 53.3% bf16 MFU | 206927 tok/s +step 14434/19560 | loss 3.355128 (+0.55z)| norm 0.2704 (-0.14z)| lr 1.03e-04 | 2533.95 ms | 53.3% bf16 MFU | 206926 tok/s +step 14435/19560 | loss 3.223205 (-2.80z)| norm 0.2638 (-0.62z)| lr 1.03e-04 | 2533.63 ms | 53.3% bf16 MFU | 206926 tok/s +step 14436/19560 | loss 3.383482 (+1.25z)| norm 0.2698 (-0.18z)| lr 1.03e-04 | 2531.95 ms | 53.3% bf16 MFU | 206933 tok/s +step 14437/19560 | loss 3.435864 (+2.50z)| norm 0.3102 (+2.69z)| lr 1.03e-04 | 2534.00 ms | 53.3% bf16 MFU | 206931 tok/s +step 14438/19560 | loss 3.277781 (-1.40z)| norm 0.2731 (+0.05z)| lr 1.03e-04 | 2532.19 ms | 53.3% bf16 MFU | 206937 tok/s +step 14439/19560 | loss 3.324882 (-0.24z)| norm 0.2739 (+0.09z)| lr 1.03e-04 | 2531.78 ms | 53.3% bf16 MFU | 206945 tok/s +step 14440/19560 | loss 3.380832 (+1.13z)| norm 0.2744 (+0.13z)| lr 1.03e-04 | 2533.98 ms | 53.3% bf16 MFU | 206942 tok/s +step 14441/19560 | loss 3.326934 (-0.20z)| norm 0.2913 (+1.34z)| lr 1.03e-04 | 2534.92 ms | 53.3% bf16 MFU | 206937 tok/s +step 14442/19560 | loss 3.331841 (-0.09z)| norm 0.2695 (-0.23z)| lr 1.03e-04 | 2533.74 ms | 53.3% bf16 MFU | 206936 tok/s +step 14443/19560 | loss 3.342358 (+0.17z)| norm 0.2934 (+1.46z)| lr 1.03e-04 | 2534.13 ms | 53.3% bf16 MFU | 206934 tok/s +step 14444/19560 | loss 3.325826 (-0.25z)| norm 0.2610 (-0.85z)| lr 1.03e-04 | 2533.81 ms | 53.3% bf16 MFU | 206933 tok/s +step 14445/19560 | loss 3.303432 (-0.80z)| norm 0.2776 (+0.33z)| lr 1.03e-04 | 2533.57 ms | 53.3% bf16 MFU | 206933 tok/s +step 14446/19560 | loss 3.324682 (-0.25z)| norm 0.2623 (-0.76z)| lr 1.02e-04 | 2533.27 ms | 53.3% bf16 MFU | 206934 tok/s +step 14447/19560 | loss 3.343783 (+0.23z)| norm 0.2592 (-0.96z)| lr 1.02e-04 | 2532.83 ms | 53.3% bf16 MFU | 206938 tok/s +step 14448/19560 | loss 3.328041 (-0.18z)| norm 0.2713 (-0.09z)| lr 1.02e-04 | 2533.15 ms | 53.3% bf16 MFU | 206939 tok/s +step 14449/19560 | loss 3.349769 (+0.38z)| norm 0.2686 (-0.28z)| lr 1.02e-04 | 2535.94 ms | 53.2% bf16 MFU | 206929 tok/s +step 14450/19560 | loss 3.359874 (+0.63z)| norm 0.2538 (-1.33z)| lr 1.02e-04 | 2535.07 ms | 53.3% bf16 MFU | 206924 tok/s +step 14451/19560 | loss 3.307605 (-0.71z)| norm 0.2622 (-0.72z)| lr 1.02e-04 | 2536.35 ms | 53.2% bf16 MFU | 206913 tok/s +step 14452/19560 | loss 3.270075 (-1.64z)| norm 0.2648 (-0.53z)| lr 1.02e-04 | 2534.69 ms | 53.3% bf16 MFU | 206910 tok/s +step 14453/19560 | loss 3.300265 (-0.86z)| norm 0.2473 (-1.74z)| lr 1.02e-04 | 2532.89 ms | 53.3% bf16 MFU | 206914 tok/s +step 14454/19560 | loss 3.284534 (-1.25z)| norm 0.2794 (+0.51z)| lr 1.02e-04 | 2535.05 ms | 53.3% bf16 MFU | 206909 tok/s +step 14455/19560 | loss 3.309773 (-0.61z)| norm 0.2622 (-0.71z)| lr 1.02e-04 | 2533.81 ms | 53.3% bf16 MFU | 206909 tok/s +step 14456/19560 | loss 3.408306 (+1.84z)| norm 0.2593 (-0.91z)| lr 1.02e-04 | 2533.20 ms | 53.3% bf16 MFU | 206912 tok/s +step 14457/19560 | loss 3.302814 (-0.77z)| norm 0.2738 (+0.12z)| lr 1.02e-04 | 2534.28 ms | 53.3% bf16 MFU | 206910 tok/s +step 14458/19560 | loss 3.332123 (-0.02z)| norm 0.2679 (-0.31z)| lr 1.02e-04 | 2533.96 ms | 53.3% bf16 MFU | 206910 tok/s +step 14459/19560 | loss 3.320082 (-0.32z)| norm 0.2872 (+1.05z)| lr 1.02e-04 | 2533.32 ms | 53.3% bf16 MFU | 206912 tok/s +step 14460/19560 | loss 3.346630 (+0.34z)| norm 0.2622 (-0.72z)| lr 1.02e-04 | 2534.37 ms | 53.3% bf16 MFU | 206910 tok/s +step 14461/19560 | loss 3.333816 (+0.01z)| norm 0.2742 (+0.13z)| lr 1.02e-04 | 2534.61 ms | 53.3% bf16 MFU | 206907 tok/s +step 14462/19560 | loss 3.340382 (+0.18z)| norm 0.2609 (-0.81z)| lr 1.02e-04 | 2534.88 ms | 53.3% bf16 MFU | 206903 tok/s +step 14463/19560 | loss 3.358227 (+0.62z)| norm 0.2638 (-0.60z)| lr 1.02e-04 | 2533.08 ms | 53.3% bf16 MFU | 206907 tok/s +step 14464/19560 | loss 3.319100 (-0.38z)| norm 0.2668 (-0.39z)| lr 1.02e-04 | 2534.71 ms | 53.3% bf16 MFU | 206904 tok/s +step 14465/19560 | loss 3.313412 (-0.52z)| norm 0.2843 (+0.84z)| lr 1.02e-04 | 2531.70 ms | 53.3% bf16 MFU | 206913 tok/s +step 14466/19560 | loss 3.516178 (+4.28z)| norm 0.2600 (-0.86z)| lr 1.02e-04 | 2532.43 ms | 53.3% bf16 MFU | 206919 tok/s +step 14467/19560 | loss 3.350173 (+0.34z)| norm 0.2938 (+1.50z)| lr 1.02e-04 | 2533.48 ms | 53.3% bf16 MFU | 206920 tok/s +step 14468/19560 | loss 3.300446 (-0.84z)| norm 0.2624 (-0.72z)| lr 1.02e-04 | 2532.58 ms | 53.3% bf16 MFU | 206925 tok/s +step 14469/19560 | loss 3.402146 (+1.55z)| norm 0.2869 (+1.17z)| lr 1.02e-04 | 2533.94 ms | 53.3% bf16 MFU | 206924 tok/s +step 14470/19560 | loss 3.255075 (-1.87z)| norm 0.2615 (-0.80z)| lr 1.02e-04 | 2531.82 ms | 53.3% bf16 MFU | 206932 tok/s +step 14471/19560 | loss 3.385258 (+1.14z)| norm 0.2685 (-0.24z)| lr 1.02e-04 | 2532.36 ms | 53.3% bf16 MFU | 206937 tok/s +step 14472/19560 | loss 3.334419 (-0.04z)| norm 0.2880 (+1.28z)| lr 1.01e-04 | 2532.32 ms | 53.3% bf16 MFU | 206942 tok/s +step 14473/19560 | loss 3.278141 (-1.33z)| norm 0.3039 (+2.44z)| lr 1.01e-04 | 2532.99 ms | 53.3% bf16 MFU | 206944 tok/s +step 14474/19560 | loss 3.341403 (+0.12z)| norm 0.2832 (+0.85z)| lr 1.01e-04 | 2534.35 ms | 53.3% bf16 MFU | 206941 tok/s +step 14475/19560 | loss 3.330713 (-0.13z)| norm 0.2853 (+0.99z)| lr 1.01e-04 | 2534.12 ms | 53.3% bf16 MFU | 206938 tok/s +step 14476/19560 | loss 3.299902 (-0.84z)| norm 0.2634 (-0.67z)| lr 1.01e-04 | 2533.96 ms | 53.3% bf16 MFU | 206936 tok/s +step 14477/19560 | loss 3.311349 (-0.57z)| norm 0.2687 (-0.28z)| lr 1.01e-04 | 2532.91 ms | 53.3% bf16 MFU | 206939 tok/s +step 14478/19560 | loss 3.336569 (+0.02z)| norm 0.2771 (+0.38z)| lr 1.01e-04 | 2533.91 ms | 53.3% bf16 MFU | 206938 tok/s +step 14479/19560 | loss 3.294765 (-0.95z)| norm 0.2988 (+2.04z)| lr 1.01e-04 | 2533.46 ms | 53.3% bf16 MFU | 206938 tok/s +step 14480/19560 | loss 3.264226 (-1.64z)| norm 0.2745 (+0.17z)| lr 1.01e-04 | 2534.82 ms | 53.3% bf16 MFU | 206933 tok/s +step 14481/19560 | loss 3.376590 (+0.96z)| norm 0.2801 (+0.59z)| lr 1.01e-04 | 2532.89 ms | 53.3% bf16 MFU | 206936 tok/s +step 14482/19560 | loss 3.317954 (-0.40z)| norm 0.2881 (+1.19z)| lr 1.01e-04 | 2532.85 ms | 53.3% bf16 MFU | 206939 tok/s +step 14483/19560 | loss 3.276637 (-1.34z)| norm 0.2715 (-0.09z)| lr 1.01e-04 | 2532.31 ms | 53.3% bf16 MFU | 206944 tok/s +step 14484/19560 | loss 3.314674 (-0.46z)| norm 0.2534 (-1.47z)| lr 1.01e-04 | 2532.43 ms | 53.3% bf16 MFU | 206948 tok/s +step 14485/19560 | loss 3.386624 (+1.23z)| norm 0.2707 (-0.13z)| lr 1.01e-04 | 2534.12 ms | 53.3% bf16 MFU | 206945 tok/s +step 14486/19560 | loss 3.370794 (+0.85z)| norm 0.2748 (+0.19z)| lr 1.01e-04 | 2533.79 ms | 53.3% bf16 MFU | 206944 tok/s +step 14487/19560 | loss 3.325031 (-0.23z)| norm 0.2720 (-0.02z)| lr 1.01e-04 | 2534.32 ms | 53.3% bf16 MFU | 206940 tok/s +step 14488/19560 | loss 3.373711 (+0.90z)| norm 0.2810 (+0.67z)| lr 1.01e-04 | 2532.66 ms | 53.3% bf16 MFU | 206944 tok/s +step 14489/19560 | loss 3.346570 (+0.27z)| norm 0.2860 (+1.04z)| lr 1.01e-04 | 2536.31 ms | 53.2% bf16 MFU | 206932 tok/s +step 14490/19560 | loss 3.347550 (+0.31z)| norm 0.2522 (-1.54z)| lr 1.01e-04 | 2535.81 ms | 53.2% bf16 MFU | 206923 tok/s +step 14491/19560 | loss 3.395330 (+1.42z)| norm 0.2976 (+1.90z)| lr 1.01e-04 | 2534.02 ms | 53.3% bf16 MFU | 206922 tok/s +step 14492/19560 | loss 3.460582 (+2.83z)| norm 0.3373 (+4.46z)| lr 1.01e-04 | 2533.20 ms | 53.3% bf16 MFU | 206924 tok/s +step 14493/19560 | loss 3.308423 (-0.62z)| norm 0.2851 (+0.83z)| lr 1.01e-04 | 2537.08 ms | 53.2% bf16 MFU | 206911 tok/s +step 14494/19560 | loss 3.293384 (-0.96z)| norm 0.2901 (+1.16z)| lr 1.01e-04 | 2536.13 ms | 53.2% bf16 MFU | 206902 tok/s +step 14495/19560 | loss 3.337151 (+0.03z)| norm 0.2728 (-0.03z)| lr 1.01e-04 | 2533.62 ms | 53.3% bf16 MFU | 206903 tok/s +step 14496/19560 | loss 3.334579 (-0.03z)| norm 0.2802 (+0.47z)| lr 1.01e-04 | 2534.35 ms | 53.3% bf16 MFU | 206902 tok/s +step 14497/19560 | loss 3.358069 (+0.50z)| norm 0.2606 (-0.87z)| lr 1.01e-04 | 2533.35 ms | 53.3% bf16 MFU | 206904 tok/s +step 14498/19560 | loss 3.334597 (-0.03z)| norm 0.2790 (+0.40z)| lr 1.01e-04 | 2533.12 ms | 53.3% bf16 MFU | 206908 tok/s +step 14499/19560 | loss 3.301674 (-0.77z)| norm 0.2578 (-1.05z)| lr 1.00e-04 | 2534.23 ms | 53.3% bf16 MFU | 206906 tok/s +step 14500/19560 | loss 3.433770 (+2.22z)| norm 0.2888 (+1.07z)| lr 1.00e-04 | 2532.61 ms | 53.3% bf16 MFU | 206912 tok/s +val loss 3.321125 +evaluating HellaSwag: 0/628 evaluating HellaSwag: 10/628 evaluating HellaSwag: 20/628 evaluating HellaSwag: 30/628 evaluating HellaSwag: 40/628 evaluating HellaSwag: 50/628 evaluating HellaSwag: 60/628 evaluating HellaSwag: 70/628 evaluating HellaSwag: 80/628 evaluating HellaSwag: 90/628 evaluating HellaSwag: 100/628 evaluating HellaSwag: 110/628 evaluating HellaSwag: 120/628 evaluating HellaSwag: 130/628 evaluating HellaSwag: 140/628 evaluating HellaSwag: 150/628 evaluating HellaSwag: 160/628 evaluating HellaSwag: 170/628 evaluating HellaSwag: 180/628 evaluating HellaSwag: 190/628 evaluating HellaSwag: 200/628 evaluating HellaSwag: 210/628 evaluating HellaSwag: 220/628 evaluating HellaSwag: 230/628 evaluating HellaSwag: 240/628 evaluating HellaSwag: 250/628 evaluating HellaSwag: 260/628 evaluating HellaSwag: 270/628 evaluating HellaSwag: 280/628 evaluating HellaSwag: 290/628 evaluating HellaSwag: 300/628 evaluating HellaSwag: 310/628 evaluating HellaSwag: 320/628 evaluating HellaSwag: 330/628 evaluating HellaSwag: 340/628 evaluating HellaSwag: 350/628 evaluating HellaSwag: 360/628 evaluating HellaSwag: 370/628 evaluating HellaSwag: 380/628 evaluating HellaSwag: 390/628 evaluating HellaSwag: 400/628 evaluating HellaSwag: 410/628 evaluating HellaSwag: 420/628 evaluating HellaSwag: 430/628 evaluating HellaSwag: 440/628 evaluating HellaSwag: 450/628 evaluating HellaSwag: 460/628 evaluating HellaSwag: 470/628 evaluating HellaSwag: 480/628 evaluating HellaSwag: 490/628 evaluating HellaSwag: 500/628 evaluating HellaSwag: 510/628 evaluating HellaSwag: 520/628 evaluating HellaSwag: 530/628 evaluating HellaSwag: 540/628 evaluating HellaSwag: 550/628 evaluating HellaSwag: 560/628 evaluating HellaSwag: 570/628 evaluating HellaSwag: 580/628 evaluating HellaSwag: 590/628 evaluating HellaSwag: 600/628 evaluating HellaSwag: 610/628 evaluating HellaSwag: 620/628 HellaSwag: 2998/10042 = 0.298546 +step 14501/19560 | loss 3.349220 (+0.33z)| norm 0.2831 (+0.69z)| lr 1.00e-04 | 2533.50 ms | 53.3% bf16 MFU | 206913 tok/s +step 14502/19560 | loss 3.321272 (-0.31z)| norm 0.2990 (+1.76z)| lr 1.00e-04 | 2536.00 ms | 53.2% bf16 MFU | 206905 tok/s +step 14503/19560 | loss 3.264333 (-1.59z)| norm 0.2595 (-0.93z)| lr 1.00e-04 | 2533.69 ms | 53.3% bf16 MFU | 206906 tok/s +step 14504/19560 | loss 3.324432 (-0.22z)| norm 0.2632 (-0.67z)| lr 1.00e-04 | 2532.63 ms | 53.3% bf16 MFU | 206911 tok/s +step 14505/19560 | loss 3.329713 (-0.10z)| norm 0.2732 (+0.01z)| lr 1.00e-04 | 2533.90 ms | 53.3% bf16 MFU | 206911 tok/s +step 14506/19560 | loss 3.314645 (-0.44z)| norm 0.2717 (-0.09z)| lr 1.00e-04 | 2534.40 ms | 53.3% bf16 MFU | 206909 tok/s +step 14507/19560 | loss 3.286321 (-1.07z)| norm 0.2749 (+0.11z)| lr 1.00e-04 | 2533.78 ms | 53.3% bf16 MFU | 206909 tok/s +step 14508/19560 | loss 3.271455 (-1.38z)| norm 0.2896 (+1.12z)| lr 1.00e-04 | 2531.72 ms | 53.3% bf16 MFU | 206918 tok/s +step 14509/19560 | loss 3.320817 (-0.27z)| norm 0.2783 (+0.34z)| lr 1.00e-04 | 2532.09 ms | 53.3% bf16 MFU | 206925 tok/s +step 14510/19560 | loss 3.372866 (+0.90z)| norm 0.2902 (+1.14z)| lr 1.00e-04 | 2531.88 ms | 53.3% bf16 MFU | 206933 tok/s +step 14511/19560 | loss 3.246161 (-1.91z)| norm 0.2862 (+0.85z)| lr 1.00e-04 | 2532.77 ms | 53.3% bf16 MFU | 206936 tok/s +step 14512/19560 | loss 3.390279 (+1.30z)| norm 0.2745 (+0.04z)| lr 1.00e-04 | 2532.57 ms | 53.3% bf16 MFU | 206940 tok/s +step 14513/19560 | loss 3.341210 (+0.18z)| norm 0.2752 (+0.09z)| lr 1.00e-04 | 2535.24 ms | 53.3% bf16 MFU | 206933 tok/s +step 14514/19560 | loss 3.337376 (+0.09z)| norm 0.2664 (-0.54z)| lr 9.99e-05 | 2533.17 ms | 53.3% bf16 MFU | 206935 tok/s +step 14515/19560 | loss 3.310855 (-0.51z)| norm 0.2683 (-0.39z)| lr 9.99e-05 | 2532.22 ms | 53.3% bf16 MFU | 206941 tok/s +step 14516/19560 | loss 3.346208 (+0.28z)| norm 0.2689 (-0.37z)| lr 9.98e-05 | 2532.96 ms | 53.3% bf16 MFU | 206943 tok/s +step 14517/19560 | loss 3.306327 (-0.63z)| norm 0.2726 (-0.10z)| lr 9.98e-05 | 2533.95 ms | 53.3% bf16 MFU | 206941 tok/s +step 14518/19560 | loss 3.374234 (+0.91z)| norm 0.2737 (-0.04z)| lr 9.98e-05 | 2533.21 ms | 53.3% bf16 MFU | 206942 tok/s +step 14519/19560 | loss 3.309005 (-0.58z)| norm 0.2702 (-0.30z)| lr 9.97e-05 | 2532.99 ms | 53.3% bf16 MFU | 206944 tok/s +step 14520/19560 | loss 3.327355 (-0.16z)| norm 0.2644 (-0.74z)| lr 9.97e-05 | 2534.39 ms | 53.3% bf16 MFU | 206941 tok/s +step 14521/19560 | loss 3.337775 (+0.08z)| norm 0.2762 (+0.13z)| lr 9.97e-05 | 2533.87 ms | 53.3% bf16 MFU | 206939 tok/s +step 14522/19560 | loss 3.284899 (-1.14z)| norm 0.2478 (-1.92z)| lr 9.96e-05 | 2534.20 ms | 53.3% bf16 MFU | 206936 tok/s +step 14523/19560 | loss 3.360100 (+0.58z)| norm 0.2761 (+0.13z)| lr 9.96e-05 | 2532.97 ms | 53.3% bf16 MFU | 206939 tok/s +step 14524/19560 | loss 3.318492 (-0.38z)| norm 0.2552 (-1.40z)| lr 9.95e-05 | 2533.47 ms | 53.3% bf16 MFU | 206939 tok/s +step 14525/19560 | loss 3.302179 (-0.74z)| norm 0.2609 (-0.97z)| lr 9.95e-05 | 2532.48 ms | 53.3% bf16 MFU | 206944 tok/s +step 14526/19560 | loss 3.369296 (+0.78z)| norm 0.2701 (-0.31z)| lr 9.95e-05 | 2531.41 ms | 53.3% bf16 MFU | 206952 tok/s +step 14527/19560 | loss 3.315339 (-0.46z)| norm 0.2642 (-0.73z)| lr 9.94e-05 | 2530.74 ms | 53.4% bf16 MFU | 206963 tok/s +step 14528/19560 | loss 3.341151 (+0.14z)| norm 0.2645 (-0.71z)| lr 9.94e-05 | 2529.73 ms | 53.4% bf16 MFU | 206977 tok/s +step 14529/19560 | loss 3.265805 (-1.60z)| norm 0.2627 (-0.83z)| lr 9.94e-05 | 2532.08 ms | 53.3% bf16 MFU | 206981 tok/s +step 14530/19560 | loss 3.291270 (-1.00z)| norm 0.2672 (-0.50z)| lr 9.93e-05 | 2532.51 ms | 53.3% bf16 MFU | 206983 tok/s +step 14531/19560 | loss 3.335821 (+0.03z)| norm 0.2573 (-1.22z)| lr 9.93e-05 | 2533.74 ms | 53.3% bf16 MFU | 206980 tok/s +step 14532/19560 | loss 3.390367 (+1.29z)| norm 0.2758 (+0.15z)| lr 9.92e-05 | 2532.81 ms | 53.3% bf16 MFU | 206981 tok/s +step 14533/19560 | loss 3.360170 (+0.58z)| norm 0.2657 (-0.60z)| lr 9.92e-05 | 2531.87 ms | 53.3% bf16 MFU | 206986 tok/s +step 14534/19560 | loss 3.344565 (+0.21z)| norm 0.2586 (-1.12z)| lr 9.92e-05 | 2532.42 ms | 53.3% bf16 MFU | 206988 tok/s +step 14535/19560 | loss 3.383576 (+1.12z)| norm 0.2519 (-1.58z)| lr 9.91e-05 | 2532.42 ms | 53.3% bf16 MFU | 206990 tok/s +step 14536/19560 | loss 3.263050 (-1.68z)| norm 0.3120 (+2.78z)| lr 9.91e-05 | 2532.74 ms | 53.3% bf16 MFU | 206991 tok/s +step 14537/19560 | loss 3.350054 (+0.33z)| norm 0.2608 (-0.91z)| lr 9.91e-05 | 2533.14 ms | 53.3% bf16 MFU | 206990 tok/s +step 14538/19560 | loss 3.300758 (-0.80z)| norm 0.2668 (-0.47z)| lr 9.90e-05 | 2534.62 ms | 53.3% bf16 MFU | 206983 tok/s +step 14539/19560 | loss 3.368372 (+0.76z)| norm 0.2574 (-1.12z)| lr 9.90e-05 | 2533.43 ms | 53.3% bf16 MFU | 206981 tok/s +step 14540/19560 | loss 3.306348 (-0.68z)| norm 0.2651 (-0.58z)| lr 9.90e-05 | 2533.33 ms | 53.3% bf16 MFU | 206980 tok/s +step 14541/19560 | loss 3.312747 (-0.53z)| norm 0.2620 (-0.79z)| lr 9.89e-05 | 2533.26 ms | 53.3% bf16 MFU | 206979 tok/s +step 14542/19560 | loss 3.321714 (-0.31z)| norm 0.2831 (+0.71z)| lr 9.89e-05 | 2534.37 ms | 53.3% bf16 MFU | 206974 tok/s +step 14543/19560 | loss 3.349023 (+0.32z)| norm 0.2798 (+0.48z)| lr 9.88e-05 | 2531.92 ms | 53.3% bf16 MFU | 206979 tok/s +step 14544/19560 | loss 3.366095 (+0.72z)| norm 0.2938 (+1.47z)| lr 9.88e-05 | 2534.28 ms | 53.3% bf16 MFU | 206974 tok/s +step 14545/19560 | loss 3.242502 (-2.12z)| norm 0.2684 (-0.34z)| lr 9.88e-05 | 2531.75 ms | 53.3% bf16 MFU | 206979 tok/s +step 14546/19560 | loss 3.356992 (+0.53z)| norm 0.2630 (-0.71z)| lr 9.87e-05 | 2532.49 ms | 53.3% bf16 MFU | 206981 tok/s +step 14547/19560 | loss 3.287157 (-1.07z)| norm 0.2681 (-0.35z)| lr 9.87e-05 | 2535.44 ms | 53.3% bf16 MFU | 206972 tok/s +step 14548/19560 | loss 3.311316 (-0.52z)| norm 0.2647 (-0.59z)| lr 9.87e-05 | 2532.98 ms | 53.3% bf16 MFU | 206972 tok/s +step 14549/19560 | loss 3.371096 (+0.88z)| norm 0.2757 (+0.19z)| lr 9.86e-05 | 2533.28 ms | 53.3% bf16 MFU | 206972 tok/s +step 14550/19560 | loss 3.339715 (+0.14z)| norm 0.2783 (+0.37z)| lr 9.86e-05 | 2532.80 ms | 53.3% bf16 MFU | 206973 tok/s +step 14551/19560 | loss 3.307037 (-0.61z)| norm 0.2708 (-0.16z)| lr 9.85e-05 | 2533.68 ms | 53.3% bf16 MFU | 206971 tok/s +step 14552/19560 | loss 3.326888 (-0.14z)| norm 0.2854 (+0.87z)| lr 9.85e-05 | 2535.06 ms | 53.3% bf16 MFU | 206963 tok/s +step 14553/19560 | loss 3.401561 (+1.57z)| norm 0.2841 (+0.77z)| lr 9.85e-05 | 2532.03 ms | 53.3% bf16 MFU | 206968 tok/s +step 14554/19560 | loss 3.281880 (-1.18z)| norm 0.2997 (+1.90z)| lr 9.84e-05 | 2532.89 ms | 53.3% bf16 MFU | 206969 tok/s +step 14555/19560 | loss 3.326616 (-0.13z)| norm 0.2791 (+0.41z)| lr 9.84e-05 | 2531.99 ms | 53.3% bf16 MFU | 206974 tok/s +step 14556/19560 | loss 3.291466 (-0.94z)| norm 0.2791 (+0.41z)| lr 9.84e-05 | 2532.59 ms | 53.3% bf16 MFU | 206976 tok/s +step 14557/19560 | loss 3.407939 (+1.73z)| norm 0.3107 (+2.60z)| lr 9.83e-05 | 2532.67 ms | 53.3% bf16 MFU | 206978 tok/s +step 14558/19560 | loss 3.250641 (-1.85z)| norm 0.2995 (+1.77z)| lr 9.83e-05 | 2535.47 ms | 53.3% bf16 MFU | 206968 tok/s +step 14559/19560 | loss 3.343994 (+0.27z)| norm 0.2769 (+0.19z)| lr 9.82e-05 | 2533.22 ms | 53.3% bf16 MFU | 206968 tok/s +step 14560/19560 | loss 3.360206 (+0.63z)| norm 0.2767 (+0.17z)| lr 9.82e-05 | 2533.45 ms | 53.3% bf16 MFU | 206967 tok/s +step 14561/19560 | loss 3.264551 (-1.50z)| norm 0.2550 (-1.34z)| lr 9.82e-05 | 2535.13 ms | 53.3% bf16 MFU | 206959 tok/s +step 14562/19560 | loss 3.305941 (-0.56z)| norm 0.2928 (+1.28z)| lr 9.81e-05 | 2534.36 ms | 53.3% bf16 MFU | 206954 tok/s +step 14563/19560 | loss 3.313102 (-0.43z)| norm 0.2898 (+1.06z)| lr 9.81e-05 | 2534.71 ms | 53.3% bf16 MFU | 206949 tok/s +step 14564/19560 | loss 3.333318 (+0.05z)| norm 0.2603 (-0.98z)| lr 9.81e-05 | 2532.59 ms | 53.3% bf16 MFU | 206952 tok/s +step 14565/19560 | loss 3.332207 (+0.04z)| norm 0.2682 (-0.41z)| lr 9.80e-05 | 2534.20 ms | 53.3% bf16 MFU | 206949 tok/s +step 14566/19560 | loss 3.308583 (-0.53z)| norm 0.2758 (+0.12z)| lr 9.80e-05 | 2536.14 ms | 53.2% bf16 MFU | 206938 tok/s +step 14567/19560 | loss 3.369460 (+0.91z)| norm 0.2559 (-1.27z)| lr 9.80e-05 | 2534.67 ms | 53.3% bf16 MFU | 206933 tok/s +step 14568/19560 | loss 3.329786 (-0.02z)| norm 0.2555 (-1.28z)| lr 9.79e-05 | 2533.06 ms | 53.3% bf16 MFU | 206936 tok/s +step 14569/19560 | loss 3.335914 (+0.12z)| norm 0.2877 (+0.97z)| lr 9.79e-05 | 2535.18 ms | 53.3% bf16 MFU | 206929 tok/s +step 14570/19560 | loss 3.345444 (+0.35z)| norm 0.2651 (-0.60z)| lr 9.78e-05 | 2532.41 ms | 53.3% bf16 MFU | 206934 tok/s +step 14571/19560 | loss 3.334247 (+0.08z)| norm 0.2673 (-0.44z)| lr 9.78e-05 | 2530.14 ms | 53.4% bf16 MFU | 206948 tok/s +step 14572/19560 | loss 3.244918 (-2.00z)| norm 0.2837 (+0.70z)| lr 9.78e-05 | 2532.83 ms | 53.3% bf16 MFU | 206951 tok/s +step 14573/19560 | loss 3.403918 (+1.70z)| norm 0.2527 (-1.45z)| lr 9.77e-05 | 2532.95 ms | 53.3% bf16 MFU | 206953 tok/s +step 14574/19560 | loss 3.326582 (-0.10z)| norm 0.2692 (-0.31z)| lr 9.77e-05 | 2533.67 ms | 53.3% bf16 MFU | 206951 tok/s +step 14575/19560 | loss 3.255179 (-1.73z)| norm 0.2632 (-0.73z)| lr 9.77e-05 | 2535.07 ms | 53.3% bf16 MFU | 206944 tok/s +step 14576/19560 | loss 3.323248 (-0.16z)| norm 0.2711 (-0.18z)| lr 9.76e-05 | 2534.37 ms | 53.3% bf16 MFU | 206941 tok/s +step 14577/19560 | loss 3.285320 (-1.02z)| norm 0.2575 (-1.12z)| lr 9.76e-05 | 2534.22 ms | 53.3% bf16 MFU | 206938 tok/s +step 14578/19560 | loss 3.427436 (+2.19z)| norm 0.2833 (+0.66z)| lr 9.75e-05 | 2532.84 ms | 53.3% bf16 MFU | 206941 tok/s +step 14579/19560 | loss 3.349191 (+0.42z)| norm 0.2679 (-0.42z)| lr 9.75e-05 | 2535.24 ms | 53.3% bf16 MFU | 206934 tok/s +step 14580/19560 | loss 3.308691 (-0.50z)| norm 0.2684 (-0.38z)| lr 9.75e-05 | 2534.22 ms | 53.3% bf16 MFU | 206931 tok/s +step 14581/19560 | loss 3.394663 (+1.42z)| norm 0.2891 (+1.06z)| lr 9.74e-05 | 2532.59 ms | 53.3% bf16 MFU | 206936 tok/s +step 14582/19560 | loss 3.323649 (-0.19z)| norm 0.2524 (-1.52z)| lr 9.74e-05 | 2533.79 ms | 53.3% bf16 MFU | 206935 tok/s +step 14583/19560 | loss 3.280592 (-1.15z)| norm 0.2761 (+0.14z)| lr 9.74e-05 | 2531.60 ms | 53.3% bf16 MFU | 206943 tok/s +step 14584/19560 | loss 3.391717 (+1.36z)| norm 0.2676 (-0.46z)| lr 9.73e-05 | 2534.89 ms | 53.3% bf16 MFU | 206937 tok/s +step 14585/19560 | loss 3.294291 (-0.84z)| norm 0.2618 (-0.86z)| lr 9.73e-05 | 2533.71 ms | 53.3% bf16 MFU | 206936 tok/s +step 14586/19560 | loss 3.321770 (-0.22z)| norm 0.2857 (+0.81z)| lr 9.73e-05 | 2533.18 ms | 53.3% bf16 MFU | 206938 tok/s +step 14587/19560 | loss 3.327185 (-0.10z)| norm 0.2776 (+0.25z)| lr 9.72e-05 | 2534.66 ms | 53.3% bf16 MFU | 206934 tok/s +step 14588/19560 | loss 3.341425 (+0.23z)| norm 0.2610 (-0.93z)| lr 9.72e-05 | 2533.18 ms | 53.3% bf16 MFU | 206935 tok/s +step 14589/19560 | loss 3.369257 (+0.85z)| norm 0.2881 (+0.98z)| lr 9.71e-05 | 2534.27 ms | 53.3% bf16 MFU | 206932 tok/s +step 14590/19560 | loss 3.276881 (-1.22z)| norm 0.2796 (+0.37z)| lr 9.71e-05 | 2533.83 ms | 53.3% bf16 MFU | 206932 tok/s +step 14591/19560 | loss 3.337871 (+0.15z)| norm 0.2553 (-1.34z)| lr 9.71e-05 | 2531.50 ms | 53.3% bf16 MFU | 206940 tok/s +step 14592/19560 | loss 3.365122 (+0.76z)| norm 0.2739 (-0.03z)| lr 9.70e-05 | 2533.16 ms | 53.3% bf16 MFU | 206942 tok/s +step 14593/19560 | loss 3.277135 (-1.20z)| norm 0.2571 (-1.19z)| lr 9.70e-05 | 2532.96 ms | 53.3% bf16 MFU | 206944 tok/s +step 14594/19560 | loss 3.286430 (-1.02z)| norm 0.2684 (-0.40z)| lr 9.70e-05 | 2533.32 ms | 53.3% bf16 MFU | 206945 tok/s +step 14595/19560 | loss 3.322792 (-0.15z)| norm 0.2768 (+0.20z)| lr 9.69e-05 | 2534.96 ms | 53.3% bf16 MFU | 206938 tok/s +step 14596/19560 | loss 3.338278 (+0.21z)| norm 0.2566 (-1.23z)| lr 9.69e-05 | 2534.08 ms | 53.3% bf16 MFU | 206936 tok/s +step 14597/19560 | loss 3.292766 (-0.86z)| norm 0.2799 (+0.42z)| lr 9.68e-05 | 2533.88 ms | 53.3% bf16 MFU | 206935 tok/s +step 14598/19560 | loss 3.355700 (+0.64z)| norm 0.2626 (-0.81z)| lr 9.68e-05 | 2532.06 ms | 53.3% bf16 MFU | 206941 tok/s +step 14599/19560 | loss 3.318387 (-0.26z)| norm 0.2846 (+0.74z)| lr 9.68e-05 | 2532.63 ms | 53.3% bf16 MFU | 206945 tok/s +step 14600/19560 | loss 3.411722 (+1.99z)| norm 0.2986 (+1.71z)| lr 9.67e-05 | 2532.95 ms | 53.3% bf16 MFU | 206947 tok/s +step 14601/19560 | loss 3.360645 (+0.74z)| norm 0.2738 (-0.01z)| lr 9.67e-05 | 2532.92 ms | 53.3% bf16 MFU | 206949 tok/s +step 14602/19560 | loss 3.329227 (-0.02z)| norm 0.2767 (+0.20z)| lr 9.67e-05 | 2533.36 ms | 53.3% bf16 MFU | 206949 tok/s +step 14603/19560 | loss 3.323774 (-0.15z)| norm 0.2802 (+0.46z)| lr 9.66e-05 | 2531.70 ms | 53.3% bf16 MFU | 206956 tok/s +step 14604/19560 | loss 3.319250 (-0.26z)| norm 0.2549 (-1.34z)| lr 9.66e-05 | 2532.06 ms | 53.3% bf16 MFU | 206962 tok/s +step 14605/19560 | loss 3.325123 (-0.12z)| norm 0.2712 (-0.18z)| lr 9.66e-05 | 2531.06 ms | 53.3% bf16 MFU | 206970 tok/s +step 14606/19560 | loss 3.359788 (+0.72z)| norm 0.2683 (-0.39z)| lr 9.65e-05 | 2533.44 ms | 53.3% bf16 MFU | 206969 tok/s +step 14607/19560 | loss 3.318276 (-0.30z)| norm 0.2615 (-0.86z)| lr 9.65e-05 | 2533.07 ms | 53.3% bf16 MFU | 206970 tok/s +step 14608/19560 | loss 3.344468 (+0.33z)| norm 0.2729 (-0.04z)| lr 9.64e-05 | 2535.32 ms | 53.3% bf16 MFU | 206961 tok/s +step 14609/19560 | loss 3.354127 (+0.57z)| norm 0.2800 (+0.47z)| lr 9.64e-05 | 2532.07 ms | 53.3% bf16 MFU | 206966 tok/s +step 14610/19560 | loss 3.334478 (+0.08z)| norm 0.2699 (-0.24z)| lr 9.64e-05 | 2533.68 ms | 53.3% bf16 MFU | 206964 tok/s +step 14611/19560 | loss 3.314879 (-0.41z)| norm 0.2679 (-0.39z)| lr 9.63e-05 | 2533.87 ms | 53.3% bf16 MFU | 206961 tok/s +step 14612/19560 | loss 3.331688 (+0.00z)| norm 0.2745 (+0.08z)| lr 9.63e-05 | 2534.02 ms | 53.3% bf16 MFU | 206958 tok/s +step 14613/19560 | loss 3.354755 (+0.59z)| norm 0.2858 (+0.89z)| lr 9.63e-05 | 2532.34 ms | 53.3% bf16 MFU | 206962 tok/s +step 14614/19560 | loss 3.364589 (+0.84z)| norm 0.2732 (-0.03z)| lr 9.62e-05 | 2533.06 ms | 53.3% bf16 MFU | 206963 tok/s +step 14615/19560 | loss 3.379801 (+1.20z)| norm 0.2648 (-0.63z)| lr 9.62e-05 | 2535.10 ms | 53.3% bf16 MFU | 206955 tok/s +step 14616/19560 | loss 3.347043 (+0.39z)| norm 0.2688 (-0.33z)| lr 9.61e-05 | 2534.04 ms | 53.3% bf16 MFU | 206952 tok/s +step 14617/19560 | loss 3.365097 (+0.84z)| norm 0.3104 (+2.61z)| lr 9.61e-05 | 2533.00 ms | 53.3% bf16 MFU | 206954 tok/s +step 14618/19560 | loss 3.383477 (+1.28z)| norm 0.2792 (+0.38z)| lr 9.61e-05 | 2534.35 ms | 53.3% bf16 MFU | 206950 tok/s +step 14619/19560 | loss 3.383758 (+1.30z)| norm 0.2701 (-0.25z)| lr 9.60e-05 | 2534.30 ms | 53.3% bf16 MFU | 206946 tok/s +step 14620/19560 | loss 3.275040 (-1.43z)| norm 0.2574 (-1.22z)| lr 9.60e-05 | 2535.59 ms | 53.2% bf16 MFU | 206938 tok/s +step 14621/19560 | loss 3.360353 (+0.76z)| norm 0.2774 (+0.36z)| lr 9.60e-05 | 2534.13 ms | 53.3% bf16 MFU | 206935 tok/s +step 14622/19560 | loss 3.325945 (-0.13z)| norm 0.2951 (+1.74z)| lr 9.59e-05 | 2534.41 ms | 53.3% bf16 MFU | 206932 tok/s +step 14623/19560 | loss 3.303926 (-0.69z)| norm 0.2679 (-0.39z)| lr 9.59e-05 | 2533.86 ms | 53.3% bf16 MFU | 206931 tok/s +step 14624/19560 | loss 3.297170 (-0.86z)| norm 0.2851 (+0.96z)| lr 9.59e-05 | 2536.26 ms | 53.2% bf16 MFU | 206920 tok/s +step 14625/19560 | loss 3.304712 (-0.65z)| norm 0.2899 (+1.30z)| lr 9.58e-05 | 2533.31 ms | 53.3% bf16 MFU | 206922 tok/s +step 14626/19560 | loss 3.273945 (-1.42z)| norm 0.2753 (+0.17z)| lr 9.58e-05 | 2534.98 ms | 53.3% bf16 MFU | 206917 tok/s +step 14627/19560 | loss 3.305771 (-0.61z)| norm 0.2855 (+0.95z)| lr 9.57e-05 | 2533.35 ms | 53.3% bf16 MFU | 206919 tok/s +step 14628/19560 | loss 3.347719 (+0.49z)| norm 0.2647 (-0.67z)| lr 9.57e-05 | 2532.25 ms | 53.3% bf16 MFU | 206925 tok/s +step 14629/19560 | loss 3.325718 (-0.08z)| norm 0.2744 (+0.11z)| lr 9.57e-05 | 2531.36 ms | 53.3% bf16 MFU | 206935 tok/s +step 14630/19560 | loss 3.336766 (+0.21z)| norm 0.2776 (+0.37z)| lr 9.56e-05 | 2534.04 ms | 53.3% bf16 MFU | 206933 tok/s +step 14631/19560 | loss 3.304011 (-0.67z)| norm 0.2673 (-0.46z)| lr 9.56e-05 | 2531.81 ms | 53.3% bf16 MFU | 206940 tok/s +step 14632/19560 | loss 3.317157 (-0.32z)| norm 0.2794 (+0.50z)| lr 9.56e-05 | 2533.50 ms | 53.3% bf16 MFU | 206940 tok/s +step 14633/19560 | loss 3.291877 (-0.98z)| norm 0.2777 (+0.37z)| lr 9.55e-05 | 2531.95 ms | 53.3% bf16 MFU | 206947 tok/s +step 14634/19560 | loss 3.350578 (+0.57z)| norm 0.2645 (-0.69z)| lr 9.55e-05 | 2533.92 ms | 53.3% bf16 MFU | 206945 tok/s +step 14635/19560 | loss 3.357892 (+0.75z)| norm 0.2827 (+0.77z)| lr 9.55e-05 | 2535.17 ms | 53.3% bf16 MFU | 206938 tok/s +step 14636/19560 | loss 3.306974 (-0.61z)| norm 0.2814 (+0.67z)| lr 9.54e-05 | 2533.54 ms | 53.3% bf16 MFU | 206938 tok/s +step 14637/19560 | loss 3.358096 (+0.74z)| norm 0.2639 (-0.73z)| lr 9.54e-05 | 2533.91 ms | 53.3% bf16 MFU | 206936 tok/s +step 14638/19560 | loss 3.317620 (-0.33z)| norm 0.2674 (-0.43z)| lr 9.53e-05 | 2534.31 ms | 53.3% bf16 MFU | 206933 tok/s +step 14639/19560 | loss 3.317917 (-0.34z)| norm 0.2668 (-0.48z)| lr 9.53e-05 | 2533.14 ms | 53.3% bf16 MFU | 206935 tok/s +step 14640/19560 | loss 3.529430 (+4.92z)| norm 0.2964 (+1.89z)| lr 9.53e-05 | 2533.08 ms | 53.3% bf16 MFU | 206937 tok/s +step 14641/19560 | loss 3.394768 (+1.55z)| norm 0.2812 (+0.67z)| lr 9.52e-05 | 2532.84 ms | 53.3% bf16 MFU | 206940 tok/s +step 14642/19560 | loss 3.289204 (-1.04z)| norm 0.2689 (-0.32z)| lr 9.52e-05 | 2533.39 ms | 53.3% bf16 MFU | 206941 tok/s +step 14643/19560 | loss 3.280509 (-1.24z)| norm 0.2792 (+0.50z)| lr 9.52e-05 | 2534.17 ms | 53.3% bf16 MFU | 206938 tok/s +step 14644/19560 | loss 3.329597 (-0.04z)| norm 0.2586 (-1.14z)| lr 9.51e-05 | 2533.16 ms | 53.3% bf16 MFU | 206940 tok/s +step 14645/19560 | loss 3.370593 (+0.95z)| norm 0.2799 (+0.55z)| lr 9.51e-05 | 2532.50 ms | 53.3% bf16 MFU | 206944 tok/s +step 14646/19560 | loss 3.315959 (-0.37z)| norm 0.2809 (+0.63z)| lr 9.51e-05 | 2533.21 ms | 53.3% bf16 MFU | 206945 tok/s +step 14647/19560 | loss 3.276645 (-1.32z)| norm 0.2739 (+0.07z)| lr 9.50e-05 | 2533.15 ms | 53.3% bf16 MFU | 206946 tok/s +step 14648/19560 | loss 3.352352 (+0.52z)| norm 0.2757 (+0.21z)| lr 9.50e-05 | 2533.66 ms | 53.3% bf16 MFU | 206945 tok/s +step 14649/19560 | loss 3.369460 (+0.92z)| norm 0.2924 (+1.51z)| lr 9.49e-05 | 2536.28 ms | 53.2% bf16 MFU | 206934 tok/s +step 14650/19560 | loss 3.306904 (-0.60z)| norm 0.2925 (+1.50z)| lr 9.49e-05 | 2534.23 ms | 53.3% bf16 MFU | 206931 tok/s +step 14651/19560 | loss 3.401146 (+1.67z)| norm 0.2597 (-1.09z)| lr 9.49e-05 | 2534.34 ms | 53.3% bf16 MFU | 206928 tok/s +step 14652/19560 | loss 3.344382 (+0.30z)| norm 0.2769 (+0.26z)| lr 9.48e-05 | 2533.43 ms | 53.3% bf16 MFU | 206929 tok/s +step 14653/19560 | loss 3.349635 (+0.41z)| norm 0.2722 (-0.12z)| lr 9.48e-05 | 2533.28 ms | 53.3% bf16 MFU | 206931 tok/s +step 14654/19560 | loss 3.325172 (-0.17z)| norm 0.2658 (-0.63z)| lr 9.48e-05 | 2532.33 ms | 53.3% bf16 MFU | 206936 tok/s +step 14655/19560 | loss 3.357501 (+0.61z)| norm 0.2882 (+1.14z)| lr 9.47e-05 | 2533.99 ms | 53.3% bf16 MFU | 206935 tok/s +step 14656/19560 | loss 3.326873 (-0.13z)| norm 0.2735 (-0.04z)| lr 9.47e-05 | 2532.42 ms | 53.3% bf16 MFU | 206939 tok/s +step 14657/19560 | loss 3.336500 (+0.09z)| norm 0.2989 (+1.95z)| lr 9.47e-05 | 2533.11 ms | 53.3% bf16 MFU | 206941 tok/s +step 14658/19560 | loss 3.348224 (+0.37z)| norm 0.3223 (+3.57z)| lr 9.46e-05 | 2531.51 ms | 53.3% bf16 MFU | 206949 tok/s +step 14659/19560 | loss 3.409100 (+1.82z)| norm 0.2915 (+1.24z)| lr 9.46e-05 | 2531.64 ms | 53.3% bf16 MFU | 206957 tok/s +step 14660/19560 | loss 3.437217 (+2.45z)| norm 0.2844 (+0.70z)| lr 9.45e-05 | 2531.45 ms | 53.3% bf16 MFU | 206964 tok/s +step 14661/19560 | loss 3.406447 (+1.69z)| norm 0.2893 (+1.06z)| lr 9.45e-05 | 2531.84 ms | 53.3% bf16 MFU | 206970 tok/s +step 14662/19560 | loss 3.357773 (+0.54z)| norm 0.2743 (-0.07z)| lr 9.45e-05 | 2532.76 ms | 53.3% bf16 MFU | 206972 tok/s +step 14663/19560 | loss 3.284954 (-1.15z)| norm 0.2824 (+0.52z)| lr 9.44e-05 | 2532.30 ms | 53.3% bf16 MFU | 206975 tok/s +step 14664/19560 | loss 3.309783 (-0.58z)| norm 0.2856 (+0.80z)| lr 9.44e-05 | 2534.35 ms | 53.3% bf16 MFU | 206970 tok/s +step 14665/19560 | loss 3.278102 (-1.31z)| norm 0.2782 (+0.21z)| lr 9.44e-05 | 2531.50 ms | 53.3% bf16 MFU | 206977 tok/s +step 14666/19560 | loss 3.323827 (-0.24z)| norm 0.2899 (+1.12z)| lr 9.43e-05 | 2532.85 ms | 53.3% bf16 MFU | 206978 tok/s +step 14667/19560 | loss 3.344900 (+0.26z)| norm 0.2646 (-0.87z)| lr 9.43e-05 | 2532.33 ms | 53.3% bf16 MFU | 206981 tok/s +step 14668/19560 | loss 3.291905 (-0.99z)| norm 0.2847 (+0.70z)| lr 9.43e-05 | 2531.08 ms | 53.3% bf16 MFU | 206989 tok/s +step 14669/19560 | loss 3.293427 (-0.94z)| norm 0.2718 (-0.33z)| lr 9.42e-05 | 2532.39 ms | 53.3% bf16 MFU | 206991 tok/s +step 14670/19560 | loss 3.379187 (+1.06z)| norm 0.2678 (-0.63z)| lr 9.42e-05 | 2531.29 ms | 53.3% bf16 MFU | 206997 tok/s +step 14671/19560 | loss 3.331717 (-0.05z)| norm 0.2782 (+0.19z)| lr 9.41e-05 | 2533.07 ms | 53.3% bf16 MFU | 206996 tok/s +step 14672/19560 | loss 3.316279 (-0.40z)| norm 0.2869 (+0.88z)| lr 9.41e-05 | 2533.80 ms | 53.3% bf16 MFU | 206992 tok/s +step 14673/19560 | loss 3.369870 (+0.85z)| norm 0.2686 (-0.57z)| lr 9.41e-05 | 2532.57 ms | 53.3% bf16 MFU | 206994 tok/s +step 14674/19560 | loss 3.300813 (-0.79z)| norm 0.2684 (-0.59z)| lr 9.40e-05 | 2533.14 ms | 53.3% bf16 MFU | 206993 tok/s +step 14675/19560 | loss 3.353293 (+0.45z)| norm 0.2617 (-1.11z)| lr 9.40e-05 | 2534.41 ms | 53.3% bf16 MFU | 206986 tok/s +step 14676/19560 | loss 3.276455 (-1.38z)| norm 0.2532 (-1.77z)| lr 9.40e-05 | 2532.40 ms | 53.3% bf16 MFU | 206989 tok/s +step 14677/19560 | loss 3.326076 (-0.19z)| norm 0.2688 (-0.53z)| lr 9.39e-05 | 2533.17 ms | 53.3% bf16 MFU | 206988 tok/s +step 14678/19560 | loss 3.308598 (-0.60z)| norm 0.2520 (-1.81z)| lr 9.39e-05 | 2533.88 ms | 53.3% bf16 MFU | 206984 tok/s +step 14679/19560 | loss 3.384118 (+1.19z)| norm 0.2692 (-0.48z)| lr 9.39e-05 | 2531.53 ms | 53.3% bf16 MFU | 206990 tok/s +step 14680/19560 | loss 3.292737 (-0.98z)| norm 0.2614 (-1.07z)| lr 9.38e-05 | 2532.75 ms | 53.3% bf16 MFU | 206990 tok/s +step 14681/19560 | loss 3.316309 (-0.41z)| norm 0.2515 (-1.80z)| lr 9.38e-05 | 2534.03 ms | 53.3% bf16 MFU | 206986 tok/s +step 14682/19560 | loss 3.366596 (+0.78z)| norm 0.2594 (-1.17z)| lr 9.37e-05 | 2533.19 ms | 53.3% bf16 MFU | 206985 tok/s +step 14683/19560 | loss 3.321197 (-0.30z)| norm 0.2612 (-1.02z)| lr 9.37e-05 | 2535.61 ms | 53.2% bf16 MFU | 206974 tok/s +step 14684/19560 | loss 3.276972 (-1.36z)| norm 0.2573 (-1.30z)| lr 9.37e-05 | 2532.99 ms | 53.3% bf16 MFU | 206975 tok/s +step 14685/19560 | loss 3.345713 (+0.30z)| norm 0.2522 (-1.69z)| lr 9.36e-05 | 2532.53 ms | 53.3% bf16 MFU | 206977 tok/s +step 14686/19560 | loss 3.303267 (-0.75z)| norm 0.2538 (-1.55z)| lr 9.36e-05 | 2532.12 ms | 53.3% bf16 MFU | 206981 tok/s +step 14687/19560 | loss 3.324168 (-0.23z)| norm 0.2548 (-1.44z)| lr 9.36e-05 | 2533.15 ms | 53.3% bf16 MFU | 206980 tok/s +step 14688/19560 | loss 3.315486 (-0.44z)| norm 0.2586 (-1.13z)| lr 9.35e-05 | 2533.13 ms | 53.3% bf16 MFU | 206980 tok/s +step 14689/19560 | loss 3.329884 (-0.10z)| norm 0.2549 (-1.42z)| lr 9.35e-05 | 2531.72 ms | 53.3% bf16 MFU | 206985 tok/s +step 14690/19560 | loss 3.330516 (-0.08z)| norm 0.2725 (-0.04z)| lr 9.35e-05 | 2532.55 ms | 53.3% bf16 MFU | 206987 tok/s +step 14691/19560 | loss 3.361733 (+0.68z)| norm 0.2690 (-0.30z)| lr 9.34e-05 | 2533.61 ms | 53.3% bf16 MFU | 206984 tok/s +step 14692/19560 | loss 3.317234 (-0.42z)| norm 0.2483 (-1.91z)| lr 9.34e-05 | 2533.97 ms | 53.3% bf16 MFU | 206980 tok/s +step 14693/19560 | loss 3.298001 (-0.89z)| norm 0.2858 (+1.00z)| lr 9.33e-05 | 2532.62 ms | 53.3% bf16 MFU | 206982 tok/s +step 14694/19560 | loss 3.290370 (-1.07z)| norm 0.2859 (+1.00z)| lr 9.33e-05 | 2531.95 ms | 53.3% bf16 MFU | 206986 tok/s +step 14695/19560 | loss 3.347705 (+0.35z)| norm 0.2766 (+0.27z)| lr 9.33e-05 | 2532.37 ms | 53.3% bf16 MFU | 206989 tok/s +step 14696/19560 | loss 3.291428 (-1.03z)| norm 0.2658 (-0.58z)| lr 9.32e-05 | 2532.63 ms | 53.3% bf16 MFU | 206990 tok/s +step 14697/19560 | loss 3.357189 (+0.59z)| norm 0.2842 (+0.86z)| lr 9.32e-05 | 2531.80 ms | 53.3% bf16 MFU | 206995 tok/s +step 14698/19560 | loss 3.334443 (+0.03z)| norm 0.2709 (-0.18z)| lr 9.32e-05 | 2533.22 ms | 53.3% bf16 MFU | 206993 tok/s +step 14699/19560 | loss 3.364855 (+0.77z)| norm 0.2768 (+0.27z)| lr 9.31e-05 | 2533.37 ms | 53.3% bf16 MFU | 206991 tok/s +step 14700/19560 | loss 3.413824 (+1.95z)| norm 0.2819 (+0.68z)| lr 9.31e-05 | 2533.12 ms | 53.3% bf16 MFU | 206990 tok/s +step 14701/19560 | loss 3.392716 (+1.43z)| norm 0.2768 (+0.26z)| lr 9.31e-05 | 2532.78 ms | 53.3% bf16 MFU | 206991 tok/s +step 14702/19560 | loss 3.351709 (+0.41z)| norm 0.2680 (-0.44z)| lr 9.30e-05 | 2535.22 ms | 53.3% bf16 MFU | 206981 tok/s +step 14703/19560 | loss 3.373765 (+0.95z)| norm 0.2738 (+0.02z)| lr 9.30e-05 | 2531.77 ms | 53.3% bf16 MFU | 206986 tok/s +step 14704/19560 | loss 3.340691 (+0.12z)| norm 0.2652 (-0.66z)| lr 9.29e-05 | 2533.02 ms | 53.3% bf16 MFU | 206986 tok/s +step 14705/19560 | loss 3.307683 (-0.72z)| norm 0.2682 (-0.43z)| lr 9.29e-05 | 2533.57 ms | 53.3% bf16 MFU | 206984 tok/s +step 14706/19560 | loss 3.331478 (-0.10z)| norm 0.2841 (+0.84z)| lr 9.29e-05 | 2535.06 ms | 53.3% bf16 MFU | 206975 tok/s +step 14707/19560 | loss 3.283523 (-1.31z)| norm 0.2687 (-0.39z)| lr 9.28e-05 | 2534.02 ms | 53.3% bf16 MFU | 206971 tok/s +step 14708/19560 | loss 3.355083 (+0.50z)| norm 0.2800 (+0.50z)| lr 9.28e-05 | 2533.68 ms | 53.3% bf16 MFU | 206969 tok/s +step 14709/19560 | loss 3.330490 (-0.11z)| norm 0.2773 (+0.30z)| lr 9.28e-05 | 2535.40 ms | 53.3% bf16 MFU | 206960 tok/s +step 14710/19560 | loss 3.322864 (-0.31z)| norm 0.2518 (-1.75z)| lr 9.27e-05 | 2535.45 ms | 53.3% bf16 MFU | 206951 tok/s +step 14711/19560 | loss 3.346353 (+0.29z)| norm 0.2671 (-0.52z)| lr 9.27e-05 | 2532.87 ms | 53.3% bf16 MFU | 206953 tok/s +step 14712/19560 | loss 3.380495 (+1.18z)| norm 0.2745 (+0.07z)| lr 9.27e-05 | 2533.78 ms | 53.3% bf16 MFU | 206952 tok/s +step 14713/19560 | loss 3.366707 (+0.81z)| norm 0.2679 (-0.46z)| lr 9.26e-05 | 2534.71 ms | 53.3% bf16 MFU | 206946 tok/s +step 14714/19560 | loss 3.329241 (-0.17z)| norm 0.2725 (-0.08z)| lr 9.26e-05 | 2531.68 ms | 53.3% bf16 MFU | 206954 tok/s +step 14715/19560 | loss 3.342865 (+0.18z)| norm 0.2542 (-1.53z)| lr 9.25e-05 | 2535.03 ms | 53.3% bf16 MFU | 206947 tok/s +step 14716/19560 | loss 3.241360 (-2.40z)| norm 0.2775 (+0.32z)| lr 9.25e-05 | 2533.04 ms | 53.3% bf16 MFU | 206948 tok/s +step 14717/19560 | loss 3.355453 (+0.52z)| norm 0.2644 (-0.72z)| lr 9.25e-05 | 2531.97 ms | 53.3% bf16 MFU | 206954 tok/s +step 14718/19560 | loss 3.432222 (+2.42z)| norm 0.2905 (+1.37z)| lr 9.24e-05 | 2531.42 ms | 53.3% bf16 MFU | 206962 tok/s +step 14719/19560 | loss 3.338816 (+0.06z)| norm 0.2799 (+0.51z)| lr 9.24e-05 | 2531.63 ms | 53.3% bf16 MFU | 206969 tok/s +step 14720/19560 | loss 3.303275 (-0.82z)| norm 0.2644 (-0.73z)| lr 9.24e-05 | 2531.91 ms | 53.3% bf16 MFU | 206974 tok/s +step 14721/19560 | loss 3.331630 (-0.12z)| norm 0.2618 (-0.95z)| lr 9.23e-05 | 2531.46 ms | 53.3% bf16 MFU | 206981 tok/s +step 14722/19560 | loss 3.302209 (-0.87z)| norm 0.2986 (+1.98z)| lr 9.23e-05 | 2532.72 ms | 53.3% bf16 MFU | 206982 tok/s +step 14723/19560 | loss 3.342900 (+0.16z)| norm 0.2710 (-0.22z)| lr 9.23e-05 | 2533.24 ms | 53.3% bf16 MFU | 206981 tok/s +step 14724/19560 | loss 3.319585 (-0.43z)| norm 0.2709 (-0.24z)| lr 9.22e-05 | 2532.04 ms | 53.3% bf16 MFU | 206985 tok/s +step 14725/19560 | loss 3.387367 (+1.28z)| norm 0.2837 (+0.79z)| lr 9.22e-05 | 2533.59 ms | 53.3% bf16 MFU | 206983 tok/s +step 14726/19560 | loss 3.324159 (-0.33z)| norm 0.2509 (-1.82z)| lr 9.22e-05 | 2533.01 ms | 53.3% bf16 MFU | 206983 tok/s +step 14727/19560 | loss 3.343825 (+0.17z)| norm 0.2584 (-1.20z)| lr 9.21e-05 | 2531.13 ms | 53.3% bf16 MFU | 206990 tok/s +step 14728/19560 | loss 3.362610 (+0.67z)| norm 0.2776 (+0.34z)| lr 9.21e-05 | 2531.12 ms | 53.3% bf16 MFU | 206997 tok/s +step 14729/19560 | loss 3.309387 (-0.70z)| norm 0.2461 (-2.14z)| lr 9.20e-05 | 2530.42 ms | 53.4% bf16 MFU | 207007 tok/s +step 14730/19560 | loss 3.386612 (+1.28z)| norm 0.2552 (-1.40z)| lr 9.20e-05 | 2532.64 ms | 53.3% bf16 MFU | 207008 tok/s +step 14731/19560 | loss 3.320848 (-0.41z)| norm 0.2586 (-1.11z)| lr 9.20e-05 | 2533.45 ms | 53.3% bf16 MFU | 207004 tok/s +step 14732/19560 | loss 3.343902 (+0.18z)| norm 0.2724 (-0.05z)| lr 9.19e-05 | 2533.61 ms | 53.3% bf16 MFU | 207001 tok/s +step 14733/19560 | loss 3.310915 (-0.66z)| norm 0.2713 (-0.13z)| lr 9.19e-05 | 2531.25 ms | 53.3% bf16 MFU | 207007 tok/s +step 14734/19560 | loss 3.397848 (+1.54z)| norm 0.2760 (+0.23z)| lr 9.19e-05 | 2534.03 ms | 53.3% bf16 MFU | 207002 tok/s +step 14735/19560 | loss 3.329496 (-0.20z)| norm 0.2693 (-0.30z)| lr 9.18e-05 | 2533.73 ms | 53.3% bf16 MFU | 206998 tok/s +step 14736/19560 | loss 3.334698 (-0.06z)| norm 0.2566 (-1.29z)| lr 9.18e-05 | 2535.36 ms | 53.3% bf16 MFU | 206987 tok/s +step 14737/19560 | loss 3.429340 (+2.28z)| norm 0.2886 (+1.22z)| lr 9.18e-05 | 2532.07 ms | 53.3% bf16 MFU | 206991 tok/s +step 14738/19560 | loss 3.372609 (+0.86z)| norm 0.3045 (+2.38z)| lr 9.17e-05 | 2532.84 ms | 53.3% bf16 MFU | 206991 tok/s +step 14739/19560 | loss 3.311381 (-0.66z)| norm 0.2666 (-0.51z)| lr 9.17e-05 | 2532.25 ms | 53.3% bf16 MFU | 206994 tok/s +step 14740/19560 | loss 3.278704 (-1.45z)| norm 0.2638 (-0.72z)| lr 9.16e-05 | 2534.27 ms | 53.3% bf16 MFU | 206988 tok/s +step 14741/19560 | loss 3.430209 (+2.22z)| norm 0.3022 (+2.16z)| lr 9.16e-05 | 2535.30 ms | 53.3% bf16 MFU | 206979 tok/s +step 14742/19560 | loss 3.466749 (+2.98z)| norm 0.2803 (+0.51z)| lr 9.16e-05 | 2533.69 ms | 53.3% bf16 MFU | 206976 tok/s +step 14743/19560 | loss 3.307860 (-0.71z)| norm 0.2732 (-0.02z)| lr 9.15e-05 | 2535.19 ms | 53.3% bf16 MFU | 206967 tok/s +step 14744/19560 | loss 3.381043 (+0.99z)| norm 0.2726 (-0.07z)| lr 9.15e-05 | 2534.63 ms | 53.3% bf16 MFU | 206962 tok/s +step 14745/19560 | loss 3.316659 (-0.50z)| norm 0.2677 (-0.43z)| lr 9.15e-05 | 2534.24 ms | 53.3% bf16 MFU | 206958 tok/s +step 14746/19560 | loss 3.354183 (+0.38z)| norm 0.2769 (+0.29z)| lr 9.14e-05 | 2532.50 ms | 53.3% bf16 MFU | 206961 tok/s +step 14747/19560 | loss 3.245301 (-2.12z)| norm 0.2780 (+0.37z)| lr 9.14e-05 | 2533.58 ms | 53.3% bf16 MFU | 206960 tok/s +step 14748/19560 | loss 3.357475 (+0.46z)| norm 0.3170 (+3.23z)| lr 9.14e-05 | 2533.84 ms | 53.3% bf16 MFU | 206957 tok/s +step 14749/19560 | loss 3.358060 (+0.48z)| norm 0.2705 (-0.23z)| lr 9.13e-05 | 2532.73 ms | 53.3% bf16 MFU | 206960 tok/s +step 14750/19560 | loss 3.283236 (-1.25z)| norm 0.2827 (+0.69z)| lr 9.13e-05 | 2532.91 ms | 53.3% bf16 MFU | 206961 tok/s +val loss 3.317486 +evaluating HellaSwag: 0/628 evaluating HellaSwag: 10/628 evaluating HellaSwag: 20/628 evaluating HellaSwag: 30/628 evaluating HellaSwag: 40/628 evaluating HellaSwag: 50/628 evaluating HellaSwag: 60/628 evaluating HellaSwag: 70/628 evaluating HellaSwag: 80/628 evaluating HellaSwag: 90/628 evaluating HellaSwag: 100/628 evaluating HellaSwag: 110/628 evaluating HellaSwag: 120/628 evaluating HellaSwag: 130/628 evaluating HellaSwag: 140/628 evaluating HellaSwag: 150/628 evaluating HellaSwag: 160/628 evaluating HellaSwag: 170/628 evaluating HellaSwag: 180/628 evaluating HellaSwag: 190/628 evaluating HellaSwag: 200/628 evaluating HellaSwag: 210/628 evaluating HellaSwag: 220/628 evaluating HellaSwag: 230/628 evaluating HellaSwag: 240/628 evaluating HellaSwag: 250/628 evaluating HellaSwag: 260/628 evaluating HellaSwag: 270/628 evaluating HellaSwag: 280/628 evaluating HellaSwag: 290/628 evaluating HellaSwag: 300/628 evaluating HellaSwag: 310/628 evaluating HellaSwag: 320/628 evaluating HellaSwag: 330/628 evaluating HellaSwag: 340/628 evaluating HellaSwag: 350/628 evaluating HellaSwag: 360/628 evaluating HellaSwag: 370/628 evaluating HellaSwag: 380/628 evaluating HellaSwag: 390/628 evaluating HellaSwag: 400/628 evaluating HellaSwag: 410/628 evaluating HellaSwag: 420/628 evaluating HellaSwag: 430/628 evaluating HellaSwag: 440/628 evaluating HellaSwag: 450/628 evaluating HellaSwag: 460/628 evaluating HellaSwag: 470/628 evaluating HellaSwag: 480/628 evaluating HellaSwag: 490/628 evaluating HellaSwag: 500/628 evaluating HellaSwag: 510/628 evaluating HellaSwag: 520/628 evaluating HellaSwag: 530/628 evaluating HellaSwag: 540/628 evaluating HellaSwag: 550/628 evaluating HellaSwag: 560/628 evaluating HellaSwag: 570/628 evaluating HellaSwag: 580/628 evaluating HellaSwag: 590/628 evaluating HellaSwag: 600/628 evaluating HellaSwag: 610/628 evaluating HellaSwag: 620/628 HellaSwag: 3022/10042 = 0.300936 +step 14751/19560 | loss 3.376992 (+0.90z)| norm 0.2627 (-0.81z)| lr 9.13e-05 | 2534.60 ms | 53.3% bf16 MFU | 206956 tok/s +step 14752/19560 | loss 3.303753 (-0.79z)| norm 0.2597 (-1.02z)| lr 9.12e-05 | 2533.32 ms | 53.3% bf16 MFU | 206956 tok/s +step 14753/19560 | loss 3.329260 (-0.21z)| norm 0.2815 (+0.62z)| lr 9.12e-05 | 2533.00 ms | 53.3% bf16 MFU | 206957 tok/s +step 14754/19560 | loss 3.289731 (-1.13z)| norm 0.2661 (-0.53z)| lr 9.11e-05 | 2531.81 ms | 53.3% bf16 MFU | 206963 tok/s +step 14755/19560 | loss 3.242692 (-2.17z)| norm 0.2679 (-0.39z)| lr 9.11e-05 | 2532.41 ms | 53.3% bf16 MFU | 206967 tok/s +step 14756/19560 | loss 3.414748 (+1.73z)| norm 0.2667 (-0.48z)| lr 9.11e-05 | 2533.15 ms | 53.3% bf16 MFU | 206967 tok/s +step 14757/19560 | loss 3.349525 (+0.25z)| norm 0.2641 (-0.67z)| lr 9.10e-05 | 2531.48 ms | 53.3% bf16 MFU | 206974 tok/s +step 14758/19560 | loss 3.265125 (-1.63z)| norm 0.2527 (-1.50z)| lr 9.10e-05 | 2534.57 ms | 53.3% bf16 MFU | 206968 tok/s +step 14759/19560 | loss 3.354108 (+0.35z)| norm 0.2714 (-0.10z)| lr 9.10e-05 | 2531.94 ms | 53.3% bf16 MFU | 206973 tok/s +step 14760/19560 | loss 3.302584 (-0.80z)| norm 0.2580 (-1.09z)| lr 9.09e-05 | 2534.95 ms | 53.3% bf16 MFU | 206966 tok/s +step 14761/19560 | loss 3.283350 (-1.22z)| norm 0.4382 (+8.30z)| lr 9.09e-05 | 2533.44 ms | 53.3% bf16 MFU | 206965 tok/s +step 14762/19560 | loss 3.314275 (-0.53z)| norm 0.2730 (-0.05z)| lr 9.09e-05 | 2533.74 ms | 53.3% bf16 MFU | 206963 tok/s +step 14763/19560 | loss 3.331623 (-0.13z)| norm 0.2752 (+0.07z)| lr 9.08e-05 | 2530.96 ms | 53.3% bf16 MFU | 206972 tok/s +step 14764/19560 | loss 3.247983 (-1.96z)| norm 0.2590 (-0.74z)| lr 9.08e-05 | 2534.14 ms | 53.3% bf16 MFU | 206968 tok/s +step 14765/19560 | loss 3.331881 (-0.11z)| norm 0.2607 (-0.66z)| lr 9.07e-05 | 2533.35 ms | 53.3% bf16 MFU | 206967 tok/s +step 14766/19560 | loss 3.403629 (+1.44z)| norm 0.2685 (-0.26z)| lr 9.07e-05 | 2533.61 ms | 53.3% bf16 MFU | 206965 tok/s +step 14767/19560 | loss 3.321831 (-0.35z)| norm 0.2786 (+0.24z)| lr 9.07e-05 | 2532.86 ms | 53.3% bf16 MFU | 206967 tok/s +step 14768/19560 | loss 3.335186 (-0.02z)| norm 0.2657 (-0.40z)| lr 9.06e-05 | 2532.66 ms | 53.3% bf16 MFU | 206969 tok/s +step 14769/19560 | loss 3.422562 (+2.02z)| norm 0.2616 (-0.60z)| lr 9.06e-05 | 2533.36 ms | 53.3% bf16 MFU | 206968 tok/s +step 14770/19560 | loss 3.278429 (-1.35z)| norm 0.2888 (+0.77z)| lr 9.06e-05 | 2533.01 ms | 53.3% bf16 MFU | 206969 tok/s +step 14771/19560 | loss 3.388933 (+1.21z)| norm 0.2628 (-0.54z)| lr 9.05e-05 | 2535.38 ms | 53.3% bf16 MFU | 206960 tok/s +step 14772/19560 | loss 3.314494 (-0.52z)| norm 0.2773 (+0.19z)| lr 9.05e-05 | 2533.73 ms | 53.3% bf16 MFU | 206958 tok/s +step 14773/19560 | loss 3.372409 (+0.83z)| norm 0.2778 (+0.21z)| lr 9.05e-05 | 2536.17 ms | 53.2% bf16 MFU | 206946 tok/s +step 14774/19560 | loss 3.341246 (+0.09z)| norm 0.2739 (+0.02z)| lr 9.04e-05 | 2534.70 ms | 53.3% bf16 MFU | 206941 tok/s +step 14775/19560 | loss 3.301332 (-0.85z)| norm 0.2822 (+0.44z)| lr 9.04e-05 | 2533.61 ms | 53.3% bf16 MFU | 206941 tok/s +step 14776/19560 | loss 3.304111 (-0.77z)| norm 0.2589 (-0.73z)| lr 9.04e-05 | 2533.89 ms | 53.3% bf16 MFU | 206939 tok/s +step 14777/19560 | loss 3.341230 (+0.10z)| norm 0.2756 (+0.12z)| lr 9.03e-05 | 2534.43 ms | 53.3% bf16 MFU | 206936 tok/s +step 14778/19560 | loss 3.393605 (+1.31z)| norm 0.2916 (+0.93z)| lr 9.03e-05 | 2534.42 ms | 53.3% bf16 MFU | 206932 tok/s +step 14779/19560 | loss 3.355001 (+0.42z)| norm 0.2760 (+0.13z)| lr 9.02e-05 | 2533.50 ms | 53.3% bf16 MFU | 206933 tok/s +step 14780/19560 | loss 3.307583 (-0.69z)| norm 0.2972 (+1.19z)| lr 9.02e-05 | 2533.35 ms | 53.3% bf16 MFU | 206934 tok/s +step 14781/19560 | loss 3.323395 (-0.31z)| norm 0.2870 (+0.67z)| lr 9.02e-05 | 2533.71 ms | 53.3% bf16 MFU | 206933 tok/s +step 14782/19560 | loss 3.311801 (-0.58z)| norm 0.2638 (-0.50z)| lr 9.01e-05 | 2534.19 ms | 53.3% bf16 MFU | 206931 tok/s +step 14783/19560 | loss 3.427810 (+2.10z)| norm 0.2734 (-0.01z)| lr 9.01e-05 | 2532.07 ms | 53.3% bf16 MFU | 206937 tok/s +step 14784/19560 | loss 3.272716 (-1.46z)| norm 0.2795 (+0.30z)| lr 9.01e-05 | 2532.76 ms | 53.3% bf16 MFU | 206941 tok/s +step 14785/19560 | loss 3.264780 (-1.62z)| norm 0.2595 (-0.70z)| lr 9.00e-05 | 2534.29 ms | 53.3% bf16 MFU | 206937 tok/s +step 14786/19560 | loss 3.302536 (-0.75z)| norm 0.2833 (+0.54z)| lr 9.00e-05 | 2534.15 ms | 53.3% bf16 MFU | 206935 tok/s +step 14787/19560 | loss 3.290938 (-1.00z)| norm 0.2691 (-0.19z)| lr 9.00e-05 | 2534.89 ms | 53.3% bf16 MFU | 206930 tok/s +step 14788/19560 | loss 3.357114 (+0.53z)| norm 0.2609 (-0.61z)| lr 8.99e-05 | 2533.98 ms | 53.3% bf16 MFU | 206928 tok/s +step 14789/19560 | loss 3.352128 (+0.43z)| norm 0.2746 (+0.11z)| lr 8.99e-05 | 2533.89 ms | 53.3% bf16 MFU | 206927 tok/s +step 14790/19560 | loss 3.326777 (-0.16z)| norm 0.2755 (+0.15z)| lr 8.99e-05 | 2532.82 ms | 53.3% bf16 MFU | 206931 tok/s +step 14791/19560 | loss 3.261127 (-1.69z)| norm 0.2748 (+0.12z)| lr 8.98e-05 | 2534.31 ms | 53.3% bf16 MFU | 206928 tok/s +step 14792/19560 | loss 3.270741 (-1.45z)| norm 0.2674 (-0.26z)| lr 8.98e-05 | 2532.01 ms | 53.3% bf16 MFU | 206935 tok/s +step 14793/19560 | loss 3.271096 (-1.43z)| norm 0.2984 (+1.35z)| lr 8.97e-05 | 2533.08 ms | 53.3% bf16 MFU | 206937 tok/s +step 14794/19560 | loss 3.319329 (-0.32z)| norm 0.2577 (-0.76z)| lr 8.97e-05 | 2532.10 ms | 53.3% bf16 MFU | 206943 tok/s +step 14795/19560 | loss 3.335952 (+0.07z)| norm 0.2498 (-1.16z)| lr 8.97e-05 | 2533.04 ms | 53.3% bf16 MFU | 206945 tok/s +step 14796/19560 | loss 3.317964 (-0.35z)| norm 0.2770 (+0.25z)| lr 8.96e-05 | 2531.60 ms | 53.3% bf16 MFU | 206952 tok/s +step 14797/19560 | loss 3.368073 (+0.80z)| norm 0.2625 (-0.49z)| lr 8.96e-05 | 2532.01 ms | 53.3% bf16 MFU | 206958 tok/s +step 14798/19560 | loss 3.303230 (-0.70z)| norm 0.2662 (-0.30z)| lr 8.96e-05 | 2531.70 ms | 53.3% bf16 MFU | 206965 tok/s +step 14799/19560 | loss 3.469888 (+3.05z)| norm 0.2788 (+0.35z)| lr 8.95e-05 | 2531.17 ms | 53.3% bf16 MFU | 206973 tok/s +step 14800/19560 | loss 3.316156 (-0.40z)| norm 0.2695 (-0.12z)| lr 8.95e-05 | 2533.37 ms | 53.3% bf16 MFU | 206972 tok/s +step 14801/19560 | loss 3.309189 (-0.55z)| norm 0.2691 (-0.14z)| lr 8.95e-05 | 2531.86 ms | 53.3% bf16 MFU | 206977 tok/s +step 14802/19560 | loss 3.345493 (+0.26z)| norm 0.2514 (-1.05z)| lr 8.94e-05 | 2533.28 ms | 53.3% bf16 MFU | 206976 tok/s +step 14803/19560 | loss 3.402084 (+1.51z)| norm 0.3890 (+5.31z)| lr 8.94e-05 | 2533.42 ms | 53.3% bf16 MFU | 206975 tok/s +step 14804/19560 | loss 3.399913 (+1.44z)| norm 0.3093 (+1.64z)| lr 8.94e-05 | 2533.52 ms | 53.3% bf16 MFU | 206973 tok/s +step 14805/19560 | loss 3.290822 (-0.98z)| norm 0.2855 (+0.55z)| lr 8.93e-05 | 2533.97 ms | 53.3% bf16 MFU | 206970 tok/s +step 14806/19560 | loss 3.372828 (+0.83z)| norm 0.2863 (+0.58z)| lr 8.93e-05 | 2532.63 ms | 53.3% bf16 MFU | 206972 tok/s +step 14807/19560 | loss 3.290754 (-0.98z)| norm 0.2725 (-0.05z)| lr 8.93e-05 | 2534.49 ms | 53.3% bf16 MFU | 206966 tok/s +step 14808/19560 | loss 3.331831 (-0.07z)| norm 0.2874 (+0.62z)| lr 8.92e-05 | 2533.16 ms | 53.3% bf16 MFU | 206967 tok/s +step 14809/19560 | loss 3.336471 (+0.03z)| norm 0.2664 (-0.34z)| lr 8.92e-05 | 2535.81 ms | 53.2% bf16 MFU | 206956 tok/s +step 14810/19560 | loss 3.334178 (-0.02z)| norm 0.2870 (+0.59z)| lr 8.91e-05 | 2532.23 ms | 53.3% bf16 MFU | 206960 tok/s +step 14811/19560 | loss 3.349658 (+0.32z)| norm 0.2786 (+0.20z)| lr 8.91e-05 | 2534.02 ms | 53.3% bf16 MFU | 206957 tok/s +step 14812/19560 | loss 3.313615 (-0.49z)| norm 0.2503 (-1.09z)| lr 8.91e-05 | 2533.42 ms | 53.3% bf16 MFU | 206957 tok/s +step 14813/19560 | loss 3.357259 (+0.49z)| norm 0.2800 (+0.25z)| lr 8.90e-05 | 2533.70 ms | 53.3% bf16 MFU | 206955 tok/s +step 14814/19560 | loss 3.330251 (-0.13z)| norm 0.2596 (-0.68z)| lr 8.90e-05 | 2532.87 ms | 53.3% bf16 MFU | 206957 tok/s +step 14815/19560 | loss 3.228811 (-2.34z)| norm 0.2483 (-1.19z)| lr 8.90e-05 | 2533.99 ms | 53.3% bf16 MFU | 206955 tok/s +step 14816/19560 | loss 3.334780 (-0.01z)| norm 0.2751 (+0.03z)| lr 8.89e-05 | 2535.48 ms | 53.3% bf16 MFU | 206946 tok/s +step 14817/19560 | loss 3.306476 (-0.63z)| norm 0.2702 (-0.20z)| lr 8.89e-05 | 2531.33 ms | 53.3% bf16 MFU | 206955 tok/s +step 14818/19560 | loss 3.316267 (-0.41z)| norm 0.2725 (-0.10z)| lr 8.89e-05 | 2534.86 ms | 53.3% bf16 MFU | 206948 tok/s +step 14819/19560 | loss 3.393492 (+1.27z)| norm 0.2702 (-0.20z)| lr 8.88e-05 | 2531.77 ms | 53.3% bf16 MFU | 206955 tok/s +step 14820/19560 | loss 3.355403 (+0.43z)| norm 0.2787 (+0.18z)| lr 8.88e-05 | 2532.13 ms | 53.3% bf16 MFU | 206960 tok/s +step 14821/19560 | loss 3.332037 (-0.08z)| norm 0.2787 (+0.18z)| lr 8.88e-05 | 2534.38 ms | 53.3% bf16 MFU | 206956 tok/s +step 14822/19560 | loss 3.339997 (+0.08z)| norm 0.2804 (+0.26z)| lr 8.87e-05 | 2534.13 ms | 53.3% bf16 MFU | 206952 tok/s +step 14823/19560 | loss 3.276389 (-1.30z)| norm 0.2780 (+0.15z)| lr 8.87e-05 | 2532.90 ms | 53.3% bf16 MFU | 206954 tok/s +step 14824/19560 | loss 3.300410 (-0.77z)| norm 0.2890 (+0.65z)| lr 8.86e-05 | 2533.51 ms | 53.3% bf16 MFU | 206954 tok/s +step 14825/19560 | loss 3.358811 (+0.51z)| norm 0.2731 (-0.08z)| lr 8.86e-05 | 2533.93 ms | 53.3% bf16 MFU | 206951 tok/s +step 14826/19560 | loss 3.298559 (-0.81z)| norm 0.2719 (-0.14z)| lr 8.86e-05 | 2534.39 ms | 53.3% bf16 MFU | 206947 tok/s +step 14827/19560 | loss 3.348211 (+0.28z)| norm 0.2779 (+0.14z)| lr 8.85e-05 | 2533.02 ms | 53.3% bf16 MFU | 206949 tok/s +step 14828/19560 | loss 3.219062 (-2.48z)| norm 0.2819 (+0.32z)| lr 8.85e-05 | 2534.93 ms | 53.3% bf16 MFU | 206943 tok/s +step 14829/19560 | loss 3.330789 (-0.05z)| norm 0.2634 (-0.53z)| lr 8.85e-05 | 2534.35 ms | 53.3% bf16 MFU | 206939 tok/s +step 14830/19560 | loss 3.298969 (-0.74z)| norm 0.2860 (+0.51z)| lr 8.84e-05 | 2532.57 ms | 53.3% bf16 MFU | 206943 tok/s +step 14831/19560 | loss 3.353115 (+0.44z)| norm 0.2831 (+0.37z)| lr 8.84e-05 | 2533.93 ms | 53.3% bf16 MFU | 206941 tok/s +step 14832/19560 | loss 3.303265 (-0.63z)| norm 0.2855 (+0.48z)| lr 8.84e-05 | 2532.12 ms | 53.3% bf16 MFU | 206947 tok/s +step 14833/19560 | loss 3.292291 (-0.87z)| norm 0.2601 (-0.69z)| lr 8.83e-05 | 2533.74 ms | 53.3% bf16 MFU | 206946 tok/s +step 14834/19560 | loss 3.238092 (-2.00z)| norm 0.2800 (+0.23z)| lr 8.83e-05 | 2533.15 ms | 53.3% bf16 MFU | 206947 tok/s +step 14835/19560 | loss 3.292297 (-0.84z)| norm 0.2672 (-0.36z)| lr 8.83e-05 | 2532.77 ms | 53.3% bf16 MFU | 206950 tok/s +step 14836/19560 | loss 3.340072 (+0.18z)| norm 0.2780 (+0.14z)| lr 8.82e-05 | 2534.02 ms | 53.3% bf16 MFU | 206947 tok/s +step 14837/19560 | loss 3.244051 (-1.83z)| norm 0.3087 (+1.53z)| lr 8.82e-05 | 2532.64 ms | 53.3% bf16 MFU | 206951 tok/s +step 14838/19560 | loss 3.377099 (+0.96z)| norm 0.2753 (-0.01z)| lr 8.82e-05 | 2533.27 ms | 53.3% bf16 MFU | 206951 tok/s +step 14839/19560 | loss 3.310229 (-0.44z)| norm 0.2926 (+0.78z)| lr 8.81e-05 | 2532.06 ms | 53.3% bf16 MFU | 206956 tok/s +step 14840/19560 | loss 3.228150 (-2.11z)| norm 0.2896 (+0.63z)| lr 8.81e-05 | 2533.72 ms | 53.3% bf16 MFU | 206955 tok/s +step 14841/19560 | loss 3.321776 (-0.16z)| norm 0.2659 (-0.45z)| lr 8.80e-05 | 2532.71 ms | 53.3% bf16 MFU | 206957 tok/s +step 14842/19560 | loss 3.331589 (+0.04z)| norm 0.2935 (+0.80z)| lr 8.80e-05 | 2532.69 ms | 53.3% bf16 MFU | 206960 tok/s +step 14843/19560 | loss 3.258114 (-1.46z)| norm 0.2757 (-0.02z)| lr 8.80e-05 | 2532.20 ms | 53.3% bf16 MFU | 206964 tok/s +step 14844/19560 | loss 3.296577 (-0.68z)| norm 0.2680 (-0.37z)| lr 8.79e-05 | 2533.56 ms | 53.3% bf16 MFU | 206963 tok/s +step 14845/19560 | loss 3.290824 (-0.79z)| norm 0.2771 (+0.04z)| lr 8.79e-05 | 2533.18 ms | 53.3% bf16 MFU | 206963 tok/s +step 14846/19560 | loss 3.335777 (+0.17z)| norm 0.2728 (-0.15z)| lr 8.79e-05 | 2532.85 ms | 53.3% bf16 MFU | 206965 tok/s +step 14847/19560 | loss 3.352825 (+0.52z)| norm 0.2902 (+0.65z)| lr 8.78e-05 | 2532.78 ms | 53.3% bf16 MFU | 206967 tok/s +step 14848/19560 | loss 3.227457 (-2.09z)| norm 0.2955 (+0.88z)| lr 8.78e-05 | 2530.85 ms | 53.3% bf16 MFU | 206976 tok/s +step 14849/19560 | loss 3.304719 (-0.47z)| norm 0.2707 (-0.26z)| lr 8.78e-05 | 2533.13 ms | 53.3% bf16 MFU | 206976 tok/s +step 14850/19560 | loss 3.336285 (+0.18z)| norm 0.2700 (-0.29z)| lr 8.77e-05 | 2531.07 ms | 53.3% bf16 MFU | 206984 tok/s +step 14851/19560 | loss 3.297795 (-0.61z)| norm 0.2716 (-0.21z)| lr 8.77e-05 | 2533.93 ms | 53.3% bf16 MFU | 206980 tok/s +step 14852/19560 | loss 3.230988 (-1.96z)| norm 0.2645 (-0.53z)| lr 8.77e-05 | 2532.85 ms | 53.3% bf16 MFU | 206981 tok/s +step 14853/19560 | loss 3.243682 (-1.67z)| norm 0.2881 (+0.55z)| lr 8.76e-05 | 2533.58 ms | 53.3% bf16 MFU | 206979 tok/s +step 14854/19560 | loss 3.325331 (-0.00z)| norm 0.2725 (-0.18z)| lr 8.76e-05 | 2532.65 ms | 53.3% bf16 MFU | 206981 tok/s +step 14855/19560 | loss 3.305160 (-0.41z)| norm 0.2551 (-0.98z)| lr 8.76e-05 | 2532.80 ms | 53.3% bf16 MFU | 206982 tok/s +step 14856/19560 | loss 3.297556 (-0.55z)| norm 0.2654 (-0.50z)| lr 8.75e-05 | 2534.58 ms | 53.3% bf16 MFU | 206975 tok/s +step 14857/19560 | loss 3.301382 (-0.47z)| norm 0.2643 (-0.56z)| lr 8.75e-05 | 2532.73 ms | 53.3% bf16 MFU | 206977 tok/s +step 14858/19560 | loss 3.279032 (-0.91z)| norm 0.2672 (-0.43z)| lr 8.74e-05 | 2531.78 ms | 53.3% bf16 MFU | 206982 tok/s +step 14859/19560 | loss 3.338964 (+0.31z)| norm 0.2990 (+1.03z)| lr 8.74e-05 | 2533.31 ms | 53.3% bf16 MFU | 206981 tok/s +step 14860/19560 | loss 3.337904 (+0.29z)| norm 0.2612 (-0.72z)| lr 8.74e-05 | 2532.02 ms | 53.3% bf16 MFU | 206985 tok/s +step 14861/19560 | loss 3.281139 (-0.87z)| norm 0.2685 (-0.38z)| lr 8.73e-05 | 2533.52 ms | 53.3% bf16 MFU | 206983 tok/s +step 14862/19560 | loss 3.333762 (+0.22z)| norm 0.2754 (-0.06z)| lr 8.73e-05 | 2532.54 ms | 53.3% bf16 MFU | 206985 tok/s +step 14863/19560 | loss 3.277661 (-0.92z)| norm 0.2482 (-1.30z)| lr 8.73e-05 | 2532.92 ms | 53.3% bf16 MFU | 206985 tok/s +step 14864/19560 | loss 3.332078 (+0.19z)| norm 0.2619 (-0.67z)| lr 8.72e-05 | 2530.55 ms | 53.4% bf16 MFU | 206995 tok/s +step 14865/19560 | loss 3.341734 (+0.41z)| norm 0.2571 (-0.88z)| lr 8.72e-05 | 2533.69 ms | 53.3% bf16 MFU | 206991 tok/s +step 14866/19560 | loss 3.276328 (-0.94z)| norm 0.2561 (-0.92z)| lr 8.72e-05 | 2533.50 ms | 53.3% bf16 MFU | 206989 tok/s +step 14867/19560 | loss 3.321310 (+0.00z)| norm 0.2976 (+0.99z)| lr 8.71e-05 | 2533.75 ms | 53.3% bf16 MFU | 206986 tok/s +step 14868/19560 | loss 3.376771 (+1.15z)| norm 0.3991 (+5.02z)| lr 8.71e-05 | 2531.27 ms | 53.3% bf16 MFU | 206992 tok/s +step 14869/19560 | loss 3.294557 (-0.56z)| norm 0.2683 (-0.36z)| lr 8.71e-05 | 2533.25 ms | 53.3% bf16 MFU | 206991 tok/s +step 14870/19560 | loss 3.316579 (-0.07z)| norm 0.2724 (-0.18z)| lr 8.70e-05 | 2532.35 ms | 53.3% bf16 MFU | 206993 tok/s +step 14871/19560 | loss 3.228852 (-1.97z)| norm 0.2577 (-0.79z)| lr 8.70e-05 | 2532.49 ms | 53.3% bf16 MFU | 206995 tok/s +step 14872/19560 | loss 3.320973 (+0.05z)| norm 0.2766 (-0.01z)| lr 8.70e-05 | 2534.32 ms | 53.3% bf16 MFU | 206989 tok/s +step 14873/19560 | loss 3.620893 (+5.69z)| norm 0.3376 (+2.43z)| lr 8.69e-05 | 2531.16 ms | 53.3% bf16 MFU | 206996 tok/s +step 14874/19560 | loss 3.302706 (-0.34z)| norm 0.2779 (+0.02z)| lr 8.69e-05 | 2534.44 ms | 53.3% bf16 MFU | 206989 tok/s +step 14875/19560 | loss 3.324448 (+0.06z)| norm 0.2690 (-0.33z)| lr 8.68e-05 | 2532.93 ms | 53.3% bf16 MFU | 206989 tok/s +step 14876/19560 | loss 3.323825 (+0.05z)| norm 0.2642 (-0.52z)| lr 8.68e-05 | 2533.29 ms | 53.3% bf16 MFU | 206988 tok/s +step 14877/19560 | loss 3.351860 (+0.59z)| norm 0.2871 (+0.41z)| lr 8.68e-05 | 2532.30 ms | 53.3% bf16 MFU | 206991 tok/s +step 14878/19560 | loss 3.262246 (-1.12z)| norm 0.2571 (-0.80z)| lr 8.67e-05 | 2532.77 ms | 53.3% bf16 MFU | 206991 tok/s +step 14879/19560 | loss 3.284434 (-0.69z)| norm 0.2742 (-0.11z)| lr 8.67e-05 | 2531.57 ms | 53.3% bf16 MFU | 206996 tok/s +step 14880/19560 | loss 3.265013 (-1.05z)| norm 0.2685 (-0.34z)| lr 8.67e-05 | 2531.83 ms | 53.3% bf16 MFU | 207001 tok/s +step 14881/19560 | loss 3.253551 (-1.25z)| norm 0.2559 (-0.84z)| lr 8.66e-05 | 2533.42 ms | 53.3% bf16 MFU | 206998 tok/s +step 14882/19560 | loss 3.313730 (-0.11z)| norm 0.2663 (-0.42z)| lr 8.66e-05 | 2533.02 ms | 53.3% bf16 MFU | 206997 tok/s +step 14883/19560 | loss 3.389818 (+1.32z)| norm 0.2875 (+0.43z)| lr 8.66e-05 | 2532.39 ms | 53.3% bf16 MFU | 206999 tok/s +step 14884/19560 | loss 3.248385 (-1.36z)| norm 0.2623 (-0.59z)| lr 8.65e-05 | 2533.54 ms | 53.3% bf16 MFU | 206996 tok/s +step 14885/19560 | loss 3.289361 (-0.56z)| norm 0.2582 (-0.76z)| lr 8.65e-05 | 2534.17 ms | 53.3% bf16 MFU | 206991 tok/s +step 14886/19560 | loss 3.252936 (-1.26z)| norm 0.2547 (-0.90z)| lr 8.65e-05 | 2532.82 ms | 53.3% bf16 MFU | 206991 tok/s +step 14887/19560 | loss 3.323283 (+0.09z)| norm 0.2596 (-0.69z)| lr 8.64e-05 | 2531.68 ms | 53.3% bf16 MFU | 206996 tok/s +step 14888/19560 | loss 3.297475 (-0.40z)| norm 0.2762 (-0.03z)| lr 8.64e-05 | 2532.90 ms | 53.3% bf16 MFU | 206996 tok/s +step 14889/19560 | loss 3.318059 (-0.01z)| norm 0.2613 (-0.70z)| lr 8.64e-05 | 2532.98 ms | 53.3% bf16 MFU | 206995 tok/s +step 14890/19560 | loss 3.294080 (-0.47z)| norm 0.2485 (-1.32z)| lr 8.63e-05 | 2531.37 ms | 53.3% bf16 MFU | 207001 tok/s +step 14891/19560 | loss 3.301824 (-0.32z)| norm 0.3115 (+1.75z)| lr 8.63e-05 | 2533.20 ms | 53.3% bf16 MFU | 206999 tok/s +step 14892/19560 | loss 3.331949 (+0.25z)| norm 0.2682 (-0.36z)| lr 8.62e-05 | 2534.43 ms | 53.3% bf16 MFU | 206993 tok/s +step 14893/19560 | loss 3.242495 (-1.45z)| norm 0.2656 (-0.49z)| lr 8.62e-05 | 2535.12 ms | 53.3% bf16 MFU | 206984 tok/s +step 14894/19560 | loss 3.304578 (-0.25z)| norm 0.2833 (+0.37z)| lr 8.62e-05 | 2533.52 ms | 53.3% bf16 MFU | 206981 tok/s +step 14895/19560 | loss 3.365663 (+0.92z)| norm 0.2580 (-0.86z)| lr 8.61e-05 | 2531.80 ms | 53.3% bf16 MFU | 206986 tok/s +step 14896/19560 | loss 3.206886 (-2.09z)| norm 0.2593 (-0.80z)| lr 8.61e-05 | 2535.27 ms | 53.3% bf16 MFU | 206977 tok/s +step 14897/19560 | loss 3.333380 (+0.33z)| norm 0.2583 (-0.84z)| lr 8.61e-05 | 2533.60 ms | 53.3% bf16 MFU | 206975 tok/s +step 14898/19560 | loss 3.290324 (-0.50z)| norm 0.2759 (+0.02z)| lr 8.60e-05 | 2532.68 ms | 53.3% bf16 MFU | 206977 tok/s +step 14899/19560 | loss 3.354465 (+0.75z)| norm 0.2739 (-0.08z)| lr 8.60e-05 | 2533.02 ms | 53.3% bf16 MFU | 206977 tok/s +step 14900/19560 | loss 3.336596 (+0.40z)| norm 0.2765 (+0.04z)| lr 8.60e-05 | 2532.40 ms | 53.3% bf16 MFU | 206980 tok/s +step 14901/19560 | loss 3.303693 (-0.23z)| norm 0.2632 (-0.60z)| lr 8.59e-05 | 2533.33 ms | 53.3% bf16 MFU | 206978 tok/s +step 14902/19560 | loss 3.359590 (+0.85z)| norm 0.2768 (+0.06z)| lr 8.59e-05 | 2532.80 ms | 53.3% bf16 MFU | 206979 tok/s +step 14903/19560 | loss 3.291783 (-0.46z)| norm 0.2633 (-0.59z)| lr 8.59e-05 | 2532.33 ms | 53.3% bf16 MFU | 206982 tok/s +step 14904/19560 | loss 3.303588 (-0.23z)| norm 0.2797 (+0.20z)| lr 8.58e-05 | 2532.62 ms | 53.3% bf16 MFU | 206984 tok/s +step 14905/19560 | loss 3.336473 (+0.41z)| norm 0.2572 (-0.88z)| lr 8.58e-05 | 2533.66 ms | 53.3% bf16 MFU | 206981 tok/s +step 14906/19560 | loss 3.222443 (-1.78z)| norm 0.2786 (+0.16z)| lr 8.58e-05 | 2533.74 ms | 53.3% bf16 MFU | 206978 tok/s +step 14907/19560 | loss 3.350508 (+0.70z)| norm 0.2748 (-0.02z)| lr 8.57e-05 | 2533.19 ms | 53.3% bf16 MFU | 206978 tok/s +step 14908/19560 | loss 3.332814 (+0.36z)| norm 0.2531 (-1.06z)| lr 8.57e-05 | 2533.48 ms | 53.3% bf16 MFU | 206976 tok/s +step 14909/19560 | loss 3.324052 (+0.19z)| norm 0.2698 (-0.24z)| lr 8.57e-05 | 2532.41 ms | 53.3% bf16 MFU | 206979 tok/s +step 14910/19560 | loss 3.298875 (-0.30z)| norm 0.2485 (-1.27z)| lr 8.56e-05 | 2531.99 ms | 53.3% bf16 MFU | 206983 tok/s +step 14911/19560 | loss 3.256951 (-1.10z)| norm 0.2733 (-0.07z)| lr 8.56e-05 | 2534.14 ms | 53.3% bf16 MFU | 206978 tok/s +step 14912/19560 | loss 3.299801 (-0.26z)| norm 0.2457 (-1.38z)| lr 8.55e-05 | 2532.40 ms | 53.3% bf16 MFU | 206981 tok/s +step 14913/19560 | loss 3.323705 (+0.20z)| norm 0.2544 (-0.96z)| lr 8.55e-05 | 2534.10 ms | 53.3% bf16 MFU | 206977 tok/s +step 14914/19560 | loss 3.281116 (-0.64z)| norm 0.2570 (-0.82z)| lr 8.55e-05 | 2531.52 ms | 53.3% bf16 MFU | 206983 tok/s +step 14915/19560 | loss 3.331681 (+0.35z)| norm 0.2513 (-1.08z)| lr 8.54e-05 | 2533.79 ms | 53.3% bf16 MFU | 206980 tok/s +step 14916/19560 | loss 3.402841 (+1.73z)| norm 0.2777 (+0.17z)| lr 8.54e-05 | 2533.98 ms | 53.3% bf16 MFU | 206976 tok/s +step 14917/19560 | loss 3.308075 (-0.11z)| norm 0.2819 (+0.37z)| lr 8.54e-05 | 2531.84 ms | 53.3% bf16 MFU | 206981 tok/s +step 14918/19560 | loss 3.277222 (-0.71z)| norm 0.2481 (-1.23z)| lr 8.53e-05 | 2533.98 ms | 53.3% bf16 MFU | 206977 tok/s +step 14919/19560 | loss 3.344323 (+0.59z)| norm 0.2689 (-0.24z)| lr 8.53e-05 | 2533.75 ms | 53.3% bf16 MFU | 206974 tok/s +step 14920/19560 | loss 3.251837 (-1.21z)| norm 0.2641 (-0.46z)| lr 8.53e-05 | 2531.79 ms | 53.3% bf16 MFU | 206980 tok/s +step 14921/19560 | loss 3.293628 (-0.40z)| norm 0.2622 (-0.54z)| lr 8.52e-05 | 2533.24 ms | 53.3% bf16 MFU | 206979 tok/s +step 14922/19560 | loss 3.283600 (-0.59z)| norm 0.2533 (-0.97z)| lr 8.52e-05 | 2532.58 ms | 53.3% bf16 MFU | 206981 tok/s +step 14923/19560 | loss 3.343717 (+0.58z)| norm 0.2609 (-0.61z)| lr 8.52e-05 | 2533.31 ms | 53.3% bf16 MFU | 206980 tok/s +step 14924/19560 | loss 3.273363 (-0.78z)| norm 0.2495 (-1.14z)| lr 8.51e-05 | 2532.16 ms | 53.3% bf16 MFU | 206983 tok/s +step 14925/19560 | loss 3.287503 (-0.50z)| norm 0.2909 (+0.82z)| lr 8.51e-05 | 2532.34 ms | 53.3% bf16 MFU | 206986 tok/s +step 14926/19560 | loss 3.280966 (-0.62z)| norm 0.2709 (-0.13z)| lr 8.51e-05 | 2531.57 ms | 53.3% bf16 MFU | 206992 tok/s +step 14927/19560 | loss 3.349267 (+0.76z)| norm 0.2681 (-0.26z)| lr 8.50e-05 | 2531.74 ms | 53.3% bf16 MFU | 206996 tok/s +step 14928/19560 | loss 3.299807 (-0.24z)| norm 0.2747 (+0.05z)| lr 8.50e-05 | 2532.67 ms | 53.3% bf16 MFU | 206997 tok/s +step 14929/19560 | loss 3.292637 (-0.38z)| norm 0.2475 (-1.23z)| lr 8.50e-05 | 2532.98 ms | 53.3% bf16 MFU | 206996 tok/s +step 14930/19560 | loss 3.309324 (-0.04z)| norm 0.2649 (-0.41z)| lr 8.49e-05 | 2533.37 ms | 53.3% bf16 MFU | 206994 tok/s +step 14931/19560 | loss 3.307052 (-0.07z)| norm 0.2678 (-0.26z)| lr 8.49e-05 | 2531.32 ms | 53.3% bf16 MFU | 207001 tok/s +step 14932/19560 | loss 3.316079 (+0.13z)| norm 0.2731 (+0.04z)| lr 8.49e-05 | 2533.92 ms | 53.3% bf16 MFU | 206996 tok/s +step 14933/19560 | loss 3.278421 (-0.65z)| norm 0.2488 (-1.28z)| lr 8.48e-05 | 2534.50 ms | 53.3% bf16 MFU | 206989 tok/s +step 14934/19560 | loss 3.362017 (+1.09z)| norm 0.2706 (-0.08z)| lr 8.48e-05 | 2533.05 ms | 53.3% bf16 MFU | 206989 tok/s +step 14935/19560 | loss 3.364714 (+1.13z)| norm 0.2953 (+1.27z)| lr 8.47e-05 | 2533.52 ms | 53.3% bf16 MFU | 206986 tok/s +step 14936/19560 | loss 3.304020 (-0.13z)| norm 0.2508 (-1.15z)| lr 8.47e-05 | 2534.02 ms | 53.3% bf16 MFU | 206982 tok/s +step 14937/19560 | loss 3.282590 (-0.56z)| norm 0.2725 (+0.03z)| lr 8.47e-05 | 2534.11 ms | 53.3% bf16 MFU | 206977 tok/s +step 14938/19560 | loss 3.288344 (-0.44z)| norm 0.2784 (+0.36z)| lr 8.46e-05 | 2531.76 ms | 53.3% bf16 MFU | 206983 tok/s +step 14939/19560 | loss 3.301486 (-0.15z)| norm 0.2518 (-1.08z)| lr 8.46e-05 | 2532.73 ms | 53.3% bf16 MFU | 206984 tok/s +step 14940/19560 | loss 3.292260 (-0.34z)| norm 0.2676 (-0.23z)| lr 8.46e-05 | 2533.99 ms | 53.3% bf16 MFU | 206980 tok/s +step 14941/19560 | loss 3.366054 (+1.19z)| norm 0.2800 (+0.45z)| lr 8.45e-05 | 2533.71 ms | 53.3% bf16 MFU | 206977 tok/s +step 14942/19560 | loss 3.331579 (+0.47z)| norm 0.2872 (+0.83z)| lr 8.45e-05 | 2533.21 ms | 53.3% bf16 MFU | 206976 tok/s +step 14943/19560 | loss 3.234533 (-1.55z)| norm 0.2499 (-1.21z)| lr 8.45e-05 | 2533.14 ms | 53.3% bf16 MFU | 206976 tok/s +step 14944/19560 | loss 3.274188 (-0.71z)| norm 0.2991 (+1.46z)| lr 8.44e-05 | 2532.14 ms | 53.3% bf16 MFU | 206980 tok/s +step 14945/19560 | loss 3.326454 (+0.37z)| norm 0.2706 (-0.08z)| lr 8.44e-05 | 2532.61 ms | 53.3% bf16 MFU | 206982 tok/s +step 14946/19560 | loss 3.287447 (-0.43z)| norm 0.2608 (-0.61z)| lr 8.44e-05 | 2531.82 ms | 53.3% bf16 MFU | 206987 tok/s +step 14947/19560 | loss 3.315164 (+0.16z)| norm 0.2894 (+0.93z)| lr 8.43e-05 | 2532.23 ms | 53.3% bf16 MFU | 206990 tok/s +step 14948/19560 | loss 3.309087 (+0.04z)| norm 0.2633 (-0.48z)| lr 8.43e-05 | 2530.80 ms | 53.3% bf16 MFU | 206998 tok/s +step 14949/19560 | loss 3.359448 (+1.09z)| norm 0.2654 (-0.36z)| lr 8.43e-05 | 2532.07 ms | 53.3% bf16 MFU | 207001 tok/s +step 14950/19560 | loss 3.374928 (+1.41z)| norm 0.2670 (-0.26z)| lr 8.42e-05 | 2531.77 ms | 53.3% bf16 MFU | 207005 tok/s +step 14951/19560 | loss 3.280832 (-0.57z)| norm 0.2561 (-0.84z)| lr 8.42e-05 | 2532.23 ms | 53.3% bf16 MFU | 207008 tok/s +step 14952/19560 | loss 3.291152 (-0.35z)| norm 0.2853 (+0.73z)| lr 8.42e-05 | 2533.89 ms | 53.3% bf16 MFU | 207003 tok/s +step 14953/19560 | loss 3.278532 (-0.60z)| norm 0.2556 (-0.86z)| lr 8.41e-05 | 2531.12 ms | 53.3% bf16 MFU | 207009 tok/s +step 14954/19560 | loss 3.295473 (-0.24z)| norm 0.2687 (-0.15z)| lr 8.41e-05 | 2533.23 ms | 53.3% bf16 MFU | 207007 tok/s +step 14955/19560 | loss 3.318492 (+0.24z)| norm 0.2762 (+0.25z)| lr 8.41e-05 | 2533.33 ms | 53.3% bf16 MFU | 207005 tok/s +step 14956/19560 | loss 3.333112 (+0.54z)| norm 0.2449 (-1.41z)| lr 8.40e-05 | 2533.14 ms | 53.3% bf16 MFU | 207003 tok/s +step 14957/19560 | loss 3.369985 (+1.31z)| norm 0.2711 (-0.01z)| lr 8.40e-05 | 2531.99 ms | 53.3% bf16 MFU | 207006 tok/s +step 14958/19560 | loss 3.285934 (-0.47z)| norm 0.2551 (-0.85z)| lr 8.39e-05 | 2532.47 ms | 53.3% bf16 MFU | 207007 tok/s +step 14959/19560 | loss 3.373218 (+1.37z)| norm 0.2601 (-0.58z)| lr 8.39e-05 | 2532.89 ms | 53.3% bf16 MFU | 207006 tok/s +step 14960/19560 | loss 3.309978 (+0.04z)| norm 0.2631 (-0.41z)| lr 8.39e-05 | 2531.92 ms | 53.3% bf16 MFU | 207010 tok/s +step 14961/19560 | loss 3.309861 (+0.03z)| norm 0.2712 (+0.02z)| lr 8.38e-05 | 2534.67 ms | 53.3% bf16 MFU | 207001 tok/s +step 14962/19560 | loss 3.354482 (+0.96z)| norm 0.2661 (-0.25z)| lr 8.38e-05 | 2534.62 ms | 53.3% bf16 MFU | 206994 tok/s +step 14963/19560 | loss 3.346074 (+0.77z)| norm 0.2766 (+0.32z)| lr 8.38e-05 | 2534.61 ms | 53.3% bf16 MFU | 206987 tok/s +step 14964/19560 | loss 3.314045 (+0.10z)| norm 0.2832 (+0.66z)| lr 8.37e-05 | 2536.28 ms | 53.2% bf16 MFU | 206973 tok/s +step 14965/19560 | loss 3.343589 (+0.71z)| norm 0.2782 (+0.42z)| lr 8.37e-05 | 2531.93 ms | 53.3% bf16 MFU | 206978 tok/s +step 14966/19560 | loss 3.327132 (+0.37z)| norm 0.2836 (+0.70z)| lr 8.37e-05 | 2534.57 ms | 53.3% bf16 MFU | 206972 tok/s +step 14967/19560 | loss 3.286843 (-0.49z)| norm 0.2454 (-1.35z)| lr 8.36e-05 | 2533.24 ms | 53.3% bf16 MFU | 206971 tok/s +step 14968/19560 | loss 3.275695 (-0.75z)| norm 0.2955 (+1.36z)| lr 8.36e-05 | 2532.87 ms | 53.3% bf16 MFU | 206973 tok/s +step 14969/19560 | loss 3.354849 (+0.96z)| norm 0.3188 (+2.54z)| lr 8.36e-05 | 2534.65 ms | 53.3% bf16 MFU | 206966 tok/s +step 14970/19560 | loss 3.310951 (+0.02z)| norm 0.2797 (+0.48z)| lr 8.35e-05 | 2534.06 ms | 53.3% bf16 MFU | 206963 tok/s +step 14971/19560 | loss 3.261536 (-1.05z)| norm 0.2764 (+0.31z)| lr 8.35e-05 | 2533.18 ms | 53.3% bf16 MFU | 206963 tok/s +step 14972/19560 | loss 3.311845 (+0.03z)| norm 0.2666 (-0.21z)| lr 8.35e-05 | 2533.44 ms | 53.3% bf16 MFU | 206962 tok/s +step 14973/19560 | loss 3.212060 (-2.08z)| norm 0.2731 (+0.13z)| lr 8.34e-05 | 2535.18 ms | 53.3% bf16 MFU | 206954 tok/s +step 14974/19560 | loss 3.333886 (+0.52z)| norm 0.2948 (+1.27z)| lr 8.34e-05 | 2534.95 ms | 53.3% bf16 MFU | 206948 tok/s +step 14975/19560 | loss 3.265868 (-0.92z)| norm 0.2772 (+0.35z)| lr 8.34e-05 | 2534.60 ms | 53.3% bf16 MFU | 206943 tok/s +step 14976/19560 | loss 3.336607 (+0.58z)| norm 0.2767 (+0.33z)| lr 8.33e-05 | 2532.78 ms | 53.3% bf16 MFU | 206946 tok/s +step 14977/19560 | loss 3.287195 (-0.48z)| norm 0.2779 (+0.39z)| lr 8.33e-05 | 2532.46 ms | 53.3% bf16 MFU | 206950 tok/s +step 14978/19560 | loss 3.254401 (-1.17z)| norm 0.2668 (-0.20z)| lr 8.33e-05 | 2533.58 ms | 53.3% bf16 MFU | 206949 tok/s +step 14979/19560 | loss 3.286227 (-0.49z)| norm 0.2672 (-0.17z)| lr 8.32e-05 | 2533.03 ms | 53.3% bf16 MFU | 206951 tok/s +step 14980/19560 | loss 3.276875 (-0.70z)| norm 0.2674 (-0.17z)| lr 8.32e-05 | 2533.89 ms | 53.3% bf16 MFU | 206949 tok/s +step 14981/19560 | loss 3.364184 (+1.17z)| norm 0.2857 (+0.81z)| lr 8.32e-05 | 2533.56 ms | 53.3% bf16 MFU | 206948 tok/s +step 14982/19560 | loss 3.272658 (-0.80z)| norm 0.2854 (+0.79z)| lr 8.31e-05 | 2531.91 ms | 53.3% bf16 MFU | 206954 tok/s +step 14983/19560 | loss 3.275431 (-0.74z)| norm 0.2620 (-0.46z)| lr 8.31e-05 | 2533.36 ms | 53.3% bf16 MFU | 206954 tok/s +step 14984/19560 | loss 3.287967 (-0.47z)| norm 0.2712 (+0.03z)| lr 8.30e-05 | 2533.25 ms | 53.3% bf16 MFU | 206955 tok/s +step 14985/19560 | loss 3.265913 (-0.93z)| norm 0.2621 (-0.46z)| lr 8.30e-05 | 2533.86 ms | 53.3% bf16 MFU | 206953 tok/s +step 14986/19560 | loss 3.318469 (+0.19z)| norm 0.2754 (+0.25z)| lr 8.30e-05 | 2532.07 ms | 53.3% bf16 MFU | 206958 tok/s +step 14987/19560 | loss 3.256226 (-1.13z)| norm 0.2513 (-1.02z)| lr 8.29e-05 | 2531.27 ms | 53.3% bf16 MFU | 206966 tok/s +step 14988/19560 | loss 3.306335 (-0.05z)| norm 0.2712 (+0.04z)| lr 8.29e-05 | 2531.47 ms | 53.3% bf16 MFU | 206974 tok/s +step 14989/19560 | loss 3.268413 (-0.86z)| norm 0.2572 (-0.70z)| lr 8.29e-05 | 2534.47 ms | 53.3% bf16 MFU | 206968 tok/s +step 14990/19560 | loss 3.268026 (-0.86z)| norm 0.2580 (-0.65z)| lr 8.28e-05 | 2534.13 ms | 53.3% bf16 MFU | 206964 tok/s +step 14991/19560 | loss 3.245827 (-1.32z)| norm 0.2760 (+0.30z)| lr 8.28e-05 | 2535.24 ms | 53.3% bf16 MFU | 206956 tok/s +step 14992/19560 | loss 3.227142 (-1.68z)| norm 0.2595 (-0.59z)| lr 8.28e-05 | 2534.75 ms | 53.3% bf16 MFU | 206950 tok/s +step 14993/19560 | loss 3.331482 (+0.52z)| norm 0.2591 (-0.61z)| lr 8.27e-05 | 2533.59 ms | 53.3% bf16 MFU | 206949 tok/s +step 14994/19560 | loss 3.299792 (-0.15z)| norm 0.2610 (-0.51z)| lr 8.27e-05 | 2535.04 ms | 53.3% bf16 MFU | 206943 tok/s +step 14995/19560 | loss 3.252937 (-1.13z)| norm 0.2586 (-0.63z)| lr 8.27e-05 | 2531.85 ms | 53.3% bf16 MFU | 206949 tok/s +step 14996/19560 | loss 3.322012 (+0.34z)| norm 0.2778 (+0.59z)| lr 8.26e-05 | 2535.72 ms | 53.2% bf16 MFU | 206940 tok/s +step 14997/19560 | loss 3.310646 (+0.09z)| norm 0.2739 (+0.31z)| lr 8.26e-05 | 2534.61 ms | 53.3% bf16 MFU | 206936 tok/s +step 14998/19560 | loss 3.288845 (-0.37z)| norm 0.2706 (+0.09z)| lr 8.26e-05 | 2536.30 ms | 53.2% bf16 MFU | 206925 tok/s +step 14999/19560 | loss 3.383182 (+1.61z)| norm 0.2694 (+0.01z)| lr 8.25e-05 | 2536.27 ms | 53.2% bf16 MFU | 206914 tok/s +step 15000/19560 | loss 3.319881 (+0.27z)| norm 0.2580 (-0.77z)| lr 8.25e-05 | 2536.17 ms | 53.2% bf16 MFU | 206905 tok/s +val loss 3.314786 +evaluating HellaSwag: 0/628 evaluating HellaSwag: 10/628 evaluating HellaSwag: 20/628 evaluating HellaSwag: 30/628 evaluating HellaSwag: 40/628 evaluating HellaSwag: 50/628 evaluating HellaSwag: 60/628 evaluating HellaSwag: 70/628 evaluating HellaSwag: 80/628 evaluating HellaSwag: 90/628 evaluating HellaSwag: 100/628 evaluating HellaSwag: 110/628 evaluating HellaSwag: 120/628 evaluating HellaSwag: 130/628 evaluating HellaSwag: 140/628 evaluating HellaSwag: 150/628 evaluating HellaSwag: 160/628 evaluating HellaSwag: 170/628 evaluating HellaSwag: 180/628 evaluating HellaSwag: 190/628 evaluating HellaSwag: 200/628 evaluating HellaSwag: 210/628 evaluating HellaSwag: 220/628 evaluating HellaSwag: 230/628 evaluating HellaSwag: 240/628 evaluating HellaSwag: 250/628 evaluating HellaSwag: 260/628 evaluating HellaSwag: 270/628 evaluating HellaSwag: 280/628 evaluating HellaSwag: 290/628 evaluating HellaSwag: 300/628 evaluating HellaSwag: 310/628 evaluating HellaSwag: 320/628 evaluating HellaSwag: 330/628 evaluating HellaSwag: 340/628 evaluating HellaSwag: 350/628 evaluating HellaSwag: 360/628 evaluating HellaSwag: 370/628 evaluating HellaSwag: 380/628 evaluating HellaSwag: 390/628 evaluating HellaSwag: 400/628 evaluating HellaSwag: 410/628 evaluating HellaSwag: 420/628 evaluating HellaSwag: 430/628 evaluating HellaSwag: 440/628 evaluating HellaSwag: 450/628 evaluating HellaSwag: 460/628 evaluating HellaSwag: 470/628 evaluating HellaSwag: 480/628 evaluating HellaSwag: 490/628 evaluating HellaSwag: 500/628 evaluating HellaSwag: 510/628 evaluating HellaSwag: 520/628 evaluating HellaSwag: 530/628 evaluating HellaSwag: 540/628 evaluating HellaSwag: 550/628 evaluating HellaSwag: 560/628 evaluating HellaSwag: 570/628 evaluating HellaSwag: 580/628 evaluating HellaSwag: 590/628 evaluating HellaSwag: 600/628 evaluating HellaSwag: 610/628 evaluating HellaSwag: 620/628 HellaSwag: 3031/10042 = 0.301832 +Writing checkpoint at step 15000 +Writing model to log124M/model_00015000.bin +Writing state to log124M/state_00015000_00000.bin +step 15001/19560 | loss 3.336721 (+0.83z)| norm 0.2865 (+1.33z)| lr 8.25e-05 | 2550.57 ms | 52.9% bf16 MFU | 206837 tok/s +step 15002/19560 | loss 3.259503 (-1.18z)| norm 0.2537 (-1.12z)| lr 8.24e-05 | 2530.35 ms | 53.4% bf16 MFU | 206855 tok/s +step 15003/19560 | loss 3.321805 (+0.45z)| norm 0.2606 (-0.60z)| lr 8.24e-05 | 2529.07 ms | 53.4% bf16 MFU | 206878 tok/s +step 15004/19560 | loss 3.300843 (-0.10z)| norm 0.2593 (-0.69z)| lr 8.24e-05 | 2532.32 ms | 53.3% bf16 MFU | 206886 tok/s +step 15005/19560 | loss 3.364043 (+1.55z)| norm 0.2594 (-0.67z)| lr 8.23e-05 | 2532.22 ms | 53.3% bf16 MFU | 206894 tok/s +step 15006/19560 | loss 3.322769 (+0.46z)| norm 0.2810 (+0.94z)| lr 8.23e-05 | 2531.68 ms | 53.3% bf16 MFU | 206904 tok/s +step 15007/19560 | loss 3.193949 (-2.80z)| norm 0.2723 (+0.29z)| lr 8.23e-05 | 2533.62 ms | 53.3% bf16 MFU | 206905 tok/s +step 15008/19560 | loss 3.351885 (+1.19z)| norm 0.2787 (+0.76z)| lr 8.22e-05 | 2533.22 ms | 53.3% bf16 MFU | 206908 tok/s +step 15009/19560 | loss 3.425447 (+2.93z)| norm 0.2958 (+1.99z)| lr 8.22e-05 | 2532.70 ms | 53.3% bf16 MFU | 206913 tok/s +step 15010/19560 | loss 3.318633 (+0.30z)| norm 0.2567 (-0.89z)| lr 8.22e-05 | 2535.89 ms | 53.2% bf16 MFU | 206905 tok/s +step 15011/19560 | loss 3.308352 (+0.06z)| norm 0.2641 (-0.33z)| lr 8.21e-05 | 2533.29 ms | 53.3% bf16 MFU | 206908 tok/s +step 15012/19560 | loss 3.256478 (-1.24z)| norm 0.2793 (+0.78z)| lr 8.21e-05 | 2531.30 ms | 53.3% bf16 MFU | 206918 tok/s +step 15013/19560 | loss 3.253398 (-1.30z)| norm 0.2519 (-1.24z)| lr 8.20e-05 | 2534.37 ms | 53.3% bf16 MFU | 206916 tok/s +step 15014/19560 | loss 3.292160 (-0.34z)| norm 0.2752 (+0.47z)| lr 8.20e-05 | 2533.95 ms | 53.3% bf16 MFU | 206915 tok/s +step 15015/19560 | loss 3.324878 (+0.48z)| norm 0.2914 (+1.64z)| lr 8.20e-05 | 2533.20 ms | 53.3% bf16 MFU | 206918 tok/s +step 15016/19560 | loss 3.292977 (-0.32z)| norm 0.2570 (-0.88z)| lr 8.19e-05 | 2532.44 ms | 53.3% bf16 MFU | 206923 tok/s +step 15017/19560 | loss 3.325331 (+0.49z)| norm 0.2881 (+1.38z)| lr 8.19e-05 | 2533.60 ms | 53.3% bf16 MFU | 206924 tok/s +step 15018/19560 | loss 3.267883 (-0.94z)| norm 0.2691 (-0.02z)| lr 8.19e-05 | 2532.61 ms | 53.3% bf16 MFU | 206929 tok/s +step 15019/19560 | loss 3.312786 (+0.18z)| norm 0.2642 (-0.37z)| lr 8.18e-05 | 2531.82 ms | 53.3% bf16 MFU | 206936 tok/s +step 15020/19560 | loss 3.294428 (-0.28z)| norm 0.2789 (+0.75z)| lr 8.18e-05 | 2534.15 ms | 53.3% bf16 MFU | 206934 tok/s +step 15021/19560 | loss 3.290842 (-0.38z)| norm 0.2757 (+0.51z)| lr 8.18e-05 | 2533.25 ms | 53.3% bf16 MFU | 206935 tok/s +step 15022/19560 | loss 3.354854 (+1.22z)| norm 0.2743 (+0.41z)| lr 8.17e-05 | 2534.74 ms | 53.3% bf16 MFU | 206930 tok/s +step 15023/19560 | loss 3.309128 (+0.08z)| norm 0.2713 (+0.16z)| lr 8.17e-05 | 2534.91 ms | 53.3% bf16 MFU | 206925 tok/s +step 15024/19560 | loss 3.256823 (-1.27z)| norm 0.2560 (-1.01z)| lr 8.17e-05 | 2535.39 ms | 53.3% bf16 MFU | 206918 tok/s +step 15025/19560 | loss 3.264697 (-1.05z)| norm 0.2701 (+0.07z)| lr 8.16e-05 | 2534.99 ms | 53.3% bf16 MFU | 206914 tok/s +step 15026/19560 | loss 3.293622 (-0.31z)| norm 0.2761 (+0.53z)| lr 8.16e-05 | 2534.31 ms | 53.3% bf16 MFU | 206912 tok/s +step 15027/19560 | loss 3.375115 (+1.77z)| norm 0.2692 (+0.01z)| lr 8.16e-05 | 2533.41 ms | 53.3% bf16 MFU | 206914 tok/s +step 15028/19560 | loss 3.268546 (-0.94z)| norm 0.2602 (-0.68z)| lr 8.15e-05 | 2535.29 ms | 53.3% bf16 MFU | 206908 tok/s +step 15029/19560 | loss 3.340223 (+0.88z)| norm 0.2693 (+0.02z)| lr 8.15e-05 | 2535.36 ms | 53.3% bf16 MFU | 206902 tok/s +step 15030/19560 | loss 3.225780 (-1.99z)| norm 0.2694 (+0.03z)| lr 8.15e-05 | 2537.00 ms | 53.2% bf16 MFU | 206890 tok/s +step 15031/19560 | loss 3.434558 (+3.13z)| norm 0.2763 (+0.55z)| lr 8.14e-05 | 2534.80 ms | 53.3% bf16 MFU | 206887 tok/s +step 15032/19560 | loss 3.321856 (+0.39z)| norm 0.2726 (+0.27z)| lr 8.14e-05 | 2532.98 ms | 53.3% bf16 MFU | 206892 tok/s +step 15033/19560 | loss 3.324850 (+0.46z)| norm 0.2734 (+0.33z)| lr 8.14e-05 | 2532.11 ms | 53.3% bf16 MFU | 206900 tok/s +step 15034/19560 | loss 3.311623 (+0.13z)| norm 0.2745 (+0.41z)| lr 8.13e-05 | 2533.47 ms | 53.3% bf16 MFU | 206902 tok/s +step 15035/19560 | loss 3.226705 (-1.93z)| norm 0.2804 (+0.86z)| lr 8.13e-05 | 2531.70 ms | 53.3% bf16 MFU | 206912 tok/s +step 15036/19560 | loss 3.263081 (-1.02z)| norm 0.2711 (+0.13z)| lr 8.13e-05 | 2532.09 ms | 53.3% bf16 MFU | 206919 tok/s +step 15037/19560 | loss 3.344783 (+0.97z)| norm 0.2799 (+0.82z)| lr 8.12e-05 | 2532.66 ms | 53.3% bf16 MFU | 206923 tok/s +step 15038/19560 | loss 3.320742 (+0.38z)| norm 0.3073 (+2.84z)| lr 8.12e-05 | 2534.79 ms | 53.3% bf16 MFU | 206919 tok/s +step 15039/19560 | loss 3.304171 (-0.04z)| norm 0.2575 (-0.92z)| lr 8.12e-05 | 2533.38 ms | 53.3% bf16 MFU | 206921 tok/s +step 15040/19560 | loss 3.262958 (-1.03z)| norm 0.2863 (+1.24z)| lr 8.11e-05 | 2533.53 ms | 53.3% bf16 MFU | 206922 tok/s +step 15041/19560 | loss 3.269907 (-0.85z)| norm 0.2800 (+0.75z)| lr 8.11e-05 | 2532.89 ms | 53.3% bf16 MFU | 206925 tok/s +step 15042/19560 | loss 3.243981 (-1.46z)| norm 0.2580 (-0.94z)| lr 8.11e-05 | 2534.82 ms | 53.3% bf16 MFU | 206921 tok/s +step 15043/19560 | loss 3.270368 (-0.81z)| norm 0.2700 (-0.03z)| lr 8.10e-05 | 2534.45 ms | 53.3% bf16 MFU | 206918 tok/s +step 15044/19560 | loss 3.269468 (-0.83z)| norm 0.2742 (+0.29z)| lr 8.10e-05 | 2535.73 ms | 53.2% bf16 MFU | 206910 tok/s +step 15045/19560 | loss 3.296758 (-0.15z)| norm 0.2642 (-0.47z)| lr 8.10e-05 | 2534.89 ms | 53.3% bf16 MFU | 206906 tok/s +step 15046/19560 | loss 3.323499 (+0.50z)| norm 0.2740 (+0.27z)| lr 8.09e-05 | 2533.71 ms | 53.3% bf16 MFU | 206907 tok/s +step 15047/19560 | loss 3.288089 (-0.37z)| norm 0.2817 (+0.87z)| lr 8.09e-05 | 2533.93 ms | 53.3% bf16 MFU | 206907 tok/s +step 15048/19560 | loss 3.343200 (+0.98z)| norm 0.2675 (-0.24z)| lr 8.09e-05 | 2532.51 ms | 53.3% bf16 MFU | 206913 tok/s +step 15049/19560 | loss 3.296890 (-0.17z)| norm 0.2714 (+0.06z)| lr 8.08e-05 | 2534.48 ms | 53.3% bf16 MFU | 206910 tok/s +step 15050/19560 | loss 3.287634 (-0.40z)| norm 0.2813 (+0.82z)| lr 8.08e-05 | 2532.14 ms | 53.3% bf16 MFU | 206917 tok/s +step 15051/19560 | loss 3.342936 (+0.97z)| norm 0.2641 (-0.53z)| lr 8.07e-05 | 2530.86 ms | 53.3% bf16 MFU | 206929 tok/s +step 15052/19560 | loss 3.313995 (+0.25z)| norm 0.2789 (+0.61z)| lr 8.07e-05 | 2532.84 ms | 53.3% bf16 MFU | 206933 tok/s +step 15053/19560 | loss 3.306168 (+0.05z)| norm 0.2768 (+0.47z)| lr 8.07e-05 | 2531.18 ms | 53.3% bf16 MFU | 206943 tok/s +step 15054/19560 | loss 3.265167 (-0.96z)| norm 0.2524 (-1.47z)| lr 8.06e-05 | 2530.95 ms | 53.3% bf16 MFU | 206953 tok/s +step 15055/19560 | loss 3.378139 (+1.82z)| norm 0.2919 (+1.64z)| lr 8.06e-05 | 2532.83 ms | 53.3% bf16 MFU | 206955 tok/s +step 15056/19560 | loss 3.317863 (+0.33z)| norm 0.2706 (-0.04z)| lr 8.06e-05 | 2531.77 ms | 53.3% bf16 MFU | 206962 tok/s +step 15057/19560 | loss 3.289728 (-0.36z)| norm 0.2864 (+1.20z)| lr 8.05e-05 | 2532.54 ms | 53.3% bf16 MFU | 206965 tok/s +step 15058/19560 | loss 3.338073 (+0.82z)| norm 0.2895 (+1.41z)| lr 8.05e-05 | 2534.34 ms | 53.3% bf16 MFU | 206960 tok/s +step 15059/19560 | loss 3.240819 (-1.54z)| norm 0.2793 (+0.60z)| lr 8.05e-05 | 2532.23 ms | 53.3% bf16 MFU | 206964 tok/s +step 15060/19560 | loss 3.274953 (-0.70z)| norm 0.2747 (+0.24z)| lr 8.04e-05 | 2534.04 ms | 53.3% bf16 MFU | 206961 tok/s +step 15061/19560 | loss 3.229088 (-1.78z)| norm 0.2631 (-0.69z)| lr 8.04e-05 | 2532.07 ms | 53.3% bf16 MFU | 206966 tok/s +step 15062/19560 | loss 3.314810 (+0.28z)| norm 0.2752 (+0.27z)| lr 8.04e-05 | 2531.44 ms | 53.3% bf16 MFU | 206973 tok/s +step 15063/19560 | loss 3.257519 (-1.08z)| norm 0.2688 (-0.22z)| lr 8.03e-05 | 2532.70 ms | 53.3% bf16 MFU | 206975 tok/s +step 15064/19560 | loss 3.279537 (-0.54z)| norm 0.2609 (-0.88z)| lr 8.03e-05 | 2533.03 ms | 53.3% bf16 MFU | 206975 tok/s +step 15065/19560 | loss 3.301109 (-0.02z)| norm 0.2695 (-0.18z)| lr 8.03e-05 | 2531.53 ms | 53.3% bf16 MFU | 206982 tok/s +step 15066/19560 | loss 3.340550 (+0.92z)| norm 0.2837 (+0.98z)| lr 8.02e-05 | 2533.62 ms | 53.3% bf16 MFU | 206979 tok/s +step 15067/19560 | loss 3.379276 (+1.82z)| norm 0.3076 (+2.82z)| lr 8.02e-05 | 2532.04 ms | 53.3% bf16 MFU | 206983 tok/s +step 15068/19560 | loss 3.285969 (-0.41z)| norm 0.2761 (+0.31z)| lr 8.02e-05 | 2532.37 ms | 53.3% bf16 MFU | 206986 tok/s +step 15069/19560 | loss 3.318702 (+0.38z)| norm 0.2666 (-0.44z)| lr 8.01e-05 | 2534.47 ms | 53.3% bf16 MFU | 206980 tok/s +step 15070/19560 | loss 3.265068 (-0.89z)| norm 0.2648 (-0.56z)| lr 8.01e-05 | 2532.75 ms | 53.3% bf16 MFU | 206981 tok/s +step 15071/19560 | loss 3.280232 (-0.54z)| norm 0.2723 (+0.02z)| lr 8.01e-05 | 2533.28 ms | 53.3% bf16 MFU | 206980 tok/s +step 15072/19560 | loss 3.403363 (+2.37z)| norm 0.2761 (+0.35z)| lr 8.00e-05 | 2531.10 ms | 53.3% bf16 MFU | 206988 tok/s +step 15073/19560 | loss 3.246825 (-1.32z)| norm 0.2841 (+0.99z)| lr 8.00e-05 | 2532.41 ms | 53.3% bf16 MFU | 206990 tok/s +step 15074/19560 | loss 3.294888 (-0.19z)| norm 0.2822 (+0.82z)| lr 8.00e-05 | 2533.30 ms | 53.3% bf16 MFU | 206988 tok/s +step 15075/19560 | loss 3.355328 (+1.22z)| norm 0.2417 (-2.43z)| lr 7.99e-05 | 2532.91 ms | 53.3% bf16 MFU | 206988 tok/s +step 15076/19560 | loss 3.301955 (-0.03z)| norm 0.2686 (-0.27z)| lr 7.99e-05 | 2531.95 ms | 53.3% bf16 MFU | 206992 tok/s +step 15077/19560 | loss 3.268178 (-0.81z)| norm 0.2569 (-1.20z)| lr 7.99e-05 | 2532.19 ms | 53.3% bf16 MFU | 206995 tok/s +step 15078/19560 | loss 3.348268 (+1.09z)| norm 0.2602 (-0.92z)| lr 7.98e-05 | 2532.22 ms | 53.3% bf16 MFU | 206998 tok/s +step 15079/19560 | loss 3.302300 (-0.00z)| norm 0.2548 (-1.36z)| lr 7.98e-05 | 2534.43 ms | 53.3% bf16 MFU | 206991 tok/s +step 15080/19560 | loss 3.296247 (-0.15z)| norm 0.2875 (+1.26z)| lr 7.98e-05 | 2533.26 ms | 53.3% bf16 MFU | 206990 tok/s +step 15081/19560 | loss 3.296873 (-0.14z)| norm 0.2514 (-1.62z)| lr 7.97e-05 | 2532.27 ms | 53.3% bf16 MFU | 206992 tok/s +step 15082/19560 | loss 3.286498 (-0.38z)| norm 0.2584 (-1.05z)| lr 7.97e-05 | 2533.46 ms | 53.3% bf16 MFU | 206990 tok/s +step 15083/19560 | loss 3.253684 (-1.15z)| norm 0.2634 (-0.65z)| lr 7.97e-05 | 2536.46 ms | 53.2% bf16 MFU | 206976 tok/s +step 15084/19560 | loss 3.310532 (+0.20z)| norm 0.2524 (-1.53z)| lr 7.96e-05 | 2531.69 ms | 53.3% bf16 MFU | 206981 tok/s +step 15085/19560 | loss 3.312642 (+0.27z)| norm 0.2615 (-0.80z)| lr 7.96e-05 | 2533.42 ms | 53.3% bf16 MFU | 206980 tok/s +step 15086/19560 | loss 3.258051 (-1.03z)| norm 0.2500 (-1.70z)| lr 7.96e-05 | 2534.04 ms | 53.3% bf16 MFU | 206976 tok/s +step 15087/19560 | loss 3.365941 (+1.55z)| norm 0.2699 (-0.13z)| lr 7.95e-05 | 2532.17 ms | 53.3% bf16 MFU | 206979 tok/s +step 15088/19560 | loss 3.234109 (-1.58z)| norm 0.2551 (-1.30z)| lr 7.95e-05 | 2533.30 ms | 53.3% bf16 MFU | 206978 tok/s +step 15089/19560 | loss 3.219197 (-1.89z)| norm 0.2410 (-2.35z)| lr 7.95e-05 | 2532.87 ms | 53.3% bf16 MFU | 206979 tok/s +step 15090/19560 | loss 3.243191 (-1.31z)| norm 0.2473 (-1.82z)| lr 7.94e-05 | 2531.72 ms | 53.3% bf16 MFU | 206985 tok/s +step 15091/19560 | loss 3.404595 (+2.41z)| norm 0.3673 (+6.14z)| lr 7.94e-05 | 2531.45 ms | 53.3% bf16 MFU | 206991 tok/s +step 15092/19560 | loss 3.260628 (-0.88z)| norm 0.2523 (-1.23z)| lr 7.94e-05 | 2535.87 ms | 53.2% bf16 MFU | 206979 tok/s +step 15093/19560 | loss 3.325235 (+0.60z)| norm 0.2491 (-1.41z)| lr 7.93e-05 | 2533.11 ms | 53.3% bf16 MFU | 206978 tok/s +step 15094/19560 | loss 3.226165 (-1.64z)| norm 0.2585 (-0.80z)| lr 7.93e-05 | 2533.09 ms | 53.3% bf16 MFU | 206978 tok/s +step 15095/19560 | loss 3.304975 (+0.15z)| norm 0.2585 (-0.81z)| lr 7.93e-05 | 2531.90 ms | 53.3% bf16 MFU | 206983 tok/s +step 15096/19560 | loss 3.293895 (-0.10z)| norm 0.2580 (-0.83z)| lr 7.92e-05 | 2533.31 ms | 53.3% bf16 MFU | 206982 tok/s +step 15097/19560 | loss 3.332879 (+0.79z)| norm 0.2630 (-0.50z)| lr 7.92e-05 | 2532.45 ms | 53.3% bf16 MFU | 206984 tok/s +step 15098/19560 | loss 3.348987 (+1.15z)| norm 0.2652 (-0.35z)| lr 7.92e-05 | 2534.88 ms | 53.3% bf16 MFU | 206976 tok/s +step 15099/19560 | loss 3.337889 (+0.88z)| norm 0.2578 (-0.83z)| lr 7.91e-05 | 2531.58 ms | 53.3% bf16 MFU | 206982 tok/s +step 15100/19560 | loss 3.296373 (-0.06z)| norm 0.2620 (-0.55z)| lr 7.91e-05 | 2532.62 ms | 53.3% bf16 MFU | 206984 tok/s +step 15101/19560 | loss 3.239894 (-1.36z)| norm 0.2482 (-1.44z)| lr 7.91e-05 | 2532.70 ms | 53.3% bf16 MFU | 206985 tok/s +step 15102/19560 | loss 3.221201 (-1.75z)| norm 0.2568 (-0.86z)| lr 7.90e-05 | 2533.87 ms | 53.3% bf16 MFU | 206982 tok/s +step 15103/19560 | loss 3.334902 (+0.82z)| norm 0.2708 (+0.07z)| lr 7.90e-05 | 2532.49 ms | 53.3% bf16 MFU | 206984 tok/s +step 15104/19560 | loss 3.318542 (+0.45z)| norm 0.2706 (+0.06z)| lr 7.90e-05 | 2533.00 ms | 53.3% bf16 MFU | 206984 tok/s +step 15105/19560 | loss 3.337043 (+0.86z)| norm 0.2599 (-0.64z)| lr 7.89e-05 | 2532.34 ms | 53.3% bf16 MFU | 206986 tok/s +step 15106/19560 | loss 3.314487 (+0.34z)| norm 0.2867 (+1.13z)| lr 7.89e-05 | 2532.39 ms | 53.3% bf16 MFU | 206989 tok/s +step 15107/19560 | loss 3.288882 (-0.25z)| norm 0.2497 (-1.31z)| lr 7.88e-05 | 2533.32 ms | 53.3% bf16 MFU | 206987 tok/s +step 15108/19560 | loss 3.328087 (+0.64z)| norm 0.2758 (+0.41z)| lr 7.88e-05 | 2533.70 ms | 53.3% bf16 MFU | 206984 tok/s +step 15109/19560 | loss 3.353335 (+1.22z)| norm 0.2749 (+0.36z)| lr 7.88e-05 | 2531.67 ms | 53.3% bf16 MFU | 206989 tok/s +step 15110/19560 | loss 3.330150 (+0.68z)| norm 0.3150 (+2.92z)| lr 7.87e-05 | 2534.77 ms | 53.3% bf16 MFU | 206982 tok/s +step 15111/19560 | loss 3.266596 (-0.77z)| norm 0.2824 (+0.80z)| lr 7.87e-05 | 2533.04 ms | 53.3% bf16 MFU | 206982 tok/s +step 15112/19560 | loss 3.355241 (+1.23z)| norm 0.2604 (-0.60z)| lr 7.87e-05 | 2533.64 ms | 53.3% bf16 MFU | 206979 tok/s +step 15113/19560 | loss 3.323282 (+0.50z)| norm 0.2710 (+0.07z)| lr 7.86e-05 | 2533.20 ms | 53.3% bf16 MFU | 206979 tok/s +step 15114/19560 | loss 3.245122 (-1.26z)| norm 0.2840 (+0.90z)| lr 7.86e-05 | 2533.27 ms | 53.3% bf16 MFU | 206978 tok/s +step 15115/19560 | loss 3.305193 (+0.09z)| norm 0.2856 (+0.99z)| lr 7.86e-05 | 2533.79 ms | 53.3% bf16 MFU | 206975 tok/s +step 15116/19560 | loss 3.475885 (+3.71z)| norm 0.3096 (+2.45z)| lr 7.85e-05 | 2534.25 ms | 53.3% bf16 MFU | 206970 tok/s +step 15117/19560 | loss 3.267632 (-0.74z)| norm 0.2789 (+0.51z)| lr 7.85e-05 | 2535.02 ms | 53.3% bf16 MFU | 206962 tok/s +step 15118/19560 | loss 3.543460 (+4.66z)| norm 0.3512 (+4.59z)| lr 7.85e-05 | 2533.63 ms | 53.3% bf16 MFU | 206961 tok/s +step 15119/19560 | loss 3.294556 (-0.20z)| norm 0.3251 (+2.96z)| lr 7.84e-05 | 2532.42 ms | 53.3% bf16 MFU | 206964 tok/s +step 15120/19560 | loss 3.264333 (-0.81z)| norm 0.2853 (+0.74z)| lr 7.84e-05 | 2532.05 ms | 53.3% bf16 MFU | 206969 tok/s +step 15121/19560 | loss 3.326379 (+0.42z)| norm 0.2876 (+0.85z)| lr 7.84e-05 | 2532.05 ms | 53.3% bf16 MFU | 206974 tok/s +step 15122/19560 | loss 3.274270 (-0.61z)| norm 0.2881 (+0.87z)| lr 7.83e-05 | 2531.74 ms | 53.3% bf16 MFU | 206979 tok/s +step 15123/19560 | loss 3.375943 (+1.38z)| norm 0.2699 (-0.14z)| lr 7.83e-05 | 2532.58 ms | 53.3% bf16 MFU | 206981 tok/s +step 15124/19560 | loss 3.274268 (-0.62z)| norm 0.2676 (-0.27z)| lr 7.83e-05 | 2534.06 ms | 53.3% bf16 MFU | 206977 tok/s +step 15125/19560 | loss 3.299064 (-0.13z)| norm 0.2792 (+0.37z)| lr 7.82e-05 | 2533.60 ms | 53.3% bf16 MFU | 206975 tok/s +step 15126/19560 | loss 3.361828 (+1.09z)| norm 0.2748 (+0.13z)| lr 7.82e-05 | 2531.95 ms | 53.3% bf16 MFU | 206980 tok/s +step 15127/19560 | loss 3.326504 (+0.41z)| norm 0.2661 (-0.35z)| lr 7.82e-05 | 2531.82 ms | 53.3% bf16 MFU | 206985 tok/s +step 15128/19560 | loss 3.304461 (-0.02z)| norm 0.2496 (-1.26z)| lr 7.81e-05 | 2531.52 ms | 53.3% bf16 MFU | 206990 tok/s +step 15129/19560 | loss 3.304594 (-0.01z)| norm 0.2690 (-0.18z)| lr 7.81e-05 | 2532.91 ms | 53.3% bf16 MFU | 206990 tok/s +step 15130/19560 | loss 3.256903 (-0.95z)| norm 0.2711 (-0.07z)| lr 7.81e-05 | 2534.33 ms | 53.3% bf16 MFU | 206985 tok/s +step 15131/19560 | loss 3.221053 (-1.63z)| norm 0.2628 (-0.53z)| lr 7.80e-05 | 2532.19 ms | 53.3% bf16 MFU | 206988 tok/s +step 15132/19560 | loss 3.281957 (-0.44z)| norm 0.2812 (+0.48z)| lr 7.80e-05 | 2532.87 ms | 53.3% bf16 MFU | 206988 tok/s +step 15133/19560 | loss 3.253853 (-0.97z)| norm 0.2744 (+0.10z)| lr 7.80e-05 | 2533.54 ms | 53.3% bf16 MFU | 206986 tok/s +step 15134/19560 | loss 3.309264 (+0.12z)| norm 0.2636 (-0.50z)| lr 7.79e-05 | 2533.26 ms | 53.3% bf16 MFU | 206984 tok/s +step 15135/19560 | loss 3.322058 (+0.35z)| norm 0.2699 (-0.15z)| lr 7.79e-05 | 2534.81 ms | 53.3% bf16 MFU | 206977 tok/s +step 15136/19560 | loss 3.295044 (-0.18z)| norm 0.2531 (-1.07z)| lr 7.79e-05 | 2532.35 ms | 53.3% bf16 MFU | 206980 tok/s +step 15137/19560 | loss 3.283629 (-0.39z)| norm 0.2629 (-0.51z)| lr 7.78e-05 | 2534.35 ms | 53.3% bf16 MFU | 206975 tok/s +step 15138/19560 | loss 3.276962 (-0.52z)| norm 0.2565 (-0.87z)| lr 7.78e-05 | 2535.59 ms | 53.2% bf16 MFU | 206964 tok/s +step 15139/19560 | loss 3.283034 (-0.39z)| norm 0.2651 (-0.39z)| lr 7.78e-05 | 2533.45 ms | 53.3% bf16 MFU | 206964 tok/s +step 15140/19560 | loss 3.339950 (+0.76z)| norm 0.2498 (-1.22z)| lr 7.77e-05 | 2533.91 ms | 53.3% bf16 MFU | 206961 tok/s +step 15141/19560 | loss 3.451666 (+2.92z)| norm 1.9272 (+11.19z)| lr 7.77e-05 | 2532.02 ms | 53.3% bf16 MFU | 206966 tok/s +step 15142/19560 | loss 3.314219 (+0.19z)| norm 0.3120 (+0.18z)| lr 7.77e-05 | 2532.79 ms | 53.3% bf16 MFU | 206968 tok/s +step 15143/19560 | loss 3.351223 (+0.92z)| norm 0.3532 (+0.46z)| lr 7.76e-05 | 2533.31 ms | 53.3% bf16 MFU | 206967 tok/s +step 15144/19560 | loss 3.270918 (-0.67z)| norm 0.3025 (+0.11z)| lr 7.76e-05 | 2533.92 ms | 53.3% bf16 MFU | 206964 tok/s +step 15145/19560 | loss 3.253786 (-0.99z)| norm 0.3063 (+0.14z)| lr 7.76e-05 | 2531.81 ms | 53.3% bf16 MFU | 206970 tok/s +step 15146/19560 | loss 3.192133 (-2.16z)| norm 0.3041 (+0.12z)| lr 7.75e-05 | 2532.68 ms | 53.3% bf16 MFU | 206972 tok/s +step 15147/19560 | loss 3.283733 (-0.38z)| norm 0.2787 (-0.05z)| lr 7.75e-05 | 2533.33 ms | 53.3% bf16 MFU | 206971 tok/s +step 15148/19560 | loss 3.279332 (-0.46z)| norm 0.2841 (-0.02z)| lr 7.75e-05 | 2532.68 ms | 53.3% bf16 MFU | 206973 tok/s +step 15149/19560 | loss 3.267099 (-0.69z)| norm 0.2707 (-0.11z)| lr 7.74e-05 | 2532.75 ms | 53.3% bf16 MFU | 206975 tok/s +step 15150/19560 | loss 3.278663 (-0.46z)| norm 0.2862 (-0.00z)| lr 7.74e-05 | 2531.44 ms | 53.3% bf16 MFU | 206981 tok/s +step 15151/19560 | loss 3.380353 (+1.49z)| norm 0.3010 (+0.10z)| lr 7.74e-05 | 2532.91 ms | 53.3% bf16 MFU | 206982 tok/s +step 15152/19560 | loss 3.267419 (-0.69z)| norm 0.2799 (-0.05z)| lr 7.73e-05 | 2534.95 ms | 53.3% bf16 MFU | 206974 tok/s +step 15153/19560 | loss 3.301353 (-0.04z)| norm 0.3039 (+0.11z)| lr 7.73e-05 | 2533.87 ms | 53.3% bf16 MFU | 206971 tok/s +step 15154/19560 | loss 3.281707 (-0.42z)| norm 0.2787 (-0.06z)| lr 7.73e-05 | 2533.49 ms | 53.3% bf16 MFU | 206969 tok/s +step 15155/19560 | loss 3.313162 (+0.20z)| norm 0.2805 (-0.05z)| lr 7.72e-05 | 2533.28 ms | 53.3% bf16 MFU | 206969 tok/s +step 15156/19560 | loss 3.313978 (+0.21z)| norm 0.2702 (-0.12z)| lr 7.72e-05 | 2535.21 ms | 53.3% bf16 MFU | 206961 tok/s +step 15157/19560 | loss 3.380174 (+1.48z)| norm 0.2759 (-0.08z)| lr 7.72e-05 | 2535.20 ms | 53.3% bf16 MFU | 206953 tok/s +step 15158/19560 | loss 3.252982 (-0.98z)| norm 0.2712 (-0.11z)| lr 7.71e-05 | 2534.71 ms | 53.3% bf16 MFU | 206947 tok/s +step 15159/19560 | loss 3.333095 (+0.60z)| norm 0.2843 (-0.02z)| lr 7.71e-05 | 2533.40 ms | 53.3% bf16 MFU | 206947 tok/s +step 15160/19560 | loss 3.272587 (-0.59z)| norm 0.2544 (-0.23z)| lr 7.71e-05 | 2533.57 ms | 53.3% bf16 MFU | 206947 tok/s +step 15161/19560 | loss 3.248471 (-1.06z)| norm 0.2657 (-0.15z)| lr 7.70e-05 | 2533.12 ms | 53.3% bf16 MFU | 206948 tok/s +step 15162/19560 | loss 3.279053 (-0.45z)| norm 0.2572 (-0.21z)| lr 7.70e-05 | 2532.51 ms | 53.3% bf16 MFU | 206952 tok/s +step 15163/19560 | loss 3.351519 (+0.97z)| norm 0.2547 (-0.22z)| lr 7.70e-05 | 2532.88 ms | 53.3% bf16 MFU | 206954 tok/s +step 15164/19560 | loss 3.344006 (+0.81z)| norm 0.2548 (-0.22z)| lr 7.69e-05 | 2534.93 ms | 53.3% bf16 MFU | 206948 tok/s +step 15165/19560 | loss 3.335499 (+0.64z)| norm 0.2579 (-0.20z)| lr 7.69e-05 | 2532.45 ms | 53.3% bf16 MFU | 206952 tok/s +step 15166/19560 | loss 3.324880 (+0.43z)| norm 0.2533 (-0.23z)| lr 7.69e-05 | 2534.66 ms | 53.3% bf16 MFU | 206946 tok/s +step 15167/19560 | loss 3.374259 (+1.39z)| norm 0.2583 (-0.19z)| lr 7.68e-05 | 2533.21 ms | 53.3% bf16 MFU | 206947 tok/s +step 15168/19560 | loss 3.282351 (-0.43z)| norm 0.2495 (-0.25z)| lr 7.68e-05 | 2532.34 ms | 53.3% bf16 MFU | 206952 tok/s +step 15169/19560 | loss 3.313351 (+0.18z)| norm 0.2596 (-0.18z)| lr 7.68e-05 | 2533.57 ms | 53.3% bf16 MFU | 206951 tok/s +step 15170/19560 | loss 3.348483 (+0.86z)| norm 0.2456 (-0.27z)| lr 7.67e-05 | 2532.78 ms | 53.3% bf16 MFU | 206954 tok/s +step 15171/19560 | loss 3.398947 (+1.83z)| norm 0.2765 (-0.06z)| lr 7.67e-05 | 2532.90 ms | 53.3% bf16 MFU | 206955 tok/s +step 15172/19560 | loss 3.351046 (+0.87z)| norm 0.2659 (-0.14z)| lr 7.67e-05 | 2532.10 ms | 53.3% bf16 MFU | 206961 tok/s +step 15173/19560 | loss 3.299318 (-0.15z)| norm 0.2481 (-0.26z)| lr 7.66e-05 | 2534.66 ms | 53.3% bf16 MFU | 206955 tok/s +step 15174/19560 | loss 3.260458 (-0.90z)| norm 0.2923 (+0.04z)| lr 7.66e-05 | 2532.75 ms | 53.3% bf16 MFU | 206957 tok/s +step 15175/19560 | loss 3.363905 (+1.11z)| norm 0.2749 (-0.07z)| lr 7.66e-05 | 2533.12 ms | 53.3% bf16 MFU | 206958 tok/s +step 15176/19560 | loss 3.348987 (+0.82z)| norm 0.2591 (-0.18z)| lr 7.65e-05 | 2532.42 ms | 53.3% bf16 MFU | 206962 tok/s +step 15177/19560 | loss 3.379107 (+1.38z)| norm 0.2886 (+0.02z)| lr 7.65e-05 | 2531.84 ms | 53.3% bf16 MFU | 206968 tok/s +step 15178/19560 | loss 3.304170 (-0.07z)| norm 0.2539 (-0.22z)| lr 7.65e-05 | 2532.94 ms | 53.3% bf16 MFU | 206969 tok/s +step 15179/19560 | loss 3.308992 (+0.03z)| norm 0.2646 (-0.14z)| lr 7.64e-05 | 2535.87 ms | 53.2% bf16 MFU | 206958 tok/s +step 15180/19560 | loss 3.288837 (-0.35z)| norm 0.2693 (-0.11z)| lr 7.64e-05 | 2532.61 ms | 53.3% bf16 MFU | 206960 tok/s +step 15181/19560 | loss 3.317082 (+0.19z)| norm 0.2596 (-0.18z)| lr 7.64e-05 | 2533.30 ms | 53.3% bf16 MFU | 206960 tok/s +step 15182/19560 | loss 3.329731 (+0.43z)| norm 0.2474 (-0.26z)| lr 7.63e-05 | 2534.41 ms | 53.3% bf16 MFU | 206956 tok/s +step 15183/19560 | loss 3.273565 (-0.65z)| norm 0.2873 (+0.01z)| lr 7.63e-05 | 2533.84 ms | 53.3% bf16 MFU | 206954 tok/s +step 15184/19560 | loss 3.351117 (+0.86z)| norm 0.2689 (-0.11z)| lr 7.63e-05 | 2533.05 ms | 53.3% bf16 MFU | 206955 tok/s +step 15185/19560 | loss 3.393647 (+1.65z)| norm 0.2820 (-0.02z)| lr 7.62e-05 | 2534.02 ms | 53.3% bf16 MFU | 206952 tok/s +step 15186/19560 | loss 3.334750 (+0.52z)| norm 0.2638 (-0.15z)| lr 7.62e-05 | 2533.19 ms | 53.3% bf16 MFU | 206953 tok/s +step 15187/19560 | loss 3.298927 (-0.18z)| norm 0.2590 (-0.18z)| lr 7.62e-05 | 2534.37 ms | 53.3% bf16 MFU | 206949 tok/s +step 15188/19560 | loss 3.279540 (-0.56z)| norm 0.2661 (-0.13z)| lr 7.61e-05 | 2534.74 ms | 53.3% bf16 MFU | 206943 tok/s +step 15189/19560 | loss 3.360573 (+1.00z)| norm 0.2530 (-0.22z)| lr 7.61e-05 | 2534.40 ms | 53.3% bf16 MFU | 206940 tok/s +step 15190/19560 | loss 3.340896 (+0.61z)| norm 0.2817 (-0.02z)| lr 7.61e-05 | 2531.56 ms | 53.3% bf16 MFU | 206948 tok/s +step 15191/19560 | loss 3.339218 (+0.56z)| norm 0.2573 (-0.19z)| lr 7.60e-05 | 2532.40 ms | 53.3% bf16 MFU | 206952 tok/s +step 15192/19560 | loss 3.465756 (+2.91z)| norm 0.2809 (-0.03z)| lr 7.60e-05 | 2532.77 ms | 53.3% bf16 MFU | 206954 tok/s +step 15193/19560 | loss 3.324121 (+0.23z)| norm 0.2602 (-0.17z)| lr 7.60e-05 | 2535.41 ms | 53.3% bf16 MFU | 206946 tok/s +step 15194/19560 | loss 3.304451 (-0.14z)| norm 0.2486 (-0.24z)| lr 7.59e-05 | 2534.08 ms | 53.3% bf16 MFU | 206943 tok/s +step 15195/19560 | loss 3.295482 (-0.30z)| norm 0.2788 (-0.04z)| lr 7.59e-05 | 2534.62 ms | 53.3% bf16 MFU | 206939 tok/s +step 15196/19560 | loss 3.252357 (-1.11z)| norm 0.2641 (-0.14z)| lr 7.59e-05 | 2533.87 ms | 53.3% bf16 MFU | 206937 tok/s +step 15197/19560 | loss 3.365180 (+1.02z)| norm 0.2648 (-0.13z)| lr 7.58e-05 | 2533.90 ms | 53.3% bf16 MFU | 206936 tok/s +step 15198/19560 | loss 3.329732 (+0.34z)| norm 0.2803 (-0.03z)| lr 7.58e-05 | 2530.78 ms | 53.4% bf16 MFU | 206948 tok/s +step 15199/19560 | loss 3.361978 (+0.94z)| norm 0.2630 (-0.15z)| lr 7.58e-05 | 2533.18 ms | 53.3% bf16 MFU | 206949 tok/s +step 15200/19560 | loss 3.277228 (-0.65z)| norm 0.2806 (-0.03z)| lr 7.57e-05 | 2533.08 ms | 53.3% bf16 MFU | 206950 tok/s +step 15201/19560 | loss 3.380510 (+1.30z)| norm 0.2657 (-0.13z)| lr 7.57e-05 | 2534.11 ms | 53.3% bf16 MFU | 206947 tok/s +step 15202/19560 | loss 3.360751 (+0.91z)| norm 0.2646 (-0.13z)| lr 7.57e-05 | 2534.84 ms | 53.3% bf16 MFU | 206941 tok/s +step 15203/19560 | loss 3.262810 (-0.94z)| norm 0.2774 (-0.05z)| lr 7.56e-05 | 2533.61 ms | 53.3% bf16 MFU | 206941 tok/s +step 15204/19560 | loss 3.336324 (+0.45z)| norm 0.2556 (-0.19z)| lr 7.56e-05 | 2532.68 ms | 53.3% bf16 MFU | 206944 tok/s +step 15205/19560 | loss 3.316483 (+0.07z)| norm 0.2469 (-0.25z)| lr 7.56e-05 | 2533.37 ms | 53.3% bf16 MFU | 206945 tok/s +step 15206/19560 | loss 3.468226 (+2.85z)| norm 0.3207 (+0.24z)| lr 7.55e-05 | 2535.71 ms | 53.2% bf16 MFU | 206936 tok/s +step 15207/19560 | loss 3.316119 (+0.04z)| norm 0.2754 (-0.06z)| lr 7.55e-05 | 2533.70 ms | 53.3% bf16 MFU | 206935 tok/s +step 15208/19560 | loss 3.237123 (-1.40z)| norm 0.2786 (-0.04z)| lr 7.55e-05 | 2533.11 ms | 53.3% bf16 MFU | 206937 tok/s +step 15209/19560 | loss 3.350973 (+0.68z)| norm 0.2498 (-0.24z)| lr 7.54e-05 | 2532.74 ms | 53.3% bf16 MFU | 206940 tok/s +step 15210/19560 | loss 3.337056 (+0.42z)| norm 0.2634 (-0.15z)| lr 7.54e-05 | 2532.19 ms | 53.3% bf16 MFU | 206946 tok/s +step 15211/19560 | loss 3.325307 (+0.19z)| norm 0.2868 (+0.01z)| lr 7.54e-05 | 2534.16 ms | 53.3% bf16 MFU | 206943 tok/s +step 15212/19560 | loss 3.335279 (+0.37z)| norm 0.2485 (-0.25z)| lr 7.53e-05 | 2534.08 ms | 53.3% bf16 MFU | 206941 tok/s +step 15213/19560 | loss 3.308197 (-0.12z)| norm 0.2595 (-0.17z)| lr 7.53e-05 | 2532.60 ms | 53.3% bf16 MFU | 206944 tok/s +step 15214/19560 | loss 3.354260 (+0.71z)| norm 0.2613 (-0.16z)| lr 7.53e-05 | 2534.46 ms | 53.3% bf16 MFU | 206940 tok/s +step 15215/19560 | loss 3.343659 (+0.52z)| norm 0.2703 (-0.10z)| lr 7.52e-05 | 2531.83 ms | 53.3% bf16 MFU | 206947 tok/s +step 15216/19560 | loss 3.311559 (-0.08z)| norm 0.2776 (-0.05z)| lr 7.52e-05 | 2533.38 ms | 53.3% bf16 MFU | 206947 tok/s +step 15217/19560 | loss 3.344743 (+0.52z)| norm 0.2790 (-0.04z)| lr 7.52e-05 | 2531.90 ms | 53.3% bf16 MFU | 206954 tok/s +step 15218/19560 | loss 3.286648 (-0.58z)| norm 0.2466 (-0.26z)| lr 7.51e-05 | 2534.17 ms | 53.3% bf16 MFU | 206950 tok/s +step 15219/19560 | loss 3.246780 (-1.32z)| norm 0.2814 (-0.02z)| lr 7.51e-05 | 2531.02 ms | 53.3% bf16 MFU | 206960 tok/s +step 15220/19560 | loss 3.368505 (+0.98z)| norm 0.2665 (-0.13z)| lr 7.51e-05 | 2535.48 ms | 53.3% bf16 MFU | 206951 tok/s +step 15221/19560 | loss 3.389079 (+1.35z)| norm 0.2999 (+0.10z)| lr 7.50e-05 | 2533.04 ms | 53.3% bf16 MFU | 206953 tok/s +step 15222/19560 | loss 3.282932 (-0.67z)| norm 0.2897 (+0.03z)| lr 7.50e-05 | 2533.49 ms | 53.3% bf16 MFU | 206952 tok/s +step 15223/19560 | loss 3.407781 (+1.68z)| norm 0.2742 (-0.08z)| lr 7.50e-05 | 2535.02 ms | 53.3% bf16 MFU | 206945 tok/s +step 15224/19560 | loss 3.284319 (-0.65z)| norm 0.2748 (-0.07z)| lr 7.49e-05 | 2533.50 ms | 53.3% bf16 MFU | 206945 tok/s +step 15225/19560 | loss 3.326017 (+0.14z)| norm 0.2661 (-0.13z)| lr 7.49e-05 | 2535.25 ms | 53.3% bf16 MFU | 206938 tok/s +step 15226/19560 | loss 3.365174 (+0.88z)| norm 0.2980 (+0.08z)| lr 7.49e-05 | 2535.83 ms | 53.2% bf16 MFU | 206929 tok/s +step 15227/19560 | loss 3.271585 (-0.88z)| norm 0.2556 (-0.21z)| lr 7.48e-05 | 2533.58 ms | 53.3% bf16 MFU | 206929 tok/s +step 15228/19560 | loss 3.296395 (-0.41z)| norm 0.2552 (-0.21z)| lr 7.48e-05 | 2534.76 ms | 53.3% bf16 MFU | 206924 tok/s +step 15229/19560 | loss 3.281425 (-0.70z)| norm 0.2753 (-0.07z)| lr 7.48e-05 | 2534.13 ms | 53.3% bf16 MFU | 206923 tok/s +step 15230/19560 | loss 3.302742 (-0.32z)| norm 0.2676 (-0.13z)| lr 7.47e-05 | 2533.56 ms | 53.3% bf16 MFU | 206924 tok/s +step 15231/19560 | loss 3.289990 (-0.55z)| norm 0.2733 (-0.09z)| lr 7.47e-05 | 2533.64 ms | 53.3% bf16 MFU | 206924 tok/s +step 15232/19560 | loss 3.276135 (-0.81z)| norm 0.2671 (-0.13z)| lr 7.47e-05 | 2534.39 ms | 53.3% bf16 MFU | 206921 tok/s +step 15233/19560 | loss 3.330480 (+0.23z)| norm 0.2675 (-0.13z)| lr 7.46e-05 | 2534.44 ms | 53.3% bf16 MFU | 206918 tok/s +step 15234/19560 | loss 3.313129 (-0.10z)| norm 0.2826 (-0.03z)| lr 7.46e-05 | 2533.15 ms | 53.3% bf16 MFU | 206921 tok/s +step 15235/19560 | loss 3.385993 (+1.27z)| norm 0.2827 (-0.03z)| lr 7.46e-05 | 2533.96 ms | 53.3% bf16 MFU | 206920 tok/s +step 15236/19560 | loss 3.250295 (-1.29z)| norm 0.2749 (-0.08z)| lr 7.45e-05 | 2532.74 ms | 53.3% bf16 MFU | 206924 tok/s +step 15237/19560 | loss 3.389619 (+1.33z)| norm 0.2905 (+0.03z)| lr 7.45e-05 | 2532.94 ms | 53.3% bf16 MFU | 206928 tok/s +step 15238/19560 | loss 3.320026 (+0.02z)| norm 0.2589 (-0.19z)| lr 7.45e-05 | 2532.02 ms | 53.3% bf16 MFU | 206934 tok/s +step 15239/19560 | loss 3.329471 (+0.19z)| norm 0.2651 (-0.14z)| lr 7.44e-05 | 2531.69 ms | 53.3% bf16 MFU | 206942 tok/s +step 15240/19560 | loss 3.275176 (-0.82z)| norm 0.2795 (-0.05z)| lr 7.44e-05 | 2534.34 ms | 53.3% bf16 MFU | 206939 tok/s +step 15241/19560 | loss 3.306455 (-0.23z)| norm 0.2809 (-0.04z)| lr 7.44e-05 | 2532.81 ms | 53.3% bf16 MFU | 206942 tok/s +step 15242/19560 | loss 3.339969 (+0.39z)| norm 0.2554 (-0.21z)| lr 7.43e-05 | 2531.73 ms | 53.3% bf16 MFU | 206949 tok/s +step 15243/19560 | loss 3.343077 (+0.45z)| norm 0.2962 (+0.07z)| lr 7.43e-05 | 2532.35 ms | 53.3% bf16 MFU | 206953 tok/s +step 15244/19560 | loss 3.367705 (+0.96z)| norm 0.2875 (+0.01z)| lr 7.43e-05 | 2535.50 ms | 53.3% bf16 MFU | 206945 tok/s +step 15245/19560 | loss 3.341485 (+0.44z)| norm 0.2631 (-0.16z)| lr 7.42e-05 | 2532.98 ms | 53.3% bf16 MFU | 206947 tok/s +step 15246/19560 | loss 3.246814 (-1.49z)| norm 0.2742 (-0.08z)| lr 7.42e-05 | 2532.67 ms | 53.3% bf16 MFU | 206950 tok/s +step 15247/19560 | loss 3.294995 (-0.47z)| norm 0.2715 (-0.09z)| lr 7.42e-05 | 2532.94 ms | 53.3% bf16 MFU | 206952 tok/s +step 15248/19560 | loss 3.364200 (+0.98z)| norm 0.2602 (-0.17z)| lr 7.41e-05 | 2533.57 ms | 53.3% bf16 MFU | 206951 tok/s +step 15249/19560 | loss 3.333946 (+0.34z)| norm 0.2657 (-0.13z)| lr 7.41e-05 | 2533.98 ms | 53.3% bf16 MFU | 206948 tok/s +step 15250/19560 | loss 3.348121 (+0.63z)| norm 0.2667 (-0.12z)| lr 7.41e-05 | 2533.86 ms | 53.3% bf16 MFU | 206947 tok/s +val loss 3.311177 +evaluating HellaSwag: 0/628 evaluating HellaSwag: 10/628 evaluating HellaSwag: 20/628 evaluating HellaSwag: 30/628 evaluating HellaSwag: 40/628 evaluating HellaSwag: 50/628 evaluating HellaSwag: 60/628 evaluating HellaSwag: 70/628 evaluating HellaSwag: 80/628 evaluating HellaSwag: 90/628 evaluating HellaSwag: 100/628 evaluating HellaSwag: 110/628 evaluating HellaSwag: 120/628 evaluating HellaSwag: 130/628 evaluating HellaSwag: 140/628 evaluating HellaSwag: 150/628 evaluating HellaSwag: 160/628 evaluating HellaSwag: 170/628 evaluating HellaSwag: 180/628 evaluating HellaSwag: 190/628 evaluating HellaSwag: 200/628 evaluating HellaSwag: 210/628 evaluating HellaSwag: 220/628 evaluating HellaSwag: 230/628 evaluating HellaSwag: 240/628 evaluating HellaSwag: 250/628 evaluating HellaSwag: 260/628 evaluating HellaSwag: 270/628 evaluating HellaSwag: 280/628 evaluating HellaSwag: 290/628 evaluating HellaSwag: 300/628 evaluating HellaSwag: 310/628 evaluating HellaSwag: 320/628 evaluating HellaSwag: 330/628 evaluating HellaSwag: 340/628 evaluating HellaSwag: 350/628 evaluating HellaSwag: 360/628 evaluating HellaSwag: 370/628 evaluating HellaSwag: 380/628 evaluating HellaSwag: 390/628 evaluating HellaSwag: 400/628 evaluating HellaSwag: 410/628 evaluating HellaSwag: 420/628 evaluating HellaSwag: 430/628 evaluating HellaSwag: 440/628 evaluating HellaSwag: 450/628 evaluating HellaSwag: 460/628 evaluating HellaSwag: 470/628 evaluating HellaSwag: 480/628 evaluating HellaSwag: 490/628 evaluating HellaSwag: 500/628 evaluating HellaSwag: 510/628 evaluating HellaSwag: 520/628 evaluating HellaSwag: 530/628 evaluating HellaSwag: 540/628 evaluating HellaSwag: 550/628 evaluating HellaSwag: 560/628 evaluating HellaSwag: 570/628 evaluating HellaSwag: 580/628 evaluating HellaSwag: 590/628 evaluating HellaSwag: 600/628 evaluating HellaSwag: 610/628 evaluating HellaSwag: 620/628 HellaSwag: 3025/10042 = 0.301235 +step 15251/19560 | loss 3.296416 (-0.46z)| norm 0.2749 (-0.06z)| lr 7.41e-05 | 2533.03 ms | 53.3% bf16 MFU | 206948 tok/s +step 15252/19560 | loss 3.303844 (-0.30z)| norm 0.2589 (-0.17z)| lr 7.40e-05 | 2533.30 ms | 53.3% bf16 MFU | 206949 tok/s +step 15253/19560 | loss 3.312408 (-0.12z)| norm 0.2709 (-0.09z)| lr 7.40e-05 | 2533.11 ms | 53.3% bf16 MFU | 206950 tok/s +step 15254/19560 | loss 3.398808 (+1.71z)| norm 0.2647 (-0.13z)| lr 7.40e-05 | 2533.77 ms | 53.3% bf16 MFU | 206949 tok/s +step 15255/19560 | loss 3.287684 (-0.65z)| norm 0.2513 (-0.22z)| lr 7.39e-05 | 2533.20 ms | 53.3% bf16 MFU | 206949 tok/s +step 15256/19560 | loss 3.246480 (-1.50z)| norm 0.2447 (-0.27z)| lr 7.39e-05 | 2533.33 ms | 53.3% bf16 MFU | 206950 tok/s +step 15257/19560 | loss 3.354247 (+0.76z)| norm 0.2570 (-0.18z)| lr 7.39e-05 | 2533.17 ms | 53.3% bf16 MFU | 206951 tok/s +step 15258/19560 | loss 3.283631 (-0.73z)| norm 0.2513 (-0.22z)| lr 7.38e-05 | 2535.21 ms | 53.3% bf16 MFU | 206943 tok/s +step 15259/19560 | loss 3.315326 (-0.08z)| norm 0.2600 (-0.16z)| lr 7.38e-05 | 2531.81 ms | 53.3% bf16 MFU | 206950 tok/s +step 15260/19560 | loss 3.299884 (-0.41z)| norm 0.2513 (-0.22z)| lr 7.38e-05 | 2533.95 ms | 53.3% bf16 MFU | 206948 tok/s +step 15261/19560 | loss 3.294122 (-0.55z)| norm 0.2666 (-0.12z)| lr 7.37e-05 | 2534.69 ms | 53.3% bf16 MFU | 206943 tok/s +step 15262/19560 | loss 3.335210 (+0.34z)| norm 0.2491 (-0.23z)| lr 7.37e-05 | 2535.00 ms | 53.3% bf16 MFU | 206937 tok/s +step 15263/19560 | loss 3.206069 (-2.38z)| norm 0.2757 (-0.05z)| lr 7.37e-05 | 2533.18 ms | 53.3% bf16 MFU | 206938 tok/s +step 15264/19560 | loss 3.392231 (+1.53z)| norm 0.2520 (-0.21z)| lr 7.36e-05 | 2531.99 ms | 53.3% bf16 MFU | 206945 tok/s +step 15265/19560 | loss 3.251501 (-1.41z)| norm 0.2480 (-0.24z)| lr 7.36e-05 | 2532.81 ms | 53.3% bf16 MFU | 206947 tok/s +step 15266/19560 | loss 3.363716 (+0.91z)| norm 0.2560 (-0.19z)| lr 7.36e-05 | 2532.65 ms | 53.3% bf16 MFU | 206950 tok/s +step 15267/19560 | loss 3.386275 (+1.36z)| norm 0.2467 (-0.25z)| lr 7.35e-05 | 2534.42 ms | 53.3% bf16 MFU | 206946 tok/s +step 15268/19560 | loss 3.369353 (+1.00z)| norm 0.2913 (+0.05z)| lr 7.35e-05 | 2531.99 ms | 53.3% bf16 MFU | 206952 tok/s +step 15269/19560 | loss 3.300621 (-0.41z)| norm 0.2878 (+0.99z)| lr 7.35e-05 | 2532.53 ms | 53.3% bf16 MFU | 206956 tok/s +step 15270/19560 | loss 3.258420 (-1.29z)| norm 0.2674 (-0.17z)| lr 7.34e-05 | 2535.64 ms | 53.2% bf16 MFU | 206946 tok/s +step 15271/19560 | loss 3.358068 (+0.82z)| norm 0.2618 (-0.51z)| lr 7.34e-05 | 2533.59 ms | 53.3% bf16 MFU | 206946 tok/s +step 15272/19560 | loss 3.315325 (-0.09z)| norm 0.2607 (-0.57z)| lr 7.34e-05 | 2534.97 ms | 53.3% bf16 MFU | 206940 tok/s +step 15273/19560 | loss 3.320228 (-0.00z)| norm 0.2776 (+0.58z)| lr 7.33e-05 | 2533.97 ms | 53.3% bf16 MFU | 206938 tok/s +step 15274/19560 | loss 3.287434 (-0.74z)| norm 0.2679 (-0.07z)| lr 7.33e-05 | 2531.95 ms | 53.3% bf16 MFU | 206944 tok/s +step 15275/19560 | loss 3.297523 (-0.52z)| norm 0.2571 (-0.81z)| lr 7.33e-05 | 2533.21 ms | 53.3% bf16 MFU | 206945 tok/s +step 15276/19560 | loss 3.327435 (+0.13z)| norm 0.2808 (+0.85z)| lr 7.32e-05 | 2532.51 ms | 53.3% bf16 MFU | 206949 tok/s +step 15277/19560 | loss 3.345246 (+0.51z)| norm 0.2673 (-0.09z)| lr 7.32e-05 | 2534.77 ms | 53.3% bf16 MFU | 206944 tok/s +step 15278/19560 | loss 3.337063 (+0.32z)| norm 0.2887 (+1.40z)| lr 7.32e-05 | 2534.13 ms | 53.3% bf16 MFU | 206941 tok/s +step 15279/19560 | loss 3.253029 (-1.52z)| norm 0.2900 (+1.51z)| lr 7.31e-05 | 2532.30 ms | 53.3% bf16 MFU | 206946 tok/s +step 15280/19560 | loss 3.350791 (+0.64z)| norm 0.2693 (+0.06z)| lr 7.31e-05 | 2534.59 ms | 53.3% bf16 MFU | 206941 tok/s +step 15281/19560 | loss 3.296443 (-0.57z)| norm 0.2774 (+0.66z)| lr 7.31e-05 | 2534.03 ms | 53.3% bf16 MFU | 206939 tok/s +step 15282/19560 | loss 3.317600 (-0.11z)| norm 0.2606 (-0.54z)| lr 7.30e-05 | 2534.64 ms | 53.3% bf16 MFU | 206935 tok/s +step 15283/19560 | loss 3.355008 (+0.72z)| norm 0.2858 (+1.28z)| lr 7.30e-05 | 2533.60 ms | 53.3% bf16 MFU | 206935 tok/s +step 15284/19560 | loss 3.310452 (-0.27z)| norm 0.2602 (-0.57z)| lr 7.30e-05 | 2533.49 ms | 53.3% bf16 MFU | 206935 tok/s +step 15285/19560 | loss 3.238681 (-1.84z)| norm 0.2685 (+0.04z)| lr 7.29e-05 | 2534.18 ms | 53.3% bf16 MFU | 206933 tok/s +step 15286/19560 | loss 3.250129 (-1.58z)| norm 0.2763 (+0.60z)| lr 7.29e-05 | 2535.66 ms | 53.2% bf16 MFU | 206924 tok/s +step 15287/19560 | loss 3.276397 (-0.99z)| norm 0.2547 (-0.95z)| lr 7.29e-05 | 2533.32 ms | 53.3% bf16 MFU | 206926 tok/s +step 15288/19560 | loss 3.301674 (-0.44z)| norm 0.2792 (+0.81z)| lr 7.28e-05 | 2534.38 ms | 53.3% bf16 MFU | 206923 tok/s +step 15289/19560 | loss 3.379304 (+1.26z)| norm 0.2660 (-0.14z)| lr 7.28e-05 | 2535.37 ms | 53.3% bf16 MFU | 206917 tok/s +step 15290/19560 | loss 3.292731 (-0.66z)| norm 0.2599 (-0.59z)| lr 7.28e-05 | 2532.86 ms | 53.3% bf16 MFU | 206920 tok/s +step 15291/19560 | loss 3.380157 (+1.27z)| norm 0.2995 (+2.22z)| lr 7.27e-05 | 2532.90 ms | 53.3% bf16 MFU | 206924 tok/s +step 15292/19560 | loss 3.472477 (+3.17z)| norm 0.2994 (+2.15z)| lr 7.27e-05 | 2532.48 ms | 53.3% bf16 MFU | 206929 tok/s +step 15293/19560 | loss 3.327644 (+0.08z)| norm 0.2703 (+0.10z)| lr 7.27e-05 | 2533.95 ms | 53.3% bf16 MFU | 206928 tok/s +step 15294/19560 | loss 3.283966 (-0.84z)| norm 0.2809 (+0.84z)| lr 7.26e-05 | 2534.87 ms | 53.3% bf16 MFU | 206923 tok/s +step 15295/19560 | loss 3.240224 (-1.73z)| norm 0.2678 (-0.09z)| lr 7.26e-05 | 2535.77 ms | 53.2% bf16 MFU | 206915 tok/s +step 15296/19560 | loss 3.321148 (-0.03z)| norm 0.2718 (+0.18z)| lr 7.26e-05 | 2534.98 ms | 53.3% bf16 MFU | 206910 tok/s +step 15297/19560 | loss 3.346469 (+0.50z)| norm 0.2807 (+0.80z)| lr 7.25e-05 | 2534.84 ms | 53.3% bf16 MFU | 206906 tok/s +step 15298/19560 | loss 3.225983 (-2.00z)| norm 0.2680 (-0.11z)| lr 7.25e-05 | 2534.02 ms | 53.3% bf16 MFU | 206906 tok/s +step 15299/19560 | loss 3.392322 (+1.47z)| norm 0.2717 (+0.15z)| lr 7.25e-05 | 2534.41 ms | 53.3% bf16 MFU | 206904 tok/s +step 15300/19560 | loss 3.358145 (+0.75z)| norm 0.2740 (+0.31z)| lr 7.24e-05 | 2534.75 ms | 53.3% bf16 MFU | 206901 tok/s +step 15301/19560 | loss 3.314246 (-0.16z)| norm 0.2698 (-0.00z)| lr 7.24e-05 | 2533.33 ms | 53.3% bf16 MFU | 206903 tok/s +step 15302/19560 | loss 3.255466 (-1.39z)| norm 0.2588 (-0.78z)| lr 7.24e-05 | 2533.55 ms | 53.3% bf16 MFU | 206905 tok/s +step 15303/19560 | loss 3.329826 (+0.17z)| norm 0.2752 (+0.42z)| lr 7.23e-05 | 2535.52 ms | 53.3% bf16 MFU | 206899 tok/s +step 15304/19560 | loss 3.386637 (+1.34z)| norm 0.2790 (+0.69z)| lr 7.23e-05 | 2533.65 ms | 53.3% bf16 MFU | 206900 tok/s +step 15305/19560 | loss 3.412309 (+1.85z)| norm 0.3054 (+2.56z)| lr 7.23e-05 | 2533.88 ms | 53.3% bf16 MFU | 206901 tok/s +step 15306/19560 | loss 3.323765 (+0.03z)| norm 0.2987 (+2.03z)| lr 7.23e-05 | 2532.94 ms | 53.3% bf16 MFU | 206905 tok/s +step 15307/19560 | loss 3.262489 (-1.22z)| norm 0.2932 (+1.61z)| lr 7.22e-05 | 2533.82 ms | 53.3% bf16 MFU | 206906 tok/s +step 15308/19560 | loss 3.325653 (+0.07z)| norm 0.2673 (-0.22z)| lr 7.22e-05 | 2532.98 ms | 53.3% bf16 MFU | 206910 tok/s +step 15309/19560 | loss 3.250503 (-1.45z)| norm 0.2931 (+1.57z)| lr 7.22e-05 | 2532.86 ms | 53.3% bf16 MFU | 206914 tok/s +step 15310/19560 | loss 3.285694 (-0.73z)| norm 0.2772 (+0.45z)| lr 7.21e-05 | 2533.36 ms | 53.3% bf16 MFU | 206916 tok/s +step 15311/19560 | loss 3.326169 (+0.09z)| norm 0.2788 (+0.56z)| lr 7.21e-05 | 2533.34 ms | 53.3% bf16 MFU | 206918 tok/s +step 15312/19560 | loss 3.283461 (-0.77z)| norm 0.2906 (+1.38z)| lr 7.21e-05 | 2533.23 ms | 53.3% bf16 MFU | 206920 tok/s +step 15313/19560 | loss 3.317747 (-0.06z)| norm 0.2670 (-0.27z)| lr 7.20e-05 | 2533.56 ms | 53.3% bf16 MFU | 206921 tok/s +step 15314/19560 | loss 3.304456 (-0.33z)| norm 0.2646 (-0.44z)| lr 7.20e-05 | 2534.84 ms | 53.3% bf16 MFU | 206917 tok/s +step 15315/19560 | loss 3.342344 (+0.44z)| norm 0.2757 (+0.33z)| lr 7.20e-05 | 2535.30 ms | 53.3% bf16 MFU | 206911 tok/s +step 15316/19560 | loss 3.424381 (+2.07z)| norm 0.2899 (+1.32z)| lr 7.19e-05 | 2532.62 ms | 53.3% bf16 MFU | 206916 tok/s +step 15317/19560 | loss 3.290457 (-0.63z)| norm 0.2546 (-1.17z)| lr 7.19e-05 | 2532.33 ms | 53.3% bf16 MFU | 206922 tok/s +step 15318/19560 | loss 3.338608 (+0.35z)| norm 0.2697 (-0.10z)| lr 7.19e-05 | 2534.58 ms | 53.3% bf16 MFU | 206918 tok/s +step 15319/19560 | loss 3.272361 (-0.98z)| norm 0.2714 (+0.02z)| lr 7.18e-05 | 2534.21 ms | 53.3% bf16 MFU | 206917 tok/s +step 15320/19560 | loss 3.257707 (-1.28z)| norm 0.2548 (-1.15z)| lr 7.18e-05 | 2531.95 ms | 53.3% bf16 MFU | 206924 tok/s +step 15321/19560 | loss 3.341746 (+0.46z)| norm 0.2659 (-0.37z)| lr 7.18e-05 | 2532.64 ms | 53.3% bf16 MFU | 206929 tok/s +step 15322/19560 | loss 3.315762 (-0.08z)| norm 0.2466 (-1.72z)| lr 7.17e-05 | 2532.83 ms | 53.3% bf16 MFU | 206932 tok/s +step 15323/19560 | loss 3.253111 (-1.37z)| norm 0.2521 (-1.32z)| lr 7.17e-05 | 2532.64 ms | 53.3% bf16 MFU | 206936 tok/s +step 15324/19560 | loss 3.278270 (-0.85z)| norm 0.2581 (-0.89z)| lr 7.17e-05 | 2533.28 ms | 53.3% bf16 MFU | 206937 tok/s +step 15325/19560 | loss 3.293133 (-0.54z)| norm 0.2459 (-1.71z)| lr 7.16e-05 | 2535.69 ms | 53.2% bf16 MFU | 206929 tok/s +step 15326/19560 | loss 3.302975 (-0.33z)| norm 0.2893 (+1.28z)| lr 7.16e-05 | 2531.31 ms | 53.3% bf16 MFU | 206938 tok/s +step 15327/19560 | loss 3.324951 (+0.14z)| norm 0.2630 (-0.53z)| lr 7.16e-05 | 2533.42 ms | 53.3% bf16 MFU | 206939 tok/s +step 15328/19560 | loss 3.339831 (+0.44z)| norm 0.2867 (+1.09z)| lr 7.15e-05 | 2532.77 ms | 53.3% bf16 MFU | 206942 tok/s +step 15329/19560 | loss 3.312984 (-0.11z)| norm 0.2685 (-0.16z)| lr 7.15e-05 | 2534.17 ms | 53.3% bf16 MFU | 206939 tok/s +step 15330/19560 | loss 3.283859 (-0.71z)| norm 0.2551 (-1.07z)| lr 7.15e-05 | 2533.69 ms | 53.3% bf16 MFU | 206939 tok/s +step 15331/19560 | loss 3.275817 (-0.89z)| norm 0.2683 (-0.16z)| lr 7.14e-05 | 2534.95 ms | 53.3% bf16 MFU | 206933 tok/s +step 15332/19560 | loss 3.429469 (+2.29z)| norm 0.2863 (+1.05z)| lr 7.14e-05 | 2533.89 ms | 53.3% bf16 MFU | 206932 tok/s +step 15333/19560 | loss 3.307075 (-0.24z)| norm 0.2731 (+0.14z)| lr 7.14e-05 | 2534.72 ms | 53.3% bf16 MFU | 206927 tok/s +step 15334/19560 | loss 3.392172 (+1.58z)| norm 0.2620 (-0.63z)| lr 7.13e-05 | 2533.23 ms | 53.3% bf16 MFU | 206929 tok/s +step 15335/19560 | loss 3.318286 (+0.01z)| norm 0.2584 (-0.88z)| lr 7.13e-05 | 2533.11 ms | 53.3% bf16 MFU | 206931 tok/s +step 15336/19560 | loss 3.330843 (+0.26z)| norm 0.2776 (+0.52z)| lr 7.13e-05 | 2533.62 ms | 53.3% bf16 MFU | 206931 tok/s +step 15337/19560 | loss 3.292927 (-0.54z)| norm 0.2597 (-0.79z)| lr 7.12e-05 | 2532.80 ms | 53.3% bf16 MFU | 206935 tok/s +step 15338/19560 | loss 3.415911 (+2.06z)| norm 0.3418 (+4.69z)| lr 7.12e-05 | 2532.89 ms | 53.3% bf16 MFU | 206938 tok/s +step 15339/19560 | loss 3.298261 (-0.43z)| norm 0.2732 (+0.14z)| lr 7.12e-05 | 2534.40 ms | 53.3% bf16 MFU | 206934 tok/s +step 15340/19560 | loss 3.334733 (+0.34z)| norm 0.2836 (+0.82z)| lr 7.11e-05 | 2532.77 ms | 53.3% bf16 MFU | 206938 tok/s +step 15341/19560 | loss 3.298765 (-0.42z)| norm 0.2712 (-0.02z)| lr 7.11e-05 | 2532.68 ms | 53.3% bf16 MFU | 206941 tok/s +step 15342/19560 | loss 3.273763 (-0.93z)| norm 0.2617 (-0.65z)| lr 7.11e-05 | 2532.65 ms | 53.3% bf16 MFU | 206945 tok/s +step 15343/19560 | loss 3.351034 (+0.70z)| norm 0.2710 (-0.03z)| lr 7.11e-05 | 2532.80 ms | 53.3% bf16 MFU | 206947 tok/s +step 15344/19560 | loss 3.276661 (-0.87z)| norm 0.2628 (-0.57z)| lr 7.10e-05 | 2533.07 ms | 53.3% bf16 MFU | 206949 tok/s +step 15345/19560 | loss 3.362610 (+0.94z)| norm 0.2780 (+0.45z)| lr 7.10e-05 | 2533.52 ms | 53.3% bf16 MFU | 206948 tok/s +step 15346/19560 | loss 3.390704 (+1.51z)| norm 0.2716 (+0.00z)| lr 7.10e-05 | 2534.96 ms | 53.3% bf16 MFU | 206942 tok/s +step 15347/19560 | loss 3.353354 (+0.71z)| norm 0.2634 (-0.54z)| lr 7.09e-05 | 2531.72 ms | 53.3% bf16 MFU | 206949 tok/s +step 15348/19560 | loss 3.325564 (+0.14z)| norm 0.2815 (+0.68z)| lr 7.09e-05 | 2532.71 ms | 53.3% bf16 MFU | 206952 tok/s +step 15349/19560 | loss 3.362704 (+0.93z)| norm 0.2674 (-0.26z)| lr 7.09e-05 | 2533.57 ms | 53.3% bf16 MFU | 206951 tok/s +step 15350/19560 | loss 3.354927 (+0.75z)| norm 0.2664 (-0.32z)| lr 7.08e-05 | 2532.57 ms | 53.3% bf16 MFU | 206955 tok/s +step 15351/19560 | loss 3.352114 (+0.71z)| norm 0.2661 (-0.34z)| lr 7.08e-05 | 2534.27 ms | 53.3% bf16 MFU | 206951 tok/s +step 15352/19560 | loss 3.425618 (+2.23z)| norm 0.2989 (+1.90z)| lr 7.08e-05 | 2534.07 ms | 53.3% bf16 MFU | 206948 tok/s +step 15353/19560 | loss 3.249613 (-1.46z)| norm 0.2987 (+1.84z)| lr 7.07e-05 | 2532.40 ms | 53.3% bf16 MFU | 206952 tok/s +step 15354/19560 | loss 3.295721 (-0.49z)| norm 0.2727 (+0.10z)| lr 7.07e-05 | 2531.94 ms | 53.3% bf16 MFU | 206958 tok/s +step 15355/19560 | loss 3.303487 (-0.33z)| norm 0.2666 (-0.32z)| lr 7.07e-05 | 2533.50 ms | 53.3% bf16 MFU | 206958 tok/s +step 15356/19560 | loss 3.312663 (-0.14z)| norm 0.2827 (+0.77z)| lr 7.06e-05 | 2532.81 ms | 53.3% bf16 MFU | 206960 tok/s +step 15357/19560 | loss 3.271865 (-1.00z)| norm 0.2619 (-0.65z)| lr 7.06e-05 | 2533.71 ms | 53.3% bf16 MFU | 206958 tok/s +step 15358/19560 | loss 3.333929 (+0.30z)| norm 0.2896 (+1.23z)| lr 7.06e-05 | 2534.26 ms | 53.3% bf16 MFU | 206954 tok/s +step 15359/19560 | loss 3.346140 (+0.55z)| norm 0.2646 (-0.47z)| lr 7.05e-05 | 2533.53 ms | 53.3% bf16 MFU | 206953 tok/s +step 15360/19560 | loss 3.286800 (-0.70z)| norm 0.2700 (-0.11z)| lr 7.05e-05 | 2534.60 ms | 53.3% bf16 MFU | 206948 tok/s +step 15361/19560 | loss 3.256153 (-1.33z)| norm 0.2692 (-0.16z)| lr 7.05e-05 | 2533.80 ms | 53.3% bf16 MFU | 206947 tok/s +step 15362/19560 | loss 3.421051 (+2.07z)| norm 0.2920 (+1.38z)| lr 7.04e-05 | 2534.97 ms | 53.3% bf16 MFU | 206940 tok/s +step 15363/19560 | loss 3.363580 (+0.90z)| norm 0.2689 (-0.18z)| lr 7.04e-05 | 2534.00 ms | 53.3% bf16 MFU | 206938 tok/s +step 15364/19560 | loss 3.339401 (+0.39z)| norm 0.2709 (-0.04z)| lr 7.04e-05 | 2533.71 ms | 53.3% bf16 MFU | 206938 tok/s +step 15365/19560 | loss 3.278880 (-0.86z)| norm 0.2760 (+0.32z)| lr 7.03e-05 | 2537.92 ms | 53.2% bf16 MFU | 206920 tok/s +step 15366/19560 | loss 3.288800 (-0.65z)| norm 0.2579 (-0.92z)| lr 7.03e-05 | 2534.75 ms | 53.3% bf16 MFU | 206916 tok/s +step 15367/19560 | loss 3.324719 (+0.10z)| norm 0.2826 (+0.76z)| lr 7.03e-05 | 2535.04 ms | 53.3% bf16 MFU | 206911 tok/s +step 15368/19560 | loss 3.288271 (-0.66z)| norm 0.2859 (+0.97z)| lr 7.02e-05 | 2534.06 ms | 53.3% bf16 MFU | 206910 tok/s +step 15369/19560 | loss 3.310586 (-0.19z)| norm 0.2512 (-1.37z)| lr 7.02e-05 | 2531.22 ms | 53.3% bf16 MFU | 206921 tok/s +step 15370/19560 | loss 3.290912 (-0.60z)| norm 0.2640 (-0.50z)| lr 7.02e-05 | 2534.41 ms | 53.3% bf16 MFU | 206918 tok/s +step 15371/19560 | loss 3.266720 (-1.09z)| norm 0.2641 (-0.49z)| lr 7.02e-05 | 2533.34 ms | 53.3% bf16 MFU | 206920 tok/s +step 15372/19560 | loss 3.300108 (-0.38z)| norm 0.2755 (+0.31z)| lr 7.01e-05 | 2533.09 ms | 53.3% bf16 MFU | 206923 tok/s +step 15373/19560 | loss 3.328235 (+0.21z)| norm 0.2819 (+0.74z)| lr 7.01e-05 | 2532.05 ms | 53.3% bf16 MFU | 206930 tok/s +step 15374/19560 | loss 3.334393 (+0.33z)| norm 0.2580 (-0.90z)| lr 7.01e-05 | 2535.20 ms | 53.3% bf16 MFU | 206924 tok/s +step 15375/19560 | loss 3.283941 (-0.74z)| norm 0.2837 (+0.86z)| lr 7.00e-05 | 2534.79 ms | 53.3% bf16 MFU | 206919 tok/s +step 15376/19560 | loss 3.282392 (-0.76z)| norm 0.2727 (+0.10z)| lr 7.00e-05 | 2532.26 ms | 53.3% bf16 MFU | 206926 tok/s +step 15377/19560 | loss 3.360619 (+0.89z)| norm 0.2540 (-1.18z)| lr 7.00e-05 | 2532.66 ms | 53.3% bf16 MFU | 206930 tok/s +step 15378/19560 | loss 3.347591 (+0.62z)| norm 0.2623 (-0.60z)| lr 6.99e-05 | 2531.59 ms | 53.3% bf16 MFU | 206938 tok/s +step 15379/19560 | loss 3.347196 (+0.60z)| norm 0.2949 (+1.59z)| lr 6.99e-05 | 2533.34 ms | 53.3% bf16 MFU | 206939 tok/s +step 15380/19560 | loss 3.302817 (-0.34z)| norm 0.2589 (-0.84z)| lr 6.99e-05 | 2530.26 ms | 53.4% bf16 MFU | 206952 tok/s +step 15381/19560 | loss 3.318004 (-0.02z)| norm 0.2646 (-0.45z)| lr 6.98e-05 | 2532.73 ms | 53.3% bf16 MFU | 206955 tok/s +step 15382/19560 | loss 3.292372 (-0.55z)| norm 0.2711 (-0.02z)| lr 6.98e-05 | 2533.58 ms | 53.3% bf16 MFU | 206954 tok/s +step 15383/19560 | loss 3.237457 (-1.69z)| norm 0.2613 (-0.69z)| lr 6.98e-05 | 2533.77 ms | 53.3% bf16 MFU | 206952 tok/s +step 15384/19560 | loss 3.220159 (-2.03z)| norm 0.2569 (-1.00z)| lr 6.97e-05 | 2531.09 ms | 53.3% bf16 MFU | 206962 tok/s +step 15385/19560 | loss 3.388131 (+1.46z)| norm 0.2653 (-0.43z)| lr 6.97e-05 | 2530.52 ms | 53.4% bf16 MFU | 206973 tok/s +step 15386/19560 | loss 3.301779 (-0.33z)| norm 0.2647 (-0.48z)| lr 6.97e-05 | 2532.23 ms | 53.3% bf16 MFU | 206977 tok/s +step 15387/19560 | loss 3.291897 (-0.54z)| norm 0.2438 (-1.90z)| lr 6.96e-05 | 2532.88 ms | 53.3% bf16 MFU | 206977 tok/s +step 15388/19560 | loss 3.288235 (-0.61z)| norm 0.2540 (-1.20z)| lr 6.96e-05 | 2533.36 ms | 53.3% bf16 MFU | 206976 tok/s +step 15389/19560 | loss 3.321669 (+0.08z)| norm 0.2592 (-0.84z)| lr 6.96e-05 | 2534.49 ms | 53.3% bf16 MFU | 206970 tok/s +step 15390/19560 | loss 3.234555 (-1.70z)| norm 0.2743 (+0.18z)| lr 6.95e-05 | 2532.03 ms | 53.3% bf16 MFU | 206975 tok/s +step 15391/19560 | loss 3.294128 (-0.49z)| norm 0.2665 (-0.35z)| lr 6.95e-05 | 2532.37 ms | 53.3% bf16 MFU | 206978 tok/s +step 15392/19560 | loss 3.302778 (-0.30z)| norm 0.2538 (-1.23z)| lr 6.95e-05 | 2531.60 ms | 53.3% bf16 MFU | 206984 tok/s +step 15393/19560 | loss 3.324141 (+0.14z)| norm 0.2644 (-0.51z)| lr 6.94e-05 | 2533.04 ms | 53.3% bf16 MFU | 206984 tok/s +step 15394/19560 | loss 3.278400 (-0.82z)| norm 0.2669 (-0.35z)| lr 6.94e-05 | 2532.76 ms | 53.3% bf16 MFU | 206985 tok/s +step 15395/19560 | loss 3.342776 (+0.57z)| norm 0.2529 (-1.34z)| lr 6.94e-05 | 2534.12 ms | 53.3% bf16 MFU | 206980 tok/s +step 15396/19560 | loss 3.303254 (-0.28z)| norm 0.2593 (-0.87z)| lr 6.94e-05 | 2533.46 ms | 53.3% bf16 MFU | 206978 tok/s +step 15397/19560 | loss 3.293776 (-0.48z)| norm 0.2552 (-1.15z)| lr 6.93e-05 | 2533.66 ms | 53.3% bf16 MFU | 206976 tok/s +step 15398/19560 | loss 3.290358 (-0.56z)| norm 0.2650 (-0.45z)| lr 6.93e-05 | 2535.60 ms | 53.2% bf16 MFU | 206966 tok/s +step 15399/19560 | loss 3.301105 (-0.32z)| norm 0.2527 (-1.31z)| lr 6.93e-05 | 2534.79 ms | 53.3% bf16 MFU | 206959 tok/s +step 15400/19560 | loss 3.237041 (-1.68z)| norm 0.2565 (-1.03z)| lr 6.92e-05 | 2533.36 ms | 53.3% bf16 MFU | 206959 tok/s +step 15401/19560 | loss 3.303699 (-0.24z)| norm 0.2718 (+0.04z)| lr 6.92e-05 | 2533.29 ms | 53.3% bf16 MFU | 206959 tok/s +step 15402/19560 | loss 3.301908 (-0.29z)| norm 0.2625 (-0.61z)| lr 6.92e-05 | 2534.74 ms | 53.3% bf16 MFU | 206953 tok/s +step 15403/19560 | loss 3.298098 (-0.37z)| norm 0.2633 (-0.56z)| lr 6.91e-05 | 2534.78 ms | 53.3% bf16 MFU | 206947 tok/s +step 15404/19560 | loss 3.333693 (+0.40z)| norm 0.2668 (-0.30z)| lr 6.91e-05 | 2533.02 ms | 53.3% bf16 MFU | 206949 tok/s +step 15405/19560 | loss 3.242334 (-1.54z)| norm 0.2826 (+0.80z)| lr 6.91e-05 | 2533.40 ms | 53.3% bf16 MFU | 206949 tok/s +step 15406/19560 | loss 3.325033 (+0.23z)| norm 0.2536 (-1.22z)| lr 6.90e-05 | 2532.98 ms | 53.3% bf16 MFU | 206951 tok/s +step 15407/19560 | loss 3.243841 (-1.50z)| norm 0.2770 (+0.44z)| lr 6.90e-05 | 2532.99 ms | 53.3% bf16 MFU | 206952 tok/s +step 15408/19560 | loss 3.279624 (-0.73z)| norm 0.2603 (-0.74z)| lr 6.90e-05 | 2533.18 ms | 53.3% bf16 MFU | 206953 tok/s +step 15409/19560 | loss 3.325874 (+0.26z)| norm 0.2584 (-0.87z)| lr 6.89e-05 | 2533.60 ms | 53.3% bf16 MFU | 206952 tok/s +step 15410/19560 | loss 3.324006 (+0.22z)| norm 0.2602 (-0.74z)| lr 6.89e-05 | 2533.86 ms | 53.3% bf16 MFU | 206950 tok/s +step 15411/19560 | loss 3.299727 (-0.30z)| norm 0.2604 (-0.71z)| lr 6.89e-05 | 2533.25 ms | 53.3% bf16 MFU | 206951 tok/s +step 15412/19560 | loss 3.312831 (-0.01z)| norm 0.2952 (+1.71z)| lr 6.88e-05 | 2535.59 ms | 53.2% bf16 MFU | 206942 tok/s +step 15413/19560 | loss 3.287531 (-0.57z)| norm 0.2841 (+0.93z)| lr 6.88e-05 | 2534.86 ms | 53.3% bf16 MFU | 206936 tok/s +step 15414/19560 | loss 3.332593 (+0.39z)| norm 0.2634 (-0.51z)| lr 6.88e-05 | 2534.97 ms | 53.3% bf16 MFU | 206931 tok/s +step 15415/19560 | loss 3.282655 (-0.70z)| norm 0.2840 (+0.91z)| lr 6.87e-05 | 2533.54 ms | 53.3% bf16 MFU | 206931 tok/s +step 15416/19560 | loss 3.317430 (+0.06z)| norm 0.2607 (-0.71z)| lr 6.87e-05 | 2535.32 ms | 53.3% bf16 MFU | 206924 tok/s +step 15417/19560 | loss 3.271192 (-0.93z)| norm 0.2518 (-1.31z)| lr 6.87e-05 | 2534.87 ms | 53.3% bf16 MFU | 206920 tok/s +step 15418/19560 | loss 3.273532 (-0.88z)| norm 0.2771 (+0.43z)| lr 6.86e-05 | 2534.46 ms | 53.3% bf16 MFU | 206917 tok/s +step 15419/19560 | loss 3.314443 (+0.03z)| norm 0.2622 (-0.59z)| lr 6.86e-05 | 2531.52 ms | 53.3% bf16 MFU | 206926 tok/s +step 15420/19560 | loss 3.299989 (-0.28z)| norm 0.2555 (-1.05z)| lr 6.86e-05 | 2532.00 ms | 53.3% bf16 MFU | 206933 tok/s +step 15421/19560 | loss 3.243436 (-1.56z)| norm 0.2727 (+0.18z)| lr 6.86e-05 | 2531.52 ms | 53.3% bf16 MFU | 206942 tok/s +step 15422/19560 | loss 3.294305 (-0.39z)| norm 0.2583 (-0.84z)| lr 6.85e-05 | 2531.26 ms | 53.3% bf16 MFU | 206951 tok/s +step 15423/19560 | loss 3.246460 (-1.49z)| norm 0.2739 (+0.27z)| lr 6.85e-05 | 2534.04 ms | 53.3% bf16 MFU | 206948 tok/s +step 15424/19560 | loss 3.337057 (+0.59z)| norm 0.2521 (-1.27z)| lr 6.85e-05 | 2532.36 ms | 53.3% bf16 MFU | 206952 tok/s +step 15425/19560 | loss 3.292311 (-0.43z)| norm 0.2506 (-1.35z)| lr 6.84e-05 | 2532.73 ms | 53.3% bf16 MFU | 206955 tok/s +step 15426/19560 | loss 3.318449 (+0.16z)| norm 0.2711 (+0.10z)| lr 6.84e-05 | 2531.73 ms | 53.3% bf16 MFU | 206962 tok/s +step 15427/19560 | loss 3.343041 (+0.75z)| norm 0.2473 (-1.56z)| lr 6.84e-05 | 2531.14 ms | 53.3% bf16 MFU | 206970 tok/s +step 15428/19560 | loss 3.284468 (-0.63z)| norm 0.2617 (-0.54z)| lr 6.83e-05 | 2533.29 ms | 53.3% bf16 MFU | 206970 tok/s +step 15429/19560 | loss 3.339510 (+0.68z)| norm 0.2880 (+1.28z)| lr 6.83e-05 | 2532.45 ms | 53.3% bf16 MFU | 206973 tok/s +step 15430/19560 | loss 3.240541 (-1.66z)| norm 0.2720 (+0.16z)| lr 6.83e-05 | 2533.40 ms | 53.3% bf16 MFU | 206972 tok/s +step 15431/19560 | loss 3.303446 (-0.17z)| norm 0.2592 (-0.72z)| lr 6.82e-05 | 2533.02 ms | 53.3% bf16 MFU | 206972 tok/s +step 15432/19560 | loss 3.278373 (-0.75z)| norm 0.2590 (-0.72z)| lr 6.82e-05 | 2531.21 ms | 53.3% bf16 MFU | 206980 tok/s +step 15433/19560 | loss 3.356470 (+1.15z)| norm 0.2731 (+0.28z)| lr 6.82e-05 | 2533.46 ms | 53.3% bf16 MFU | 206978 tok/s +step 15434/19560 | loss 3.259902 (-1.19z)| norm 0.2550 (-0.99z)| lr 6.81e-05 | 2530.34 ms | 53.4% bf16 MFU | 206989 tok/s +step 15435/19560 | loss 3.289912 (-0.47z)| norm 0.2532 (-1.11z)| lr 6.81e-05 | 2531.76 ms | 53.3% bf16 MFU | 206994 tok/s +step 15436/19560 | loss 3.340732 (+0.76z)| norm 0.2701 (+0.12z)| lr 6.81e-05 | 2533.33 ms | 53.3% bf16 MFU | 206992 tok/s +step 15437/19560 | loss 3.262180 (-1.15z)| norm 0.2514 (-1.23z)| lr 6.80e-05 | 2532.99 ms | 53.3% bf16 MFU | 206992 tok/s +step 15438/19560 | loss 3.327354 (+0.43z)| norm 0.2601 (-0.58z)| lr 6.80e-05 | 2531.27 ms | 53.3% bf16 MFU | 206998 tok/s +step 15439/19560 | loss 3.376804 (+1.61z)| norm 0.2988 (+2.20z)| lr 6.80e-05 | 2531.76 ms | 53.3% bf16 MFU | 207003 tok/s +step 15440/19560 | loss 3.316603 (+0.15z)| norm 0.2824 (+1.03z)| lr 6.80e-05 | 2532.65 ms | 53.3% bf16 MFU | 207003 tok/s +step 15441/19560 | loss 3.320682 (+0.25z)| norm 0.2540 (-1.01z)| lr 6.79e-05 | 2533.77 ms | 53.3% bf16 MFU | 206999 tok/s +step 15442/19560 | loss 3.311115 (+0.02z)| norm 0.2774 (+0.66z)| lr 6.79e-05 | 2531.71 ms | 53.3% bf16 MFU | 207003 tok/s +step 15443/19560 | loss 3.360972 (+1.21z)| norm 0.2610 (-0.50z)| lr 6.79e-05 | 2532.98 ms | 53.3% bf16 MFU | 207002 tok/s +step 15444/19560 | loss 3.236196 (-1.79z)| norm 0.2631 (-0.34z)| lr 6.78e-05 | 2532.31 ms | 53.3% bf16 MFU | 207004 tok/s +step 15445/19560 | loss 3.319442 (+0.25z)| norm 0.2603 (-0.55z)| lr 6.78e-05 | 2533.09 ms | 53.3% bf16 MFU | 207003 tok/s +step 15446/19560 | loss 3.313794 (+0.12z)| norm 0.2555 (-0.89z)| lr 6.78e-05 | 2531.86 ms | 53.3% bf16 MFU | 207007 tok/s +step 15447/19560 | loss 3.346015 (+0.89z)| norm 0.2564 (-0.82z)| lr 6.77e-05 | 2534.71 ms | 53.3% bf16 MFU | 206998 tok/s +step 15448/19560 | loss 3.274974 (-0.86z)| norm 0.2498 (-1.28z)| lr 6.77e-05 | 2533.98 ms | 53.3% bf16 MFU | 206994 tok/s +step 15449/19560 | loss 3.311252 (+0.04z)| norm 0.2542 (-0.96z)| lr 6.77e-05 | 2533.97 ms | 53.3% bf16 MFU | 206989 tok/s +step 15450/19560 | loss 3.262877 (-1.14z)| norm 0.2643 (-0.24z)| lr 6.76e-05 | 2533.85 ms | 53.3% bf16 MFU | 206985 tok/s +step 15451/19560 | loss 3.289471 (-0.49z)| norm 0.2624 (-0.39z)| lr 6.76e-05 | 2532.61 ms | 53.3% bf16 MFU | 206987 tok/s +step 15452/19560 | loss 3.407928 (+2.37z)| norm 0.2403 (-1.96z)| lr 6.76e-05 | 2533.92 ms | 53.3% bf16 MFU | 206983 tok/s +step 15453/19560 | loss 3.322922 (+0.30z)| norm 0.2501 (-1.26z)| lr 6.75e-05 | 2532.76 ms | 53.3% bf16 MFU | 206984 tok/s +step 15454/19560 | loss 3.280671 (-0.72z)| norm 0.2515 (-1.14z)| lr 6.75e-05 | 2534.22 ms | 53.3% bf16 MFU | 206979 tok/s +step 15455/19560 | loss 3.262636 (-1.14z)| norm 0.2478 (-1.40z)| lr 6.75e-05 | 2532.67 ms | 53.3% bf16 MFU | 206980 tok/s +step 15456/19560 | loss 3.252700 (-1.36z)| norm 0.2427 (-1.73z)| lr 6.74e-05 | 2531.87 ms | 53.3% bf16 MFU | 206985 tok/s +step 15457/19560 | loss 3.268448 (-0.97z)| norm 0.2602 (-0.48z)| lr 6.74e-05 | 2533.69 ms | 53.3% bf16 MFU | 206982 tok/s +step 15458/19560 | loss 3.261569 (-1.12z)| norm 0.2364 (-2.13z)| lr 6.74e-05 | 2534.83 ms | 53.3% bf16 MFU | 206975 tok/s +step 15459/19560 | loss 3.320895 (+0.28z)| norm 0.2573 (-0.65z)| lr 6.73e-05 | 2533.27 ms | 53.3% bf16 MFU | 206974 tok/s +step 15460/19560 | loss 3.342805 (+0.84z)| norm 0.2485 (-1.25z)| lr 6.73e-05 | 2534.28 ms | 53.3% bf16 MFU | 206969 tok/s +step 15461/19560 | loss 3.275433 (-0.80z)| norm 0.2480 (-1.27z)| lr 6.73e-05 | 2533.03 ms | 53.3% bf16 MFU | 206970 tok/s +step 15462/19560 | loss 3.298905 (-0.21z)| norm 0.2512 (-1.04z)| lr 6.73e-05 | 2532.39 ms | 53.3% bf16 MFU | 206973 tok/s +step 15463/19560 | loss 3.288065 (-0.48z)| norm 0.2473 (-1.29z)| lr 6.72e-05 | 2534.89 ms | 53.3% bf16 MFU | 206966 tok/s +step 15464/19560 | loss 3.268945 (-0.94z)| norm 0.3034 (+2.53z)| lr 6.72e-05 | 2533.07 ms | 53.3% bf16 MFU | 206966 tok/s +step 15465/19560 | loss 3.280632 (-0.65z)| norm 0.2483 (-1.20z)| lr 6.72e-05 | 2533.71 ms | 53.3% bf16 MFU | 206964 tok/s +step 15466/19560 | loss 3.311839 (+0.15z)| norm 0.2696 (+0.32z)| lr 6.71e-05 | 2532.88 ms | 53.3% bf16 MFU | 206966 tok/s +step 15467/19560 | loss 3.300766 (-0.13z)| norm 0.2559 (-0.72z)| lr 6.71e-05 | 2532.02 ms | 53.3% bf16 MFU | 206971 tok/s +step 15468/19560 | loss 3.295647 (-0.26z)| norm 0.2596 (-0.42z)| lr 6.71e-05 | 2534.50 ms | 53.3% bf16 MFU | 206965 tok/s +step 15469/19560 | loss 3.328408 (+0.58z)| norm 0.2665 (+0.11z)| lr 6.70e-05 | 2533.46 ms | 53.3% bf16 MFU | 206964 tok/s +step 15470/19560 | loss 3.307750 (+0.04z)| norm 0.2576 (-0.57z)| lr 6.70e-05 | 2531.07 ms | 53.3% bf16 MFU | 206973 tok/s +step 15471/19560 | loss 3.281076 (-0.63z)| norm 0.2759 (+0.82z)| lr 6.70e-05 | 2532.16 ms | 53.3% bf16 MFU | 206977 tok/s +step 15472/19560 | loss 3.323616 (+0.46z)| norm 0.2652 (+0.01z)| lr 6.69e-05 | 2531.56 ms | 53.3% bf16 MFU | 206983 tok/s +step 15473/19560 | loss 3.345909 (+1.04z)| norm 0.2753 (+0.78z)| lr 6.69e-05 | 2530.67 ms | 53.4% bf16 MFU | 206993 tok/s +step 15474/19560 | loss 3.269997 (-0.92z)| norm 0.2651 (+0.00z)| lr 6.69e-05 | 2533.00 ms | 53.3% bf16 MFU | 206992 tok/s +step 15475/19560 | loss 3.322062 (+0.46z)| norm 0.2724 (+0.55z)| lr 6.68e-05 | 2532.68 ms | 53.3% bf16 MFU | 206993 tok/s +step 15476/19560 | loss 3.297967 (-0.17z)| norm 0.2689 (+0.30z)| lr 6.68e-05 | 2532.81 ms | 53.3% bf16 MFU | 206993 tok/s +step 15477/19560 | loss 3.306038 (+0.06z)| norm 0.2842 (+1.45z)| lr 6.68e-05 | 2533.40 ms | 53.3% bf16 MFU | 206991 tok/s +step 15478/19560 | loss 3.299515 (-0.11z)| norm 0.2579 (-0.54z)| lr 6.68e-05 | 2532.40 ms | 53.3% bf16 MFU | 206993 tok/s +step 15479/19560 | loss 3.281042 (-0.59z)| norm 0.2683 (+0.24z)| lr 6.67e-05 | 2534.97 ms | 53.3% bf16 MFU | 206985 tok/s +step 15480/19560 | loss 3.295729 (-0.18z)| norm 0.2795 (+1.13z)| lr 6.67e-05 | 2531.86 ms | 53.3% bf16 MFU | 206989 tok/s +step 15481/19560 | loss 3.308135 (+0.16z)| norm 0.2693 (+0.36z)| lr 6.67e-05 | 2534.94 ms | 53.3% bf16 MFU | 206981 tok/s +step 15482/19560 | loss 3.326380 (+0.68z)| norm 0.2738 (+0.72z)| lr 6.66e-05 | 2531.22 ms | 53.3% bf16 MFU | 206988 tok/s +step 15483/19560 | loss 3.279948 (-0.64z)| norm 0.2818 (+1.34z)| lr 6.66e-05 | 2533.36 ms | 53.3% bf16 MFU | 206987 tok/s +step 15484/19560 | loss 3.293950 (-0.24z)| norm 0.2721 (+0.59z)| lr 6.66e-05 | 2533.35 ms | 53.3% bf16 MFU | 206985 tok/s +step 15485/19560 | loss 3.284328 (-0.52z)| norm 0.2540 (-0.85z)| lr 6.65e-05 | 2531.92 ms | 53.3% bf16 MFU | 206989 tok/s +step 15486/19560 | loss 3.309791 (+0.22z)| norm 0.2620 (-0.20z)| lr 6.65e-05 | 2533.61 ms | 53.3% bf16 MFU | 206986 tok/s +step 15487/19560 | loss 3.275435 (-0.76z)| norm 0.2525 (-0.96z)| lr 6.65e-05 | 2533.00 ms | 53.3% bf16 MFU | 206986 tok/s +step 15488/19560 | loss 3.324755 (+0.66z)| norm 0.2749 (+0.84z)| lr 6.64e-05 | 2533.74 ms | 53.3% bf16 MFU | 206983 tok/s +step 15489/19560 | loss 3.273264 (-0.83z)| norm 0.2459 (-1.46z)| lr 6.64e-05 | 2533.18 ms | 53.3% bf16 MFU | 206982 tok/s +step 15490/19560 | loss 3.275270 (-0.78z)| norm 0.2555 (-0.68z)| lr 6.64e-05 | 2531.84 ms | 53.3% bf16 MFU | 206987 tok/s +step 15491/19560 | loss 3.283950 (-0.50z)| norm 0.2573 (-0.53z)| lr 6.63e-05 | 2534.09 ms | 53.3% bf16 MFU | 206982 tok/s +step 15492/19560 | loss 3.334331 (+1.05z)| norm 0.2797 (+1.27z)| lr 6.63e-05 | 2532.82 ms | 53.3% bf16 MFU | 206983 tok/s +step 15493/19560 | loss 3.298851 (-0.05z)| norm 0.2641 (+0.02z)| lr 6.63e-05 | 2532.32 ms | 53.3% bf16 MFU | 206986 tok/s +step 15494/19560 | loss 3.263312 (-1.13z)| norm 0.2629 (-0.08z)| lr 6.62e-05 | 2532.63 ms | 53.3% bf16 MFU | 206987 tok/s +step 15495/19560 | loss 3.335922 (+1.09z)| norm 0.2852 (+1.72z)| lr 6.62e-05 | 2534.43 ms | 53.3% bf16 MFU | 206981 tok/s +step 15496/19560 | loss 3.283772 (-0.50z)| norm 0.2910 (+2.17z)| lr 6.62e-05 | 2532.76 ms | 53.3% bf16 MFU | 206982 tok/s +step 15497/19560 | loss 3.321192 (+0.64z)| norm 0.2530 (-0.88z)| lr 6.62e-05 | 2534.79 ms | 53.3% bf16 MFU | 206975 tok/s +step 15498/19560 | loss 3.324740 (+0.73z)| norm 0.2529 (-0.88z)| lr 6.61e-05 | 2532.22 ms | 53.3% bf16 MFU | 206979 tok/s +step 15499/19560 | loss 3.291152 (-0.30z)| norm 0.2745 (+0.84z)| lr 6.61e-05 | 2532.93 ms | 53.3% bf16 MFU | 206979 tok/s +step 15500/19560 | loss 3.340813 (+1.21z)| norm 0.2672 (+0.26z)| lr 6.61e-05 | 2533.55 ms | 53.3% bf16 MFU | 206977 tok/s +val loss 3.308500 +evaluating HellaSwag: 0/628 evaluating HellaSwag: 10/628 evaluating HellaSwag: 20/628 evaluating HellaSwag: 30/628 evaluating HellaSwag: 40/628 evaluating HellaSwag: 50/628 evaluating HellaSwag: 60/628 evaluating HellaSwag: 70/628 evaluating HellaSwag: 80/628 evaluating HellaSwag: 90/628 evaluating HellaSwag: 100/628 evaluating HellaSwag: 110/628 evaluating HellaSwag: 120/628 evaluating HellaSwag: 130/628 evaluating HellaSwag: 140/628 evaluating HellaSwag: 150/628 evaluating HellaSwag: 160/628 evaluating HellaSwag: 170/628 evaluating HellaSwag: 180/628 evaluating HellaSwag: 190/628 evaluating HellaSwag: 200/628 evaluating HellaSwag: 210/628 evaluating HellaSwag: 220/628 evaluating HellaSwag: 230/628 evaluating HellaSwag: 240/628 evaluating HellaSwag: 250/628 evaluating HellaSwag: 260/628 evaluating HellaSwag: 270/628 evaluating HellaSwag: 280/628 evaluating HellaSwag: 290/628 evaluating HellaSwag: 300/628 evaluating HellaSwag: 310/628 evaluating HellaSwag: 320/628 evaluating HellaSwag: 330/628 evaluating HellaSwag: 340/628 evaluating HellaSwag: 350/628 evaluating HellaSwag: 360/628 evaluating HellaSwag: 370/628 evaluating HellaSwag: 380/628 evaluating HellaSwag: 390/628 evaluating HellaSwag: 400/628 evaluating HellaSwag: 410/628 evaluating HellaSwag: 420/628 evaluating HellaSwag: 430/628 evaluating HellaSwag: 440/628 evaluating HellaSwag: 450/628 evaluating HellaSwag: 460/628 evaluating HellaSwag: 470/628 evaluating HellaSwag: 480/628 evaluating HellaSwag: 490/628 evaluating HellaSwag: 500/628 evaluating HellaSwag: 510/628 evaluating HellaSwag: 520/628 evaluating HellaSwag: 530/628 evaluating HellaSwag: 540/628 evaluating HellaSwag: 550/628 evaluating HellaSwag: 560/628 evaluating HellaSwag: 570/628 evaluating HellaSwag: 580/628 evaluating HellaSwag: 590/628 evaluating HellaSwag: 600/628 evaluating HellaSwag: 610/628 evaluating HellaSwag: 620/628 HellaSwag: 3025/10042 = 0.301235 +step 15501/19560 | loss 3.346529 (+1.37z)| norm 0.2481 (-1.24z)| lr 6.60e-05 | 2530.80 ms | 53.3% bf16 MFU | 206986 tok/s +step 15502/19560 | loss 3.310744 (+0.29z)| norm 0.3017 (+2.93z)| lr 6.60e-05 | 2533.19 ms | 53.3% bf16 MFU | 206986 tok/s +step 15503/19560 | loss 3.311378 (+0.31z)| norm 0.2571 (-0.52z)| lr 6.60e-05 | 2532.25 ms | 53.3% bf16 MFU | 206988 tok/s +step 15504/19560 | loss 3.489721 (+5.08z)| norm 0.2744 (+0.83z)| lr 6.59e-05 | 2532.97 ms | 53.3% bf16 MFU | 206988 tok/s +step 15505/19560 | loss 3.362027 (+1.60z)| norm 0.2553 (-0.66z)| lr 6.59e-05 | 2531.43 ms | 53.3% bf16 MFU | 206994 tok/s +step 15506/19560 | loss 3.304070 (+0.04z)| norm 0.2481 (-1.21z)| lr 6.59e-05 | 2532.84 ms | 53.3% bf16 MFU | 206995 tok/s +step 15507/19560 | loss 3.373162 (+1.91z)| norm 0.2552 (-0.65z)| lr 6.58e-05 | 2531.66 ms | 53.3% bf16 MFU | 206999 tok/s +step 15508/19560 | loss 3.288443 (-0.39z)| norm 0.2658 (+0.19z)| lr 6.58e-05 | 2533.55 ms | 53.3% bf16 MFU | 206996 tok/s +step 15509/19560 | loss 3.372012 (+1.84z)| norm 0.2579 (-0.43z)| lr 6.58e-05 | 2533.47 ms | 53.3% bf16 MFU | 206994 tok/s +step 15510/19560 | loss 3.338831 (+0.94z)| norm 0.2607 (-0.20z)| lr 6.57e-05 | 2532.52 ms | 53.3% bf16 MFU | 206995 tok/s +step 15511/19560 | loss 3.307275 (+0.09z)| norm 0.2802 (+1.32z)| lr 6.57e-05 | 2532.19 ms | 53.3% bf16 MFU | 206998 tok/s +step 15512/19560 | loss 3.266995 (-1.03z)| norm 0.2622 (-0.10z)| lr 6.57e-05 | 2533.31 ms | 53.3% bf16 MFU | 206996 tok/s +step 15513/19560 | loss 3.299141 (-0.13z)| norm 0.2630 (-0.04z)| lr 6.57e-05 | 2533.03 ms | 53.3% bf16 MFU | 206995 tok/s +step 15514/19560 | loss 3.251726 (-1.43z)| norm 0.2578 (-0.44z)| lr 6.56e-05 | 2532.80 ms | 53.3% bf16 MFU | 206995 tok/s +step 15515/19560 | loss 3.284400 (-0.52z)| norm 0.2642 (+0.05z)| lr 6.56e-05 | 2534.26 ms | 53.3% bf16 MFU | 206990 tok/s +step 15516/19560 | loss 3.301327 (-0.06z)| norm 0.2676 (+0.31z)| lr 6.56e-05 | 2531.99 ms | 53.3% bf16 MFU | 206993 tok/s +step 15517/19560 | loss 3.295100 (-0.22z)| norm 0.2489 (-1.17z)| lr 6.55e-05 | 2533.78 ms | 53.3% bf16 MFU | 206990 tok/s +step 15518/19560 | loss 3.333974 (+0.85z)| norm 0.2638 (+0.03z)| lr 6.55e-05 | 2532.66 ms | 53.3% bf16 MFU | 206991 tok/s +step 15519/19560 | loss 3.372198 (+1.88z)| norm 0.2693 (+0.46z)| lr 6.55e-05 | 2532.34 ms | 53.3% bf16 MFU | 206993 tok/s +step 15520/19560 | loss 3.304016 (-0.01z)| norm 0.2578 (-0.46z)| lr 6.54e-05 | 2532.01 ms | 53.3% bf16 MFU | 206997 tok/s +step 15521/19560 | loss 3.411335 (+2.85z)| norm 0.2496 (-1.10z)| lr 6.54e-05 | 2532.61 ms | 53.3% bf16 MFU | 206998 tok/s +step 15522/19560 | loss 3.291576 (-0.37z)| norm 0.2752 (+0.93z)| lr 6.54e-05 | 2531.99 ms | 53.3% bf16 MFU | 207001 tok/s +step 15523/19560 | loss 3.358196 (+1.41z)| norm 0.2796 (+1.26z)| lr 6.53e-05 | 2531.69 ms | 53.3% bf16 MFU | 207005 tok/s +step 15524/19560 | loss 3.299082 (-0.17z)| norm 0.2395 (-1.88z)| lr 6.53e-05 | 2532.15 ms | 53.3% bf16 MFU | 207008 tok/s +step 15525/19560 | loss 3.310808 (+0.14z)| norm 0.2744 (+0.84z)| lr 6.53e-05 | 2532.44 ms | 53.3% bf16 MFU | 207009 tok/s +step 15526/19560 | loss 3.263013 (-1.13z)| norm 0.2654 (+0.13z)| lr 6.53e-05 | 2532.01 ms | 53.3% bf16 MFU | 207012 tok/s +step 15527/19560 | loss 3.286227 (-0.51z)| norm 0.2784 (+1.12z)| lr 6.52e-05 | 2532.58 ms | 53.3% bf16 MFU | 207012 tok/s +step 15528/19560 | loss 3.301925 (-0.10z)| norm 0.2766 (+0.97z)| lr 6.52e-05 | 2535.28 ms | 53.3% bf16 MFU | 207001 tok/s +step 15529/19560 | loss 3.360018 (+1.44z)| norm 0.2686 (+0.35z)| lr 6.52e-05 | 2534.36 ms | 53.3% bf16 MFU | 206995 tok/s +step 15530/19560 | loss 3.291969 (-0.38z)| norm 0.2885 (+1.85z)| lr 6.51e-05 | 2533.57 ms | 53.3% bf16 MFU | 206992 tok/s +step 15531/19560 | loss 3.383615 (+2.03z)| norm 0.2777 (+1.01z)| lr 6.51e-05 | 2531.75 ms | 53.3% bf16 MFU | 206996 tok/s +step 15532/19560 | loss 3.313852 (+0.19z)| norm 0.2859 (+1.62z)| lr 6.51e-05 | 2533.56 ms | 53.3% bf16 MFU | 206993 tok/s +step 15533/19560 | loss 3.289164 (-0.47z)| norm 0.2634 (-0.07z)| lr 6.50e-05 | 2533.32 ms | 53.3% bf16 MFU | 206992 tok/s +step 15534/19560 | loss 3.247210 (-1.56z)| norm 0.2658 (+0.10z)| lr 6.50e-05 | 2531.65 ms | 53.3% bf16 MFU | 206997 tok/s +step 15535/19560 | loss 3.283281 (-0.62z)| norm 0.2952 (+2.29z)| lr 6.50e-05 | 2531.43 ms | 53.3% bf16 MFU | 207002 tok/s +step 15536/19560 | loss 3.297425 (-0.25z)| norm 0.2889 (+1.78z)| lr 6.49e-05 | 2533.48 ms | 53.3% bf16 MFU | 207000 tok/s +step 15537/19560 | loss 3.312701 (+0.16z)| norm 0.2835 (+1.35z)| lr 6.49e-05 | 2531.94 ms | 53.3% bf16 MFU | 207003 tok/s +step 15538/19560 | loss 3.325294 (+0.50z)| norm 0.3159 (+3.52z)| lr 6.49e-05 | 2534.01 ms | 53.3% bf16 MFU | 206998 tok/s +step 15539/19560 | loss 3.306711 (-0.00z)| norm 0.2844 (+1.30z)| lr 6.48e-05 | 2532.55 ms | 53.3% bf16 MFU | 206999 tok/s +step 15540/19560 | loss 3.246552 (-1.58z)| norm 0.2746 (+0.64z)| lr 6.48e-05 | 2533.52 ms | 53.3% bf16 MFU | 206996 tok/s +step 15541/19560 | loss 3.288239 (-0.48z)| norm 0.2779 (+0.88z)| lr 6.48e-05 | 2532.50 ms | 53.3% bf16 MFU | 206997 tok/s +step 15542/19560 | loss 3.328358 (+0.59z)| norm 0.2861 (+1.44z)| lr 6.48e-05 | 2533.06 ms | 53.3% bf16 MFU | 206996 tok/s +step 15543/19560 | loss 3.277153 (-0.77z)| norm 0.2524 (-0.91z)| lr 6.47e-05 | 2533.74 ms | 53.3% bf16 MFU | 206993 tok/s +step 15544/19560 | loss 3.275897 (-0.79z)| norm 0.2705 (+0.36z)| lr 6.47e-05 | 2532.34 ms | 53.3% bf16 MFU | 206995 tok/s +step 15545/19560 | loss 3.337721 (+0.83z)| norm 0.2752 (+0.67z)| lr 6.47e-05 | 2531.87 ms | 53.3% bf16 MFU | 206999 tok/s +step 15546/19560 | loss 3.261351 (-1.19z)| norm 0.2552 (-0.73z)| lr 6.46e-05 | 2532.39 ms | 53.3% bf16 MFU | 207001 tok/s +step 15547/19560 | loss 3.340378 (+0.89z)| norm 0.2599 (-0.39z)| lr 6.46e-05 | 2532.77 ms | 53.3% bf16 MFU | 207001 tok/s +step 15548/19560 | loss 3.217470 (-2.28z)| norm 0.2553 (-0.72z)| lr 6.46e-05 | 2533.02 ms | 53.3% bf16 MFU | 207000 tok/s +step 15549/19560 | loss 3.343383 (+0.95z)| norm 0.2590 (-0.45z)| lr 6.45e-05 | 2533.45 ms | 53.3% bf16 MFU | 206997 tok/s +step 15550/19560 | loss 3.340918 (+0.88z)| norm 0.2646 (-0.05z)| lr 6.45e-05 | 2531.87 ms | 53.3% bf16 MFU | 207001 tok/s +step 15551/19560 | loss 3.390071 (+2.10z)| norm 0.2763 (+0.77z)| lr 6.45e-05 | 2531.30 ms | 53.3% bf16 MFU | 207007 tok/s +step 15552/19560 | loss 3.343159 (+0.90z)| norm 0.2753 (+0.69z)| lr 6.44e-05 | 2533.88 ms | 53.3% bf16 MFU | 207002 tok/s +step 15553/19560 | loss 3.311677 (+0.09z)| norm 0.2551 (-0.75z)| lr 6.44e-05 | 2531.84 ms | 53.3% bf16 MFU | 207006 tok/s +step 15554/19560 | loss 3.268600 (-1.01z)| norm 0.2818 (+1.13z)| lr 6.44e-05 | 2531.54 ms | 53.3% bf16 MFU | 207011 tok/s +step 15555/19560 | loss 3.400073 (+2.30z)| norm 0.2762 (+0.73z)| lr 6.44e-05 | 2535.41 ms | 53.3% bf16 MFU | 207000 tok/s +step 15556/19560 | loss 3.414857 (+2.58z)| norm 0.2748 (+0.62z)| lr 6.43e-05 | 2533.75 ms | 53.3% bf16 MFU | 206996 tok/s +step 15557/19560 | loss 3.288929 (-0.49z)| norm 0.3086 (+2.92z)| lr 6.43e-05 | 2531.94 ms | 53.3% bf16 MFU | 206999 tok/s +step 15558/19560 | loss 3.315369 (+0.14z)| norm 0.2569 (-0.63z)| lr 6.43e-05 | 2532.93 ms | 53.3% bf16 MFU | 206999 tok/s +step 15559/19560 | loss 3.274475 (-0.86z)| norm 0.2577 (-0.58z)| lr 6.42e-05 | 2533.52 ms | 53.3% bf16 MFU | 206996 tok/s +step 15560/19560 | loss 3.412857 (+2.48z)| norm 0.2913 (+1.71z)| lr 6.42e-05 | 2532.42 ms | 53.3% bf16 MFU | 206998 tok/s +step 15561/19560 | loss 3.386182 (+1.81z)| norm 0.2584 (-0.53z)| lr 6.42e-05 | 2532.26 ms | 53.3% bf16 MFU | 207000 tok/s +step 15562/19560 | loss 3.303043 (-0.19z)| norm 0.2640 (-0.15z)| lr 6.41e-05 | 2533.57 ms | 53.3% bf16 MFU | 206997 tok/s +step 15563/19560 | loss 3.290955 (-0.48z)| norm 0.2517 (-0.99z)| lr 6.41e-05 | 2532.85 ms | 53.3% bf16 MFU | 206997 tok/s +step 15564/19560 | loss 3.379117 (+1.62z)| norm 0.2598 (-0.43z)| lr 6.41e-05 | 2532.64 ms | 53.3% bf16 MFU | 206997 tok/s +step 15565/19560 | loss 3.281064 (-0.73z)| norm 0.2634 (-0.20z)| lr 6.40e-05 | 2532.39 ms | 53.3% bf16 MFU | 206999 tok/s +step 15566/19560 | loss 3.313298 (+0.05z)| norm 0.2641 (-0.15z)| lr 6.40e-05 | 2532.11 ms | 53.3% bf16 MFU | 207002 tok/s +step 15567/19560 | loss 3.309549 (-0.03z)| norm 0.2433 (-1.56z)| lr 6.40e-05 | 2531.83 ms | 53.3% bf16 MFU | 207006 tok/s +step 15568/19560 | loss 3.315134 (+0.11z)| norm 0.2568 (-0.62z)| lr 6.39e-05 | 2533.10 ms | 53.3% bf16 MFU | 207004 tok/s +step 15569/19560 | loss 3.239905 (-1.68z)| norm 0.2467 (-1.31z)| lr 6.39e-05 | 2532.62 ms | 53.3% bf16 MFU | 207005 tok/s +step 15570/19560 | loss 3.347929 (+0.90z)| norm 0.2552 (-0.71z)| lr 6.39e-05 | 2530.41 ms | 53.4% bf16 MFU | 207014 tok/s +step 15571/19560 | loss 3.295777 (-0.34z)| norm 0.2632 (-0.15z)| lr 6.39e-05 | 2531.75 ms | 53.3% bf16 MFU | 207018 tok/s +step 15572/19560 | loss 3.352803 (+1.02z)| norm 0.2541 (-0.78z)| lr 6.38e-05 | 2534.67 ms | 53.3% bf16 MFU | 207009 tok/s +step 15573/19560 | loss 3.273953 (-0.88z)| norm 0.2683 (+0.20z)| lr 6.38e-05 | 2531.21 ms | 53.3% bf16 MFU | 207015 tok/s +step 15574/19560 | loss 3.386937 (+1.81z)| norm 0.2594 (-0.42z)| lr 6.38e-05 | 2531.03 ms | 53.3% bf16 MFU | 207022 tok/s +step 15575/19560 | loss 3.339815 (+0.69z)| norm 0.2710 (+0.38z)| lr 6.37e-05 | 2532.30 ms | 53.3% bf16 MFU | 207023 tok/s +step 15576/19560 | loss 3.336519 (+0.60z)| norm 0.2753 (+0.66z)| lr 6.37e-05 | 2536.03 ms | 53.2% bf16 MFU | 207008 tok/s +step 15577/19560 | loss 3.281317 (-0.71z)| norm 0.2671 (+0.08z)| lr 6.37e-05 | 2532.85 ms | 53.3% bf16 MFU | 207008 tok/s +step 15578/19560 | loss 3.322966 (+0.27z)| norm 0.2752 (+0.64z)| lr 6.36e-05 | 2533.69 ms | 53.3% bf16 MFU | 207004 tok/s +step 15579/19560 | loss 3.323267 (+0.27z)| norm 0.2606 (-0.37z)| lr 6.36e-05 | 2533.15 ms | 53.3% bf16 MFU | 207002 tok/s +step 15580/19560 | loss 3.312766 (+0.04z)| norm 0.2757 (+0.66z)| lr 6.36e-05 | 2534.68 ms | 53.3% bf16 MFU | 206994 tok/s +step 15581/19560 | loss 3.264210 (-1.14z)| norm 0.2576 (-0.61z)| lr 6.35e-05 | 2532.00 ms | 53.3% bf16 MFU | 206998 tok/s +step 15582/19560 | loss 3.308492 (-0.06z)| norm 0.2543 (-0.85z)| lr 6.35e-05 | 2534.41 ms | 53.3% bf16 MFU | 206991 tok/s +step 15583/19560 | loss 3.305954 (-0.13z)| norm 0.2560 (-0.73z)| lr 6.35e-05 | 2532.76 ms | 53.3% bf16 MFU | 206992 tok/s +step 15584/19560 | loss 3.338110 (+0.65z)| norm 0.2628 (-0.27z)| lr 6.35e-05 | 2532.90 ms | 53.3% bf16 MFU | 206992 tok/s +step 15585/19560 | loss 3.252848 (-1.45z)| norm 0.2607 (-0.42z)| lr 6.34e-05 | 2534.95 ms | 53.3% bf16 MFU | 206983 tok/s +step 15586/19560 | loss 3.420285 (+2.59z)| norm 0.2959 (+2.07z)| lr 6.34e-05 | 2535.04 ms | 53.3% bf16 MFU | 206975 tok/s +step 15587/19560 | loss 3.303625 (-0.23z)| norm 0.2612 (-0.42z)| lr 6.34e-05 | 2534.84 ms | 53.3% bf16 MFU | 206968 tok/s +step 15588/19560 | loss 3.284263 (-0.68z)| norm 0.2664 (-0.05z)| lr 6.33e-05 | 2531.76 ms | 53.3% bf16 MFU | 206974 tok/s +step 15589/19560 | loss 3.271346 (-0.99z)| norm 0.2566 (-0.77z)| lr 6.33e-05 | 2534.27 ms | 53.3% bf16 MFU | 206969 tok/s +step 15590/19560 | loss 3.356664 (+1.05z)| norm 0.2609 (-0.47z)| lr 6.33e-05 | 2533.01 ms | 53.3% bf16 MFU | 206970 tok/s +step 15591/19560 | loss 3.302864 (-0.25z)| norm 0.2519 (-1.14z)| lr 6.32e-05 | 2534.05 ms | 53.3% bf16 MFU | 206966 tok/s +step 15592/19560 | loss 3.281816 (-0.76z)| norm 0.2558 (-0.84z)| lr 6.32e-05 | 2533.61 ms | 53.3% bf16 MFU | 206964 tok/s +step 15593/19560 | loss 3.266822 (-1.11z)| norm 0.2524 (-1.11z)| lr 6.32e-05 | 2531.94 ms | 53.3% bf16 MFU | 206970 tok/s +step 15594/19560 | loss 3.341887 (+0.69z)| norm 0.2706 (+0.27z)| lr 6.31e-05 | 2532.40 ms | 53.3% bf16 MFU | 206973 tok/s +step 15595/19560 | loss 3.356303 (+1.02z)| norm 0.2628 (-0.32z)| lr 6.31e-05 | 2531.45 ms | 53.3% bf16 MFU | 206980 tok/s +step 15596/19560 | loss 3.305245 (-0.20z)| norm 0.2698 (+0.20z)| lr 6.31e-05 | 2532.74 ms | 53.3% bf16 MFU | 206981 tok/s +step 15597/19560 | loss 3.241472 (-1.69z)| norm 0.2706 (+0.26z)| lr 6.31e-05 | 2532.47 ms | 53.3% bf16 MFU | 206983 tok/s +step 15598/19560 | loss 3.368050 (+1.28z)| norm 0.2722 (+0.37z)| lr 6.30e-05 | 2531.37 ms | 53.3% bf16 MFU | 206990 tok/s +step 15599/19560 | loss 3.316270 (+0.06z)| norm 0.2619 (-0.40z)| lr 6.30e-05 | 2531.88 ms | 53.3% bf16 MFU | 206994 tok/s +step 15600/19560 | loss 3.329499 (+0.37z)| norm 0.2812 (+1.05z)| lr 6.30e-05 | 2531.81 ms | 53.3% bf16 MFU | 206998 tok/s +step 15601/19560 | loss 3.342473 (+0.67z)| norm 0.2756 (+0.63z)| lr 6.29e-05 | 2533.22 ms | 53.3% bf16 MFU | 206997 tok/s +step 15602/19560 | loss 3.334509 (+0.48z)| norm 0.2529 (-1.08z)| lr 6.29e-05 | 2534.14 ms | 53.3% bf16 MFU | 206991 tok/s +step 15603/19560 | loss 3.373203 (+1.37z)| norm 0.2919 (+1.82z)| lr 6.29e-05 | 2532.71 ms | 53.3% bf16 MFU | 206992 tok/s +step 15604/19560 | loss 3.365994 (+1.18z)| norm 0.2766 (+0.68z)| lr 6.28e-05 | 2533.36 ms | 53.3% bf16 MFU | 206990 tok/s +step 15605/19560 | loss 3.325436 (+0.23z)| norm 0.2629 (-0.32z)| lr 6.28e-05 | 2533.10 ms | 53.3% bf16 MFU | 206989 tok/s +step 15606/19560 | loss 3.329873 (+0.33z)| norm 0.2678 (+0.03z)| lr 6.28e-05 | 2530.59 ms | 53.4% bf16 MFU | 206999 tok/s +step 15607/19560 | loss 3.353079 (+0.86z)| norm 0.2677 (+0.03z)| lr 6.28e-05 | 2532.69 ms | 53.3% bf16 MFU | 206999 tok/s +step 15608/19560 | loss 3.238065 (-1.79z)| norm 0.2658 (-0.11z)| lr 6.27e-05 | 2531.80 ms | 53.3% bf16 MFU | 207003 tok/s +step 15609/19560 | loss 3.239125 (-1.74z)| norm 0.2677 (+0.04z)| lr 6.27e-05 | 2534.10 ms | 53.3% bf16 MFU | 206998 tok/s +step 15610/19560 | loss 3.320048 (+0.11z)| norm 0.2954 (+2.07z)| lr 6.27e-05 | 2533.98 ms | 53.3% bf16 MFU | 206993 tok/s +step 15611/19560 | loss 3.309319 (-0.14z)| norm 0.2579 (-0.69z)| lr 6.26e-05 | 2533.02 ms | 53.3% bf16 MFU | 206993 tok/s +step 15612/19560 | loss 3.269826 (-1.04z)| norm 0.2583 (-0.65z)| lr 6.26e-05 | 2533.36 ms | 53.3% bf16 MFU | 206991 tok/s +step 15613/19560 | loss 3.297873 (-0.40z)| norm 0.2677 (+0.04z)| lr 6.26e-05 | 2531.72 ms | 53.3% bf16 MFU | 206995 tok/s +step 15614/19560 | loss 3.432318 (+2.58z)| norm 0.2721 (+0.35z)| lr 6.25e-05 | 2533.54 ms | 53.3% bf16 MFU | 206993 tok/s +step 15615/19560 | loss 3.305849 (-0.24z)| norm 0.2624 (-0.37z)| lr 6.25e-05 | 2532.11 ms | 53.3% bf16 MFU | 206996 tok/s +step 15616/19560 | loss 3.314846 (-0.04z)| norm 0.2763 (+0.66z)| lr 6.25e-05 | 2532.12 ms | 53.3% bf16 MFU | 206999 tok/s +step 15617/19560 | loss 3.292495 (-0.54z)| norm 0.2655 (-0.16z)| lr 6.24e-05 | 2533.58 ms | 53.3% bf16 MFU | 206996 tok/s +step 15618/19560 | loss 3.318662 (+0.04z)| norm 0.2663 (-0.10z)| lr 6.24e-05 | 2532.95 ms | 53.3% bf16 MFU | 206995 tok/s +step 15619/19560 | loss 3.321398 (+0.09z)| norm 0.2845 (+1.26z)| lr 6.24e-05 | 2533.60 ms | 53.3% bf16 MFU | 206992 tok/s +step 15620/19560 | loss 3.389060 (+1.59z)| norm 0.2811 (+0.99z)| lr 6.24e-05 | 2532.84 ms | 53.3% bf16 MFU | 206992 tok/s +step 15621/19560 | loss 3.420181 (+2.22z)| norm 0.2582 (-0.72z)| lr 6.23e-05 | 2533.46 ms | 53.3% bf16 MFU | 206990 tok/s +step 15622/19560 | loss 3.259368 (-1.29z)| norm 0.2566 (-0.83z)| lr 6.23e-05 | 2533.17 ms | 53.3% bf16 MFU | 206989 tok/s +step 15623/19560 | loss 3.282704 (-0.77z)| norm 0.2806 (+0.96z)| lr 6.23e-05 | 2533.21 ms | 53.3% bf16 MFU | 206988 tok/s +step 15624/19560 | loss 3.300828 (-0.38z)| norm 0.2651 (-0.18z)| lr 6.22e-05 | 2533.65 ms | 53.3% bf16 MFU | 206985 tok/s +step 15625/19560 | loss 3.329961 (+0.25z)| norm 0.2864 (+1.41z)| lr 6.22e-05 | 2535.26 ms | 53.3% bf16 MFU | 206976 tok/s +step 15626/19560 | loss 3.326916 (+0.19z)| norm 0.2723 (+0.33z)| lr 6.22e-05 | 2532.80 ms | 53.3% bf16 MFU | 206977 tok/s +step 15627/19560 | loss 3.369305 (+1.09z)| norm 0.2606 (-0.55z)| lr 6.21e-05 | 2532.59 ms | 53.3% bf16 MFU | 206979 tok/s +step 15628/19560 | loss 3.319828 (+0.02z)| norm 0.2966 (+2.13z)| lr 6.21e-05 | 2533.97 ms | 53.3% bf16 MFU | 206975 tok/s +step 15629/19560 | loss 3.319747 (+0.02z)| norm 0.2741 (+0.44z)| lr 6.21e-05 | 2533.13 ms | 53.3% bf16 MFU | 206975 tok/s +step 15630/19560 | loss 3.277000 (-0.90z)| norm 0.2687 (+0.05z)| lr 6.20e-05 | 2533.07 ms | 53.3% bf16 MFU | 206975 tok/s +step 15631/19560 | loss 3.432297 (+2.40z)| norm 0.2739 (+0.45z)| lr 6.20e-05 | 2532.74 ms | 53.3% bf16 MFU | 206976 tok/s +step 15632/19560 | loss 3.326132 (+0.18z)| norm 0.2695 (+0.11z)| lr 6.20e-05 | 2532.52 ms | 53.3% bf16 MFU | 206979 tok/s +step 15633/19560 | loss 3.315471 (-0.05z)| norm 0.2796 (+0.88z)| lr 6.20e-05 | 2534.09 ms | 53.3% bf16 MFU | 206974 tok/s +step 15634/19560 | loss 3.318680 (+0.02z)| norm 0.2565 (-0.92z)| lr 6.19e-05 | 2533.09 ms | 53.3% bf16 MFU | 206975 tok/s +step 15635/19560 | loss 3.292747 (-0.55z)| norm 0.2586 (-0.76z)| lr 6.19e-05 | 2535.25 ms | 53.3% bf16 MFU | 206966 tok/s +step 15636/19560 | loss 3.290959 (-0.59z)| norm 0.2541 (-1.10z)| lr 6.19e-05 | 2534.05 ms | 53.3% bf16 MFU | 206962 tok/s +step 15637/19560 | loss 3.300044 (-0.38z)| norm 0.2568 (-0.89z)| lr 6.18e-05 | 2532.06 ms | 53.3% bf16 MFU | 206967 tok/s +step 15638/19560 | loss 3.352779 (+0.82z)| norm 0.2666 (-0.13z)| lr 6.18e-05 | 2532.23 ms | 53.3% bf16 MFU | 206971 tok/s +step 15639/19560 | loss 3.318256 (+0.03z)| norm 0.2619 (-0.49z)| lr 6.18e-05 | 2533.66 ms | 53.3% bf16 MFU | 206969 tok/s +step 15640/19560 | loss 3.293218 (-0.54z)| norm 0.2500 (-1.40z)| lr 6.17e-05 | 2531.94 ms | 53.3% bf16 MFU | 206974 tok/s +step 15641/19560 | loss 3.345013 (+0.63z)| norm 0.2706 (+0.19z)| lr 6.17e-05 | 2532.57 ms | 53.3% bf16 MFU | 206976 tok/s +step 15642/19560 | loss 3.294837 (-0.53z)| norm 0.2485 (-1.51z)| lr 6.17e-05 | 2531.82 ms | 53.3% bf16 MFU | 206981 tok/s +step 15643/19560 | loss 3.404961 (+1.95z)| norm 0.2431 (-1.88z)| lr 6.17e-05 | 2533.03 ms | 53.3% bf16 MFU | 206981 tok/s +step 15644/19560 | loss 3.287595 (-0.70z)| norm 0.2687 (+0.06z)| lr 6.16e-05 | 2533.09 ms | 53.3% bf16 MFU | 206981 tok/s +step 15645/19560 | loss 3.457350 (+3.00z)| norm 0.2750 (+0.53z)| lr 6.16e-05 | 2535.15 ms | 53.3% bf16 MFU | 206972 tok/s +step 15646/19560 | loss 3.276476 (-0.93z)| norm 0.2589 (-0.70z)| lr 6.16e-05 | 2531.69 ms | 53.3% bf16 MFU | 206978 tok/s +step 15647/19560 | loss 3.391750 (+1.56z)| norm 0.2602 (-0.59z)| lr 6.15e-05 | 2532.01 ms | 53.3% bf16 MFU | 206983 tok/s +step 15648/19560 | loss 3.324848 (+0.11z)| norm 0.2745 (+0.49z)| lr 6.15e-05 | 2533.74 ms | 53.3% bf16 MFU | 206980 tok/s +step 15649/19560 | loss 3.386223 (+1.46z)| norm 0.2890 (+1.57z)| lr 6.15e-05 | 2533.33 ms | 53.3% bf16 MFU | 206978 tok/s +step 15650/19560 | loss 3.297131 (-0.49z)| norm 0.2620 (-0.48z)| lr 6.14e-05 | 2532.84 ms | 53.3% bf16 MFU | 206979 tok/s +step 15651/19560 | loss 3.313196 (-0.13z)| norm 0.2604 (-0.59z)| lr 6.14e-05 | 2533.72 ms | 53.3% bf16 MFU | 206976 tok/s +step 15652/19560 | loss 3.297273 (-0.48z)| norm 0.2889 (+1.56z)| lr 6.14e-05 | 2533.83 ms | 53.3% bf16 MFU | 206973 tok/s +step 15653/19560 | loss 3.379482 (+1.30z)| norm 0.2422 (-1.98z)| lr 6.14e-05 | 2534.23 ms | 53.3% bf16 MFU | 206969 tok/s +step 15654/19560 | loss 3.262051 (-1.25z)| norm 0.2679 (-0.03z)| lr 6.13e-05 | 2534.26 ms | 53.3% bf16 MFU | 206964 tok/s +step 15655/19560 | loss 3.303508 (-0.36z)| norm 0.2729 (+0.35z)| lr 6.13e-05 | 2533.13 ms | 53.3% bf16 MFU | 206965 tok/s +step 15656/19560 | loss 3.341372 (+0.46z)| norm 0.2514 (-1.26z)| lr 6.13e-05 | 2535.78 ms | 53.2% bf16 MFU | 206954 tok/s +step 15657/19560 | loss 3.319170 (-0.01z)| norm 0.2619 (-0.46z)| lr 6.12e-05 | 2533.58 ms | 53.3% bf16 MFU | 206953 tok/s +step 15658/19560 | loss 3.332359 (+0.27z)| norm 0.2629 (-0.38z)| lr 6.12e-05 | 2534.08 ms | 53.3% bf16 MFU | 206950 tok/s +step 15659/19560 | loss 3.393952 (+1.61z)| norm 0.2551 (-0.96z)| lr 6.12e-05 | 2532.33 ms | 53.3% bf16 MFU | 206955 tok/s +step 15660/19560 | loss 3.324201 (+0.08z)| norm 0.2638 (-0.29z)| lr 6.11e-05 | 2531.13 ms | 53.3% bf16 MFU | 206964 tok/s +step 15661/19560 | loss 3.276442 (-0.95z)| norm 0.2773 (+0.74z)| lr 6.11e-05 | 2536.43 ms | 53.2% bf16 MFU | 206951 tok/s +step 15662/19560 | loss 3.306697 (-0.31z)| norm 0.2631 (-0.34z)| lr 6.11e-05 | 2535.65 ms | 53.2% bf16 MFU | 206942 tok/s +step 15663/19560 | loss 3.353238 (+0.70z)| norm 0.2642 (-0.24z)| lr 6.10e-05 | 2534.42 ms | 53.3% bf16 MFU | 206938 tok/s +step 15664/19560 | loss 3.293588 (-0.61z)| norm 0.2703 (+0.25z)| lr 6.10e-05 | 2533.06 ms | 53.3% bf16 MFU | 206940 tok/s +step 15665/19560 | loss 3.331739 (+0.23z)| norm 0.2562 (-0.85z)| lr 6.10e-05 | 2532.29 ms | 53.3% bf16 MFU | 206945 tok/s +step 15666/19560 | loss 3.281110 (-0.87z)| norm 0.2758 (+0.77z)| lr 6.10e-05 | 2530.38 ms | 53.4% bf16 MFU | 206958 tok/s +step 15667/19560 | loss 3.262228 (-1.27z)| norm 0.2602 (-0.53z)| lr 6.09e-05 | 2535.07 ms | 53.3% bf16 MFU | 206950 tok/s +step 15668/19560 | loss 3.330804 (+0.21z)| norm 0.2491 (-1.44z)| lr 6.09e-05 | 2531.25 ms | 53.3% bf16 MFU | 206959 tok/s +step 15669/19560 | loss 3.329506 (+0.17z)| norm 0.2727 (+0.55z)| lr 6.09e-05 | 2533.52 ms | 53.3% bf16 MFU | 206958 tok/s +step 15670/19560 | loss 3.355761 (+0.75z)| norm 0.2526 (-1.14z)| lr 6.08e-05 | 2534.46 ms | 53.3% bf16 MFU | 206954 tok/s +step 15671/19560 | loss 3.249564 (-1.58z)| norm 0.2387 (-2.27z)| lr 6.08e-05 | 2531.96 ms | 53.3% bf16 MFU | 206959 tok/s +step 15672/19560 | loss 3.358791 (+0.80z)| norm 0.2802 (+1.19z)| lr 6.08e-05 | 2531.48 ms | 53.3% bf16 MFU | 206967 tok/s +step 15673/19560 | loss 3.382438 (+1.30z)| norm 0.2567 (-0.76z)| lr 6.07e-05 | 2533.67 ms | 53.3% bf16 MFU | 206965 tok/s +step 15674/19560 | loss 3.276380 (-1.01z)| norm 0.2669 (+0.08z)| lr 6.07e-05 | 2534.26 ms | 53.3% bf16 MFU | 206960 tok/s +step 15675/19560 | loss 3.310965 (-0.25z)| norm 0.2557 (-0.85z)| lr 6.07e-05 | 2531.71 ms | 53.3% bf16 MFU | 206967 tok/s +step 15676/19560 | loss 3.330025 (+0.15z)| norm 0.2490 (-1.40z)| lr 6.07e-05 | 2533.30 ms | 53.3% bf16 MFU | 206966 tok/s +step 15677/19560 | loss 3.312176 (-0.25z)| norm 0.2648 (-0.09z)| lr 6.06e-05 | 2532.90 ms | 53.3% bf16 MFU | 206968 tok/s +step 15678/19560 | loss 3.356856 (+0.75z)| norm 0.2774 (+0.95z)| lr 6.06e-05 | 2532.00 ms | 53.3% bf16 MFU | 206973 tok/s +step 15679/19560 | loss 3.248033 (-1.65z)| norm 0.2682 (+0.20z)| lr 6.06e-05 | 2532.08 ms | 53.3% bf16 MFU | 206977 tok/s +step 15680/19560 | loss 3.336748 (+0.33z)| norm 0.2695 (+0.30z)| lr 6.05e-05 | 2533.55 ms | 53.3% bf16 MFU | 206975 tok/s +step 15681/19560 | loss 3.267936 (-1.19z)| norm 0.2484 (-1.44z)| lr 6.05e-05 | 2532.06 ms | 53.3% bf16 MFU | 206979 tok/s +step 15682/19560 | loss 3.241433 (-1.76z)| norm 0.2509 (-1.21z)| lr 6.05e-05 | 2533.21 ms | 53.3% bf16 MFU | 206978 tok/s +step 15683/19560 | loss 3.285907 (-0.77z)| norm 0.2737 (+0.68z)| lr 6.04e-05 | 2534.78 ms | 53.3% bf16 MFU | 206971 tok/s +step 15684/19560 | loss 3.354465 (+0.77z)| norm 0.2661 (+0.05z)| lr 6.04e-05 | 2531.08 ms | 53.3% bf16 MFU | 206980 tok/s +step 15685/19560 | loss 3.314152 (-0.14z)| norm 0.2738 (+0.75z)| lr 6.04e-05 | 2534.35 ms | 53.3% bf16 MFU | 206974 tok/s +step 15686/19560 | loss 3.298056 (-0.50z)| norm 0.2888 (+2.02z)| lr 6.04e-05 | 2532.89 ms | 53.3% bf16 MFU | 206975 tok/s +step 15687/19560 | loss 3.292799 (-0.62z)| norm 0.2647 (-0.07z)| lr 6.03e-05 | 2532.80 ms | 53.3% bf16 MFU | 206977 tok/s +step 15688/19560 | loss 3.424070 (+2.33z)| norm 0.2801 (+1.29z)| lr 6.03e-05 | 2532.13 ms | 53.3% bf16 MFU | 206980 tok/s +step 15689/19560 | loss 3.321441 (+0.03z)| norm 0.2636 (-0.16z)| lr 6.03e-05 | 2532.24 ms | 53.3% bf16 MFU | 206984 tok/s +step 15690/19560 | loss 3.306994 (-0.30z)| norm 0.2590 (-0.56z)| lr 6.02e-05 | 2534.91 ms | 53.3% bf16 MFU | 206976 tok/s +step 15691/19560 | loss 3.331266 (+0.25z)| norm 0.2584 (-0.62z)| lr 6.02e-05 | 2532.69 ms | 53.3% bf16 MFU | 206977 tok/s +step 15692/19560 | loss 3.283249 (-0.83z)| norm 0.2882 (+1.96z)| lr 6.02e-05 | 2533.99 ms | 53.3% bf16 MFU | 206974 tok/s +step 15693/19560 | loss 3.297904 (-0.50z)| norm 0.2541 (-1.00z)| lr 6.01e-05 | 2533.86 ms | 53.3% bf16 MFU | 206971 tok/s +step 15694/19560 | loss 3.321295 (+0.03z)| norm 0.2482 (-1.48z)| lr 6.01e-05 | 2534.14 ms | 53.3% bf16 MFU | 206967 tok/s +step 15695/19560 | loss 3.263297 (-1.28z)| norm 0.2635 (-0.19z)| lr 6.01e-05 | 2533.50 ms | 53.3% bf16 MFU | 206965 tok/s +step 15696/19560 | loss 3.279236 (-0.91z)| norm 0.2672 (+0.13z)| lr 6.01e-05 | 2535.84 ms | 53.2% bf16 MFU | 206955 tok/s +step 15697/19560 | loss 3.323094 (+0.08z)| norm 0.2412 (-2.12z)| lr 6.00e-05 | 2534.18 ms | 53.3% bf16 MFU | 206951 tok/s +step 15698/19560 | loss 3.376147 (+1.28z)| norm 0.2621 (-0.31z)| lr 6.00e-05 | 2532.62 ms | 53.3% bf16 MFU | 206954 tok/s +step 15699/19560 | loss 3.271410 (-1.10z)| norm 0.2733 (+0.65z)| lr 6.00e-05 | 2532.17 ms | 53.3% bf16 MFU | 206959 tok/s +step 15700/19560 | loss 3.347274 (+0.63z)| norm 0.2479 (-1.54z)| lr 5.99e-05 | 2532.58 ms | 53.3% bf16 MFU | 206962 tok/s +step 15701/19560 | loss 3.323654 (+0.08z)| norm 0.2651 (-0.05z)| lr 5.99e-05 | 2534.35 ms | 53.3% bf16 MFU | 206958 tok/s +step 15702/19560 | loss 3.281942 (-0.86z)| norm 0.2713 (+0.47z)| lr 5.99e-05 | 2534.57 ms | 53.3% bf16 MFU | 206953 tok/s +step 15703/19560 | loss 3.370494 (+1.17z)| norm 0.2668 (+0.09z)| lr 5.98e-05 | 2536.00 ms | 53.2% bf16 MFU | 206942 tok/s +step 15704/19560 | loss 3.298716 (-0.47z)| norm 0.2591 (-0.57z)| lr 5.98e-05 | 2533.77 ms | 53.3% bf16 MFU | 206941 tok/s +step 15705/19560 | loss 3.350228 (+0.70z)| norm 0.2768 (+0.95z)| lr 5.98e-05 | 2534.30 ms | 53.3% bf16 MFU | 206938 tok/s +step 15706/19560 | loss 3.355597 (+0.81z)| norm 0.2948 (+2.44z)| lr 5.98e-05 | 2532.28 ms | 53.3% bf16 MFU | 206943 tok/s +step 15707/19560 | loss 3.351216 (+0.71z)| norm 0.2704 (+0.37z)| lr 5.97e-05 | 2532.70 ms | 53.3% bf16 MFU | 206946 tok/s +step 15708/19560 | loss 3.316997 (-0.08z)| norm 0.2885 (+1.87z)| lr 5.97e-05 | 2533.40 ms | 53.3% bf16 MFU | 206946 tok/s +step 15709/19560 | loss 3.276257 (-1.01z)| norm 0.2612 (-0.41z)| lr 5.97e-05 | 2534.91 ms | 53.3% bf16 MFU | 206940 tok/s +step 15710/19560 | loss 3.358019 (+0.85z)| norm 0.2692 (+0.25z)| lr 5.96e-05 | 2531.33 ms | 53.3% bf16 MFU | 206949 tok/s +step 15711/19560 | loss 3.337519 (+0.38z)| norm 0.2667 (+0.04z)| lr 5.96e-05 | 2534.61 ms | 53.3% bf16 MFU | 206944 tok/s +step 15712/19560 | loss 3.335215 (+0.32z)| norm 0.2725 (+0.52z)| lr 5.96e-05 | 2535.67 ms | 53.2% bf16 MFU | 206935 tok/s +step 15713/19560 | loss 3.302382 (-0.44z)| norm 0.2622 (-0.35z)| lr 5.95e-05 | 2532.74 ms | 53.3% bf16 MFU | 206939 tok/s +step 15714/19560 | loss 3.278117 (-0.99z)| norm 0.2668 (+0.05z)| lr 5.95e-05 | 2534.86 ms | 53.3% bf16 MFU | 206933 tok/s +step 15715/19560 | loss 3.330681 (+0.24z)| norm 0.2682 (+0.17z)| lr 5.95e-05 | 2533.82 ms | 53.3% bf16 MFU | 206933 tok/s +step 15716/19560 | loss 3.260393 (-1.40z)| norm 0.2649 (-0.11z)| lr 5.95e-05 | 2532.56 ms | 53.3% bf16 MFU | 206937 tok/s +step 15717/19560 | loss 3.278618 (-0.98z)| norm 0.2641 (-0.19z)| lr 5.94e-05 | 2534.22 ms | 53.3% bf16 MFU | 206934 tok/s +step 15718/19560 | loss 3.336867 (+0.39z)| norm 0.2712 (+0.42z)| lr 5.94e-05 | 2534.65 ms | 53.3% bf16 MFU | 206930 tok/s +step 15719/19560 | loss 3.284345 (-0.84z)| norm 0.2847 (+1.56z)| lr 5.94e-05 | 2531.38 ms | 53.3% bf16 MFU | 206939 tok/s +step 15720/19560 | loss 3.339008 (+0.43z)| norm 0.2677 (+0.09z)| lr 5.93e-05 | 2534.01 ms | 53.3% bf16 MFU | 206937 tok/s +step 15721/19560 | loss 3.395909 (+1.73z)| norm 0.2904 (+2.00z)| lr 5.93e-05 | 2533.43 ms | 53.3% bf16 MFU | 206938 tok/s +step 15722/19560 | loss 3.207300 (-2.57z)| norm 0.2532 (-1.16z)| lr 5.93e-05 | 2535.89 ms | 53.2% bf16 MFU | 206928 tok/s +step 15723/19560 | loss 3.322953 (+0.06z)| norm 0.2558 (-0.93z)| lr 5.92e-05 | 2532.68 ms | 53.3% bf16 MFU | 206932 tok/s +step 15724/19560 | loss 3.363093 (+0.96z)| norm 0.2804 (+1.14z)| lr 5.92e-05 | 2533.08 ms | 53.3% bf16 MFU | 206934 tok/s +step 15725/19560 | loss 3.280746 (-0.92z)| norm 0.2477 (-1.59z)| lr 5.92e-05 | 2533.16 ms | 53.3% bf16 MFU | 206936 tok/s +step 15726/19560 | loss 3.344073 (+0.54z)| norm 0.2731 (+0.54z)| lr 5.92e-05 | 2532.93 ms | 53.3% bf16 MFU | 206939 tok/s +step 15727/19560 | loss 3.440569 (+2.66z)| norm 0.2864 (+1.61z)| lr 5.91e-05 | 2533.52 ms | 53.3% bf16 MFU | 206939 tok/s +step 15728/19560 | loss 3.218603 (-2.24z)| norm 0.2736 (+0.57z)| lr 5.91e-05 | 2531.50 ms | 53.3% bf16 MFU | 206947 tok/s +step 15729/19560 | loss 3.285207 (-0.77z)| norm 0.2791 (+1.01z)| lr 5.91e-05 | 2531.86 ms | 53.3% bf16 MFU | 206954 tok/s +step 15730/19560 | loss 3.343872 (+0.51z)| norm 0.2725 (+0.45z)| lr 5.90e-05 | 2536.90 ms | 53.2% bf16 MFU | 206939 tok/s +step 15731/19560 | loss 3.251391 (-1.49z)| norm 0.2661 (-0.06z)| lr 5.90e-05 | 2533.69 ms | 53.3% bf16 MFU | 206939 tok/s +step 15732/19560 | loss 3.306998 (-0.27z)| norm 0.2699 (+0.26z)| lr 5.90e-05 | 2531.50 ms | 53.3% bf16 MFU | 206947 tok/s +step 15733/19560 | loss 3.347781 (+0.62z)| norm 0.2519 (-1.25z)| lr 5.90e-05 | 2531.92 ms | 53.3% bf16 MFU | 206953 tok/s +step 15734/19560 | loss 3.294317 (-0.54z)| norm 0.2458 (-1.73z)| lr 5.89e-05 | 2534.25 ms | 53.3% bf16 MFU | 206950 tok/s +step 15735/19560 | loss 3.322271 (+0.08z)| norm 0.2714 (+0.41z)| lr 5.89e-05 | 2533.20 ms | 53.3% bf16 MFU | 206950 tok/s +step 15736/19560 | loss 3.296442 (-0.50z)| norm 0.2764 (+0.81z)| lr 5.89e-05 | 2534.66 ms | 53.3% bf16 MFU | 206945 tok/s +step 15737/19560 | loss 3.320561 (+0.02z)| norm 0.2614 (-0.43z)| lr 5.88e-05 | 2533.56 ms | 53.3% bf16 MFU | 206945 tok/s +step 15738/19560 | loss 3.260978 (-1.30z)| norm 0.2622 (-0.35z)| lr 5.88e-05 | 2534.17 ms | 53.3% bf16 MFU | 206942 tok/s +step 15739/19560 | loss 3.321289 (+0.04z)| norm 0.2643 (-0.17z)| lr 5.88e-05 | 2533.26 ms | 53.3% bf16 MFU | 206943 tok/s +step 15740/19560 | loss 3.273250 (-1.03z)| norm 0.2599 (-0.56z)| lr 5.87e-05 | 2532.60 ms | 53.3% bf16 MFU | 206947 tok/s +step 15741/19560 | loss 3.334092 (+0.32z)| norm 0.2664 (+0.01z)| lr 5.87e-05 | 2533.30 ms | 53.3% bf16 MFU | 206947 tok/s +step 15742/19560 | loss 3.336320 (+0.39z)| norm 0.2649 (-0.12z)| lr 5.87e-05 | 2530.91 ms | 53.3% bf16 MFU | 206958 tok/s +step 15743/19560 | loss 3.316066 (-0.07z)| norm 0.2458 (-1.72z)| lr 5.87e-05 | 2533.46 ms | 53.3% bf16 MFU | 206957 tok/s +step 15744/19560 | loss 3.396897 (+1.74z)| norm 0.2566 (-0.79z)| lr 5.86e-05 | 2531.84 ms | 53.3% bf16 MFU | 206963 tok/s +step 15745/19560 | loss 3.392598 (+1.61z)| norm 0.2596 (-0.54z)| lr 5.86e-05 | 2532.49 ms | 53.3% bf16 MFU | 206966 tok/s +step 15746/19560 | loss 3.321950 (+0.03z)| norm 0.2618 (-0.35z)| lr 5.86e-05 | 2534.74 ms | 53.3% bf16 MFU | 206960 tok/s +step 15747/19560 | loss 3.245142 (-1.66z)| norm 0.2678 (+0.17z)| lr 5.85e-05 | 2532.79 ms | 53.3% bf16 MFU | 206962 tok/s +step 15748/19560 | loss 3.337847 (+0.41z)| norm 0.2663 (+0.05z)| lr 5.85e-05 | 2532.38 ms | 53.3% bf16 MFU | 206965 tok/s +step 15749/19560 | loss 3.318792 (-0.00z)| norm 0.2826 (+1.42z)| lr 5.85e-05 | 2530.97 ms | 53.3% bf16 MFU | 206975 tok/s +step 15750/19560 | loss 3.296189 (-0.52z)| norm 0.2575 (-0.71z)| lr 5.84e-05 | 2533.21 ms | 53.3% bf16 MFU | 206974 tok/s +val loss 3.305140 +evaluating HellaSwag: 0/628 evaluating HellaSwag: 10/628 evaluating HellaSwag: 20/628 evaluating HellaSwag: 30/628 evaluating HellaSwag: 40/628 evaluating HellaSwag: 50/628 evaluating HellaSwag: 60/628 evaluating HellaSwag: 70/628 evaluating HellaSwag: 80/628 evaluating HellaSwag: 90/628 evaluating HellaSwag: 100/628 evaluating HellaSwag: 110/628 evaluating HellaSwag: 120/628 evaluating HellaSwag: 130/628 evaluating HellaSwag: 140/628 evaluating HellaSwag: 150/628 evaluating HellaSwag: 160/628 evaluating HellaSwag: 170/628 evaluating HellaSwag: 180/628 evaluating HellaSwag: 190/628 evaluating HellaSwag: 200/628 evaluating HellaSwag: 210/628 evaluating HellaSwag: 220/628 evaluating HellaSwag: 230/628 evaluating HellaSwag: 240/628 evaluating HellaSwag: 250/628 evaluating HellaSwag: 260/628 evaluating HellaSwag: 270/628 evaluating HellaSwag: 280/628 evaluating HellaSwag: 290/628 evaluating HellaSwag: 300/628 evaluating HellaSwag: 310/628 evaluating HellaSwag: 320/628 evaluating HellaSwag: 330/628 evaluating HellaSwag: 340/628 evaluating HellaSwag: 350/628 evaluating HellaSwag: 360/628 evaluating HellaSwag: 370/628 evaluating HellaSwag: 380/628 evaluating HellaSwag: 390/628 evaluating HellaSwag: 400/628 evaluating HellaSwag: 410/628 evaluating HellaSwag: 420/628 evaluating HellaSwag: 430/628 evaluating HellaSwag: 440/628 evaluating HellaSwag: 450/628 evaluating HellaSwag: 460/628 evaluating HellaSwag: 470/628 evaluating HellaSwag: 480/628 evaluating HellaSwag: 490/628 evaluating HellaSwag: 500/628 evaluating HellaSwag: 510/628 evaluating HellaSwag: 520/628 evaluating HellaSwag: 530/628 evaluating HellaSwag: 540/628 evaluating HellaSwag: 550/628 evaluating HellaSwag: 560/628 evaluating HellaSwag: 570/628 evaluating HellaSwag: 580/628 evaluating HellaSwag: 590/628 evaluating HellaSwag: 600/628 evaluating HellaSwag: 610/628 evaluating HellaSwag: 620/628 HellaSwag: 3004/10042 = 0.299144 +step 15751/19560 | loss 3.503080 (+3.93z)| norm 0.3336 (+5.14z)| lr 5.84e-05 | 2531.85 ms | 53.3% bf16 MFU | 206979 tok/s +step 15752/19560 | loss 3.288787 (-0.69z)| norm 0.3016 (+2.60z)| lr 5.84e-05 | 2532.16 ms | 53.3% bf16 MFU | 206983 tok/s +step 15753/19560 | loss 3.271481 (-1.05z)| norm 0.2765 (+0.75z)| lr 5.84e-05 | 2531.19 ms | 53.3% bf16 MFU | 206990 tok/s +step 15754/19560 | loss 3.299124 (-0.45z)| norm 0.2785 (+0.89z)| lr 5.83e-05 | 2532.60 ms | 53.3% bf16 MFU | 206992 tok/s +step 15755/19560 | loss 3.298391 (-0.45z)| norm 0.3233 (+3.93z)| lr 5.83e-05 | 2533.33 ms | 53.3% bf16 MFU | 206990 tok/s +step 15756/19560 | loss 3.306010 (-0.29z)| norm 0.2870 (+1.41z)| lr 5.83e-05 | 2533.57 ms | 53.3% bf16 MFU | 206987 tok/s +step 15757/19560 | loss 3.283167 (-0.77z)| norm 0.2669 (+0.00z)| lr 5.82e-05 | 2533.48 ms | 53.3% bf16 MFU | 206985 tok/s +step 15758/19560 | loss 3.292223 (-0.58z)| norm 0.2581 (-0.61z)| lr 5.82e-05 | 2533.96 ms | 53.3% bf16 MFU | 206981 tok/s +step 15759/19560 | loss 3.266552 (-1.12z)| norm 0.2696 (+0.20z)| lr 5.82e-05 | 2533.81 ms | 53.3% bf16 MFU | 206978 tok/s +step 15760/19560 | loss 3.325429 (+0.16z)| norm 0.2826 (+1.10z)| lr 5.81e-05 | 2533.66 ms | 53.3% bf16 MFU | 206975 tok/s +step 15761/19560 | loss 3.317422 (-0.01z)| norm 0.2501 (-1.16z)| lr 5.81e-05 | 2531.79 ms | 53.3% bf16 MFU | 206981 tok/s +step 15762/19560 | loss 3.313203 (-0.10z)| norm 0.2994 (+2.23z)| lr 5.81e-05 | 2534.35 ms | 53.3% bf16 MFU | 206975 tok/s +step 15763/19560 | loss 3.321647 (+0.08z)| norm 0.2650 (-0.14z)| lr 5.81e-05 | 2533.94 ms | 53.3% bf16 MFU | 206972 tok/s +step 15764/19560 | loss 3.271180 (-1.02z)| norm 0.2697 (+0.17z)| lr 5.80e-05 | 2534.21 ms | 53.3% bf16 MFU | 206967 tok/s +step 15765/19560 | loss 3.342169 (+0.52z)| norm 0.2759 (+0.59z)| lr 5.80e-05 | 2532.72 ms | 53.3% bf16 MFU | 206969 tok/s +step 15766/19560 | loss 3.280624 (-0.81z)| norm 0.2572 (-0.70z)| lr 5.80e-05 | 2533.86 ms | 53.3% bf16 MFU | 206966 tok/s +step 15767/19560 | loss 3.312962 (-0.10z)| norm 0.2586 (-0.60z)| lr 5.79e-05 | 2533.41 ms | 53.3% bf16 MFU | 206966 tok/s +step 15768/19560 | loss 3.237077 (-1.73z)| norm 0.2635 (-0.27z)| lr 5.79e-05 | 2532.26 ms | 53.3% bf16 MFU | 206970 tok/s +step 15769/19560 | loss 3.411720 (+2.00z)| norm 0.2653 (-0.14z)| lr 5.79e-05 | 2532.00 ms | 53.3% bf16 MFU | 206974 tok/s +step 15770/19560 | loss 3.369462 (+1.08z)| norm 0.2817 (+0.99z)| lr 5.79e-05 | 2533.01 ms | 53.3% bf16 MFU | 206975 tok/s +step 15771/19560 | loss 3.363119 (+0.97z)| norm 0.2571 (-0.74z)| lr 5.78e-05 | 2533.16 ms | 53.3% bf16 MFU | 206974 tok/s +step 15772/19560 | loss 3.290949 (-0.58z)| norm 0.2787 (+0.77z)| lr 5.78e-05 | 2532.06 ms | 53.3% bf16 MFU | 206979 tok/s +step 15773/19560 | loss 3.263500 (-1.17z)| norm 0.2752 (+0.52z)| lr 5.78e-05 | 2532.79 ms | 53.3% bf16 MFU | 206980 tok/s +step 15774/19560 | loss 3.455676 (+2.95z)| norm 0.2674 (-0.03z)| lr 5.77e-05 | 2532.90 ms | 53.3% bf16 MFU | 206980 tok/s +step 15775/19560 | loss 3.312198 (-0.11z)| norm 0.2828 (+1.04z)| lr 5.77e-05 | 2533.93 ms | 53.3% bf16 MFU | 206977 tok/s +step 15776/19560 | loss 3.288425 (-0.62z)| norm 0.2659 (-0.14z)| lr 5.77e-05 | 2532.92 ms | 53.3% bf16 MFU | 206977 tok/s +step 15777/19560 | loss 3.236888 (-1.70z)| norm 0.3929 (+6.92z)| lr 5.76e-05 | 2534.23 ms | 53.3% bf16 MFU | 206973 tok/s +step 15778/19560 | loss 3.287782 (-0.60z)| norm 0.3196 (+2.73z)| lr 5.76e-05 | 2535.67 ms | 53.2% bf16 MFU | 206962 tok/s +step 15779/19560 | loss 3.390654 (+1.58z)| norm 0.3154 (+2.42z)| lr 5.76e-05 | 2534.70 ms | 53.3% bf16 MFU | 206956 tok/s +step 15780/19560 | loss 3.247814 (-1.44z)| norm 0.2779 (+0.44z)| lr 5.76e-05 | 2533.90 ms | 53.3% bf16 MFU | 206954 tok/s +step 15781/19560 | loss 3.261997 (-1.12z)| norm 0.2857 (+0.84z)| lr 5.75e-05 | 2531.17 ms | 53.3% bf16 MFU | 206963 tok/s +step 15782/19560 | loss 3.360771 (+0.95z)| norm 0.2752 (+0.28z)| lr 5.75e-05 | 2532.98 ms | 53.3% bf16 MFU | 206964 tok/s +step 15783/19560 | loss 3.275027 (-0.86z)| norm 0.2720 (+0.11z)| lr 5.75e-05 | 2531.54 ms | 53.3% bf16 MFU | 206971 tok/s +step 15784/19560 | loss 3.286533 (-0.61z)| norm 0.2869 (+0.89z)| lr 5.74e-05 | 2533.81 ms | 53.3% bf16 MFU | 206968 tok/s +step 15785/19560 | loss 3.305550 (-0.20z)| norm 0.2700 (-0.01z)| lr 5.74e-05 | 2532.44 ms | 53.3% bf16 MFU | 206971 tok/s +step 15786/19560 | loss 3.270329 (-0.93z)| norm 0.2696 (-0.04z)| lr 5.74e-05 | 2533.71 ms | 53.3% bf16 MFU | 206969 tok/s +step 15787/19560 | loss 3.302467 (-0.24z)| norm 0.2720 (+0.08z)| lr 5.74e-05 | 2534.74 ms | 53.3% bf16 MFU | 206963 tok/s +step 15788/19560 | loss 3.333771 (+0.42z)| norm 0.2654 (-0.27z)| lr 5.73e-05 | 2531.80 ms | 53.3% bf16 MFU | 206968 tok/s +step 15789/19560 | loss 3.292388 (-0.46z)| norm 0.2605 (-0.53z)| lr 5.73e-05 | 2531.97 ms | 53.3% bf16 MFU | 206973 tok/s +step 15790/19560 | loss 3.352272 (+0.80z)| norm 0.2817 (+0.60z)| lr 5.73e-05 | 2532.39 ms | 53.3% bf16 MFU | 206976 tok/s +step 15791/19560 | loss 3.344109 (+0.63z)| norm 0.2520 (-0.98z)| lr 5.72e-05 | 2532.09 ms | 53.3% bf16 MFU | 206980 tok/s +step 15792/19560 | loss 3.292364 (-0.47z)| norm 0.2571 (-0.70z)| lr 5.72e-05 | 2533.10 ms | 53.3% bf16 MFU | 206980 tok/s +step 15793/19560 | loss 3.309916 (-0.09z)| norm 0.3307 (+3.08z)| lr 5.72e-05 | 2533.06 ms | 53.3% bf16 MFU | 206980 tok/s +step 15794/19560 | loss 3.317784 (+0.07z)| norm 0.2579 (-0.66z)| lr 5.71e-05 | 2534.27 ms | 53.3% bf16 MFU | 206975 tok/s +step 15795/19560 | loss 3.356497 (+0.88z)| norm 0.2734 (+0.13z)| lr 5.71e-05 | 2532.33 ms | 53.3% bf16 MFU | 206978 tok/s +step 15796/19560 | loss 3.291550 (-0.50z)| norm 0.2691 (-0.10z)| lr 5.71e-05 | 2531.87 ms | 53.3% bf16 MFU | 206983 tok/s +step 15797/19560 | loss 3.323945 (+0.19z)| norm 0.2606 (-0.53z)| lr 5.71e-05 | 2533.54 ms | 53.3% bf16 MFU | 206981 tok/s +step 15798/19560 | loss 3.337962 (+0.50z)| norm 0.2586 (-0.64z)| lr 5.70e-05 | 2533.44 ms | 53.3% bf16 MFU | 206979 tok/s +step 15799/19560 | loss 3.289594 (-0.55z)| norm 0.2623 (-0.46z)| lr 5.70e-05 | 2531.17 ms | 53.3% bf16 MFU | 206987 tok/s +step 15800/19560 | loss 3.319987 (+0.11z)| norm 0.2889 (+0.92z)| lr 5.70e-05 | 2532.65 ms | 53.3% bf16 MFU | 206988 tok/s +step 15801/19560 | loss 3.311630 (-0.06z)| norm 0.2634 (-0.41z)| lr 5.69e-05 | 2531.83 ms | 53.3% bf16 MFU | 206992 tok/s +step 15802/19560 | loss 3.304533 (-0.22z)| norm 0.2697 (-0.08z)| lr 5.69e-05 | 2531.89 ms | 53.3% bf16 MFU | 206997 tok/s +step 15803/19560 | loss 3.376553 (+1.34z)| norm 0.2788 (+0.38z)| lr 5.69e-05 | 2533.31 ms | 53.3% bf16 MFU | 206995 tok/s +step 15804/19560 | loss 3.297861 (-0.37z)| norm 0.2671 (-0.24z)| lr 5.69e-05 | 2531.39 ms | 53.3% bf16 MFU | 207001 tok/s +step 15805/19560 | loss 3.365571 (+1.09z)| norm 0.3334 (+3.10z)| lr 5.68e-05 | 2531.89 ms | 53.3% bf16 MFU | 207004 tok/s +step 15806/19560 | loss 3.304046 (-0.23z)| norm 0.2675 (-0.23z)| lr 5.68e-05 | 2533.45 ms | 53.3% bf16 MFU | 207001 tok/s +step 15807/19560 | loss 3.292728 (-0.49z)| norm 0.2544 (-0.89z)| lr 5.68e-05 | 2531.91 ms | 53.3% bf16 MFU | 207005 tok/s +step 15808/19560 | loss 3.285407 (-0.64z)| norm 0.2876 (+0.78z)| lr 5.67e-05 | 2531.87 ms | 53.3% bf16 MFU | 207008 tok/s +step 15809/19560 | loss 3.327678 (+0.28z)| norm 0.2746 (+0.12z)| lr 5.67e-05 | 2532.29 ms | 53.3% bf16 MFU | 207010 tok/s +step 15810/19560 | loss 3.365414 (+1.09z)| norm 0.2745 (+0.10z)| lr 5.67e-05 | 2531.88 ms | 53.3% bf16 MFU | 207013 tok/s +step 15811/19560 | loss 3.322839 (+0.14z)| norm 0.2664 (-0.31z)| lr 5.67e-05 | 2532.64 ms | 53.3% bf16 MFU | 207013 tok/s +step 15812/19560 | loss 3.428632 (+2.42z)| norm 0.2890 (+0.83z)| lr 5.66e-05 | 2534.05 ms | 53.3% bf16 MFU | 207007 tok/s +step 15813/19560 | loss 3.311020 (-0.13z)| norm 0.2652 (-0.37z)| lr 5.66e-05 | 2533.79 ms | 53.3% bf16 MFU | 207003 tok/s +step 15814/19560 | loss 3.324377 (+0.16z)| norm 0.2569 (-0.78z)| lr 5.66e-05 | 2533.70 ms | 53.3% bf16 MFU | 206999 tok/s +step 15815/19560 | loss 3.273348 (-0.94z)| norm 0.2563 (-0.80z)| lr 5.65e-05 | 2533.05 ms | 53.3% bf16 MFU | 206998 tok/s +step 15816/19560 | loss 3.286867 (-0.64z)| norm 0.2670 (-0.26z)| lr 5.65e-05 | 2534.08 ms | 53.3% bf16 MFU | 206993 tok/s +step 15817/19560 | loss 3.275100 (-0.89z)| norm 0.2854 (+0.66z)| lr 5.65e-05 | 2534.46 ms | 53.3% bf16 MFU | 206986 tok/s +step 15818/19560 | loss 3.331955 (+0.36z)| norm 0.2532 (-0.96z)| lr 5.64e-05 | 2534.64 ms | 53.3% bf16 MFU | 206980 tok/s +step 15819/19560 | loss 3.408104 (+1.98z)| norm 0.2472 (-1.25z)| lr 5.64e-05 | 2531.16 ms | 53.3% bf16 MFU | 206987 tok/s +step 15820/19560 | loss 3.295285 (-0.46z)| norm 0.2636 (-0.42z)| lr 5.64e-05 | 2533.61 ms | 53.3% bf16 MFU | 206985 tok/s +step 15821/19560 | loss 3.291109 (-0.55z)| norm 0.2769 (+0.24z)| lr 5.64e-05 | 2534.44 ms | 53.3% bf16 MFU | 206979 tok/s +step 15822/19560 | loss 3.273823 (-0.91z)| norm 0.2346 (-1.87z)| lr 5.63e-05 | 2532.77 ms | 53.3% bf16 MFU | 206980 tok/s +step 15823/19560 | loss 3.321086 (+0.10z)| norm 0.2495 (-1.12z)| lr 5.63e-05 | 2530.73 ms | 53.4% bf16 MFU | 206989 tok/s +step 15824/19560 | loss 3.280819 (-0.77z)| norm 0.2685 (-0.17z)| lr 5.63e-05 | 2533.96 ms | 53.3% bf16 MFU | 206985 tok/s +step 15825/19560 | loss 3.222421 (-1.99z)| norm 0.2527 (-0.96z)| lr 5.62e-05 | 2531.99 ms | 53.3% bf16 MFU | 206989 tok/s +step 15826/19560 | loss 3.415345 (+2.10z)| norm 0.2790 (+0.34z)| lr 5.62e-05 | 2532.58 ms | 53.3% bf16 MFU | 206990 tok/s +step 15827/19560 | loss 3.298727 (-0.37z)| norm 0.2572 (-0.74z)| lr 5.62e-05 | 2531.47 ms | 53.3% bf16 MFU | 206996 tok/s +step 15828/19560 | loss 3.358299 (+0.89z)| norm 0.2510 (-1.05z)| lr 5.62e-05 | 2531.87 ms | 53.3% bf16 MFU | 207000 tok/s +step 15829/19560 | loss 3.321662 (+0.11z)| norm 0.2474 (-1.22z)| lr 5.61e-05 | 2532.67 ms | 53.3% bf16 MFU | 207001 tok/s +step 15830/19560 | loss 3.301634 (-0.31z)| norm 0.2630 (-0.44z)| lr 5.61e-05 | 2532.77 ms | 53.3% bf16 MFU | 207001 tok/s +step 15831/19560 | loss 3.304754 (-0.24z)| norm 0.2519 (-0.98z)| lr 5.61e-05 | 2532.90 ms | 53.3% bf16 MFU | 207000 tok/s +step 15832/19560 | loss 3.365432 (+1.04z)| norm 0.2534 (-0.90z)| lr 5.60e-05 | 2532.39 ms | 53.3% bf16 MFU | 207002 tok/s +step 15833/19560 | loss 3.369113 (+1.11z)| norm 0.2764 (+0.23z)| lr 5.60e-05 | 2535.14 ms | 53.3% bf16 MFU | 206992 tok/s +step 15834/19560 | loss 3.302195 (-0.30z)| norm 0.2412 (-1.48z)| lr 5.60e-05 | 2532.75 ms | 53.3% bf16 MFU | 206993 tok/s +step 15835/19560 | loss 3.308793 (-0.15z)| norm 0.2376 (-1.63z)| lr 5.60e-05 | 2531.80 ms | 53.3% bf16 MFU | 206997 tok/s +step 15836/19560 | loss 3.372666 (+1.19z)| norm 0.2546 (-0.79z)| lr 5.59e-05 | 2533.00 ms | 53.3% bf16 MFU | 206997 tok/s +step 15837/19560 | loss 3.308083 (-0.18z)| norm 0.2568 (-0.68z)| lr 5.59e-05 | 2534.12 ms | 53.3% bf16 MFU | 206991 tok/s +step 15838/19560 | loss 3.276430 (-0.83z)| norm 0.2499 (-1.00z)| lr 5.59e-05 | 2535.40 ms | 53.3% bf16 MFU | 206981 tok/s +step 15839/19560 | loss 3.335862 (+0.42z)| norm 0.2592 (-0.55z)| lr 5.58e-05 | 2534.15 ms | 53.3% bf16 MFU | 206977 tok/s +step 15840/19560 | loss 3.284334 (-0.66z)| norm 0.2816 (+0.54z)| lr 5.58e-05 | 2533.64 ms | 53.3% bf16 MFU | 206974 tok/s +step 15841/19560 | loss 3.356480 (+0.86z)| norm 0.2535 (-0.82z)| lr 5.58e-05 | 2532.03 ms | 53.3% bf16 MFU | 206979 tok/s +step 15842/19560 | loss 3.310817 (-0.11z)| norm 0.2685 (-0.10z)| lr 5.57e-05 | 2532.17 ms | 53.3% bf16 MFU | 206982 tok/s +step 15843/19560 | loss 3.286378 (-0.62z)| norm 0.2764 (+0.28z)| lr 5.57e-05 | 2535.30 ms | 53.3% bf16 MFU | 206973 tok/s +step 15844/19560 | loss 3.292652 (-0.50z)| norm 0.2489 (-1.04z)| lr 5.57e-05 | 2532.41 ms | 53.3% bf16 MFU | 206976 tok/s +step 15845/19560 | loss 3.303086 (-0.28z)| norm 0.2678 (-0.13z)| lr 5.57e-05 | 2533.56 ms | 53.3% bf16 MFU | 206974 tok/s +step 15846/19560 | loss 3.261989 (-1.14z)| norm 0.2679 (-0.12z)| lr 5.56e-05 | 2531.80 ms | 53.3% bf16 MFU | 206979 tok/s +step 15847/19560 | loss 3.261475 (-1.14z)| norm 0.2633 (-0.34z)| lr 5.56e-05 | 2531.59 ms | 53.3% bf16 MFU | 206985 tok/s +step 15848/19560 | loss 3.297637 (-0.37z)| norm 0.2763 (+0.29z)| lr 5.56e-05 | 2531.57 ms | 53.3% bf16 MFU | 206991 tok/s +step 15849/19560 | loss 3.314306 (-0.00z)| norm 0.2661 (-0.20z)| lr 5.55e-05 | 2532.53 ms | 53.3% bf16 MFU | 206992 tok/s +step 15850/19560 | loss 3.297811 (-0.38z)| norm 0.2976 (+1.31z)| lr 5.55e-05 | 2531.15 ms | 53.3% bf16 MFU | 207000 tok/s +step 15851/19560 | loss 3.306604 (-0.18z)| norm 0.2583 (-0.59z)| lr 5.55e-05 | 2532.39 ms | 53.3% bf16 MFU | 207001 tok/s +step 15852/19560 | loss 3.292962 (-0.47z)| norm 0.2559 (-0.70z)| lr 5.55e-05 | 2531.82 ms | 53.3% bf16 MFU | 207005 tok/s +step 15853/19560 | loss 3.352605 (+0.82z)| norm 0.2650 (-0.26z)| lr 5.54e-05 | 2531.24 ms | 53.3% bf16 MFU | 207011 tok/s +step 15854/19560 | loss 3.309622 (-0.11z)| norm 0.2610 (-0.45z)| lr 5.54e-05 | 2530.80 ms | 53.3% bf16 MFU | 207019 tok/s +step 15855/19560 | loss 3.368060 (+1.21z)| norm 0.2712 (+0.05z)| lr 5.54e-05 | 2530.98 ms | 53.3% bf16 MFU | 207025 tok/s +step 15856/19560 | loss 3.328453 (+0.30z)| norm 0.2673 (-0.14z)| lr 5.53e-05 | 2532.18 ms | 53.3% bf16 MFU | 207026 tok/s +step 15857/19560 | loss 3.300124 (-0.35z)| norm 0.2694 (-0.04z)| lr 5.53e-05 | 2531.98 ms | 53.3% bf16 MFU | 207028 tok/s +step 15858/19560 | loss 3.333525 (+0.42z)| norm 0.2585 (-0.56z)| lr 5.53e-05 | 2531.14 ms | 53.3% bf16 MFU | 207034 tok/s +step 15859/19560 | loss 3.250703 (-1.48z)| norm 0.2751 (+0.24z)| lr 5.53e-05 | 2534.72 ms | 53.3% bf16 MFU | 207024 tok/s +step 15860/19560 | loss 3.333860 (+0.42z)| norm 0.2717 (+0.08z)| lr 5.52e-05 | 2533.09 ms | 53.3% bf16 MFU | 207022 tok/s +step 15861/19560 | loss 3.259296 (-1.26z)| norm 0.2791 (+0.43z)| lr 5.52e-05 | 2534.34 ms | 53.3% bf16 MFU | 207014 tok/s +step 15862/19560 | loss 3.364797 (+1.13z)| norm 0.2690 (-0.07z)| lr 5.52e-05 | 2535.38 ms | 53.3% bf16 MFU | 207003 tok/s +step 15863/19560 | loss 3.303263 (-0.27z)| norm 0.2707 (+0.01z)| lr 5.51e-05 | 2533.62 ms | 53.3% bf16 MFU | 207000 tok/s +step 15864/19560 | loss 3.297556 (-0.40z)| norm 0.2647 (-0.28z)| lr 5.51e-05 | 2534.34 ms | 53.3% bf16 MFU | 206993 tok/s +step 15865/19560 | loss 3.340963 (+0.58z)| norm 0.2793 (+0.43z)| lr 5.51e-05 | 2532.14 ms | 53.3% bf16 MFU | 206996 tok/s +step 15866/19560 | loss 3.392524 (+1.72z)| norm 0.2907 (+0.97z)| lr 5.51e-05 | 2534.49 ms | 53.3% bf16 MFU | 206990 tok/s +step 15867/19560 | loss 3.348796 (+0.73z)| norm 0.2586 (-0.59z)| lr 5.50e-05 | 2531.50 ms | 53.3% bf16 MFU | 206995 tok/s +step 15868/19560 | loss 3.296012 (-0.47z)| norm 0.2854 (+0.70z)| lr 5.50e-05 | 2535.69 ms | 53.2% bf16 MFU | 206984 tok/s +step 15869/19560 | loss 3.257312 (-1.32z)| norm 0.2812 (+0.49z)| lr 5.50e-05 | 2534.53 ms | 53.3% bf16 MFU | 206977 tok/s +step 15870/19560 | loss 3.318899 (+0.07z)| norm 0.2622 (-0.43z)| lr 5.49e-05 | 2532.73 ms | 53.3% bf16 MFU | 206979 tok/s +step 15871/19560 | loss 3.249332 (-1.47z)| norm 0.2758 (+0.22z)| lr 5.49e-05 | 2533.91 ms | 53.3% bf16 MFU | 206975 tok/s +step 15872/19560 | loss 3.328390 (+0.30z)| norm 0.2674 (-0.19z)| lr 5.49e-05 | 2532.47 ms | 53.3% bf16 MFU | 206978 tok/s +step 15873/19560 | loss 3.323570 (+0.21z)| norm 0.2643 (-0.35z)| lr 5.49e-05 | 2532.81 ms | 53.3% bf16 MFU | 206979 tok/s +step 15874/19560 | loss 3.398434 (+1.88z)| norm 0.2766 (+0.25z)| lr 5.48e-05 | 2533.11 ms | 53.3% bf16 MFU | 206979 tok/s +step 15875/19560 | loss 3.301622 (-0.31z)| norm 0.2748 (+0.16z)| lr 5.48e-05 | 2535.13 ms | 53.3% bf16 MFU | 206970 tok/s +step 15876/19560 | loss 3.325558 (+0.23z)| norm 0.2702 (-0.06z)| lr 5.48e-05 | 2534.45 ms | 53.3% bf16 MFU | 206965 tok/s +step 15877/19560 | loss 3.301854 (-0.30z)| norm 0.2560 (-0.75z)| lr 5.47e-05 | 2532.53 ms | 53.3% bf16 MFU | 206968 tok/s +step 15878/19560 | loss 3.281423 (-0.76z)| norm 0.2610 (-0.51z)| lr 5.47e-05 | 2534.20 ms | 53.3% bf16 MFU | 206964 tok/s +step 15879/19560 | loss 3.371799 (+1.40z)| norm 0.2631 (-0.39z)| lr 5.47e-05 | 2533.73 ms | 53.3% bf16 MFU | 206962 tok/s +step 15880/19560 | loss 3.268811 (-1.09z)| norm 0.2751 (+0.23z)| lr 5.46e-05 | 2534.09 ms | 53.3% bf16 MFU | 206958 tok/s +step 15881/19560 | loss 3.403613 (+2.12z)| norm 0.2753 (+0.24z)| lr 5.46e-05 | 2532.20 ms | 53.3% bf16 MFU | 206963 tok/s +step 15882/19560 | loss 3.333054 (+0.43z)| norm 0.2549 (-0.79z)| lr 5.46e-05 | 2532.42 ms | 53.3% bf16 MFU | 206966 tok/s +step 15883/19560 | loss 3.271935 (-1.02z)| norm 0.2765 (+0.34z)| lr 5.46e-05 | 2532.80 ms | 53.3% bf16 MFU | 206968 tok/s +step 15884/19560 | loss 3.375468 (+1.42z)| norm 0.2937 (+1.24z)| lr 5.45e-05 | 2533.23 ms | 53.3% bf16 MFU | 206968 tok/s +step 15885/19560 | loss 3.277611 (-0.89z)| norm 0.2501 (-1.04z)| lr 5.45e-05 | 2534.53 ms | 53.3% bf16 MFU | 206962 tok/s +step 15886/19560 | loss 3.293540 (-0.52z)| norm 0.2681 (-0.10z)| lr 5.45e-05 | 2533.04 ms | 53.3% bf16 MFU | 206963 tok/s +step 15887/19560 | loss 3.316547 (+0.02z)| norm 0.2736 (+0.18z)| lr 5.44e-05 | 2532.59 ms | 53.3% bf16 MFU | 206966 tok/s +step 15888/19560 | loss 3.242047 (-1.72z)| norm 0.2778 (+0.41z)| lr 5.44e-05 | 2532.52 ms | 53.3% bf16 MFU | 206968 tok/s +step 15889/19560 | loss 3.283520 (-0.74z)| norm 0.2655 (-0.24z)| lr 5.44e-05 | 2532.96 ms | 53.3% bf16 MFU | 206969 tok/s +step 15890/19560 | loss 3.305474 (-0.22z)| norm 0.2793 (+0.49z)| lr 5.44e-05 | 2534.88 ms | 53.3% bf16 MFU | 206962 tok/s +step 15891/19560 | loss 3.272480 (-0.98z)| norm 0.2730 (+0.16z)| lr 5.43e-05 | 2533.13 ms | 53.3% bf16 MFU | 206963 tok/s +step 15892/19560 | loss 3.273662 (-0.95z)| norm 0.2629 (-0.38z)| lr 5.43e-05 | 2531.81 ms | 53.3% bf16 MFU | 206969 tok/s +step 15893/19560 | loss 3.333549 (+0.45z)| norm 0.2654 (-0.24z)| lr 5.43e-05 | 2532.29 ms | 53.3% bf16 MFU | 206972 tok/s +step 15894/19560 | loss 3.261956 (-1.22z)| norm 0.2852 (+0.80z)| lr 5.42e-05 | 2535.04 ms | 53.3% bf16 MFU | 206965 tok/s +step 15895/19560 | loss 3.325745 (+0.26z)| norm 0.3435 (+3.66z)| lr 5.42e-05 | 2531.67 ms | 53.3% bf16 MFU | 206971 tok/s +step 15896/19560 | loss 3.278481 (-0.85z)| norm 0.2731 (+0.11z)| lr 5.42e-05 | 2532.25 ms | 53.3% bf16 MFU | 206975 tok/s +step 15897/19560 | loss 3.342986 (+0.69z)| norm 0.2619 (-0.45z)| lr 5.42e-05 | 2531.51 ms | 53.3% bf16 MFU | 206981 tok/s +step 15898/19560 | loss 3.304878 (-0.21z)| norm 0.2525 (-0.91z)| lr 5.41e-05 | 2532.95 ms | 53.3% bf16 MFU | 206981 tok/s +step 15899/19560 | loss 3.313911 (+0.02z)| norm 0.2700 (-0.04z)| lr 5.41e-05 | 2533.43 ms | 53.3% bf16 MFU | 206980 tok/s +step 15900/19560 | loss 3.308944 (-0.11z)| norm 0.2542 (-0.82z)| lr 5.41e-05 | 2531.17 ms | 53.3% bf16 MFU | 206987 tok/s +step 15901/19560 | loss 3.265283 (-1.17z)| norm 0.2503 (-1.00z)| lr 5.40e-05 | 2531.82 ms | 53.3% bf16 MFU | 206992 tok/s +step 15902/19560 | loss 3.314237 (+0.05z)| norm 0.2719 (+0.08z)| lr 5.40e-05 | 2533.58 ms | 53.3% bf16 MFU | 206989 tok/s +step 15903/19560 | loss 3.317056 (+0.12z)| norm 0.2471 (-1.15z)| lr 5.40e-05 | 2534.60 ms | 53.3% bf16 MFU | 206982 tok/s +step 15904/19560 | loss 3.292160 (-0.52z)| norm 0.2543 (-0.78z)| lr 5.40e-05 | 2533.24 ms | 53.3% bf16 MFU | 206981 tok/s +step 15905/19560 | loss 3.273588 (-1.01z)| norm 0.2553 (-0.81z)| lr 5.39e-05 | 2531.41 ms | 53.3% bf16 MFU | 206988 tok/s +step 15906/19560 | loss 3.376554 (+1.62z)| norm 0.2592 (-0.57z)| lr 5.39e-05 | 2533.16 ms | 53.3% bf16 MFU | 206987 tok/s +step 15907/19560 | loss 3.324693 (+0.31z)| norm 0.2608 (-0.46z)| lr 5.39e-05 | 2533.75 ms | 53.3% bf16 MFU | 206984 tok/s +step 15908/19560 | loss 3.291221 (-0.58z)| norm 0.2484 (-1.22z)| lr 5.38e-05 | 2532.70 ms | 53.3% bf16 MFU | 206985 tok/s +step 15909/19560 | loss 3.324966 (+0.30z)| norm 0.2582 (-0.59z)| lr 5.38e-05 | 2532.38 ms | 53.3% bf16 MFU | 206987 tok/s +step 15910/19560 | loss 3.314348 (+0.03z)| norm 0.2769 (+0.59z)| lr 5.38e-05 | 2532.02 ms | 53.3% bf16 MFU | 206991 tok/s +step 15911/19560 | loss 3.230896 (-2.15z)| norm 0.2482 (-1.21z)| lr 5.38e-05 | 2534.80 ms | 53.3% bf16 MFU | 206983 tok/s +step 15912/19560 | loss 3.304717 (-0.22z)| norm 0.2683 (+0.07z)| lr 5.37e-05 | 2532.10 ms | 53.3% bf16 MFU | 206987 tok/s +step 15913/19560 | loss 3.312922 (-0.01z)| norm 0.2644 (-0.18z)| lr 5.37e-05 | 2531.52 ms | 53.3% bf16 MFU | 206993 tok/s +step 15914/19560 | loss 3.283981 (-0.77z)| norm 0.2518 (-0.97z)| lr 5.37e-05 | 2532.92 ms | 53.3% bf16 MFU | 206993 tok/s +step 15915/19560 | loss 3.306931 (-0.17z)| norm 0.2674 (+0.02z)| lr 5.36e-05 | 2532.79 ms | 53.3% bf16 MFU | 206993 tok/s +step 15916/19560 | loss 3.277007 (-0.94z)| norm 0.2776 (+0.66z)| lr 5.36e-05 | 2530.87 ms | 53.3% bf16 MFU | 207001 tok/s +step 15917/19560 | loss 3.314561 (+0.04z)| norm 0.2445 (-1.41z)| lr 5.36e-05 | 2530.65 ms | 53.4% bf16 MFU | 207010 tok/s +step 15918/19560 | loss 3.371266 (+1.52z)| norm 0.2884 (+1.33z)| lr 5.36e-05 | 2530.91 ms | 53.3% bf16 MFU | 207017 tok/s +step 15919/19560 | loss 3.312419 (-0.01z)| norm 0.2746 (+0.46z)| lr 5.35e-05 | 2530.65 ms | 53.4% bf16 MFU | 207025 tok/s +step 15920/19560 | loss 3.282546 (-0.79z)| norm 0.2569 (-0.64z)| lr 5.35e-05 | 2531.91 ms | 53.3% bf16 MFU | 207027 tok/s +step 15921/19560 | loss 3.445920 (+3.30z)| norm 0.2771 (+0.68z)| lr 5.35e-05 | 2532.22 ms | 53.3% bf16 MFU | 207028 tok/s +step 15922/19560 | loss 3.302942 (-0.27z)| norm 0.2653 (-0.10z)| lr 5.34e-05 | 2532.17 ms | 53.3% bf16 MFU | 207030 tok/s +step 15923/19560 | loss 3.304816 (-0.22z)| norm 0.2789 (+0.79z)| lr 5.34e-05 | 2533.65 ms | 53.3% bf16 MFU | 207025 tok/s +step 15924/19560 | loss 3.281236 (-0.81z)| norm 0.2695 (+0.17z)| lr 5.34e-05 | 2533.36 ms | 53.3% bf16 MFU | 207021 tok/s +step 15925/19560 | loss 3.320038 (+0.17z)| norm 0.2639 (-0.21z)| lr 5.34e-05 | 2532.56 ms | 53.3% bf16 MFU | 207021 tok/s +step 15926/19560 | loss 3.307204 (-0.15z)| norm 0.2651 (-0.13z)| lr 5.33e-05 | 2530.14 ms | 53.4% bf16 MFU | 207031 tok/s +step 15927/19560 | loss 3.325848 (+0.31z)| norm 0.2692 (+0.14z)| lr 5.33e-05 | 2531.92 ms | 53.3% bf16 MFU | 207033 tok/s +step 15928/19560 | loss 3.302429 (-0.27z)| norm 0.2423 (-1.62z)| lr 5.33e-05 | 2531.29 ms | 53.3% bf16 MFU | 207037 tok/s +step 15929/19560 | loss 3.357771 (+1.11z)| norm 0.2554 (-0.75z)| lr 5.32e-05 | 2530.81 ms | 53.3% bf16 MFU | 207043 tok/s +step 15930/19560 | loss 3.317644 (+0.10z)| norm 0.2797 (+0.85z)| lr 5.32e-05 | 2532.02 ms | 53.3% bf16 MFU | 207044 tok/s +step 15931/19560 | loss 3.237322 (-1.88z)| norm 0.2582 (-0.56z)| lr 5.32e-05 | 2532.64 ms | 53.3% bf16 MFU | 207043 tok/s +step 15932/19560 | loss 3.324746 (+0.30z)| norm 0.2791 (+0.82z)| lr 5.32e-05 | 2530.06 ms | 53.4% bf16 MFU | 207052 tok/s +step 15933/19560 | loss 3.280484 (-0.79z)| norm 0.2566 (-0.68z)| lr 5.31e-05 | 2531.18 ms | 53.3% bf16 MFU | 207056 tok/s +step 15934/19560 | loss 3.246685 (-1.61z)| norm 0.2721 (+0.43z)| lr 5.31e-05 | 2533.86 ms | 53.3% bf16 MFU | 207049 tok/s +step 15935/19560 | loss 3.272142 (-0.97z)| norm 0.2625 (-0.27z)| lr 5.31e-05 | 2532.91 ms | 53.3% bf16 MFU | 207046 tok/s +step 15936/19560 | loss 3.321149 (+0.23z)| norm 0.2566 (-0.67z)| lr 5.31e-05 | 2533.99 ms | 53.3% bf16 MFU | 207039 tok/s +step 15937/19560 | loss 3.327147 (+0.38z)| norm 0.2478 (-1.29z)| lr 5.30e-05 | 2533.87 ms | 53.3% bf16 MFU | 207032 tok/s +step 15938/19560 | loss 3.350131 (+0.96z)| norm 0.2671 (+0.10z)| lr 5.30e-05 | 2532.66 ms | 53.3% bf16 MFU | 207031 tok/s +step 15939/19560 | loss 3.263083 (-1.19z)| norm 0.2555 (-0.73z)| lr 5.30e-05 | 2533.27 ms | 53.3% bf16 MFU | 207028 tok/s +step 15940/19560 | loss 3.368513 (+1.47z)| norm 0.2509 (-1.04z)| lr 5.29e-05 | 2534.39 ms | 53.3% bf16 MFU | 207020 tok/s +step 15941/19560 | loss 3.224898 (-2.12z)| norm 0.2587 (-0.47z)| lr 5.29e-05 | 2533.81 ms | 53.3% bf16 MFU | 207015 tok/s +step 15942/19560 | loss 3.314031 (+0.10z)| norm 0.2699 (+0.33z)| lr 5.29e-05 | 2532.92 ms | 53.3% bf16 MFU | 207013 tok/s +step 15943/19560 | loss 3.356025 (+1.13z)| norm 0.2576 (-0.56z)| lr 5.29e-05 | 2532.81 ms | 53.3% bf16 MFU | 207013 tok/s +step 15944/19560 | loss 3.310193 (-0.02z)| norm 0.2449 (-1.45z)| lr 5.28e-05 | 2532.71 ms | 53.3% bf16 MFU | 207012 tok/s +step 15945/19560 | loss 3.277307 (-0.84z)| norm 0.2483 (-1.19z)| lr 5.28e-05 | 2533.48 ms | 53.3% bf16 MFU | 207009 tok/s +step 15946/19560 | loss 3.281350 (-0.72z)| norm 0.2766 (+0.83z)| lr 5.28e-05 | 2533.65 ms | 53.3% bf16 MFU | 207005 tok/s +step 15947/19560 | loss 3.246693 (-1.58z)| norm 0.2552 (-0.72z)| lr 5.27e-05 | 2533.89 ms | 53.3% bf16 MFU | 207000 tok/s +step 15948/19560 | loss 3.273659 (-0.89z)| norm 0.2445 (-1.47z)| lr 5.27e-05 | 2532.96 ms | 53.3% bf16 MFU | 207000 tok/s +step 15949/19560 | loss 3.297666 (-0.29z)| norm 0.3527 (+5.48z)| lr 5.27e-05 | 2532.72 ms | 53.3% bf16 MFU | 207000 tok/s +step 15950/19560 | loss 3.395762 (+2.13z)| norm 0.2765 (+0.67z)| lr 5.27e-05 | 2531.62 ms | 53.3% bf16 MFU | 207005 tok/s +step 15951/19560 | loss 3.314633 (+0.12z)| norm 0.3054 (+2.44z)| lr 5.26e-05 | 2531.25 ms | 53.3% bf16 MFU | 207011 tok/s +step 15952/19560 | loss 3.306636 (-0.09z)| norm 0.2676 (+0.08z)| lr 5.26e-05 | 2533.31 ms | 53.3% bf16 MFU | 207008 tok/s +step 15953/19560 | loss 3.248195 (-1.56z)| norm 0.2669 (+0.03z)| lr 5.26e-05 | 2532.89 ms | 53.3% bf16 MFU | 207007 tok/s +step 15954/19560 | loss 3.349826 (+1.03z)| norm 0.2845 (+1.12z)| lr 5.25e-05 | 2533.65 ms | 53.3% bf16 MFU | 207003 tok/s +step 15955/19560 | loss 3.281992 (-0.71z)| norm 0.2592 (-0.45z)| lr 5.25e-05 | 2534.21 ms | 53.3% bf16 MFU | 206997 tok/s +step 15956/19560 | loss 3.308622 (-0.02z)| norm 0.2550 (-0.72z)| lr 5.25e-05 | 2532.51 ms | 53.3% bf16 MFU | 206999 tok/s +step 15957/19560 | loss 3.311616 (+0.06z)| norm 0.2757 (+0.56z)| lr 5.25e-05 | 2533.67 ms | 53.3% bf16 MFU | 206995 tok/s +step 15958/19560 | loss 3.316072 (+0.17z)| norm 0.2577 (-0.56z)| lr 5.24e-05 | 2531.91 ms | 53.3% bf16 MFU | 206999 tok/s +step 15959/19560 | loss 3.285249 (-0.62z)| norm 0.2540 (-0.80z)| lr 5.24e-05 | 2531.66 ms | 53.3% bf16 MFU | 207004 tok/s +step 15960/19560 | loss 3.293944 (-0.38z)| norm 0.2472 (-1.22z)| lr 5.24e-05 | 2532.09 ms | 53.3% bf16 MFU | 207006 tok/s +step 15961/19560 | loss 3.327716 (+0.51z)| norm 0.2606 (-0.37z)| lr 5.23e-05 | 2530.67 ms | 53.4% bf16 MFU | 207015 tok/s +step 15962/19560 | loss 3.320550 (+0.32z)| norm 0.2548 (-0.75z)| lr 5.23e-05 | 2531.55 ms | 53.3% bf16 MFU | 207019 tok/s +step 15963/19560 | loss 3.363677 (+1.42z)| norm 0.2552 (-0.74z)| lr 5.23e-05 | 2532.12 ms | 53.3% bf16 MFU | 207021 tok/s +step 15964/19560 | loss 3.277343 (-0.81z)| norm 0.2741 (+0.46z)| lr 5.23e-05 | 2532.47 ms | 53.3% bf16 MFU | 207021 tok/s +step 15965/19560 | loss 3.301724 (-0.17z)| norm 0.2654 (-0.10z)| lr 5.22e-05 | 2532.43 ms | 53.3% bf16 MFU | 207022 tok/s +step 15966/19560 | loss 3.278374 (-0.78z)| norm 0.2542 (-0.83z)| lr 5.22e-05 | 2533.27 ms | 53.3% bf16 MFU | 207019 tok/s +step 15967/19560 | loss 3.319184 (+0.29z)| norm 0.2499 (-1.10z)| lr 5.22e-05 | 2531.81 ms | 53.3% bf16 MFU | 207022 tok/s +step 15968/19560 | loss 3.301192 (-0.18z)| norm 0.2607 (-0.39z)| lr 5.21e-05 | 2532.50 ms | 53.3% bf16 MFU | 207022 tok/s +step 15969/19560 | loss 3.299798 (-0.21z)| norm 0.2789 (+0.76z)| lr 5.21e-05 | 2532.97 ms | 53.3% bf16 MFU | 207020 tok/s +step 15970/19560 | loss 3.285830 (-0.57z)| norm 0.2594 (-0.49z)| lr 5.21e-05 | 2532.17 ms | 53.3% bf16 MFU | 207021 tok/s +step 15971/19560 | loss 3.265872 (-1.09z)| norm 0.2621 (-0.30z)| lr 5.21e-05 | 2530.89 ms | 53.3% bf16 MFU | 207028 tok/s +step 15972/19560 | loss 3.258335 (-1.28z)| norm 0.3052 (+2.39z)| lr 5.20e-05 | 2533.73 ms | 53.3% bf16 MFU | 207023 tok/s +step 15973/19560 | loss 3.267257 (-1.03z)| norm 0.2550 (-0.77z)| lr 5.20e-05 | 2532.53 ms | 53.3% bf16 MFU | 207023 tok/s +step 15974/19560 | loss 3.279498 (-0.72z)| norm 0.2559 (-0.70z)| lr 5.20e-05 | 2534.03 ms | 53.3% bf16 MFU | 207017 tok/s +step 15975/19560 | loss 3.338343 (+0.81z)| norm 0.2527 (-0.90z)| lr 5.19e-05 | 2535.20 ms | 53.3% bf16 MFU | 207006 tok/s +step 15976/19560 | loss 3.314952 (+0.19z)| norm 0.2705 (+0.22z)| lr 5.19e-05 | 2533.44 ms | 53.3% bf16 MFU | 207003 tok/s +step 15977/19560 | loss 3.283106 (-0.64z)| norm 0.2621 (-0.30z)| lr 5.19e-05 | 2534.91 ms | 53.3% bf16 MFU | 206994 tok/s +step 15978/19560 | loss 3.314752 (+0.19z)| norm 0.2607 (-0.38z)| lr 5.19e-05 | 2533.20 ms | 53.3% bf16 MFU | 206993 tok/s +step 15979/19560 | loss 3.275395 (-0.83z)| norm 0.2534 (-0.84z)| lr 5.18e-05 | 2534.06 ms | 53.3% bf16 MFU | 206988 tok/s +step 15980/19560 | loss 3.288974 (-0.48z)| norm 0.3020 (+2.19z)| lr 5.18e-05 | 2530.81 ms | 53.3% bf16 MFU | 206997 tok/s +step 15981/19560 | loss 3.258272 (-1.26z)| norm 0.2541 (-0.79z)| lr 5.18e-05 | 2531.41 ms | 53.3% bf16 MFU | 207003 tok/s +step 15982/19560 | loss 3.286858 (-0.51z)| norm 0.2631 (-0.23z)| lr 5.18e-05 | 2534.28 ms | 53.3% bf16 MFU | 206996 tok/s +step 15983/19560 | loss 3.292863 (-0.34z)| norm 0.2718 (+0.31z)| lr 5.17e-05 | 2533.72 ms | 53.3% bf16 MFU | 206993 tok/s +step 15984/19560 | loss 3.292394 (-0.35z)| norm 0.2470 (-1.22z)| lr 5.17e-05 | 2533.13 ms | 53.3% bf16 MFU | 206992 tok/s +step 15985/19560 | loss 3.325173 (+0.51z)| norm 0.2523 (-0.88z)| lr 5.17e-05 | 2531.80 ms | 53.3% bf16 MFU | 206996 tok/s +step 15986/19560 | loss 3.282342 (-0.61z)| norm 0.2485 (-1.11z)| lr 5.16e-05 | 2534.26 ms | 53.3% bf16 MFU | 206990 tok/s +step 15987/19560 | loss 3.269944 (-0.94z)| norm 0.2576 (-0.54z)| lr 5.16e-05 | 2532.41 ms | 53.3% bf16 MFU | 206992 tok/s +step 15988/19560 | loss 3.336171 (+0.81z)| norm 0.2665 (+0.01z)| lr 5.16e-05 | 2532.60 ms | 53.3% bf16 MFU | 206994 tok/s +step 15989/19560 | loss 3.320564 (+0.39z)| norm 0.2478 (-1.12z)| lr 5.16e-05 | 2532.76 ms | 53.3% bf16 MFU | 206994 tok/s +step 15990/19560 | loss 3.300839 (-0.12z)| norm 0.2494 (-1.01z)| lr 5.15e-05 | 2530.49 ms | 53.4% bf16 MFU | 207004 tok/s +step 15991/19560 | loss 3.322467 (+0.45z)| norm 0.2460 (-1.20z)| lr 5.15e-05 | 2533.06 ms | 53.3% bf16 MFU | 207002 tok/s +step 15992/19560 | loss 3.379740 (+1.95z)| norm 0.2628 (-0.18z)| lr 5.15e-05 | 2534.54 ms | 53.3% bf16 MFU | 206995 tok/s +step 15993/19560 | loss 3.225898 (-2.07z)| norm 0.2632 (-0.15z)| lr 5.14e-05 | 2531.23 ms | 53.3% bf16 MFU | 207002 tok/s +step 15994/19560 | loss 3.263274 (-1.09z)| norm 0.2532 (-0.75z)| lr 5.14e-05 | 2531.70 ms | 53.3% bf16 MFU | 207006 tok/s +step 15995/19560 | loss 3.290199 (-0.36z)| norm 0.2555 (-0.60z)| lr 5.14e-05 | 2532.56 ms | 53.3% bf16 MFU | 207007 tok/s +step 15996/19560 | loss 3.335058 (+0.82z)| norm 0.2482 (-1.03z)| lr 5.14e-05 | 2532.46 ms | 53.3% bf16 MFU | 207008 tok/s +step 15997/19560 | loss 3.272627 (-0.85z)| norm 0.2601 (-0.29z)| lr 5.13e-05 | 2532.67 ms | 53.3% bf16 MFU | 207008 tok/s +step 15998/19560 | loss 3.329652 (+0.67z)| norm 0.2647 (-0.01z)| lr 5.13e-05 | 2533.23 ms | 53.3% bf16 MFU | 207006 tok/s +step 15999/19560 | loss 3.279907 (-0.66z)| norm 0.2457 (-1.17z)| lr 5.13e-05 | 2534.11 ms | 53.3% bf16 MFU | 207000 tok/s +step 16000/19560 | loss 3.297193 (-0.19z)| norm 0.2622 (-0.15z)| lr 5.12e-05 | 2533.40 ms | 53.3% bf16 MFU | 206998 tok/s +val loss 3.302220 +evaluating HellaSwag: 0/628 evaluating HellaSwag: 10/628 evaluating HellaSwag: 20/628 evaluating HellaSwag: 30/628 evaluating HellaSwag: 40/628 evaluating HellaSwag: 50/628 evaluating HellaSwag: 60/628 evaluating HellaSwag: 70/628 evaluating HellaSwag: 80/628 evaluating HellaSwag: 90/628 evaluating HellaSwag: 100/628 evaluating HellaSwag: 110/628 evaluating HellaSwag: 120/628 evaluating HellaSwag: 130/628 evaluating HellaSwag: 140/628 evaluating HellaSwag: 150/628 evaluating HellaSwag: 160/628 evaluating HellaSwag: 170/628 evaluating HellaSwag: 180/628 evaluating HellaSwag: 190/628 evaluating HellaSwag: 200/628 evaluating HellaSwag: 210/628 evaluating HellaSwag: 220/628 evaluating HellaSwag: 230/628 evaluating HellaSwag: 240/628 evaluating HellaSwag: 250/628 evaluating HellaSwag: 260/628 evaluating HellaSwag: 270/628 evaluating HellaSwag: 280/628 evaluating HellaSwag: 290/628 evaluating HellaSwag: 300/628 evaluating HellaSwag: 310/628 evaluating HellaSwag: 320/628 evaluating HellaSwag: 330/628 evaluating HellaSwag: 340/628 evaluating HellaSwag: 350/628 evaluating HellaSwag: 360/628 evaluating HellaSwag: 370/628 evaluating HellaSwag: 380/628 evaluating HellaSwag: 390/628 evaluating HellaSwag: 400/628 evaluating HellaSwag: 410/628 evaluating HellaSwag: 420/628 evaluating HellaSwag: 430/628 evaluating HellaSwag: 440/628 evaluating HellaSwag: 450/628 evaluating HellaSwag: 460/628 evaluating HellaSwag: 470/628 evaluating HellaSwag: 480/628 evaluating HellaSwag: 490/628 evaluating HellaSwag: 500/628 evaluating HellaSwag: 510/628 evaluating HellaSwag: 520/628 evaluating HellaSwag: 530/628 evaluating HellaSwag: 540/628 evaluating HellaSwag: 550/628 evaluating HellaSwag: 560/628 evaluating HellaSwag: 570/628 evaluating HellaSwag: 580/628 evaluating HellaSwag: 590/628 evaluating HellaSwag: 600/628 evaluating HellaSwag: 610/628 evaluating HellaSwag: 620/628 HellaSwag: 3036/10042 = 0.302330 +step 16001/19560 | loss 3.257781 (-1.23z)| norm 0.2455 (-1.16z)| lr 5.12e-05 | 2534.75 ms | 53.3% bf16 MFU | 206990 tok/s +step 16002/19560 | loss 3.272552 (-0.83z)| norm 0.2533 (-0.67z)| lr 5.12e-05 | 2532.04 ms | 53.3% bf16 MFU | 206993 tok/s +step 16003/19560 | loss 3.326855 (+0.65z)| norm 0.2501 (-0.85z)| lr 5.12e-05 | 2530.39 ms | 53.4% bf16 MFU | 207003 tok/s +step 16004/19560 | loss 3.295998 (-0.19z)| norm 0.2567 (-0.44z)| lr 5.11e-05 | 2532.85 ms | 53.3% bf16 MFU | 207003 tok/s +step 16005/19560 | loss 3.274904 (-0.76z)| norm 0.2663 (+0.14z)| lr 5.11e-05 | 2533.34 ms | 53.3% bf16 MFU | 207001 tok/s +step 16006/19560 | loss 3.266530 (-0.98z)| norm 0.2569 (-0.43z)| lr 5.11e-05 | 2532.74 ms | 53.3% bf16 MFU | 207001 tok/s +step 16007/19560 | loss 3.285840 (-0.44z)| norm 0.2522 (-0.72z)| lr 5.11e-05 | 2530.56 ms | 53.4% bf16 MFU | 207010 tok/s +step 16008/19560 | loss 3.270461 (-0.87z)| norm 0.2622 (-0.10z)| lr 5.10e-05 | 2532.19 ms | 53.3% bf16 MFU | 207012 tok/s +step 16009/19560 | loss 3.321418 (+0.58z)| norm 0.2535 (-0.62z)| lr 5.10e-05 | 2532.46 ms | 53.3% bf16 MFU | 207013 tok/s +step 16010/19560 | loss 3.312464 (+0.33z)| norm 0.2608 (-0.18z)| lr 5.10e-05 | 2530.50 ms | 53.4% bf16 MFU | 207021 tok/s +step 16011/19560 | loss 3.357008 (+1.57z)| norm 0.2571 (-0.40z)| lr 5.09e-05 | 2533.27 ms | 53.3% bf16 MFU | 207018 tok/s +step 16012/19560 | loss 3.291634 (-0.27z)| norm 0.2571 (-0.38z)| lr 5.09e-05 | 2533.27 ms | 53.3% bf16 MFU | 207015 tok/s +step 16013/19560 | loss 3.303847 (+0.07z)| norm 0.2706 (+0.45z)| lr 5.09e-05 | 2533.02 ms | 53.3% bf16 MFU | 207014 tok/s +step 16014/19560 | loss 3.373537 (+2.04z)| norm 0.2724 (+0.56z)| lr 5.09e-05 | 2532.69 ms | 53.3% bf16 MFU | 207013 tok/s +step 16015/19560 | loss 3.372915 (+1.98z)| norm 0.2666 (+0.20z)| lr 5.08e-05 | 2532.49 ms | 53.3% bf16 MFU | 207014 tok/s +step 16016/19560 | loss 3.282992 (-0.56z)| norm 0.2567 (-0.41z)| lr 5.08e-05 | 2533.15 ms | 53.3% bf16 MFU | 207012 tok/s +step 16017/19560 | loss 3.352971 (+1.40z)| norm 0.2673 (+0.25z)| lr 5.08e-05 | 2534.20 ms | 53.3% bf16 MFU | 207005 tok/s +step 16018/19560 | loss 3.269065 (-0.95z)| norm 0.2885 (+1.56z)| lr 5.07e-05 | 2533.25 ms | 53.3% bf16 MFU | 207003 tok/s +step 16019/19560 | loss 3.282455 (-0.58z)| norm 0.2489 (-0.88z)| lr 5.07e-05 | 2533.29 ms | 53.3% bf16 MFU | 207001 tok/s +step 16020/19560 | loss 3.300041 (-0.09z)| norm 0.2541 (-0.55z)| lr 5.07e-05 | 2535.21 ms | 53.3% bf16 MFU | 206991 tok/s +step 16021/19560 | loss 3.349201 (+1.29z)| norm 0.2642 (+0.07z)| lr 5.07e-05 | 2533.41 ms | 53.3% bf16 MFU | 206989 tok/s +step 16022/19560 | loss 3.457689 (+4.02z)| norm 0.4079 (+7.02z)| lr 5.06e-05 | 2532.81 ms | 53.3% bf16 MFU | 206990 tok/s +step 16023/19560 | loss 3.279271 (-0.66z)| norm 0.2633 (-0.00z)| lr 5.06e-05 | 2534.57 ms | 53.3% bf16 MFU | 206983 tok/s +step 16024/19560 | loss 3.315428 (+0.28z)| norm 0.2730 (+0.50z)| lr 5.06e-05 | 2533.66 ms | 53.3% bf16 MFU | 206980 tok/s +step 16025/19560 | loss 3.372238 (+1.76z)| norm 0.2535 (-0.51z)| lr 5.06e-05 | 2534.54 ms | 53.3% bf16 MFU | 206974 tok/s +step 16026/19560 | loss 3.349841 (+1.16z)| norm 0.2948 (+1.60z)| lr 5.05e-05 | 2535.23 ms | 53.3% bf16 MFU | 206965 tok/s +step 16027/19560 | loss 3.282146 (-0.60z)| norm 0.2612 (-0.12z)| lr 5.05e-05 | 2534.30 ms | 53.3% bf16 MFU | 206961 tok/s +step 16028/19560 | loss 3.292397 (-0.33z)| norm 0.2595 (-0.21z)| lr 5.05e-05 | 2532.21 ms | 53.3% bf16 MFU | 206965 tok/s +step 16029/19560 | loss 3.277007 (-0.73z)| norm 0.2505 (-0.67z)| lr 5.04e-05 | 2534.30 ms | 53.3% bf16 MFU | 206961 tok/s +step 16030/19560 | loss 3.422416 (+2.93z)| norm 0.2644 (+0.05z)| lr 5.04e-05 | 2532.16 ms | 53.3% bf16 MFU | 206965 tok/s +step 16031/19560 | loss 3.329046 (+0.58z)| norm 0.2721 (+0.43z)| lr 5.04e-05 | 2532.22 ms | 53.3% bf16 MFU | 206969 tok/s +step 16032/19560 | loss 3.315677 (+0.24z)| norm 0.2593 (-0.23z)| lr 5.04e-05 | 2532.38 ms | 53.3% bf16 MFU | 206973 tok/s +step 16033/19560 | loss 3.335701 (+0.73z)| norm 0.2576 (-0.32z)| lr 5.03e-05 | 2532.69 ms | 53.3% bf16 MFU | 206974 tok/s +step 16034/19560 | loss 3.261939 (-1.11z)| norm 0.2648 (+0.05z)| lr 5.03e-05 | 2532.86 ms | 53.3% bf16 MFU | 206975 tok/s +step 16035/19560 | loss 3.419944 (+2.78z)| norm 0.2567 (-0.37z)| lr 5.03e-05 | 2531.93 ms | 53.3% bf16 MFU | 206980 tok/s +step 16036/19560 | loss 3.226587 (-1.92z)| norm 0.2862 (+1.14z)| lr 5.02e-05 | 2533.61 ms | 53.3% bf16 MFU | 206978 tok/s +step 16037/19560 | loss 3.271600 (-0.82z)| norm 0.2635 (-0.03z)| lr 5.02e-05 | 2531.08 ms | 53.3% bf16 MFU | 206986 tok/s +step 16038/19560 | loss 3.275987 (-0.71z)| norm 0.2676 (+0.18z)| lr 5.02e-05 | 2531.94 ms | 53.3% bf16 MFU | 206990 tok/s +step 16039/19560 | loss 3.272942 (-0.80z)| norm 0.2521 (-0.62z)| lr 5.02e-05 | 2533.61 ms | 53.3% bf16 MFU | 206987 tok/s +step 16040/19560 | loss 3.310843 (+0.13z)| norm 0.2467 (-0.89z)| lr 5.01e-05 | 2533.37 ms | 53.3% bf16 MFU | 206986 tok/s +step 16041/19560 | loss 3.305974 (+0.01z)| norm 0.2694 (+0.28z)| lr 5.01e-05 | 2531.99 ms | 53.3% bf16 MFU | 206990 tok/s +step 16042/19560 | loss 3.323194 (+0.42z)| norm 0.2559 (-0.42z)| lr 5.01e-05 | 2531.75 ms | 53.3% bf16 MFU | 206994 tok/s +step 16043/19560 | loss 3.326720 (+0.50z)| norm 0.2570 (-0.36z)| lr 5.01e-05 | 2534.55 ms | 53.3% bf16 MFU | 206987 tok/s +step 16044/19560 | loss 3.339799 (+0.81z)| norm 0.2557 (-0.41z)| lr 5.00e-05 | 2532.72 ms | 53.3% bf16 MFU | 206988 tok/s +step 16045/19560 | loss 3.354733 (+1.16z)| norm 0.2526 (-0.58z)| lr 5.00e-05 | 2532.73 ms | 53.3% bf16 MFU | 206989 tok/s +step 16046/19560 | loss 3.285561 (-0.50z)| norm 0.2594 (-0.22z)| lr 5.00e-05 | 2533.90 ms | 53.3% bf16 MFU | 206985 tok/s +step 16047/19560 | loss 3.315003 (+0.21z)| norm 0.2510 (-0.65z)| lr 4.99e-05 | 2533.72 ms | 53.3% bf16 MFU | 206982 tok/s +step 16048/19560 | loss 3.330932 (+0.59z)| norm 0.2581 (-0.28z)| lr 4.99e-05 | 2533.49 ms | 53.3% bf16 MFU | 206980 tok/s +step 16049/19560 | loss 3.343030 (+0.95z)| norm 0.2422 (-1.09z)| lr 4.99e-05 | 2534.78 ms | 53.3% bf16 MFU | 206973 tok/s +step 16050/19560 | loss 3.376870 (+1.78z)| norm 0.2495 (-0.70z)| lr 4.99e-05 | 2532.80 ms | 53.3% bf16 MFU | 206974 tok/s +step 16051/19560 | loss 3.341765 (+0.88z)| norm 0.2811 (+0.93z)| lr 4.98e-05 | 2532.69 ms | 53.3% bf16 MFU | 206976 tok/s +step 16052/19560 | loss 3.239489 (-1.66z)| norm 0.2651 (+0.11z)| lr 4.98e-05 | 2532.55 ms | 53.3% bf16 MFU | 206978 tok/s +step 16053/19560 | loss 3.302219 (-0.10z)| norm 0.2542 (-0.45z)| lr 4.98e-05 | 2534.05 ms | 53.3% bf16 MFU | 206974 tok/s +step 16054/19560 | loss 3.289909 (-0.40z)| norm 0.2477 (-0.78z)| lr 4.97e-05 | 2532.34 ms | 53.3% bf16 MFU | 206977 tok/s +step 16055/19560 | loss 3.322060 (+0.40z)| norm 0.2469 (-0.81z)| lr 4.97e-05 | 2533.41 ms | 53.3% bf16 MFU | 206976 tok/s +step 16056/19560 | loss 3.350506 (+1.09z)| norm 0.2737 (+0.56z)| lr 4.97e-05 | 2532.21 ms | 53.3% bf16 MFU | 206979 tok/s +step 16057/19560 | loss 3.217154 (-2.16z)| norm 0.2664 (+0.18z)| lr 4.97e-05 | 2532.78 ms | 53.3% bf16 MFU | 206981 tok/s +step 16058/19560 | loss 3.281600 (-0.57z)| norm 0.2641 (+0.06z)| lr 4.96e-05 | 2533.65 ms | 53.3% bf16 MFU | 206978 tok/s +step 16059/19560 | loss 3.312941 (+0.18z)| norm 0.2615 (-0.07z)| lr 4.96e-05 | 2533.56 ms | 53.3% bf16 MFU | 206976 tok/s +step 16060/19560 | loss 3.330974 (+0.62z)| norm 0.2539 (-0.46z)| lr 4.96e-05 | 2532.31 ms | 53.3% bf16 MFU | 206979 tok/s +step 16061/19560 | loss 3.350421 (+1.09z)| norm 0.2702 (+0.39z)| lr 4.96e-05 | 2533.40 ms | 53.3% bf16 MFU | 206978 tok/s +step 16062/19560 | loss 3.336316 (+0.73z)| norm 0.2562 (-0.34z)| lr 4.95e-05 | 2535.11 ms | 53.3% bf16 MFU | 206969 tok/s +step 16063/19560 | loss 3.362683 (+1.36z)| norm 0.2720 (+0.48z)| lr 4.95e-05 | 2530.69 ms | 53.4% bf16 MFU | 206980 tok/s +step 16064/19560 | loss 3.330056 (+0.55z)| norm 0.2715 (+0.45z)| lr 4.95e-05 | 2533.50 ms | 53.3% bf16 MFU | 206978 tok/s +step 16065/19560 | loss 3.375147 (+1.64z)| norm 0.2669 (+0.20z)| lr 4.94e-05 | 2532.02 ms | 53.3% bf16 MFU | 206982 tok/s +step 16066/19560 | loss 3.275934 (-0.77z)| norm 0.2582 (-0.25z)| lr 4.94e-05 | 2531.46 ms | 53.3% bf16 MFU | 206988 tok/s +step 16067/19560 | loss 3.318978 (+0.27z)| norm 0.2652 (+0.11z)| lr 4.94e-05 | 2534.39 ms | 53.3% bf16 MFU | 206982 tok/s +step 16068/19560 | loss 3.392669 (+2.06z)| norm 0.2910 (+1.43z)| lr 4.94e-05 | 2534.46 ms | 53.3% bf16 MFU | 206976 tok/s +step 16069/19560 | loss 3.344168 (+0.87z)| norm 0.2547 (-0.44z)| lr 4.93e-05 | 2532.31 ms | 53.3% bf16 MFU | 206980 tok/s +step 16070/19560 | loss 3.363628 (+1.33z)| norm 0.2828 (+1.00z)| lr 4.93e-05 | 2531.58 ms | 53.3% bf16 MFU | 206985 tok/s +step 16071/19560 | loss 3.322761 (+0.33z)| norm 0.2706 (+0.37z)| lr 4.93e-05 | 2531.65 ms | 53.3% bf16 MFU | 206991 tok/s +step 16072/19560 | loss 3.277709 (-0.77z)| norm 0.2486 (-0.77z)| lr 4.93e-05 | 2532.40 ms | 53.3% bf16 MFU | 206993 tok/s +step 16073/19560 | loss 3.306358 (-0.07z)| norm 0.2760 (+0.63z)| lr 4.92e-05 | 2531.85 ms | 53.3% bf16 MFU | 206997 tok/s +step 16074/19560 | loss 3.283911 (-0.62z)| norm 0.2558 (-0.40z)| lr 4.92e-05 | 2534.75 ms | 53.3% bf16 MFU | 206989 tok/s +step 16075/19560 | loss 3.343050 (+0.82z)| norm 0.2618 (-0.10z)| lr 4.92e-05 | 2533.70 ms | 53.3% bf16 MFU | 206986 tok/s +step 16076/19560 | loss 3.260762 (-1.21z)| norm 0.2786 (+0.76z)| lr 4.91e-05 | 2531.40 ms | 53.3% bf16 MFU | 206992 tok/s +step 16077/19560 | loss 3.349043 (+0.96z)| norm 0.2530 (-0.57z)| lr 4.91e-05 | 2532.19 ms | 53.3% bf16 MFU | 206995 tok/s +step 16078/19560 | loss 3.293030 (-0.41z)| norm 0.2621 (-0.05z)| lr 4.91e-05 | 2531.82 ms | 53.3% bf16 MFU | 207000 tok/s +step 16079/19560 | loss 3.256813 (-1.30z)| norm 0.2569 (-0.33z)| lr 4.91e-05 | 2533.66 ms | 53.3% bf16 MFU | 206996 tok/s +step 16080/19560 | loss 3.333313 (+0.60z)| norm 0.2711 (+0.49z)| lr 4.90e-05 | 2533.04 ms | 53.3% bf16 MFU | 206995 tok/s +step 16081/19560 | loss 3.314894 (+0.13z)| norm 0.2537 (-0.51z)| lr 4.90e-05 | 2532.64 ms | 53.3% bf16 MFU | 206996 tok/s +step 16082/19560 | loss 3.288112 (-0.53z)| norm 0.2598 (-0.15z)| lr 4.90e-05 | 2534.99 ms | 53.3% bf16 MFU | 206987 tok/s +step 16083/19560 | loss 3.330393 (+0.52z)| norm 0.2719 (+0.55z)| lr 4.90e-05 | 2533.06 ms | 53.3% bf16 MFU | 206987 tok/s +step 16084/19560 | loss 3.330151 (+0.51z)| norm 0.2553 (-0.42z)| lr 4.89e-05 | 2533.21 ms | 53.3% bf16 MFU | 206986 tok/s +step 16085/19560 | loss 3.331081 (+0.53z)| norm 0.2671 (+0.28z)| lr 4.89e-05 | 2531.53 ms | 53.3% bf16 MFU | 206992 tok/s +step 16086/19560 | loss 3.310289 (+0.01z)| norm 0.2678 (+0.31z)| lr 4.89e-05 | 2534.90 ms | 53.3% bf16 MFU | 206983 tok/s +step 16087/19560 | loss 3.360790 (+1.26z)| norm 0.2549 (-0.44z)| lr 4.88e-05 | 2531.41 ms | 53.3% bf16 MFU | 206990 tok/s +step 16088/19560 | loss 3.353549 (+1.06z)| norm 0.2861 (+1.36z)| lr 4.88e-05 | 2532.35 ms | 53.3% bf16 MFU | 206992 tok/s +step 16089/19560 | loss 3.290318 (-0.51z)| norm 0.2686 (+0.34z)| lr 4.88e-05 | 2533.90 ms | 53.3% bf16 MFU | 206988 tok/s +step 16090/19560 | loss 3.328331 (+0.44z)| norm 0.2672 (+0.25z)| lr 4.88e-05 | 2534.25 ms | 53.3% bf16 MFU | 206983 tok/s +step 16091/19560 | loss 3.323689 (+0.33z)| norm 0.2539 (-0.52z)| lr 4.87e-05 | 2531.99 ms | 53.3% bf16 MFU | 206987 tok/s +step 16092/19560 | loss 3.325312 (+0.37z)| norm 0.2547 (-0.47z)| lr 4.87e-05 | 2532.64 ms | 53.3% bf16 MFU | 206988 tok/s +step 16093/19560 | loss 3.313873 (+0.08z)| norm 0.2413 (-1.23z)| lr 4.87e-05 | 2534.54 ms | 53.3% bf16 MFU | 206982 tok/s +step 16094/19560 | loss 3.299761 (-0.28z)| norm 0.2782 (+0.89z)| lr 4.87e-05 | 2533.47 ms | 53.3% bf16 MFU | 206980 tok/s +step 16095/19560 | loss 3.272497 (-0.96z)| norm 0.2540 (-0.51z)| lr 4.86e-05 | 2531.88 ms | 53.3% bf16 MFU | 206984 tok/s +step 16096/19560 | loss 3.282173 (-0.71z)| norm 0.2408 (-1.25z)| lr 4.86e-05 | 2533.68 ms | 53.3% bf16 MFU | 206982 tok/s +step 16097/19560 | loss 3.312644 (+0.05z)| norm 0.2731 (+0.60z)| lr 4.86e-05 | 2535.40 ms | 53.3% bf16 MFU | 206972 tok/s +step 16098/19560 | loss 3.257118 (-1.33z)| norm 0.2722 (+0.55z)| lr 4.85e-05 | 2534.05 ms | 53.3% bf16 MFU | 206968 tok/s +step 16099/19560 | loss 3.257482 (-1.32z)| norm 0.2547 (-0.46z)| lr 4.85e-05 | 2531.21 ms | 53.3% bf16 MFU | 206976 tok/s +step 16100/19560 | loss 3.330478 (+0.49z)| norm 0.2590 (-0.19z)| lr 4.85e-05 | 2533.84 ms | 53.3% bf16 MFU | 206973 tok/s +step 16101/19560 | loss 3.303663 (-0.19z)| norm 0.2738 (+0.67z)| lr 4.85e-05 | 2532.50 ms | 53.3% bf16 MFU | 206976 tok/s +step 16102/19560 | loss 3.312213 (+0.02z)| norm 0.2542 (-0.48z)| lr 4.84e-05 | 2535.71 ms | 53.2% bf16 MFU | 206965 tok/s +step 16103/19560 | loss 3.311644 (+0.01z)| norm 0.2712 (+0.51z)| lr 4.84e-05 | 2535.08 ms | 53.3% bf16 MFU | 206957 tok/s +step 16104/19560 | loss 3.288481 (-0.57z)| norm 0.2708 (+0.49z)| lr 4.84e-05 | 2533.24 ms | 53.3% bf16 MFU | 206958 tok/s +step 16105/19560 | loss 3.319826 (+0.22z)| norm 0.2613 (-0.08z)| lr 4.84e-05 | 2533.47 ms | 53.3% bf16 MFU | 206957 tok/s +step 16106/19560 | loss 3.299474 (-0.30z)| norm 0.2517 (-0.63z)| lr 4.83e-05 | 2532.59 ms | 53.3% bf16 MFU | 206960 tok/s +step 16107/19560 | loss 3.283641 (-0.70z)| norm 0.2598 (-0.16z)| lr 4.83e-05 | 2534.18 ms | 53.3% bf16 MFU | 206956 tok/s +step 16108/19560 | loss 3.291454 (-0.50z)| norm 0.2563 (-0.35z)| lr 4.83e-05 | 2534.66 ms | 53.3% bf16 MFU | 206951 tok/s +step 16109/19560 | loss 3.312521 (+0.02z)| norm 0.2651 (+0.17z)| lr 4.82e-05 | 2535.33 ms | 53.3% bf16 MFU | 206943 tok/s +step 16110/19560 | loss 3.302425 (-0.24z)| norm 0.2441 (-1.08z)| lr 4.82e-05 | 2532.17 ms | 53.3% bf16 MFU | 206948 tok/s +step 16111/19560 | loss 3.300726 (-0.28z)| norm 0.2418 (-1.20z)| lr 4.82e-05 | 2533.16 ms | 53.3% bf16 MFU | 206950 tok/s +step 16112/19560 | loss 3.250324 (-1.55z)| norm 0.2563 (-0.34z)| lr 4.82e-05 | 2535.61 ms | 53.2% bf16 MFU | 206940 tok/s +step 16113/19560 | loss 3.231178 (-1.99z)| norm 0.2546 (-0.44z)| lr 4.81e-05 | 2533.23 ms | 53.3% bf16 MFU | 206942 tok/s +step 16114/19560 | loss 3.305992 (-0.12z)| norm 0.2461 (-0.94z)| lr 4.81e-05 | 2534.50 ms | 53.3% bf16 MFU | 206938 tok/s +step 16115/19560 | loss 3.249845 (-1.52z)| norm 0.2682 (+0.37z)| lr 4.81e-05 | 2532.80 ms | 53.3% bf16 MFU | 206941 tok/s +step 16116/19560 | loss 3.342144 (+0.78z)| norm 0.2571 (-0.29z)| lr 4.81e-05 | 2532.33 ms | 53.3% bf16 MFU | 206946 tok/s +step 16117/19560 | loss 3.286808 (-0.59z)| norm 0.2506 (-0.68z)| lr 4.80e-05 | 2534.15 ms | 53.3% bf16 MFU | 206943 tok/s +step 16118/19560 | loss 3.382035 (+1.74z)| norm 0.2469 (-0.90z)| lr 4.80e-05 | 2530.05 ms | 53.4% bf16 MFU | 206957 tok/s +step 16119/19560 | loss 3.299000 (-0.30z)| norm 0.2586 (-0.21z)| lr 4.80e-05 | 2532.38 ms | 53.3% bf16 MFU | 206961 tok/s +step 16120/19560 | loss 3.362302 (+1.27z)| norm 0.2448 (-1.02z)| lr 4.79e-05 | 2533.10 ms | 53.3% bf16 MFU | 206961 tok/s +step 16121/19560 | loss 3.317335 (+0.14z)| norm 0.2462 (-0.93z)| lr 4.79e-05 | 2532.46 ms | 53.3% bf16 MFU | 206965 tok/s +step 16122/19560 | loss 3.333567 (+0.54z)| norm 0.2795 (+1.04z)| lr 4.79e-05 | 2532.54 ms | 53.3% bf16 MFU | 206967 tok/s +step 16123/19560 | loss 3.294775 (-0.44z)| norm 0.2524 (-0.56z)| lr 4.79e-05 | 2533.88 ms | 53.3% bf16 MFU | 206965 tok/s +step 16124/19560 | loss 3.265933 (-1.15z)| norm 0.2541 (-0.47z)| lr 4.78e-05 | 2533.38 ms | 53.3% bf16 MFU | 206964 tok/s +step 16125/19560 | loss 3.325124 (+0.33z)| norm 0.2469 (-0.89z)| lr 4.78e-05 | 2533.02 ms | 53.3% bf16 MFU | 206965 tok/s +step 16126/19560 | loss 3.328890 (+0.42z)| norm 0.2510 (-0.64z)| lr 4.78e-05 | 2534.33 ms | 53.3% bf16 MFU | 206960 tok/s +step 16127/19560 | loss 3.321388 (+0.23z)| norm 0.2511 (-0.64z)| lr 4.78e-05 | 2534.03 ms | 53.3% bf16 MFU | 206957 tok/s +step 16128/19560 | loss 3.273939 (-0.96z)| norm 0.2644 (+0.15z)| lr 4.77e-05 | 2532.07 ms | 53.3% bf16 MFU | 206962 tok/s +step 16129/19560 | loss 3.316423 (+0.09z)| norm 0.3007 (+2.24z)| lr 4.77e-05 | 2534.76 ms | 53.3% bf16 MFU | 206956 tok/s +step 16130/19560 | loss 3.496128 (+4.28z)| norm 0.3535 (+4.78z)| lr 4.77e-05 | 2533.41 ms | 53.3% bf16 MFU | 206956 tok/s +step 16131/19560 | loss 3.252940 (-1.42z)| norm 0.2602 (-0.16z)| lr 4.76e-05 | 2534.07 ms | 53.3% bf16 MFU | 206953 tok/s +step 16132/19560 | loss 3.247944 (-1.52z)| norm 0.2709 (+0.40z)| lr 4.76e-05 | 2535.24 ms | 53.3% bf16 MFU | 206945 tok/s +step 16133/19560 | loss 3.324622 (+0.25z)| norm 0.2685 (+0.27z)| lr 4.76e-05 | 2533.76 ms | 53.3% bf16 MFU | 206944 tok/s +step 16134/19560 | loss 3.734299 (+7.37z)| norm 0.3031 (+2.05z)| lr 4.76e-05 | 2533.52 ms | 53.3% bf16 MFU | 206944 tok/s +step 16135/19560 | loss 3.306883 (-0.19z)| norm 0.2751 (+0.58z)| lr 4.75e-05 | 2534.45 ms | 53.3% bf16 MFU | 206940 tok/s +step 16136/19560 | loss 3.368328 (+0.88z)| norm 0.2914 (+1.41z)| lr 4.75e-05 | 2532.92 ms | 53.3% bf16 MFU | 206942 tok/s +step 16137/19560 | loss 3.322461 (+0.07z)| norm 0.2657 (+0.08z)| lr 4.75e-05 | 2533.58 ms | 53.3% bf16 MFU | 206942 tok/s +step 16138/19560 | loss 3.215498 (-1.79z)| norm 0.2602 (-0.21z)| lr 4.75e-05 | 2535.56 ms | 53.2% bf16 MFU | 206934 tok/s +step 16139/19560 | loss 3.236399 (-1.40z)| norm 0.2701 (+0.30z)| lr 4.74e-05 | 2534.52 ms | 53.3% bf16 MFU | 206930 tok/s +step 16140/19560 | loss 3.304542 (-0.21z)| norm 0.2757 (+0.59z)| lr 4.74e-05 | 2532.62 ms | 53.3% bf16 MFU | 206934 tok/s +step 16141/19560 | loss 3.319294 (+0.04z)| norm 0.2808 (+0.84z)| lr 4.74e-05 | 2534.32 ms | 53.3% bf16 MFU | 206931 tok/s +step 16142/19560 | loss 3.378515 (+1.07z)| norm 0.2735 (+0.47z)| lr 4.74e-05 | 2530.51 ms | 53.4% bf16 MFU | 206944 tok/s +step 16143/19560 | loss 3.303746 (-0.22z)| norm 0.2575 (-0.36z)| lr 4.73e-05 | 2534.30 ms | 53.3% bf16 MFU | 206940 tok/s +step 16144/19560 | loss 3.336815 (+0.35z)| norm 0.3065 (+2.11z)| lr 4.73e-05 | 2532.41 ms | 53.3% bf16 MFU | 206945 tok/s +step 16145/19560 | loss 3.305310 (-0.20z)| norm 0.2562 (-0.43z)| lr 4.73e-05 | 2533.21 ms | 53.3% bf16 MFU | 206946 tok/s +step 16146/19560 | loss 3.390355 (+1.27z)| norm 0.2641 (-0.02z)| lr 4.72e-05 | 2533.71 ms | 53.3% bf16 MFU | 206945 tok/s +step 16147/19560 | loss 3.293369 (-0.42z)| norm 0.2691 (+0.23z)| lr 4.72e-05 | 2533.40 ms | 53.3% bf16 MFU | 206945 tok/s +step 16148/19560 | loss 3.266260 (-0.89z)| norm 0.2735 (+0.44z)| lr 4.72e-05 | 2532.54 ms | 53.3% bf16 MFU | 206949 tok/s +step 16149/19560 | loss 3.326738 (+0.17z)| norm 0.2581 (-0.34z)| lr 4.72e-05 | 2532.34 ms | 53.3% bf16 MFU | 206953 tok/s +step 16150/19560 | loss 3.319915 (+0.07z)| norm 0.2640 (+0.02z)| lr 4.71e-05 | 2533.86 ms | 53.3% bf16 MFU | 206951 tok/s +step 16151/19560 | loss 3.318954 (+0.05z)| norm 0.2651 (+0.10z)| lr 4.71e-05 | 2532.66 ms | 53.3% bf16 MFU | 206954 tok/s +step 16152/19560 | loss 3.296871 (-0.34z)| norm 0.2597 (-0.26z)| lr 4.71e-05 | 2535.03 ms | 53.3% bf16 MFU | 206948 tok/s +step 16153/19560 | loss 3.349434 (+0.60z)| norm 0.2547 (-0.60z)| lr 4.71e-05 | 2532.67 ms | 53.3% bf16 MFU | 206951 tok/s +step 16154/19560 | loss 3.278304 (-0.66z)| norm 0.2519 (-0.77z)| lr 4.70e-05 | 2532.79 ms | 53.3% bf16 MFU | 206953 tok/s +step 16155/19560 | loss 3.367805 (+0.92z)| norm 0.2467 (-1.11z)| lr 4.70e-05 | 2533.57 ms | 53.3% bf16 MFU | 206952 tok/s +step 16156/19560 | loss 3.437160 (+2.10z)| norm 0.2583 (-0.33z)| lr 4.70e-05 | 2534.15 ms | 53.3% bf16 MFU | 206949 tok/s +step 16157/19560 | loss 3.355184 (+0.65z)| norm 0.2526 (-0.71z)| lr 4.69e-05 | 2532.48 ms | 53.3% bf16 MFU | 206953 tok/s +step 16158/19560 | loss 3.315923 (-0.02z)| norm 0.2458 (-1.16z)| lr 4.69e-05 | 2532.68 ms | 53.3% bf16 MFU | 206956 tok/s +step 16159/19560 | loss 3.265475 (-0.91z)| norm 0.2678 (+0.33z)| lr 4.69e-05 | 2533.94 ms | 53.3% bf16 MFU | 206953 tok/s +step 16160/19560 | loss 3.256763 (-1.05z)| norm 0.2655 (+0.17z)| lr 4.69e-05 | 2532.70 ms | 53.3% bf16 MFU | 206956 tok/s +step 16161/19560 | loss 3.270184 (-0.80z)| norm 0.2544 (-0.58z)| lr 4.68e-05 | 2534.66 ms | 53.3% bf16 MFU | 206951 tok/s +step 16162/19560 | loss 3.318736 (+0.05z)| norm 0.2499 (-0.87z)| lr 4.68e-05 | 2532.55 ms | 53.3% bf16 MFU | 206954 tok/s +step 16163/19560 | loss 3.238262 (-1.36z)| norm 0.2484 (-0.97z)| lr 4.68e-05 | 2533.80 ms | 53.3% bf16 MFU | 206952 tok/s +step 16164/19560 | loss 3.286821 (-0.51z)| norm 0.2621 (-0.03z)| lr 4.68e-05 | 2532.64 ms | 53.3% bf16 MFU | 206955 tok/s +step 16165/19560 | loss 3.222032 (-1.65z)| norm 0.2635 (+0.06z)| lr 4.67e-05 | 2534.01 ms | 53.3% bf16 MFU | 206952 tok/s +step 16166/19560 | loss 3.282358 (-0.58z)| norm 0.2628 (+0.01z)| lr 4.67e-05 | 2532.05 ms | 53.3% bf16 MFU | 206958 tok/s +step 16167/19560 | loss 3.290050 (-0.44z)| norm 0.2561 (-0.44z)| lr 4.67e-05 | 2533.41 ms | 53.3% bf16 MFU | 206957 tok/s +step 16168/19560 | loss 3.349343 (+0.61z)| norm 0.2542 (-0.57z)| lr 4.67e-05 | 2531.27 ms | 53.3% bf16 MFU | 206966 tok/s +step 16169/19560 | loss 3.357408 (+0.74z)| norm 0.2800 (+1.17z)| lr 4.66e-05 | 2534.03 ms | 53.3% bf16 MFU | 206962 tok/s +step 16170/19560 | loss 3.312199 (-0.06z)| norm 0.2572 (-0.37z)| lr 4.66e-05 | 2532.11 ms | 53.3% bf16 MFU | 206967 tok/s +step 16171/19560 | loss 3.366287 (+0.89z)| norm 0.2686 (+0.39z)| lr 4.66e-05 | 2533.00 ms | 53.3% bf16 MFU | 206968 tok/s +step 16172/19560 | loss 3.331804 (+0.28z)| norm 0.2628 (-0.01z)| lr 4.65e-05 | 2533.02 ms | 53.3% bf16 MFU | 206969 tok/s +step 16173/19560 | loss 3.251849 (-1.11z)| norm 0.2545 (-0.57z)| lr 4.65e-05 | 2532.91 ms | 53.3% bf16 MFU | 206970 tok/s +step 16174/19560 | loss 3.359886 (+0.78z)| norm 0.2530 (-0.67z)| lr 4.65e-05 | 2533.01 ms | 53.3% bf16 MFU | 206970 tok/s +step 16175/19560 | loss 3.300797 (-0.26z)| norm 0.2532 (-0.66z)| lr 4.65e-05 | 2533.04 ms | 53.3% bf16 MFU | 206971 tok/s +step 16176/19560 | loss 3.269636 (-0.80z)| norm 0.2662 (+0.22z)| lr 4.64e-05 | 2532.11 ms | 53.3% bf16 MFU | 206975 tok/s +step 16177/19560 | loss 3.300615 (-0.25z)| norm 0.2494 (-0.93z)| lr 4.64e-05 | 2532.25 ms | 53.3% bf16 MFU | 206978 tok/s +step 16178/19560 | loss 3.323580 (+0.17z)| norm 0.2637 (+0.04z)| lr 4.64e-05 | 2535.83 ms | 53.2% bf16 MFU | 206967 tok/s +step 16179/19560 | loss 3.247548 (-1.16z)| norm 0.2500 (-0.88z)| lr 4.64e-05 | 2532.81 ms | 53.3% bf16 MFU | 206969 tok/s +step 16180/19560 | loss 3.310413 (-0.06z)| norm 0.2697 (+0.46z)| lr 4.63e-05 | 2533.69 ms | 53.3% bf16 MFU | 206967 tok/s +step 16181/19560 | loss 3.337548 (+0.41z)| norm 0.2611 (-0.13z)| lr 4.63e-05 | 2531.73 ms | 53.3% bf16 MFU | 206973 tok/s +step 16182/19560 | loss 3.301735 (-0.22z)| norm 0.2568 (-0.43z)| lr 4.63e-05 | 2533.64 ms | 53.3% bf16 MFU | 206971 tok/s +step 16183/19560 | loss 3.273063 (-0.72z)| norm 0.2674 (+0.29z)| lr 4.63e-05 | 2531.65 ms | 53.3% bf16 MFU | 206977 tok/s +step 16184/19560 | loss 3.295930 (-0.31z)| norm 0.2704 (+0.50z)| lr 4.62e-05 | 2535.38 ms | 53.3% bf16 MFU | 206967 tok/s +step 16185/19560 | loss 3.325727 (+0.20z)| norm 0.2688 (+0.39z)| lr 4.62e-05 | 2535.19 ms | 53.3% bf16 MFU | 206959 tok/s +step 16186/19560 | loss 3.347054 (+0.58z)| norm 0.2772 (+0.96z)| lr 4.62e-05 | 2534.40 ms | 53.3% bf16 MFU | 206955 tok/s +step 16187/19560 | loss 3.296632 (-0.33z)| norm 0.2794 (+1.10z)| lr 4.61e-05 | 2532.23 ms | 53.3% bf16 MFU | 206959 tok/s +step 16188/19560 | loss 3.288599 (-0.46z)| norm 0.2729 (+0.64z)| lr 4.61e-05 | 2534.88 ms | 53.3% bf16 MFU | 206953 tok/s +step 16189/19560 | loss 3.291304 (-0.41z)| norm 0.2438 (-1.33z)| lr 4.61e-05 | 2533.43 ms | 53.3% bf16 MFU | 206952 tok/s +step 16190/19560 | loss 3.284625 (-0.52z)| norm 0.2758 (+0.83z)| lr 4.61e-05 | 2531.98 ms | 53.3% bf16 MFU | 206958 tok/s +step 16191/19560 | loss 3.292998 (-0.36z)| norm 0.2705 (+0.48z)| lr 4.60e-05 | 2532.82 ms | 53.3% bf16 MFU | 206960 tok/s +step 16192/19560 | loss 3.304230 (-0.15z)| norm 0.2563 (-0.48z)| lr 4.60e-05 | 2534.47 ms | 53.3% bf16 MFU | 206955 tok/s +step 16193/19560 | loss 3.260092 (-0.93z)| norm 0.2745 (+0.75z)| lr 4.60e-05 | 2535.09 ms | 53.3% bf16 MFU | 206948 tok/s +step 16194/19560 | loss 3.330196 (+0.32z)| norm 0.2670 (+0.24z)| lr 4.60e-05 | 2534.33 ms | 53.3% bf16 MFU | 206944 tok/s +step 16195/19560 | loss 3.357453 (+0.80z)| norm 0.2624 (-0.07z)| lr 4.59e-05 | 2533.42 ms | 53.3% bf16 MFU | 206945 tok/s +step 16196/19560 | loss 3.244564 (-1.21z)| norm 0.2854 (+1.50z)| lr 4.59e-05 | 2533.31 ms | 53.3% bf16 MFU | 206945 tok/s +step 16197/19560 | loss 3.359653 (+0.86z)| norm 0.2827 (+1.29z)| lr 4.59e-05 | 2534.49 ms | 53.3% bf16 MFU | 206941 tok/s +step 16198/19560 | loss 3.283360 (-0.50z)| norm 0.2599 (-0.25z)| lr 4.59e-05 | 2533.82 ms | 53.3% bf16 MFU | 206940 tok/s +step 16199/19560 | loss 3.317139 (+0.11z)| norm 0.2540 (-0.64z)| lr 4.58e-05 | 2532.65 ms | 53.3% bf16 MFU | 206943 tok/s +step 16200/19560 | loss 3.300770 (-0.19z)| norm 0.2558 (-0.52z)| lr 4.58e-05 | 2533.05 ms | 53.3% bf16 MFU | 206945 tok/s +step 16201/19560 | loss 3.341100 (+0.54z)| norm 0.2650 (+0.11z)| lr 4.58e-05 | 2533.24 ms | 53.3% bf16 MFU | 206946 tok/s +step 16202/19560 | loss 3.302833 (-0.16z)| norm 0.2596 (-0.26z)| lr 4.57e-05 | 2534.23 ms | 53.3% bf16 MFU | 206943 tok/s +step 16203/19560 | loss 3.282089 (-0.52z)| norm 0.2540 (-0.64z)| lr 4.57e-05 | 2533.37 ms | 53.3% bf16 MFU | 206943 tok/s +step 16204/19560 | loss 3.317154 (+0.10z)| norm 0.2554 (-0.53z)| lr 4.57e-05 | 2532.58 ms | 53.3% bf16 MFU | 206947 tok/s +step 16205/19560 | loss 3.331697 (+0.37z)| norm 0.2593 (-0.27z)| lr 4.57e-05 | 2536.09 ms | 53.2% bf16 MFU | 206936 tok/s +step 16206/19560 | loss 3.317751 (+0.11z)| norm 0.2466 (-1.13z)| lr 4.56e-05 | 2534.03 ms | 53.3% bf16 MFU | 206934 tok/s +step 16207/19560 | loss 3.295296 (-0.30z)| norm 0.2692 (+0.42z)| lr 4.56e-05 | 2536.08 ms | 53.2% bf16 MFU | 206924 tok/s +step 16208/19560 | loss 3.285030 (-0.48z)| norm 0.2666 (+0.24z)| lr 4.56e-05 | 2534.20 ms | 53.3% bf16 MFU | 206922 tok/s +step 16209/19560 | loss 3.294686 (-0.30z)| norm 0.2606 (-0.18z)| lr 4.56e-05 | 2534.77 ms | 53.3% bf16 MFU | 206918 tok/s +step 16210/19560 | loss 3.284732 (-0.48z)| norm 0.2590 (-0.29z)| lr 4.55e-05 | 2534.92 ms | 53.3% bf16 MFU | 206914 tok/s +step 16211/19560 | loss 3.336989 (+0.47z)| norm 0.2566 (-0.45z)| lr 4.55e-05 | 2532.42 ms | 53.3% bf16 MFU | 206919 tok/s +step 16212/19560 | loss 3.274171 (-0.67z)| norm 0.2582 (-0.34z)| lr 4.55e-05 | 2533.64 ms | 53.3% bf16 MFU | 206920 tok/s +step 16213/19560 | loss 3.220346 (-1.61z)| norm 0.2559 (-0.49z)| lr 4.55e-05 | 2536.04 ms | 53.2% bf16 MFU | 206911 tok/s +step 16214/19560 | loss 3.238827 (-1.26z)| norm 0.2734 (+0.71z)| lr 4.54e-05 | 2536.23 ms | 53.2% bf16 MFU | 206901 tok/s +step 16215/19560 | loss 3.296530 (-0.22z)| norm 0.2448 (-1.24z)| lr 4.54e-05 | 2534.43 ms | 53.3% bf16 MFU | 206899 tok/s +step 16216/19560 | loss 3.355846 (+0.84z)| norm 0.2799 (+1.17z)| lr 4.54e-05 | 2532.08 ms | 53.3% bf16 MFU | 206907 tok/s +step 16217/19560 | loss 3.364800 (+0.99z)| norm 0.2600 (-0.19z)| lr 4.54e-05 | 2535.22 ms | 53.3% bf16 MFU | 206902 tok/s +step 16218/19560 | loss 3.287999 (-0.38z)| norm 0.2569 (-0.40z)| lr 4.53e-05 | 2534.28 ms | 53.3% bf16 MFU | 206901 tok/s +step 16219/19560 | loss 3.321918 (+0.23z)| norm 0.2619 (-0.06z)| lr 4.53e-05 | 2531.03 ms | 53.3% bf16 MFU | 206913 tok/s +step 16220/19560 | loss 3.312307 (+0.06z)| norm 0.2563 (-0.45z)| lr 4.53e-05 | 2531.65 ms | 53.3% bf16 MFU | 206922 tok/s +step 16221/19560 | loss 3.295455 (-0.24z)| norm 0.2461 (-1.16z)| lr 4.52e-05 | 2535.03 ms | 53.3% bf16 MFU | 206917 tok/s +step 16222/19560 | loss 3.341781 (+0.58z)| norm 0.2622 (-0.03z)| lr 4.52e-05 | 2532.18 ms | 53.3% bf16 MFU | 206923 tok/s +step 16223/19560 | loss 3.415977 (+1.86z)| norm 0.2476 (-1.04z)| lr 4.52e-05 | 2532.84 ms | 53.3% bf16 MFU | 206927 tok/s +step 16224/19560 | loss 3.299565 (-0.20z)| norm 0.2774 (+1.00z)| lr 4.52e-05 | 2532.11 ms | 53.3% bf16 MFU | 206934 tok/s +step 16225/19560 | loss 3.300200 (-0.18z)| norm 0.2507 (-0.84z)| lr 4.51e-05 | 2532.72 ms | 53.3% bf16 MFU | 206937 tok/s +step 16226/19560 | loss 3.255748 (-0.97z)| norm 0.2562 (-0.45z)| lr 4.51e-05 | 2532.99 ms | 53.3% bf16 MFU | 206939 tok/s +step 16227/19560 | loss 3.296561 (-0.25z)| norm 0.2429 (-1.36z)| lr 4.51e-05 | 2531.92 ms | 53.3% bf16 MFU | 206946 tok/s +step 16228/19560 | loss 3.387534 (+1.34z)| norm 0.2511 (-0.79z)| lr 4.51e-05 | 2532.84 ms | 53.3% bf16 MFU | 206949 tok/s +step 16229/19560 | loss 3.315135 (+0.07z)| norm 0.2558 (-0.45z)| lr 4.50e-05 | 2533.52 ms | 53.3% bf16 MFU | 206948 tok/s +step 16230/19560 | loss 3.356256 (+0.78z)| norm 0.2409 (-1.46z)| lr 4.50e-05 | 2532.60 ms | 53.3% bf16 MFU | 206952 tok/s +step 16231/19560 | loss 3.431027 (+2.04z)| norm 0.2643 (+0.14z)| lr 4.50e-05 | 2533.48 ms | 53.3% bf16 MFU | 206951 tok/s +step 16232/19560 | loss 3.311861 (-0.02z)| norm 0.2594 (-0.19z)| lr 4.50e-05 | 2532.67 ms | 53.3% bf16 MFU | 206954 tok/s +step 16233/19560 | loss 3.342397 (+0.51z)| norm 0.2565 (-0.38z)| lr 4.49e-05 | 2533.81 ms | 53.3% bf16 MFU | 206952 tok/s +step 16234/19560 | loss 3.337399 (+0.42z)| norm 0.2560 (-0.42z)| lr 4.49e-05 | 2532.86 ms | 53.3% bf16 MFU | 206954 tok/s +step 16235/19560 | loss 3.333997 (+0.35z)| norm 0.2541 (-0.55z)| lr 4.49e-05 | 2532.44 ms | 53.3% bf16 MFU | 206958 tok/s +step 16236/19560 | loss 3.273410 (-0.69z)| norm 0.2634 (+0.09z)| lr 4.48e-05 | 2533.77 ms | 53.3% bf16 MFU | 206956 tok/s +step 16237/19560 | loss 3.331545 (+0.31z)| norm 0.2573 (-0.33z)| lr 4.48e-05 | 2533.44 ms | 53.3% bf16 MFU | 206956 tok/s +step 16238/19560 | loss 3.297651 (-0.28z)| norm 0.2578 (-0.30z)| lr 4.48e-05 | 2531.84 ms | 53.3% bf16 MFU | 206962 tok/s +step 16239/19560 | loss 3.281925 (-0.54z)| norm 0.2541 (-0.57z)| lr 4.48e-05 | 2534.28 ms | 53.3% bf16 MFU | 206958 tok/s +step 16240/19560 | loss 3.348464 (+0.59z)| norm 0.2675 (+0.36z)| lr 4.47e-05 | 2533.49 ms | 53.3% bf16 MFU | 206957 tok/s +step 16241/19560 | loss 3.318212 (+0.06z)| norm 0.2639 (+0.10z)| lr 4.47e-05 | 2534.34 ms | 53.3% bf16 MFU | 206953 tok/s +step 16242/19560 | loss 3.320038 (+0.09z)| norm 0.2540 (-0.59z)| lr 4.47e-05 | 2532.04 ms | 53.3% bf16 MFU | 206958 tok/s +step 16243/19560 | loss 3.339666 (+0.42z)| norm 0.2513 (-0.77z)| lr 4.47e-05 | 2534.13 ms | 53.3% bf16 MFU | 206955 tok/s +step 16244/19560 | loss 3.319070 (+0.06z)| norm 0.2498 (-0.87z)| lr 4.46e-05 | 2531.31 ms | 53.3% bf16 MFU | 206963 tok/s +step 16245/19560 | loss 3.369963 (+0.94z)| norm 0.2614 (-0.07z)| lr 4.46e-05 | 2532.64 ms | 53.3% bf16 MFU | 206966 tok/s +step 16246/19560 | loss 3.317088 (+0.02z)| norm 0.2788 (+1.13z)| lr 4.46e-05 | 2531.46 ms | 53.3% bf16 MFU | 206973 tok/s +step 16247/19560 | loss 3.335093 (+0.33z)| norm 0.2721 (+0.66z)| lr 4.46e-05 | 2531.59 ms | 53.3% bf16 MFU | 206979 tok/s +step 16248/19560 | loss 3.290389 (-0.44z)| norm 0.2747 (+0.82z)| lr 4.45e-05 | 2532.60 ms | 53.3% bf16 MFU | 206981 tok/s +step 16249/19560 | loss 3.304912 (-0.18z)| norm 0.2784 (+1.06z)| lr 4.45e-05 | 2533.00 ms | 53.3% bf16 MFU | 206981 tok/s +step 16250/19560 | loss 3.370338 (+0.96z)| norm 0.2659 (+0.20z)| lr 4.45e-05 | 2532.78 ms | 53.3% bf16 MFU | 206982 tok/s +val loss 3.300011 +evaluating HellaSwag: 0/628 evaluating HellaSwag: 10/628 evaluating HellaSwag: 20/628 evaluating HellaSwag: 30/628 evaluating HellaSwag: 40/628 evaluating HellaSwag: 50/628 evaluating HellaSwag: 60/628 evaluating HellaSwag: 70/628 evaluating HellaSwag: 80/628 evaluating HellaSwag: 90/628 evaluating HellaSwag: 100/628 evaluating HellaSwag: 110/628 evaluating HellaSwag: 120/628 evaluating HellaSwag: 130/628 evaluating HellaSwag: 140/628 evaluating HellaSwag: 150/628 evaluating HellaSwag: 160/628 evaluating HellaSwag: 170/628 evaluating HellaSwag: 180/628 evaluating HellaSwag: 190/628 evaluating HellaSwag: 200/628 evaluating HellaSwag: 210/628 evaluating HellaSwag: 220/628 evaluating HellaSwag: 230/628 evaluating HellaSwag: 240/628 evaluating HellaSwag: 250/628 evaluating HellaSwag: 260/628 evaluating HellaSwag: 270/628 evaluating HellaSwag: 280/628 evaluating HellaSwag: 290/628 evaluating HellaSwag: 300/628 evaluating HellaSwag: 310/628 evaluating HellaSwag: 320/628 evaluating HellaSwag: 330/628 evaluating HellaSwag: 340/628 evaluating HellaSwag: 350/628 evaluating HellaSwag: 360/628 evaluating HellaSwag: 370/628 evaluating HellaSwag: 380/628 evaluating HellaSwag: 390/628 evaluating HellaSwag: 400/628 evaluating HellaSwag: 410/628 evaluating HellaSwag: 420/628 evaluating HellaSwag: 430/628 evaluating HellaSwag: 440/628 evaluating HellaSwag: 450/628 evaluating HellaSwag: 460/628 evaluating HellaSwag: 470/628 evaluating HellaSwag: 480/628 evaluating HellaSwag: 490/628 evaluating HellaSwag: 500/628 evaluating HellaSwag: 510/628 evaluating HellaSwag: 520/628 evaluating HellaSwag: 530/628 evaluating HellaSwag: 540/628 evaluating HellaSwag: 550/628 evaluating HellaSwag: 560/628 evaluating HellaSwag: 570/628 evaluating HellaSwag: 580/628 evaluating HellaSwag: 590/628 evaluating HellaSwag: 600/628 evaluating HellaSwag: 610/628 evaluating HellaSwag: 620/628 HellaSwag: 3027/10042 = 0.301434 +step 16251/19560 | loss 3.318798 (+0.05z)| norm 0.2663 (+0.22z)| lr 4.45e-05 | 2536.05 ms | 53.2% bf16 MFU | 206970 tok/s +step 16252/19560 | loss 3.327355 (+0.19z)| norm 0.2734 (+0.71z)| lr 4.44e-05 | 2534.27 ms | 53.3% bf16 MFU | 206965 tok/s +step 16253/19560 | loss 3.298651 (-0.31z)| norm 0.2780 (+1.02z)| lr 4.44e-05 | 2532.64 ms | 53.3% bf16 MFU | 206967 tok/s +step 16254/19560 | loss 3.320920 (+0.09z)| norm 0.2531 (-0.74z)| lr 4.44e-05 | 2532.58 ms | 53.3% bf16 MFU | 206970 tok/s +step 16255/19560 | loss 3.299872 (-0.28z)| norm 0.2740 (+0.73z)| lr 4.44e-05 | 2532.02 ms | 53.3% bf16 MFU | 206975 tok/s +step 16256/19560 | loss 3.314063 (-0.04z)| norm 0.2674 (+0.25z)| lr 4.43e-05 | 2532.63 ms | 53.3% bf16 MFU | 206977 tok/s +step 16257/19560 | loss 3.290217 (-0.45z)| norm 0.2657 (+0.15z)| lr 4.43e-05 | 2531.95 ms | 53.3% bf16 MFU | 206981 tok/s +step 16258/19560 | loss 3.310697 (-0.07z)| norm 0.2793 (+1.44z)| lr 4.43e-05 | 2532.90 ms | 53.3% bf16 MFU | 206982 tok/s +step 16259/19560 | loss 3.251725 (-1.15z)| norm 0.2729 (+0.87z)| lr 4.42e-05 | 2534.92 ms | 53.3% bf16 MFU | 206974 tok/s +step 16260/19560 | loss 3.278921 (-0.66z)| norm 0.2693 (+0.56z)| lr 4.42e-05 | 2534.14 ms | 53.3% bf16 MFU | 206970 tok/s +step 16261/19560 | loss 3.333283 (+0.34z)| norm 0.2506 (-1.09z)| lr 4.42e-05 | 2531.55 ms | 53.3% bf16 MFU | 206976 tok/s +step 16262/19560 | loss 3.301490 (-0.25z)| norm 0.2685 (+0.54z)| lr 4.42e-05 | 2533.54 ms | 53.3% bf16 MFU | 206974 tok/s +step 16263/19560 | loss 3.303524 (-0.20z)| norm 0.2739 (+1.05z)| lr 4.41e-05 | 2533.89 ms | 53.3% bf16 MFU | 206971 tok/s +step 16264/19560 | loss 3.290655 (-0.51z)| norm 0.2467 (-1.47z)| lr 4.41e-05 | 2533.83 ms | 53.3% bf16 MFU | 206968 tok/s +step 16265/19560 | loss 3.250017 (-1.52z)| norm 0.2599 (-0.23z)| lr 4.41e-05 | 2533.97 ms | 53.3% bf16 MFU | 206965 tok/s +step 16266/19560 | loss 3.273833 (-0.95z)| norm 0.2692 (+0.65z)| lr 4.41e-05 | 2534.54 ms | 53.3% bf16 MFU | 206960 tok/s +step 16267/19560 | loss 3.255125 (-1.44z)| norm 0.2571 (-0.49z)| lr 4.40e-05 | 2532.36 ms | 53.3% bf16 MFU | 206964 tok/s +step 16268/19560 | loss 3.359223 (+1.23z)| norm 0.2567 (-0.51z)| lr 4.40e-05 | 2532.20 ms | 53.3% bf16 MFU | 206968 tok/s +step 16269/19560 | loss 3.345129 (+0.86z)| norm 0.2473 (-1.39z)| lr 4.40e-05 | 2532.65 ms | 53.3% bf16 MFU | 206970 tok/s +step 16270/19560 | loss 3.359657 (+1.24z)| norm 0.2676 (+0.56z)| lr 4.40e-05 | 2533.16 ms | 53.3% bf16 MFU | 206970 tok/s +step 16271/19560 | loss 3.353182 (+1.06z)| norm 0.2826 (+1.96z)| lr 4.39e-05 | 2532.50 ms | 53.3% bf16 MFU | 206973 tok/s +step 16272/19560 | loss 3.357366 (+1.16z)| norm 0.2563 (-0.54z)| lr 4.39e-05 | 2533.06 ms | 53.3% bf16 MFU | 206973 tok/s +step 16273/19560 | loss 3.398152 (+2.14z)| norm 0.2665 (+0.50z)| lr 4.39e-05 | 2531.98 ms | 53.3% bf16 MFU | 206978 tok/s +step 16274/19560 | loss 3.322063 (+0.25z)| norm 0.2505 (-1.12z)| lr 4.39e-05 | 2534.42 ms | 53.3% bf16 MFU | 206972 tok/s +step 16275/19560 | loss 3.375404 (+1.58z)| norm 0.2680 (+0.66z)| lr 4.38e-05 | 2533.15 ms | 53.3% bf16 MFU | 206972 tok/s +step 16276/19560 | loss 3.328773 (+0.39z)| norm 0.2739 (+1.25z)| lr 4.38e-05 | 2531.76 ms | 53.3% bf16 MFU | 206978 tok/s +step 16277/19560 | loss 3.390612 (+1.92z)| norm 0.2568 (-0.48z)| lr 4.38e-05 | 2534.45 ms | 53.3% bf16 MFU | 206972 tok/s +step 16278/19560 | loss 3.341672 (+0.69z)| norm 0.2516 (-0.99z)| lr 4.38e-05 | 2532.97 ms | 53.3% bf16 MFU | 206973 tok/s +step 16279/19560 | loss 3.276974 (-0.91z)| norm 0.2546 (-0.69z)| lr 4.37e-05 | 2533.87 ms | 53.3% bf16 MFU | 206970 tok/s +step 16280/19560 | loss 3.410409 (+2.33z)| norm 0.2634 (+0.21z)| lr 4.37e-05 | 2532.87 ms | 53.3% bf16 MFU | 206971 tok/s +step 16281/19560 | loss 3.335394 (+0.51z)| norm 0.2653 (+0.39z)| lr 4.37e-05 | 2531.85 ms | 53.3% bf16 MFU | 206976 tok/s +step 16282/19560 | loss 3.283813 (-0.75z)| norm 0.2546 (-0.70z)| lr 4.36e-05 | 2532.65 ms | 53.3% bf16 MFU | 206978 tok/s +step 16283/19560 | loss 3.319492 (+0.14z)| norm 0.2564 (-0.52z)| lr 4.36e-05 | 2533.28 ms | 53.3% bf16 MFU | 206977 tok/s +step 16284/19560 | loss 3.290928 (-0.56z)| norm 0.2553 (-0.64z)| lr 4.36e-05 | 2535.04 ms | 53.3% bf16 MFU | 206969 tok/s +step 16285/19560 | loss 3.374746 (+1.56z)| norm 0.2649 (+0.33z)| lr 4.36e-05 | 2534.01 ms | 53.3% bf16 MFU | 206966 tok/s +step 16286/19560 | loss 3.286058 (-0.68z)| norm 0.2724 (+1.09z)| lr 4.35e-05 | 2534.89 ms | 53.3% bf16 MFU | 206959 tok/s +step 16287/19560 | loss 3.256835 (-1.41z)| norm 0.2513 (-1.07z)| lr 4.35e-05 | 2534.42 ms | 53.3% bf16 MFU | 206954 tok/s +step 16288/19560 | loss 3.328668 (+0.39z)| norm 0.2625 (+0.08z)| lr 4.35e-05 | 2533.60 ms | 53.3% bf16 MFU | 206953 tok/s +step 16289/19560 | loss 3.263424 (-1.26z)| norm 0.2716 (+1.00z)| lr 4.35e-05 | 2534.27 ms | 53.3% bf16 MFU | 206949 tok/s +step 16290/19560 | loss 3.353374 (+1.01z)| norm 0.2538 (-0.83z)| lr 4.34e-05 | 2533.11 ms | 53.3% bf16 MFU | 206951 tok/s +step 16291/19560 | loss 3.246764 (-1.69z)| norm 0.2535 (-0.86z)| lr 4.34e-05 | 2533.56 ms | 53.3% bf16 MFU | 206950 tok/s +step 16292/19560 | loss 3.258397 (-1.38z)| norm 0.2495 (-1.26z)| lr 4.34e-05 | 2534.32 ms | 53.3% bf16 MFU | 206946 tok/s +step 16293/19560 | loss 3.331064 (+0.43z)| norm 0.2676 (+0.59z)| lr 4.34e-05 | 2533.51 ms | 53.3% bf16 MFU | 206946 tok/s +step 16294/19560 | loss 3.251997 (-1.58z)| norm 0.2449 (-1.70z)| lr 4.33e-05 | 2533.61 ms | 53.3% bf16 MFU | 206945 tok/s +step 16295/19560 | loss 3.382753 (+1.72z)| norm 0.2520 (-0.97z)| lr 4.33e-05 | 2533.55 ms | 53.3% bf16 MFU | 206945 tok/s +step 16296/19560 | loss 3.332784 (+0.46z)| norm 0.2651 (+0.34z)| lr 4.33e-05 | 2534.15 ms | 53.3% bf16 MFU | 206942 tok/s +step 16297/19560 | loss 3.255101 (-1.48z)| norm 0.2516 (-1.01z)| lr 4.33e-05 | 2533.33 ms | 53.3% bf16 MFU | 206943 tok/s +step 16298/19560 | loss 3.242612 (-1.76z)| norm 0.2533 (-0.84z)| lr 4.32e-05 | 2532.54 ms | 53.3% bf16 MFU | 206947 tok/s +step 16299/19560 | loss 3.342869 (+0.75z)| norm 0.2721 (+1.08z)| lr 4.32e-05 | 2534.22 ms | 53.3% bf16 MFU | 206944 tok/s +step 16300/19560 | loss 3.340552 (+0.69z)| norm 0.2441 (-1.73z)| lr 4.32e-05 | 2533.89 ms | 53.3% bf16 MFU | 206942 tok/s +step 16301/19560 | loss 3.305950 (-0.19z)| norm 0.2509 (-1.05z)| lr 4.32e-05 | 2532.34 ms | 53.3% bf16 MFU | 206947 tok/s +step 16302/19560 | loss 3.335258 (+0.56z)| norm 0.2568 (-0.45z)| lr 4.31e-05 | 2532.63 ms | 53.3% bf16 MFU | 206950 tok/s +step 16303/19560 | loss 3.286329 (-0.68z)| norm 0.2528 (-0.86z)| lr 4.31e-05 | 2532.86 ms | 53.3% bf16 MFU | 206952 tok/s +step 16304/19560 | loss 3.353448 (+1.00z)| norm 0.2520 (-0.92z)| lr 4.31e-05 | 2531.47 ms | 53.3% bf16 MFU | 206960 tok/s +step 16305/19560 | loss 3.359080 (+1.13z)| norm 0.2635 (+0.22z)| lr 4.31e-05 | 2531.95 ms | 53.3% bf16 MFU | 206965 tok/s +step 16306/19560 | loss 3.432939 (+2.87z)| norm 0.2755 (+1.40z)| lr 4.30e-05 | 2532.73 ms | 53.3% bf16 MFU | 206967 tok/s +step 16307/19560 | loss 3.275833 (-0.97z)| norm 0.2505 (-1.09z)| lr 4.30e-05 | 2532.63 ms | 53.3% bf16 MFU | 206970 tok/s +step 16308/19560 | loss 3.468809 (+3.55z)| norm 0.2811 (+1.94z)| lr 4.30e-05 | 2533.28 ms | 53.3% bf16 MFU | 206969 tok/s +step 16309/19560 | loss 3.296849 (-0.45z)| norm 0.2611 (-0.04z)| lr 4.29e-05 | 2532.48 ms | 53.3% bf16 MFU | 206972 tok/s +step 16310/19560 | loss 3.339535 (+0.53z)| norm 0.2607 (-0.09z)| lr 4.29e-05 | 2534.45 ms | 53.3% bf16 MFU | 206967 tok/s +step 16311/19560 | loss 3.320262 (+0.08z)| norm 0.2684 (+0.68z)| lr 4.29e-05 | 2533.08 ms | 53.3% bf16 MFU | 206967 tok/s +step 16312/19560 | loss 3.348316 (+0.72z)| norm 0.3879 (+8.35z)| lr 4.29e-05 | 2531.37 ms | 53.3% bf16 MFU | 206975 tok/s +step 16313/19560 | loss 3.283535 (-0.78z)| norm 0.2811 (+1.22z)| lr 4.28e-05 | 2532.87 ms | 53.3% bf16 MFU | 206976 tok/s +step 16314/19560 | loss 3.375942 (+1.36z)| norm 0.2601 (-0.16z)| lr 4.28e-05 | 2534.83 ms | 53.3% bf16 MFU | 206968 tok/s +step 16315/19560 | loss 3.335871 (+0.42z)| norm 0.2680 (+0.38z)| lr 4.28e-05 | 2533.01 ms | 53.3% bf16 MFU | 206969 tok/s +step 16316/19560 | loss 3.312646 (-0.12z)| norm 0.2775 (+1.01z)| lr 4.28e-05 | 2532.31 ms | 53.3% bf16 MFU | 206973 tok/s +step 16317/19560 | loss 3.325906 (+0.18z)| norm 0.2617 (-0.06z)| lr 4.27e-05 | 2534.80 ms | 53.3% bf16 MFU | 206966 tok/s +step 16318/19560 | loss 3.316508 (-0.04z)| norm 0.2589 (-0.24z)| lr 4.27e-05 | 2531.54 ms | 53.3% bf16 MFU | 206973 tok/s +step 16319/19560 | loss 3.350526 (+0.74z)| norm 0.2806 (+1.21z)| lr 4.27e-05 | 2532.16 ms | 53.3% bf16 MFU | 206977 tok/s +step 16320/19560 | loss 3.343635 (+0.57z)| norm 0.2626 (+0.00z)| lr 4.27e-05 | 2533.37 ms | 53.3% bf16 MFU | 206975 tok/s +step 16321/19560 | loss 3.285193 (-0.80z)| norm 0.2525 (-0.66z)| lr 4.26e-05 | 2531.10 ms | 53.3% bf16 MFU | 206984 tok/s +step 16322/19560 | loss 3.307914 (-0.26z)| norm 0.2615 (-0.06z)| lr 4.26e-05 | 2533.20 ms | 53.3% bf16 MFU | 206983 tok/s +step 16323/19560 | loss 3.411332 (+2.12z)| norm 0.2573 (-0.34z)| lr 4.26e-05 | 2533.34 ms | 53.3% bf16 MFU | 206981 tok/s +step 16324/19560 | loss 3.330408 (+0.24z)| norm 0.2536 (-0.57z)| lr 4.26e-05 | 2533.65 ms | 53.3% bf16 MFU | 206979 tok/s +step 16325/19560 | loss 3.322174 (+0.05z)| norm 0.2624 (+0.03z)| lr 4.25e-05 | 2533.92 ms | 53.3% bf16 MFU | 206975 tok/s +step 16326/19560 | loss 3.316206 (-0.09z)| norm 0.2509 (-0.74z)| lr 4.25e-05 | 2532.01 ms | 53.3% bf16 MFU | 206980 tok/s +step 16327/19560 | loss 3.331297 (+0.26z)| norm 0.2398 (-1.48z)| lr 4.25e-05 | 2533.94 ms | 53.3% bf16 MFU | 206976 tok/s +step 16328/19560 | loss 3.313007 (-0.17z)| norm 0.2560 (-0.39z)| lr 4.25e-05 | 2534.46 ms | 53.3% bf16 MFU | 206970 tok/s +step 16329/19560 | loss 3.328790 (+0.20z)| norm 0.2501 (-0.77z)| lr 4.24e-05 | 2532.86 ms | 53.3% bf16 MFU | 206972 tok/s +step 16330/19560 | loss 3.285445 (-0.81z)| norm 0.2675 (+0.39z)| lr 4.24e-05 | 2534.92 ms | 53.3% bf16 MFU | 206964 tok/s +step 16331/19560 | loss 3.302300 (-0.42z)| norm 0.2686 (+0.46z)| lr 4.24e-05 | 2534.68 ms | 53.3% bf16 MFU | 206958 tok/s +step 16332/19560 | loss 3.346199 (+0.60z)| norm 0.2416 (-1.34z)| lr 4.24e-05 | 2532.13 ms | 53.3% bf16 MFU | 206963 tok/s +step 16333/19560 | loss 3.341989 (+0.50z)| norm 0.2614 (-0.02z)| lr 4.23e-05 | 2531.56 ms | 53.3% bf16 MFU | 206970 tok/s +step 16334/19560 | loss 3.318007 (-0.06z)| norm 0.2638 (+0.13z)| lr 4.23e-05 | 2532.01 ms | 53.3% bf16 MFU | 206975 tok/s +step 16335/19560 | loss 3.245605 (-1.73z)| norm 0.2490 (-0.85z)| lr 4.23e-05 | 2533.40 ms | 53.3% bf16 MFU | 206973 tok/s +step 16336/19560 | loss 3.327938 (+0.17z)| norm 0.2491 (-0.83z)| lr 4.23e-05 | 2533.18 ms | 53.3% bf16 MFU | 206973 tok/s +step 16337/19560 | loss 3.373306 (+1.21z)| norm 0.2720 (+0.70z)| lr 4.22e-05 | 2533.30 ms | 53.3% bf16 MFU | 206972 tok/s +step 16338/19560 | loss 3.265192 (-1.29z)| norm 0.2730 (+0.75z)| lr 4.22e-05 | 2533.47 ms | 53.3% bf16 MFU | 206971 tok/s +step 16339/19560 | loss 3.325344 (+0.10z)| norm 0.2681 (+0.42z)| lr 4.22e-05 | 2533.07 ms | 53.3% bf16 MFU | 206971 tok/s +step 16340/19560 | loss 3.293417 (-0.64z)| norm 0.2475 (-0.94z)| lr 4.22e-05 | 2532.12 ms | 53.3% bf16 MFU | 206976 tok/s +step 16341/19560 | loss 3.294572 (-0.64z)| norm 0.2455 (-1.07z)| lr 4.21e-05 | 2532.97 ms | 53.3% bf16 MFU | 206976 tok/s +step 16342/19560 | loss 3.361670 (+0.94z)| norm 0.2743 (+0.84z)| lr 4.21e-05 | 2532.26 ms | 53.3% bf16 MFU | 206979 tok/s +step 16343/19560 | loss 3.346027 (+0.55z)| norm 0.2517 (-0.67z)| lr 4.21e-05 | 2531.97 ms | 53.3% bf16 MFU | 206984 tok/s +step 16344/19560 | loss 3.480794 (+3.57z)| norm 0.2715 (+0.65z)| lr 4.21e-05 | 2532.79 ms | 53.3% bf16 MFU | 206985 tok/s +step 16345/19560 | loss 3.377360 (+1.21z)| norm 0.2576 (-0.27z)| lr 4.20e-05 | 2532.03 ms | 53.3% bf16 MFU | 206988 tok/s +step 16346/19560 | loss 3.255567 (-1.54z)| norm 0.3200 (+3.65z)| lr 4.20e-05 | 2533.32 ms | 53.3% bf16 MFU | 206987 tok/s +step 16347/19560 | loss 3.316088 (-0.17z)| norm 0.2763 (+0.88z)| lr 4.20e-05 | 2532.18 ms | 53.3% bf16 MFU | 206990 tok/s +step 16348/19560 | loss 3.314437 (-0.21z)| norm 0.2705 (+0.51z)| lr 4.20e-05 | 2534.67 ms | 53.3% bf16 MFU | 206983 tok/s +step 16349/19560 | loss 3.267634 (-1.25z)| norm 0.2641 (+0.10z)| lr 4.19e-05 | 2533.82 ms | 53.3% bf16 MFU | 206980 tok/s +step 16350/19560 | loss 3.323130 (-0.01z)| norm 0.2606 (-0.11z)| lr 4.19e-05 | 2533.27 ms | 53.3% bf16 MFU | 206979 tok/s +step 16351/19560 | loss 3.364243 (+0.94z)| norm 0.2745 (+0.74z)| lr 4.19e-05 | 2532.06 ms | 53.3% bf16 MFU | 206983 tok/s +step 16352/19560 | loss 3.341363 (+0.41z)| norm 0.2862 (+1.47z)| lr 4.18e-05 | 2532.17 ms | 53.3% bf16 MFU | 206986 tok/s +step 16353/19560 | loss 3.329820 (+0.14z)| norm 0.2540 (-0.55z)| lr 4.18e-05 | 2531.60 ms | 53.3% bf16 MFU | 206992 tok/s +step 16354/19560 | loss 3.378232 (+1.23z)| norm 0.2490 (-0.86z)| lr 4.18e-05 | 2532.54 ms | 53.3% bf16 MFU | 206993 tok/s +step 16355/19560 | loss 3.272842 (-1.17z)| norm 0.2604 (-0.15z)| lr 4.18e-05 | 2533.32 ms | 53.3% bf16 MFU | 206991 tok/s +step 16356/19560 | loss 3.355300 (+0.72z)| norm 0.2608 (-0.14z)| lr 4.17e-05 | 2533.23 ms | 53.3% bf16 MFU | 206990 tok/s +step 16357/19560 | loss 3.401336 (+1.74z)| norm 0.2723 (+0.58z)| lr 4.17e-05 | 2533.38 ms | 53.3% bf16 MFU | 206988 tok/s +step 16358/19560 | loss 3.419223 (+2.10z)| norm 0.2467 (-1.04z)| lr 4.17e-05 | 2532.78 ms | 53.3% bf16 MFU | 206989 tok/s +step 16359/19560 | loss 3.288159 (-0.82z)| norm 0.2607 (-0.15z)| lr 4.17e-05 | 2533.84 ms | 53.3% bf16 MFU | 206985 tok/s +step 16360/19560 | loss 3.404704 (+1.80z)| norm 0.3117 (+2.95z)| lr 4.16e-05 | 2533.55 ms | 53.3% bf16 MFU | 206983 tok/s +step 16361/19560 | loss 3.278678 (-1.02z)| norm 0.2379 (-1.54z)| lr 4.16e-05 | 2534.02 ms | 53.3% bf16 MFU | 206978 tok/s +step 16362/19560 | loss 3.369611 (+1.01z)| norm 0.2622 (-0.07z)| lr 4.16e-05 | 2532.36 ms | 53.3% bf16 MFU | 206981 tok/s +step 16363/19560 | loss 3.340622 (+0.36z)| norm 0.2701 (+0.40z)| lr 4.16e-05 | 2533.68 ms | 53.3% bf16 MFU | 206979 tok/s +step 16364/19560 | loss 3.358899 (+0.75z)| norm 0.2581 (-0.33z)| lr 4.15e-05 | 2535.15 ms | 53.3% bf16 MFU | 206970 tok/s +step 16365/19560 | loss 3.298289 (-0.60z)| norm 0.2451 (-1.10z)| lr 4.15e-05 | 2534.77 ms | 53.3% bf16 MFU | 206963 tok/s +step 16366/19560 | loss 3.249162 (-1.67z)| norm 0.2733 (+0.59z)| lr 4.15e-05 | 2533.00 ms | 53.3% bf16 MFU | 206964 tok/s +step 16367/19560 | loss 3.341035 (+0.35z)| norm 0.2444 (-1.14z)| lr 4.15e-05 | 2532.98 ms | 53.3% bf16 MFU | 206965 tok/s +step 16368/19560 | loss 3.296630 (-0.62z)| norm 0.2516 (-0.70z)| lr 4.14e-05 | 2534.17 ms | 53.3% bf16 MFU | 206962 tok/s +step 16369/19560 | loss 3.377373 (+1.15z)| norm 0.2668 (+0.21z)| lr 4.14e-05 | 2534.78 ms | 53.3% bf16 MFU | 206955 tok/s +step 16370/19560 | loss 3.284518 (-0.89z)| norm 0.2513 (-0.72z)| lr 4.14e-05 | 2533.60 ms | 53.3% bf16 MFU | 206954 tok/s +step 16371/19560 | loss 3.360888 (+0.79z)| norm 0.2572 (-0.37z)| lr 4.14e-05 | 2533.17 ms | 53.3% bf16 MFU | 206955 tok/s +step 16372/19560 | loss 3.427006 (+2.18z)| norm 0.2441 (-1.15z)| lr 4.13e-05 | 2535.18 ms | 53.3% bf16 MFU | 206947 tok/s +step 16373/19560 | loss 3.310005 (-0.33z)| norm 0.2865 (+1.37z)| lr 4.13e-05 | 2532.21 ms | 53.3% bf16 MFU | 206952 tok/s +step 16374/19560 | loss 3.303729 (-0.47z)| norm 0.2639 (+0.03z)| lr 4.13e-05 | 2534.44 ms | 53.3% bf16 MFU | 206948 tok/s +step 16375/19560 | loss 3.380906 (+1.19z)| norm 0.2915 (+1.65z)| lr 4.13e-05 | 2531.20 ms | 53.3% bf16 MFU | 206957 tok/s +step 16376/19560 | loss 3.335473 (+0.20z)| norm 0.2566 (-0.40z)| lr 4.12e-05 | 2533.98 ms | 53.3% bf16 MFU | 206955 tok/s +step 16377/19560 | loss 3.355405 (+0.62z)| norm 0.2728 (+0.56z)| lr 4.12e-05 | 2532.31 ms | 53.3% bf16 MFU | 206959 tok/s +step 16378/19560 | loss 3.338878 (+0.27z)| norm 0.2579 (-0.32z)| lr 4.12e-05 | 2532.85 ms | 53.3% bf16 MFU | 206961 tok/s +step 16379/19560 | loss 3.390491 (+1.37z)| norm 0.2587 (-0.27z)| lr 4.12e-05 | 2534.98 ms | 53.3% bf16 MFU | 206954 tok/s +step 16380/19560 | loss 3.374579 (+1.01z)| norm 0.2674 (+0.25z)| lr 4.11e-05 | 2532.34 ms | 53.3% bf16 MFU | 206958 tok/s +step 16381/19560 | loss 3.342209 (+0.31z)| norm 0.2588 (-0.25z)| lr 4.11e-05 | 2534.98 ms | 53.3% bf16 MFU | 206951 tok/s +step 16382/19560 | loss 3.339079 (+0.25z)| norm 0.2570 (-0.36z)| lr 4.11e-05 | 2532.28 ms | 53.3% bf16 MFU | 206956 tok/s +step 16383/19560 | loss 3.304261 (-0.50z)| norm 0.2567 (-0.37z)| lr 4.11e-05 | 2533.37 ms | 53.3% bf16 MFU | 206955 tok/s +step 16384/19560 | loss 3.309796 (-0.38z)| norm 0.2695 (+0.39z)| lr 4.10e-05 | 2532.32 ms | 53.3% bf16 MFU | 206960 tok/s +step 16385/19560 | loss 3.364543 (+0.78z)| norm 0.2452 (-1.04z)| lr 4.10e-05 | 2533.82 ms | 53.3% bf16 MFU | 206957 tok/s +step 16386/19560 | loss 3.260836 (-1.42z)| norm 0.2752 (+0.74z)| lr 4.10e-05 | 2534.62 ms | 53.3% bf16 MFU | 206952 tok/s +step 16387/19560 | loss 3.364859 (+0.77z)| norm 0.2595 (-0.19z)| lr 4.10e-05 | 2535.12 ms | 53.3% bf16 MFU | 206945 tok/s +step 16388/19560 | loss 3.320078 (-0.19z)| norm 0.2670 (+0.26z)| lr 4.09e-05 | 2532.09 ms | 53.3% bf16 MFU | 206951 tok/s +step 16389/19560 | loss 3.307896 (-0.45z)| norm 0.2719 (+0.54z)| lr 4.09e-05 | 2534.93 ms | 53.3% bf16 MFU | 206944 tok/s +step 16390/19560 | loss 3.417024 (+1.85z)| norm 0.2571 (-0.33z)| lr 4.09e-05 | 2532.39 ms | 53.3% bf16 MFU | 206949 tok/s +step 16391/19560 | loss 3.300125 (-0.63z)| norm 0.2639 (+0.07z)| lr 4.09e-05 | 2533.06 ms | 53.3% bf16 MFU | 206950 tok/s +step 16392/19560 | loss 3.561834 (+4.48z)| norm 0.3085 (+2.64z)| lr 4.08e-05 | 2533.03 ms | 53.3% bf16 MFU | 206952 tok/s +step 16393/19560 | loss 3.357355 (+0.49z)| norm 0.2918 (+1.64z)| lr 4.08e-05 | 2534.13 ms | 53.3% bf16 MFU | 206949 tok/s +step 16394/19560 | loss 3.306054 (-0.53z)| norm 0.2642 (+0.05z)| lr 4.08e-05 | 2533.93 ms | 53.3% bf16 MFU | 206947 tok/s +step 16395/19560 | loss 3.298520 (-0.69z)| norm 0.2662 (+0.16z)| lr 4.08e-05 | 2534.20 ms | 53.3% bf16 MFU | 206943 tok/s +step 16396/19560 | loss 3.388373 (+1.09z)| norm 0.2795 (+0.92z)| lr 4.07e-05 | 2535.74 ms | 53.2% bf16 MFU | 206934 tok/s +step 16397/19560 | loss 3.353575 (+0.40z)| norm 0.2604 (-0.19z)| lr 4.07e-05 | 2532.08 ms | 53.3% bf16 MFU | 206940 tok/s +step 16398/19560 | loss 3.499949 (+3.15z)| norm 0.2887 (+1.42z)| lr 4.07e-05 | 2532.63 ms | 53.3% bf16 MFU | 206944 tok/s +step 16399/19560 | loss 3.413507 (+1.48z)| norm 0.2599 (-0.22z)| lr 4.07e-05 | 2533.11 ms | 53.3% bf16 MFU | 206946 tok/s +step 16400/19560 | loss 3.291232 (-0.82z)| norm 0.2816 (+1.02z)| lr 4.06e-05 | 2532.44 ms | 53.3% bf16 MFU | 206950 tok/s +step 16401/19560 | loss 3.332487 (-0.03z)| norm 0.2649 (+0.06z)| lr 4.06e-05 | 2532.43 ms | 53.3% bf16 MFU | 206954 tok/s +step 16402/19560 | loss 3.529579 (+3.50z)| norm 0.3473 (+4.38z)| lr 4.06e-05 | 2532.30 ms | 53.3% bf16 MFU | 206958 tok/s +step 16403/19560 | loss 3.321725 (-0.24z)| norm 0.2971 (+1.69z)| lr 4.06e-05 | 2532.66 ms | 53.3% bf16 MFU | 206961 tok/s +step 16404/19560 | loss 3.284543 (-0.91z)| norm 0.2844 (+1.02z)| lr 4.05e-05 | 2535.49 ms | 53.3% bf16 MFU | 206952 tok/s +step 16405/19560 | loss 3.286805 (-0.85z)| norm 0.2676 (+0.14z)| lr 4.05e-05 | 2532.79 ms | 53.3% bf16 MFU | 206954 tok/s +step 16406/19560 | loss 3.323667 (-0.19z)| norm 0.2648 (-0.02z)| lr 4.05e-05 | 2532.51 ms | 53.3% bf16 MFU | 206958 tok/s +step 16407/19560 | loss 3.352023 (+0.32z)| norm 0.2960 (+1.58z)| lr 4.05e-05 | 2534.23 ms | 53.3% bf16 MFU | 206954 tok/s +step 16408/19560 | loss 3.295888 (-0.69z)| norm 0.2637 (-0.09z)| lr 4.04e-05 | 2532.16 ms | 53.3% bf16 MFU | 206959 tok/s +step 16409/19560 | loss 3.311402 (-0.40z)| norm 0.2469 (-0.95z)| lr 4.04e-05 | 2535.42 ms | 53.3% bf16 MFU | 206950 tok/s +step 16410/19560 | loss 3.369681 (+0.65z)| norm 0.2680 (+0.14z)| lr 4.04e-05 | 2532.00 ms | 53.3% bf16 MFU | 206956 tok/s +step 16411/19560 | loss 3.313426 (-0.38z)| norm 0.2613 (-0.21z)| lr 4.04e-05 | 2531.54 ms | 53.3% bf16 MFU | 206963 tok/s +step 16412/19560 | loss 3.335508 (+0.02z)| norm 0.2637 (-0.09z)| lr 4.03e-05 | 2528.70 ms | 53.4% bf16 MFU | 206982 tok/s +step 16413/19560 | loss 3.317705 (-0.30z)| norm 0.2433 (-1.14z)| lr 4.03e-05 | 2530.64 ms | 53.4% bf16 MFU | 206991 tok/s +step 16414/19560 | loss 3.309678 (-0.45z)| norm 0.2516 (-0.70z)| lr 4.03e-05 | 2531.11 ms | 53.3% bf16 MFU | 206999 tok/s +step 16415/19560 | loss 3.266391 (-1.25z)| norm 0.2672 (+0.10z)| lr 4.03e-05 | 2532.05 ms | 53.3% bf16 MFU | 207002 tok/s +step 16416/19560 | loss 3.292671 (-0.76z)| norm 0.2639 (-0.07z)| lr 4.02e-05 | 2532.21 ms | 53.3% bf16 MFU | 207004 tok/s +step 16417/19560 | loss 3.403085 (+1.25z)| norm 0.2451 (-1.03z)| lr 4.02e-05 | 2532.38 ms | 53.3% bf16 MFU | 207006 tok/s +step 16418/19560 | loss 3.354619 (+0.36z)| norm 0.2613 (-0.20z)| lr 4.02e-05 | 2534.03 ms | 53.3% bf16 MFU | 207000 tok/s +step 16419/19560 | loss 3.316668 (-0.35z)| norm 0.2601 (-0.26z)| lr 4.02e-05 | 2532.70 ms | 53.3% bf16 MFU | 207001 tok/s +step 16420/19560 | loss 3.255119 (-1.49z)| norm 0.2663 (+0.05z)| lr 4.01e-05 | 2532.85 ms | 53.3% bf16 MFU | 207000 tok/s +step 16421/19560 | loss 3.393959 (+1.07z)| norm 0.2542 (-0.57z)| lr 4.01e-05 | 2536.74 ms | 53.2% bf16 MFU | 206984 tok/s +step 16422/19560 | loss 3.347046 (+0.19z)| norm 0.2536 (-0.61z)| lr 4.01e-05 | 2533.59 ms | 53.3% bf16 MFU | 206982 tok/s +step 16423/19560 | loss 3.299236 (-0.69z)| norm 0.2537 (-0.60z)| lr 4.01e-05 | 2534.96 ms | 53.3% bf16 MFU | 206974 tok/s +step 16424/19560 | loss 3.249621 (-1.58z)| norm 0.2712 (+0.30z)| lr 4.00e-05 | 2534.29 ms | 53.3% bf16 MFU | 206969 tok/s +step 16425/19560 | loss 3.372741 (+0.67z)| norm 0.2623 (-0.16z)| lr 4.00e-05 | 2533.52 ms | 53.3% bf16 MFU | 206968 tok/s +step 16426/19560 | loss 3.299801 (-0.70z)| norm 0.2575 (-0.41z)| lr 4.00e-05 | 2532.64 ms | 53.3% bf16 MFU | 206970 tok/s +step 16427/19560 | loss 3.334346 (-0.05z)| norm 0.2658 (+0.02z)| lr 4.00e-05 | 2534.05 ms | 53.3% bf16 MFU | 206966 tok/s +step 16428/19560 | loss 3.332572 (-0.08z)| norm 0.2771 (+0.59z)| lr 3.99e-05 | 2531.71 ms | 53.3% bf16 MFU | 206972 tok/s +step 16429/19560 | loss 3.302991 (-0.63z)| norm 0.2516 (-0.73z)| lr 3.99e-05 | 2534.09 ms | 53.3% bf16 MFU | 206968 tok/s +step 16430/19560 | loss 3.354226 (+0.32z)| norm 0.2801 (+0.74z)| lr 3.99e-05 | 2534.87 ms | 53.3% bf16 MFU | 206962 tok/s +step 16431/19560 | loss 3.356072 (+0.35z)| norm 0.2660 (+0.00z)| lr 3.99e-05 | 2534.06 ms | 53.3% bf16 MFU | 206958 tok/s +step 16432/19560 | loss 3.288849 (-0.90z)| norm 0.2597 (-0.33z)| lr 3.98e-05 | 2533.12 ms | 53.3% bf16 MFU | 206959 tok/s +step 16433/19560 | loss 3.258287 (-1.45z)| norm 0.2594 (-0.34z)| lr 3.98e-05 | 2533.03 ms | 53.3% bf16 MFU | 206960 tok/s +step 16434/19560 | loss 3.351970 (+0.31z)| norm 0.2607 (-0.27z)| lr 3.98e-05 | 2531.50 ms | 53.3% bf16 MFU | 206967 tok/s +step 16435/19560 | loss 3.248999 (-1.62z)| norm 0.2701 (+0.21z)| lr 3.98e-05 | 2533.25 ms | 53.3% bf16 MFU | 206967 tok/s +step 16436/19560 | loss 3.305228 (-0.55z)| norm 0.2411 (-1.28z)| lr 3.97e-05 | 2534.78 ms | 53.3% bf16 MFU | 206961 tok/s +step 16437/19560 | loss 3.348837 (+0.28z)| norm 0.2669 (+0.06z)| lr 3.97e-05 | 2535.79 ms | 53.2% bf16 MFU | 206950 tok/s +step 16438/19560 | loss 3.355020 (+0.39z)| norm 0.2612 (-0.24z)| lr 3.97e-05 | 2535.71 ms | 53.2% bf16 MFU | 206941 tok/s +step 16439/19560 | loss 3.399714 (+1.23z)| norm 0.2552 (-0.54z)| lr 3.97e-05 | 2534.07 ms | 53.3% bf16 MFU | 206939 tok/s +step 16440/19560 | loss 3.293163 (-0.79z)| norm 0.2557 (-0.56z)| lr 3.96e-05 | 2533.82 ms | 53.3% bf16 MFU | 206938 tok/s +step 16441/19560 | loss 3.257360 (-1.46z)| norm 0.2556 (-0.56z)| lr 3.96e-05 | 2535.97 ms | 53.2% bf16 MFU | 206928 tok/s +step 16442/19560 | loss 3.360401 (+0.49z)| norm 0.2643 (-0.01z)| lr 3.96e-05 | 2535.04 ms | 53.3% bf16 MFU | 206922 tok/s +step 16443/19560 | loss 3.312438 (-0.41z)| norm 0.2512 (-0.83z)| lr 3.96e-05 | 2534.06 ms | 53.3% bf16 MFU | 206921 tok/s +step 16444/19560 | loss 3.348465 (+0.26z)| norm 0.2462 (-1.13z)| lr 3.95e-05 | 2533.41 ms | 53.3% bf16 MFU | 206922 tok/s +step 16445/19560 | loss 3.314582 (-0.38z)| norm 0.2444 (-1.22z)| lr 3.95e-05 | 2532.82 ms | 53.3% bf16 MFU | 206926 tok/s +step 16446/19560 | loss 3.383394 (+0.92z)| norm 0.2510 (-0.80z)| lr 3.95e-05 | 2533.90 ms | 53.3% bf16 MFU | 206925 tok/s +step 16447/19560 | loss 3.344925 (+0.19z)| norm 0.2456 (-1.12z)| lr 3.95e-05 | 2532.71 ms | 53.3% bf16 MFU | 206929 tok/s +step 16448/19560 | loss 3.409585 (+1.39z)| norm 0.2459 (-1.09z)| lr 3.94e-05 | 2534.19 ms | 53.3% bf16 MFU | 206927 tok/s +step 16449/19560 | loss 3.325738 (-0.19z)| norm 0.2575 (-0.38z)| lr 3.94e-05 | 2534.13 ms | 53.3% bf16 MFU | 206925 tok/s +step 16450/19560 | loss 3.311210 (-0.46z)| norm 0.2608 (-0.17z)| lr 3.94e-05 | 2533.40 ms | 53.3% bf16 MFU | 206926 tok/s +step 16451/19560 | loss 3.298165 (-0.70z)| norm 0.2605 (-0.19z)| lr 3.94e-05 | 2533.36 ms | 53.3% bf16 MFU | 206928 tok/s +step 16452/19560 | loss 3.311469 (-0.44z)| norm 0.2576 (-0.37z)| lr 3.93e-05 | 2532.12 ms | 53.3% bf16 MFU | 206934 tok/s +step 16453/19560 | loss 3.301390 (-0.63z)| norm 0.2503 (-0.82z)| lr 3.93e-05 | 2532.94 ms | 53.3% bf16 MFU | 206937 tok/s +step 16454/19560 | loss 3.374912 (+0.75z)| norm 0.2575 (-0.37z)| lr 3.93e-05 | 2533.92 ms | 53.3% bf16 MFU | 206935 tok/s +step 16455/19560 | loss 3.286702 (-0.90z)| norm 0.2575 (-0.39z)| lr 3.93e-05 | 2534.20 ms | 53.3% bf16 MFU | 206933 tok/s +step 16456/19560 | loss 3.494445 (+2.88z)| norm 0.3052 (+2.52z)| lr 3.92e-05 | 2532.91 ms | 53.3% bf16 MFU | 206936 tok/s +step 16457/19560 | loss 3.277713 (-1.05z)| norm 0.2540 (-0.62z)| lr 3.92e-05 | 2533.83 ms | 53.3% bf16 MFU | 206935 tok/s +step 16458/19560 | loss 3.412475 (+1.37z)| norm 0.2820 (+1.09z)| lr 3.92e-05 | 2533.57 ms | 53.3% bf16 MFU | 206935 tok/s +step 16459/19560 | loss 3.297370 (-0.71z)| norm 0.2548 (-0.57z)| lr 3.92e-05 | 2532.62 ms | 53.3% bf16 MFU | 206939 tok/s +step 16460/19560 | loss 3.351480 (+0.27z)| norm 0.2768 (+0.76z)| lr 3.91e-05 | 2532.09 ms | 53.3% bf16 MFU | 206945 tok/s +step 16461/19560 | loss 3.262976 (-1.31z)| norm 0.2618 (-0.16z)| lr 3.91e-05 | 2533.93 ms | 53.3% bf16 MFU | 206943 tok/s +step 16462/19560 | loss 3.300683 (-0.63z)| norm 0.2517 (-0.77z)| lr 3.91e-05 | 2536.06 ms | 53.2% bf16 MFU | 206932 tok/s +step 16463/19560 | loss 3.381686 (+0.81z)| norm 0.2518 (-0.77z)| lr 3.91e-05 | 2534.23 ms | 53.3% bf16 MFU | 206930 tok/s +step 16464/19560 | loss 3.272315 (-1.15z)| norm 0.2684 (+0.24z)| lr 3.90e-05 | 2532.91 ms | 53.3% bf16 MFU | 206933 tok/s +step 16465/19560 | loss 3.351251 (+0.27z)| norm 0.2899 (+1.54z)| lr 3.90e-05 | 2533.89 ms | 53.3% bf16 MFU | 206932 tok/s +step 16466/19560 | loss 3.338002 (+0.02z)| norm 0.2569 (-0.46z)| lr 3.90e-05 | 2532.93 ms | 53.3% bf16 MFU | 206935 tok/s +step 16467/19560 | loss 3.335618 (-0.03z)| norm 0.2759 (+0.69z)| lr 3.90e-05 | 2532.24 ms | 53.3% bf16 MFU | 206940 tok/s +step 16468/19560 | loss 3.356820 (+0.35z)| norm 0.2842 (+1.17z)| lr 3.89e-05 | 2532.17 ms | 53.3% bf16 MFU | 206946 tok/s +step 16469/19560 | loss 3.351889 (+0.25z)| norm 0.2721 (+0.43z)| lr 3.89e-05 | 2532.53 ms | 53.3% bf16 MFU | 206949 tok/s +step 16470/19560 | loss 3.406934 (+1.24z)| norm 0.2620 (-0.18z)| lr 3.89e-05 | 2532.97 ms | 53.3% bf16 MFU | 206951 tok/s +step 16471/19560 | loss 3.314998 (-0.42z)| norm 0.2584 (-0.40z)| lr 3.89e-05 | 2535.65 ms | 53.2% bf16 MFU | 206942 tok/s +step 16472/19560 | loss 3.292701 (-0.81z)| norm 0.2676 (+0.16z)| lr 3.88e-05 | 2533.53 ms | 53.3% bf16 MFU | 206942 tok/s +step 16473/19560 | loss 3.338713 (+0.04z)| norm 0.2823 (+1.05z)| lr 3.88e-05 | 2535.01 ms | 53.3% bf16 MFU | 206936 tok/s +step 16474/19560 | loss 3.308227 (-0.53z)| norm 0.2483 (-1.03z)| lr 3.88e-05 | 2532.65 ms | 53.3% bf16 MFU | 206940 tok/s +step 16475/19560 | loss 3.285197 (-0.96z)| norm 0.2889 (+1.53z)| lr 3.88e-05 | 2532.90 ms | 53.3% bf16 MFU | 206942 tok/s +step 16476/19560 | loss 3.306343 (-0.56z)| norm 0.2643 (-0.02z)| lr 3.87e-05 | 2533.73 ms | 53.3% bf16 MFU | 206941 tok/s +step 16477/19560 | loss 3.305543 (-0.58z)| norm 0.2647 (+0.00z)| lr 3.87e-05 | 2533.94 ms | 53.3% bf16 MFU | 206939 tok/s +step 16478/19560 | loss 3.309270 (-0.51z)| norm 0.2814 (+1.04z)| lr 3.87e-05 | 2534.49 ms | 53.3% bf16 MFU | 206935 tok/s +step 16479/19560 | loss 3.272608 (-1.18z)| norm 0.2591 (-0.35z)| lr 3.87e-05 | 2533.28 ms | 53.3% bf16 MFU | 206937 tok/s +step 16480/19560 | loss 3.244341 (-1.67z)| norm 0.2666 (+0.13z)| lr 3.86e-05 | 2535.02 ms | 53.3% bf16 MFU | 206931 tok/s +step 16481/19560 | loss 3.331116 (-0.07z)| norm 0.2851 (+1.28z)| lr 3.86e-05 | 2534.54 ms | 53.3% bf16 MFU | 206927 tok/s +step 16482/19560 | loss 3.330927 (-0.07z)| norm 0.2851 (+1.26z)| lr 3.86e-05 | 2534.14 ms | 53.3% bf16 MFU | 206925 tok/s +step 16483/19560 | loss 3.413074 (+1.42z)| norm 0.3058 (+2.48z)| lr 3.86e-05 | 2534.75 ms | 53.3% bf16 MFU | 206921 tok/s +step 16484/19560 | loss 3.373866 (+0.70z)| norm 0.2691 (+0.23z)| lr 3.86e-05 | 2533.32 ms | 53.3% bf16 MFU | 206923 tok/s +step 16485/19560 | loss 3.354326 (+0.35z)| norm 0.2862 (+1.26z)| lr 3.85e-05 | 2535.17 ms | 53.3% bf16 MFU | 206917 tok/s +step 16486/19560 | loss 3.319966 (-0.28z)| norm 0.2800 (+0.87z)| lr 3.85e-05 | 2535.32 ms | 53.3% bf16 MFU | 206911 tok/s +step 16487/19560 | loss 3.442233 (+1.96z)| norm 0.2695 (+0.22z)| lr 3.85e-05 | 2530.90 ms | 53.3% bf16 MFU | 206923 tok/s +step 16488/19560 | loss 3.296801 (-0.71z)| norm 0.2726 (+0.44z)| lr 3.85e-05 | 2533.86 ms | 53.3% bf16 MFU | 206922 tok/s +step 16489/19560 | loss 3.338198 (+0.05z)| norm 0.2651 (-0.05z)| lr 3.84e-05 | 2532.99 ms | 53.3% bf16 MFU | 206926 tok/s +step 16490/19560 | loss 3.314069 (-0.39z)| norm 0.2661 (+0.02z)| lr 3.84e-05 | 2531.41 ms | 53.3% bf16 MFU | 206935 tok/s +step 16491/19560 | loss 3.321890 (-0.25z)| norm 0.2453 (-1.29z)| lr 3.84e-05 | 2532.74 ms | 53.3% bf16 MFU | 206938 tok/s +step 16492/19560 | loss 3.349236 (+0.26z)| norm 0.2661 (+0.03z)| lr 3.84e-05 | 2533.70 ms | 53.3% bf16 MFU | 206938 tok/s +step 16493/19560 | loss 3.334871 (-0.01z)| norm 0.3033 (+2.33z)| lr 3.83e-05 | 2536.58 ms | 53.2% bf16 MFU | 206925 tok/s +step 16494/19560 | loss 3.311522 (-0.46z)| norm 0.2701 (+0.25z)| lr 3.83e-05 | 2533.26 ms | 53.3% bf16 MFU | 206927 tok/s +step 16495/19560 | loss 3.341956 (+0.12z)| norm 0.2662 (-0.01z)| lr 3.83e-05 | 2533.04 ms | 53.3% bf16 MFU | 206930 tok/s +step 16496/19560 | loss 3.296654 (-0.74z)| norm 0.2885 (+1.38z)| lr 3.83e-05 | 2534.62 ms | 53.3% bf16 MFU | 206926 tok/s +step 16497/19560 | loss 3.392022 (+1.05z)| norm 0.2668 (+0.02z)| lr 3.82e-05 | 2532.65 ms | 53.3% bf16 MFU | 206930 tok/s +step 16498/19560 | loss 3.355350 (+0.36z)| norm 0.2584 (-0.52z)| lr 3.82e-05 | 2533.32 ms | 53.3% bf16 MFU | 206931 tok/s +step 16499/19560 | loss 3.319907 (-0.31z)| norm 0.2589 (-0.49z)| lr 3.82e-05 | 2532.75 ms | 53.3% bf16 MFU | 206935 tok/s +step 16500/19560 | loss 3.354145 (+0.35z)| norm 0.2642 (-0.16z)| lr 3.82e-05 | 2531.63 ms | 53.3% bf16 MFU | 206943 tok/s +val loss 3.297320 +evaluating HellaSwag: 0/628 evaluating HellaSwag: 10/628 evaluating HellaSwag: 20/628 evaluating HellaSwag: 30/628 evaluating HellaSwag: 40/628 evaluating HellaSwag: 50/628 evaluating HellaSwag: 60/628 evaluating HellaSwag: 70/628 evaluating HellaSwag: 80/628 evaluating HellaSwag: 90/628 evaluating HellaSwag: 100/628 evaluating HellaSwag: 110/628 evaluating HellaSwag: 120/628 evaluating HellaSwag: 130/628 evaluating HellaSwag: 140/628 evaluating HellaSwag: 150/628 evaluating HellaSwag: 160/628 evaluating HellaSwag: 170/628 evaluating HellaSwag: 180/628 evaluating HellaSwag: 190/628 evaluating HellaSwag: 200/628 evaluating HellaSwag: 210/628 evaluating HellaSwag: 220/628 evaluating HellaSwag: 230/628 evaluating HellaSwag: 240/628 evaluating HellaSwag: 250/628 evaluating HellaSwag: 260/628 evaluating HellaSwag: 270/628 evaluating HellaSwag: 280/628 evaluating HellaSwag: 290/628 evaluating HellaSwag: 300/628 evaluating HellaSwag: 310/628 evaluating HellaSwag: 320/628 evaluating HellaSwag: 330/628 evaluating HellaSwag: 340/628 evaluating HellaSwag: 350/628 evaluating HellaSwag: 360/628 evaluating HellaSwag: 370/628 evaluating HellaSwag: 380/628 evaluating HellaSwag: 390/628 evaluating HellaSwag: 400/628 evaluating HellaSwag: 410/628 evaluating HellaSwag: 420/628 evaluating HellaSwag: 430/628 evaluating HellaSwag: 440/628 evaluating HellaSwag: 450/628 evaluating HellaSwag: 460/628 evaluating HellaSwag: 470/628 evaluating HellaSwag: 480/628 evaluating HellaSwag: 490/628 evaluating HellaSwag: 500/628 evaluating HellaSwag: 510/628 evaluating HellaSwag: 520/628 evaluating HellaSwag: 530/628 evaluating HellaSwag: 540/628 evaluating HellaSwag: 550/628 evaluating HellaSwag: 560/628 evaluating HellaSwag: 570/628 evaluating HellaSwag: 580/628 evaluating HellaSwag: 590/628 evaluating HellaSwag: 600/628 evaluating HellaSwag: 610/628 evaluating HellaSwag: 620/628 HellaSwag: 3018/10042 = 0.300538 +step 16501/19560 | loss 3.371080 (+0.67z)| norm 0.2519 (-0.93z)| lr 3.81e-05 | 2534.77 ms | 53.3% bf16 MFU | 206938 tok/s +step 16502/19560 | loss 3.333645 (-0.05z)| norm 0.2472 (-1.22z)| lr 3.81e-05 | 2533.71 ms | 53.3% bf16 MFU | 206937 tok/s +step 16503/19560 | loss 3.367579 (+0.60z)| norm 0.2484 (-1.12z)| lr 3.81e-05 | 2532.01 ms | 53.3% bf16 MFU | 206944 tok/s +step 16504/19560 | loss 3.294473 (-0.79z)| norm 0.2554 (-0.68z)| lr 3.81e-05 | 2532.90 ms | 53.3% bf16 MFU | 206946 tok/s +step 16505/19560 | loss 3.263823 (-1.35z)| norm 0.2537 (-0.78z)| lr 3.80e-05 | 2532.38 ms | 53.3% bf16 MFU | 206950 tok/s +step 16506/19560 | loss 3.345740 (+0.20z)| norm 0.2480 (-1.13z)| lr 3.80e-05 | 2533.82 ms | 53.3% bf16 MFU | 206949 tok/s +step 16507/19560 | loss 3.346952 (+0.23z)| norm 0.2651 (-0.05z)| lr 3.80e-05 | 2532.15 ms | 53.3% bf16 MFU | 206954 tok/s +step 16508/19560 | loss 3.282241 (-0.99z)| norm 0.2512 (-0.92z)| lr 3.80e-05 | 2531.87 ms | 53.3% bf16 MFU | 206960 tok/s +step 16509/19560 | loss 3.308469 (-0.48z)| norm 0.2469 (-1.18z)| lr 3.79e-05 | 2532.58 ms | 53.3% bf16 MFU | 206963 tok/s +step 16510/19560 | loss 3.326582 (-0.14z)| norm 0.2500 (-0.98z)| lr 3.79e-05 | 2530.94 ms | 53.3% bf16 MFU | 206972 tok/s +step 16511/19560 | loss 3.297321 (-0.69z)| norm 0.2581 (-0.47z)| lr 3.79e-05 | 2532.56 ms | 53.3% bf16 MFU | 206974 tok/s +step 16512/19560 | loss 3.381762 (+0.90z)| norm 0.2412 (-1.51z)| lr 3.79e-05 | 2533.19 ms | 53.3% bf16 MFU | 206974 tok/s +step 16513/19560 | loss 3.340101 (+0.11z)| norm 0.2650 (-0.04z)| lr 3.78e-05 | 2531.73 ms | 53.3% bf16 MFU | 206980 tok/s +step 16514/19560 | loss 3.255380 (-1.49z)| norm 0.2461 (-1.20z)| lr 3.78e-05 | 2532.61 ms | 53.3% bf16 MFU | 206982 tok/s +step 16515/19560 | loss 3.399014 (+1.22z)| norm 0.2483 (-1.05z)| lr 3.78e-05 | 2533.25 ms | 53.3% bf16 MFU | 206981 tok/s +step 16516/19560 | loss 3.342417 (+0.15z)| norm 0.2521 (-0.81z)| lr 3.78e-05 | 2534.03 ms | 53.3% bf16 MFU | 206976 tok/s +step 16517/19560 | loss 3.346009 (+0.21z)| norm 0.2700 (+0.30z)| lr 3.77e-05 | 2533.47 ms | 53.3% bf16 MFU | 206975 tok/s +step 16518/19560 | loss 3.304025 (-0.57z)| norm 0.2550 (-0.62z)| lr 3.77e-05 | 2532.97 ms | 53.3% bf16 MFU | 206975 tok/s +step 16519/19560 | loss 3.269697 (-1.21z)| norm 0.2543 (-0.66z)| lr 3.77e-05 | 2534.03 ms | 53.3% bf16 MFU | 206972 tok/s +step 16520/19560 | loss 3.370018 (+0.77z)| norm 0.2655 (+0.05z)| lr 3.77e-05 | 2533.89 ms | 53.3% bf16 MFU | 206968 tok/s +step 16521/19560 | loss 3.311414 (-0.42z)| norm 0.2448 (-1.25z)| lr 3.76e-05 | 2534.76 ms | 53.3% bf16 MFU | 206962 tok/s +step 16522/19560 | loss 3.292064 (-0.81z)| norm 0.2699 (+0.35z)| lr 3.76e-05 | 2535.60 ms | 53.2% bf16 MFU | 206952 tok/s +step 16523/19560 | loss 3.334771 (+0.06z)| norm 0.2633 (-0.07z)| lr 3.76e-05 | 2534.97 ms | 53.3% bf16 MFU | 206946 tok/s +step 16524/19560 | loss 3.313150 (-0.38z)| norm 0.2646 (+0.02z)| lr 3.76e-05 | 2533.23 ms | 53.3% bf16 MFU | 206947 tok/s +step 16525/19560 | loss 3.398560 (+1.36z)| norm 0.3011 (+2.30z)| lr 3.76e-05 | 2534.46 ms | 53.3% bf16 MFU | 206943 tok/s +step 16526/19560 | loss 3.306486 (-0.51z)| norm 0.2640 (-0.02z)| lr 3.75e-05 | 2533.40 ms | 53.3% bf16 MFU | 206943 tok/s +step 16527/19560 | loss 3.346379 (+0.36z)| norm 0.2767 (+0.78z)| lr 3.75e-05 | 2533.75 ms | 53.3% bf16 MFU | 206942 tok/s +step 16528/19560 | loss 3.362437 (+0.70z)| norm 0.2678 (+0.21z)| lr 3.75e-05 | 2533.22 ms | 53.3% bf16 MFU | 206943 tok/s +step 16529/19560 | loss 3.351501 (+0.46z)| norm 0.2641 (-0.02z)| lr 3.75e-05 | 2533.21 ms | 53.3% bf16 MFU | 206944 tok/s +step 16530/19560 | loss 3.332410 (+0.08z)| norm 0.2801 (+1.17z)| lr 3.74e-05 | 2532.39 ms | 53.3% bf16 MFU | 206949 tok/s +step 16531/19560 | loss 3.322474 (-0.15z)| norm 0.2531 (-0.76z)| lr 3.74e-05 | 2533.61 ms | 53.3% bf16 MFU | 206948 tok/s +step 16532/19560 | loss 3.320306 (-0.21z)| norm 0.2795 (+1.18z)| lr 3.74e-05 | 2533.89 ms | 53.3% bf16 MFU | 206946 tok/s +step 16533/19560 | loss 3.327567 (-0.05z)| norm 0.2515 (-0.86z)| lr 3.74e-05 | 2535.78 ms | 53.2% bf16 MFU | 206937 tok/s +step 16534/19560 | loss 3.262838 (-1.55z)| norm 0.2531 (-0.74z)| lr 3.73e-05 | 2534.21 ms | 53.3% bf16 MFU | 206934 tok/s +step 16535/19560 | loss 3.317904 (-0.25z)| norm 0.2745 (+0.85z)| lr 3.73e-05 | 2533.01 ms | 53.3% bf16 MFU | 206936 tok/s +step 16536/19560 | loss 3.271187 (-1.34z)| norm 0.2478 (-1.12z)| lr 3.73e-05 | 2535.10 ms | 53.3% bf16 MFU | 206930 tok/s +step 16537/19560 | loss 3.304115 (-0.57z)| norm 0.2634 (+0.03z)| lr 3.73e-05 | 2533.55 ms | 53.3% bf16 MFU | 206930 tok/s +step 16538/19560 | loss 3.295730 (-0.75z)| norm 0.2592 (-0.28z)| lr 3.72e-05 | 2531.94 ms | 53.3% bf16 MFU | 206937 tok/s +step 16539/19560 | loss 3.297844 (-0.70z)| norm 0.2441 (-1.39z)| lr 3.72e-05 | 2532.60 ms | 53.3% bf16 MFU | 206941 tok/s +step 16540/19560 | loss 3.300604 (-0.63z)| norm 0.2496 (-0.97z)| lr 3.72e-05 | 2533.34 ms | 53.3% bf16 MFU | 206942 tok/s +step 16541/19560 | loss 3.307809 (-0.46z)| norm 0.2436 (-1.41z)| lr 3.72e-05 | 2530.86 ms | 53.3% bf16 MFU | 206953 tok/s +step 16542/19560 | loss 3.294917 (-0.75z)| norm 0.2525 (-0.76z)| lr 3.71e-05 | 2532.55 ms | 53.3% bf16 MFU | 206956 tok/s +step 16543/19560 | loss 3.288709 (-0.91z)| norm 0.2455 (-1.26z)| lr 3.71e-05 | 2531.83 ms | 53.3% bf16 MFU | 206962 tok/s +step 16544/19560 | loss 3.288201 (-0.92z)| norm 0.2575 (-0.37z)| lr 3.71e-05 | 2532.90 ms | 53.3% bf16 MFU | 206964 tok/s +step 16545/19560 | loss 3.328968 (+0.05z)| norm 0.2469 (-1.15z)| lr 3.71e-05 | 2532.11 ms | 53.3% bf16 MFU | 206968 tok/s +step 16546/19560 | loss 3.354705 (+0.66z)| norm 0.2453 (-1.26z)| lr 3.70e-05 | 2533.94 ms | 53.3% bf16 MFU | 206965 tok/s +step 16547/19560 | loss 3.357771 (+0.72z)| norm 0.2565 (-0.43z)| lr 3.70e-05 | 2533.16 ms | 53.3% bf16 MFU | 206965 tok/s +step 16548/19560 | loss 3.327862 (+0.00z)| norm 0.2742 (+0.85z)| lr 3.70e-05 | 2533.03 ms | 53.3% bf16 MFU | 206966 tok/s +step 16549/19560 | loss 3.308064 (-0.46z)| norm 0.2443 (-1.31z)| lr 3.70e-05 | 2532.99 ms | 53.3% bf16 MFU | 206967 tok/s +step 16550/19560 | loss 3.382344 (+1.31z)| norm 0.2625 (-0.00z)| lr 3.69e-05 | 2533.00 ms | 53.3% bf16 MFU | 206968 tok/s +step 16551/19560 | loss 3.284944 (-1.01z)| norm 0.2523 (-0.73z)| lr 3.69e-05 | 2534.82 ms | 53.3% bf16 MFU | 206961 tok/s +step 16552/19560 | loss 3.287028 (-0.98z)| norm 0.2536 (-0.63z)| lr 3.69e-05 | 2533.23 ms | 53.3% bf16 MFU | 206961 tok/s +step 16553/19560 | loss 3.273768 (-1.28z)| norm 0.2442 (-1.29z)| lr 3.69e-05 | 2535.56 ms | 53.2% bf16 MFU | 206952 tok/s +step 16554/19560 | loss 3.282022 (-1.07z)| norm 0.2459 (-1.16z)| lr 3.69e-05 | 2532.01 ms | 53.3% bf16 MFU | 206958 tok/s +step 16555/19560 | loss 3.302063 (-0.58z)| norm 0.2492 (-0.91z)| lr 3.68e-05 | 2534.07 ms | 53.3% bf16 MFU | 206954 tok/s +step 16556/19560 | loss 3.272634 (-1.27z)| norm 0.2368 (-1.76z)| lr 3.68e-05 | 2534.56 ms | 53.3% bf16 MFU | 206950 tok/s +step 16557/19560 | loss 3.267704 (-1.37z)| norm 0.2430 (-1.31z)| lr 3.68e-05 | 2534.23 ms | 53.3% bf16 MFU | 206946 tok/s +step 16558/19560 | loss 3.285059 (-0.95z)| norm 0.2506 (-0.76z)| lr 3.68e-05 | 2533.60 ms | 53.3% bf16 MFU | 206946 tok/s +step 16559/19560 | loss 3.346355 (+0.50z)| norm 0.2455 (-1.10z)| lr 3.67e-05 | 2532.97 ms | 53.3% bf16 MFU | 206948 tok/s +step 16560/19560 | loss 3.297205 (-0.66z)| norm 0.2448 (-1.14z)| lr 3.67e-05 | 2533.50 ms | 53.3% bf16 MFU | 206947 tok/s +step 16561/19560 | loss 3.263631 (-1.46z)| norm 0.2400 (-1.46z)| lr 3.67e-05 | 2531.44 ms | 53.3% bf16 MFU | 206955 tok/s +step 16562/19560 | loss 3.367601 (+1.00z)| norm 0.2470 (-0.96z)| lr 3.67e-05 | 2534.17 ms | 53.3% bf16 MFU | 206952 tok/s +step 16563/19560 | loss 3.305786 (-0.48z)| norm 0.2624 (+0.11z)| lr 3.66e-05 | 2533.70 ms | 53.3% bf16 MFU | 206951 tok/s +step 16564/19560 | loss 3.286901 (-0.92z)| norm 0.2432 (-1.22z)| lr 3.66e-05 | 2531.90 ms | 53.3% bf16 MFU | 206957 tok/s +step 16565/19560 | loss 3.300472 (-0.59z)| norm 0.2393 (-1.47z)| lr 3.66e-05 | 2533.38 ms | 53.3% bf16 MFU | 206957 tok/s +step 16566/19560 | loss 3.320920 (-0.10z)| norm 0.2447 (-1.08z)| lr 3.66e-05 | 2532.68 ms | 53.3% bf16 MFU | 206959 tok/s +step 16567/19560 | loss 3.359905 (+0.85z)| norm 0.2381 (-1.51z)| lr 3.65e-05 | 2534.12 ms | 53.3% bf16 MFU | 206956 tok/s +step 16568/19560 | loss 3.320431 (-0.11z)| norm 0.2480 (-0.83z)| lr 3.65e-05 | 2535.68 ms | 53.2% bf16 MFU | 206946 tok/s +step 16569/19560 | loss 3.321294 (-0.10z)| norm 0.2574 (-0.19z)| lr 3.65e-05 | 2534.67 ms | 53.3% bf16 MFU | 206941 tok/s +step 16570/19560 | loss 3.296438 (-0.70z)| norm 0.2469 (-0.90z)| lr 3.65e-05 | 2535.08 ms | 53.3% bf16 MFU | 206935 tok/s +step 16571/19560 | loss 3.260418 (-1.55z)| norm 0.2549 (-0.35z)| lr 3.64e-05 | 2534.03 ms | 53.3% bf16 MFU | 206933 tok/s +step 16572/19560 | loss 3.311834 (-0.30z)| norm 0.2613 (+0.07z)| lr 3.64e-05 | 2534.69 ms | 53.3% bf16 MFU | 206929 tok/s +step 16573/19560 | loss 3.240756 (-1.98z)| norm 0.2826 (+1.49z)| lr 3.64e-05 | 2533.43 ms | 53.3% bf16 MFU | 206930 tok/s +step 16574/19560 | loss 3.320010 (-0.07z)| norm 0.2660 (+0.36z)| lr 3.64e-05 | 2532.04 ms | 53.3% bf16 MFU | 206936 tok/s +step 16575/19560 | loss 3.338317 (+0.37z)| norm 0.2739 (+0.88z)| lr 3.64e-05 | 2533.08 ms | 53.3% bf16 MFU | 206938 tok/s +step 16576/19560 | loss 3.266065 (-1.36z)| norm 0.2603 (-0.05z)| lr 3.63e-05 | 2535.67 ms | 53.2% bf16 MFU | 206930 tok/s +step 16577/19560 | loss 3.296803 (-0.61z)| norm 0.2598 (-0.08z)| lr 3.63e-05 | 2534.53 ms | 53.3% bf16 MFU | 206926 tok/s +step 16578/19560 | loss 3.294027 (-0.67z)| norm 0.2544 (-0.45z)| lr 3.63e-05 | 2535.61 ms | 53.2% bf16 MFU | 206918 tok/s +step 16579/19560 | loss 3.326259 (+0.11z)| norm 0.2654 (+0.30z)| lr 3.63e-05 | 2533.40 ms | 53.3% bf16 MFU | 206920 tok/s +step 16580/19560 | loss 3.294723 (-0.65z)| norm 0.2520 (-0.61z)| lr 3.62e-05 | 2533.63 ms | 53.3% bf16 MFU | 206920 tok/s +step 16581/19560 | loss 3.353139 (+0.75z)| norm 0.2682 (+0.48z)| lr 3.62e-05 | 2535.93 ms | 53.2% bf16 MFU | 206912 tok/s +step 16582/19560 | loss 3.342570 (+0.51z)| norm 0.2454 (-1.06z)| lr 3.62e-05 | 2532.76 ms | 53.3% bf16 MFU | 206916 tok/s +step 16583/19560 | loss 3.342384 (+0.49z)| norm 0.2587 (-0.16z)| lr 3.62e-05 | 2534.30 ms | 53.3% bf16 MFU | 206914 tok/s +step 16584/19560 | loss 3.310287 (-0.28z)| norm 0.2658 (+0.36z)| lr 3.61e-05 | 2532.70 ms | 53.3% bf16 MFU | 206919 tok/s +step 16585/19560 | loss 3.296839 (-0.64z)| norm 0.2437 (-1.19z)| lr 3.61e-05 | 2535.19 ms | 53.3% bf16 MFU | 206913 tok/s +step 16586/19560 | loss 3.280489 (-1.06z)| norm 0.2396 (-1.45z)| lr 3.61e-05 | 2533.74 ms | 53.3% bf16 MFU | 206914 tok/s +step 16587/19560 | loss 3.268447 (-1.37z)| norm 0.2679 (+0.52z)| lr 3.61e-05 | 2533.99 ms | 53.3% bf16 MFU | 206913 tok/s +step 16588/19560 | loss 3.375989 (+1.49z)| norm 0.2730 (+0.88z)| lr 3.60e-05 | 2534.01 ms | 53.3% bf16 MFU | 206912 tok/s +step 16589/19560 | loss 3.347013 (+0.71z)| norm 0.2448 (-1.08z)| lr 3.60e-05 | 2533.14 ms | 53.3% bf16 MFU | 206915 tok/s +step 16590/19560 | loss 3.322323 (+0.04z)| norm 0.2493 (-0.77z)| lr 3.60e-05 | 2533.43 ms | 53.3% bf16 MFU | 206917 tok/s +step 16591/19560 | loss 3.341265 (+0.57z)| norm 0.2487 (-0.80z)| lr 3.60e-05 | 2534.95 ms | 53.3% bf16 MFU | 206912 tok/s +step 16592/19560 | loss 3.352479 (+0.86z)| norm 0.2793 (+1.32z)| lr 3.59e-05 | 2534.72 ms | 53.3% bf16 MFU | 206909 tok/s +step 16593/19560 | loss 3.306844 (-0.38z)| norm 0.2573 (-0.19z)| lr 3.59e-05 | 2532.98 ms | 53.3% bf16 MFU | 206913 tok/s +step 16594/19560 | loss 3.307938 (-0.34z)| norm 0.2572 (-0.20z)| lr 3.59e-05 | 2533.32 ms | 53.3% bf16 MFU | 206915 tok/s +step 16595/19560 | loss 3.300209 (-0.54z)| norm 0.2387 (-1.48z)| lr 3.59e-05 | 2531.54 ms | 53.3% bf16 MFU | 206924 tok/s +step 16596/19560 | loss 3.401120 (+2.17z)| norm 0.2622 (+0.19z)| lr 3.59e-05 | 2534.05 ms | 53.3% bf16 MFU | 206923 tok/s +step 16597/19560 | loss 3.333172 (+0.35z)| norm 0.2493 (-0.72z)| lr 3.58e-05 | 2534.50 ms | 53.3% bf16 MFU | 206920 tok/s +step 16598/19560 | loss 3.292666 (-0.74z)| norm 0.2520 (-0.52z)| lr 3.58e-05 | 2533.14 ms | 53.3% bf16 MFU | 206922 tok/s +step 16599/19560 | loss 3.284843 (-0.94z)| norm 0.2440 (-1.08z)| lr 3.58e-05 | 2531.86 ms | 53.3% bf16 MFU | 206930 tok/s +step 16600/19560 | loss 3.336774 (+0.47z)| norm 0.2463 (-0.90z)| lr 3.58e-05 | 2534.30 ms | 53.3% bf16 MFU | 206927 tok/s +step 16601/19560 | loss 3.342060 (+0.62z)| norm 0.2740 (+1.07z)| lr 3.57e-05 | 2532.94 ms | 53.3% bf16 MFU | 206930 tok/s +step 16602/19560 | loss 3.432846 (+2.97z)| norm 0.2570 (-0.14z)| lr 3.57e-05 | 2531.36 ms | 53.3% bf16 MFU | 206940 tok/s +step 16603/19560 | loss 3.275993 (-1.17z)| norm 0.2582 (-0.05z)| lr 3.57e-05 | 2534.36 ms | 53.3% bf16 MFU | 206936 tok/s +step 16604/19560 | loss 3.299416 (-0.55z)| norm 0.2593 (+0.04z)| lr 3.57e-05 | 2531.99 ms | 53.3% bf16 MFU | 206943 tok/s +step 16605/19560 | loss 3.321849 (+0.03z)| norm 0.2532 (-0.40z)| lr 3.56e-05 | 2532.44 ms | 53.3% bf16 MFU | 206947 tok/s +step 16606/19560 | loss 3.300982 (-0.52z)| norm 0.2667 (+0.60z)| lr 3.56e-05 | 2533.66 ms | 53.3% bf16 MFU | 206946 tok/s +step 16607/19560 | loss 3.323184 (+0.06z)| norm 0.2438 (-1.07z)| lr 3.56e-05 | 2532.73 ms | 53.3% bf16 MFU | 206949 tok/s +step 16608/19560 | loss 3.262643 (-1.56z)| norm 0.2377 (-1.48z)| lr 3.56e-05 | 2534.32 ms | 53.3% bf16 MFU | 206945 tok/s +step 16609/19560 | loss 3.222187 (-2.55z)| norm 0.3100 (+3.59z)| lr 3.55e-05 | 2534.58 ms | 53.3% bf16 MFU | 206941 tok/s +step 16610/19560 | loss 3.285558 (-0.89z)| norm 0.2442 (-0.98z)| lr 3.55e-05 | 2535.19 ms | 53.3% bf16 MFU | 206934 tok/s +step 16611/19560 | loss 3.336482 (+0.46z)| norm 0.2414 (-1.19z)| lr 3.55e-05 | 2533.25 ms | 53.3% bf16 MFU | 206935 tok/s +step 16612/19560 | loss 3.331241 (+0.33z)| norm 0.2505 (-0.51z)| lr 3.55e-05 | 2534.29 ms | 53.3% bf16 MFU | 206933 tok/s +step 16613/19560 | loss 3.360139 (+1.10z)| norm 0.2637 (+0.48z)| lr 3.55e-05 | 2536.21 ms | 53.2% bf16 MFU | 206922 tok/s +step 16614/19560 | loss 3.276420 (-1.12z)| norm 0.2437 (-1.00z)| lr 3.54e-05 | 2532.89 ms | 53.3% bf16 MFU | 206925 tok/s +step 16615/19560 | loss 3.318291 (+0.02z)| norm 0.2540 (-0.21z)| lr 3.54e-05 | 2532.28 ms | 53.3% bf16 MFU | 206931 tok/s +step 16616/19560 | loss 3.342377 (+0.68z)| norm 0.2459 (-0.81z)| lr 3.54e-05 | 2534.38 ms | 53.3% bf16 MFU | 206928 tok/s +step 16617/19560 | loss 3.308259 (-0.26z)| norm 0.2481 (-0.64z)| lr 3.54e-05 | 2537.63 ms | 53.2% bf16 MFU | 206912 tok/s +step 16618/19560 | loss 3.301627 (-0.45z)| norm 0.2444 (-0.90z)| lr 3.53e-05 | 2534.47 ms | 53.3% bf16 MFU | 206910 tok/s +step 16619/19560 | loss 3.344135 (+0.73z)| norm 0.2644 (+0.59z)| lr 3.53e-05 | 2534.64 ms | 53.3% bf16 MFU | 206907 tok/s +step 16620/19560 | loss 3.276370 (-1.13z)| norm 0.2533 (-0.23z)| lr 3.53e-05 | 2536.79 ms | 53.2% bf16 MFU | 206895 tok/s +step 16621/19560 | loss 3.362320 (+1.23z)| norm 0.2484 (-0.61z)| lr 3.53e-05 | 2534.92 ms | 53.3% bf16 MFU | 206892 tok/s +step 16622/19560 | loss 3.319959 (+0.07z)| norm 0.2634 (+0.60z)| lr 3.52e-05 | 2533.84 ms | 53.3% bf16 MFU | 206893 tok/s +step 16623/19560 | loss 3.299169 (-0.50z)| norm 0.2527 (-0.25z)| lr 3.52e-05 | 2533.36 ms | 53.3% bf16 MFU | 206896 tok/s +step 16624/19560 | loss 3.305820 (-0.32z)| norm 0.2685 (+1.06z)| lr 3.52e-05 | 2535.35 ms | 53.3% bf16 MFU | 206890 tok/s +step 16625/19560 | loss 3.298944 (-0.49z)| norm 0.2548 (-0.06z)| lr 3.52e-05 | 2533.45 ms | 53.3% bf16 MFU | 206893 tok/s +step 16626/19560 | loss 3.281797 (-0.96z)| norm 0.2561 (+0.04z)| lr 3.51e-05 | 2533.35 ms | 53.3% bf16 MFU | 206896 tok/s +step 16627/19560 | loss 3.342129 (+0.73z)| norm 0.2654 (+0.81z)| lr 3.51e-05 | 2532.31 ms | 53.3% bf16 MFU | 206903 tok/s +step 16628/19560 | loss 3.268702 (-1.31z)| norm 0.2440 (-0.94z)| lr 3.51e-05 | 2532.80 ms | 53.3% bf16 MFU | 206908 tok/s +step 16629/19560 | loss 3.301100 (-0.39z)| norm 0.2375 (-1.45z)| lr 3.51e-05 | 2534.03 ms | 53.3% bf16 MFU | 206908 tok/s +step 16630/19560 | loss 3.330054 (+0.43z)| norm 0.2459 (-0.77z)| lr 3.51e-05 | 2532.26 ms | 53.3% bf16 MFU | 206915 tok/s +step 16631/19560 | loss 3.275057 (-1.11z)| norm 0.2599 (+0.37z)| lr 3.50e-05 | 2534.39 ms | 53.3% bf16 MFU | 206912 tok/s +step 16632/19560 | loss 3.297997 (-0.46z)| norm 0.2484 (-0.57z)| lr 3.50e-05 | 2531.68 ms | 53.3% bf16 MFU | 206921 tok/s +step 16633/19560 | loss 3.340455 (+0.73z)| norm 0.2404 (-1.20z)| lr 3.50e-05 | 2532.43 ms | 53.3% bf16 MFU | 206927 tok/s +step 16634/19560 | loss 3.363559 (+1.38z)| norm 0.2526 (-0.22z)| lr 3.50e-05 | 2532.42 ms | 53.3% bf16 MFU | 206932 tok/s +step 16635/19560 | loss 3.275880 (-1.09z)| norm 0.2659 (+0.86z)| lr 3.49e-05 | 2533.28 ms | 53.3% bf16 MFU | 206933 tok/s +step 16636/19560 | loss 3.231678 (-2.29z)| norm 0.2532 (-0.17z)| lr 3.49e-05 | 2534.33 ms | 53.3% bf16 MFU | 206930 tok/s +step 16637/19560 | loss 3.265536 (-1.33z)| norm 0.2565 (+0.09z)| lr 3.49e-05 | 2534.07 ms | 53.3% bf16 MFU | 206929 tok/s +step 16638/19560 | loss 3.329087 (+0.43z)| norm 0.2728 (+1.39z)| lr 3.49e-05 | 2534.34 ms | 53.3% bf16 MFU | 206926 tok/s +step 16639/19560 | loss 3.327685 (+0.38z)| norm 0.2614 (+0.47z)| lr 3.48e-05 | 2532.13 ms | 53.3% bf16 MFU | 206932 tok/s +step 16640/19560 | loss 3.335451 (+0.61z)| norm 0.2500 (-0.46z)| lr 3.48e-05 | 2533.03 ms | 53.3% bf16 MFU | 206935 tok/s +step 16641/19560 | loss 3.305780 (-0.21z)| norm 0.2541 (-0.12z)| lr 3.48e-05 | 2532.55 ms | 53.3% bf16 MFU | 206939 tok/s +step 16642/19560 | loss 3.365595 (+1.45z)| norm 0.2496 (-0.48z)| lr 3.48e-05 | 2532.63 ms | 53.3% bf16 MFU | 206943 tok/s +step 16643/19560 | loss 3.277428 (-1.03z)| norm 0.2358 (-1.59z)| lr 3.47e-05 | 2533.51 ms | 53.3% bf16 MFU | 206942 tok/s +step 16644/19560 | loss 3.241560 (-2.00z)| norm 0.2408 (-1.17z)| lr 3.47e-05 | 2531.79 ms | 53.3% bf16 MFU | 206949 tok/s +step 16645/19560 | loss 3.256002 (-1.57z)| norm 0.2513 (-0.32z)| lr 3.47e-05 | 2532.52 ms | 53.3% bf16 MFU | 206953 tok/s +step 16646/19560 | loss 3.296419 (-0.43z)| norm 0.2470 (-0.66z)| lr 3.47e-05 | 2532.14 ms | 53.3% bf16 MFU | 206958 tok/s +step 16647/19560 | loss 3.330869 (+0.53z)| norm 0.2548 (-0.04z)| lr 3.47e-05 | 2534.25 ms | 53.3% bf16 MFU | 206954 tok/s +step 16648/19560 | loss 3.308295 (-0.10z)| norm 0.2558 (+0.05z)| lr 3.46e-05 | 2534.22 ms | 53.3% bf16 MFU | 206951 tok/s +step 16649/19560 | loss 3.275267 (-1.03z)| norm 0.2572 (+0.16z)| lr 3.46e-05 | 2532.29 ms | 53.3% bf16 MFU | 206955 tok/s +step 16650/19560 | loss 3.343834 (+0.91z)| norm 0.2502 (-0.40z)| lr 3.46e-05 | 2533.97 ms | 53.3% bf16 MFU | 206953 tok/s +step 16651/19560 | loss 3.314714 (+0.09z)| norm 0.2468 (-0.66z)| lr 3.46e-05 | 2532.75 ms | 53.3% bf16 MFU | 206955 tok/s +step 16652/19560 | loss 3.422250 (+3.00z)| norm 0.3264 (+5.14z)| lr 3.45e-05 | 2534.71 ms | 53.3% bf16 MFU | 206950 tok/s +step 16653/19560 | loss 3.318369 (+0.18z)| norm 0.2620 (+0.52z)| lr 3.45e-05 | 2534.51 ms | 53.3% bf16 MFU | 206945 tok/s +step 16654/19560 | loss 3.326725 (+0.41z)| norm 0.2756 (+1.53z)| lr 3.45e-05 | 2531.58 ms | 53.3% bf16 MFU | 206953 tok/s +step 16655/19560 | loss 3.342181 (+0.84z)| norm 0.2628 (+0.58z)| lr 3.45e-05 | 2531.88 ms | 53.3% bf16 MFU | 206959 tok/s +step 16656/19560 | loss 3.335472 (+0.67z)| norm 0.2563 (+0.10z)| lr 3.44e-05 | 2535.42 ms | 53.3% bf16 MFU | 206950 tok/s +step 16657/19560 | loss 3.333859 (+0.63z)| norm 0.2478 (-0.54z)| lr 3.44e-05 | 2531.65 ms | 53.3% bf16 MFU | 206957 tok/s +step 16658/19560 | loss 3.328294 (+0.47z)| norm 0.2692 (+1.11z)| lr 3.44e-05 | 2535.87 ms | 53.2% bf16 MFU | 206947 tok/s +step 16659/19560 | loss 3.271443 (-1.12z)| norm 0.2588 (+0.30z)| lr 3.44e-05 | 2535.01 ms | 53.3% bf16 MFU | 206941 tok/s +step 16660/19560 | loss 3.340599 (+0.82z)| norm 0.2563 (+0.13z)| lr 3.44e-05 | 2534.89 ms | 53.3% bf16 MFU | 206935 tok/s +step 16661/19560 | loss 3.290690 (-0.57z)| norm 0.2507 (-0.31z)| lr 3.43e-05 | 2536.03 ms | 53.2% bf16 MFU | 206925 tok/s +step 16662/19560 | loss 3.329671 (+0.51z)| norm 0.2590 (+0.34z)| lr 3.43e-05 | 2533.39 ms | 53.3% bf16 MFU | 206926 tok/s +step 16663/19560 | loss 3.290562 (-0.59z)| norm 0.2679 (+1.04z)| lr 3.43e-05 | 2534.45 ms | 53.3% bf16 MFU | 206923 tok/s +step 16664/19560 | loss 3.310276 (-0.04z)| norm 0.2628 (+0.63z)| lr 3.43e-05 | 2532.90 ms | 53.3% bf16 MFU | 206927 tok/s +step 16665/19560 | loss 3.289054 (-0.64z)| norm 0.2497 (-0.39z)| lr 3.42e-05 | 2533.19 ms | 53.3% bf16 MFU | 206929 tok/s +step 16666/19560 | loss 3.334648 (+0.64z)| norm 0.2657 (+0.86z)| lr 3.42e-05 | 2534.12 ms | 53.3% bf16 MFU | 206927 tok/s +step 16667/19560 | loss 3.313691 (+0.05z)| norm 0.2589 (+0.32z)| lr 3.42e-05 | 2530.53 ms | 53.4% bf16 MFU | 206940 tok/s +step 16668/19560 | loss 3.274350 (-1.05z)| norm 0.2572 (+0.18z)| lr 3.42e-05 | 2533.49 ms | 53.3% bf16 MFU | 206940 tok/s +step 16669/19560 | loss 3.253159 (-1.62z)| norm 0.2606 (+0.44z)| lr 3.41e-05 | 2533.84 ms | 53.3% bf16 MFU | 206939 tok/s +step 16670/19560 | loss 3.260611 (-1.40z)| norm 0.2589 (+0.30z)| lr 3.41e-05 | 2533.19 ms | 53.3% bf16 MFU | 206940 tok/s +step 16671/19560 | loss 3.275160 (-0.99z)| norm 0.2686 (+1.05z)| lr 3.41e-05 | 2533.71 ms | 53.3% bf16 MFU | 206939 tok/s +step 16672/19560 | loss 3.331223 (+0.55z)| norm 0.2611 (+0.46z)| lr 3.41e-05 | 2533.78 ms | 53.3% bf16 MFU | 206938 tok/s +step 16673/19560 | loss 3.237367 (-1.99z)| norm 0.4205 (+8.48z)| lr 3.40e-05 | 2533.08 ms | 53.3% bf16 MFU | 206940 tok/s +step 16674/19560 | loss 3.270856 (-1.07z)| norm 0.2934 (+1.86z)| lr 3.40e-05 | 2532.74 ms | 53.3% bf16 MFU | 206943 tok/s +step 16675/19560 | loss 3.256114 (-1.44z)| norm 0.3108 (+2.65z)| lr 3.40e-05 | 2533.57 ms | 53.3% bf16 MFU | 206943 tok/s +step 16676/19560 | loss 3.281434 (-0.74z)| norm 0.2723 (+0.74z)| lr 3.40e-05 | 2533.65 ms | 53.3% bf16 MFU | 206942 tok/s +step 16677/19560 | loss 3.306080 (-0.07z)| norm 0.2918 (+1.67z)| lr 3.40e-05 | 2532.97 ms | 53.3% bf16 MFU | 206945 tok/s +step 16678/19560 | loss 3.299253 (-0.25z)| norm 0.2634 (+0.27z)| lr 3.39e-05 | 2533.93 ms | 53.3% bf16 MFU | 206943 tok/s +step 16679/19560 | loss 3.358801 (+1.37z)| norm 0.2578 (-0.00z)| lr 3.39e-05 | 2533.09 ms | 53.3% bf16 MFU | 206944 tok/s +step 16680/19560 | loss 3.325949 (+0.46z)| norm 0.2712 (+0.65z)| lr 3.39e-05 | 2533.73 ms | 53.3% bf16 MFU | 206943 tok/s +step 16681/19560 | loss 3.303963 (-0.15z)| norm 0.2529 (-0.25z)| lr 3.39e-05 | 2533.35 ms | 53.3% bf16 MFU | 206944 tok/s +step 16682/19560 | loss 3.252354 (-1.55z)| norm 0.2587 (+0.03z)| lr 3.38e-05 | 2533.23 ms | 53.3% bf16 MFU | 206945 tok/s +step 16683/19560 | loss 3.372961 (+1.71z)| norm 0.2642 (+0.29z)| lr 3.38e-05 | 2532.13 ms | 53.3% bf16 MFU | 206950 tok/s +step 16684/19560 | loss 3.309286 (-0.02z)| norm 0.2592 (+0.04z)| lr 3.38e-05 | 2533.65 ms | 53.3% bf16 MFU | 206949 tok/s +step 16685/19560 | loss 3.273877 (-0.98z)| norm 0.2480 (-0.52z)| lr 3.38e-05 | 2532.86 ms | 53.3% bf16 MFU | 206952 tok/s +step 16686/19560 | loss 3.305750 (-0.12z)| norm 0.2873 (+1.40z)| lr 3.37e-05 | 2534.39 ms | 53.3% bf16 MFU | 206947 tok/s +step 16687/19560 | loss 3.368417 (+1.57z)| norm 0.2682 (+0.46z)| lr 3.37e-05 | 2535.92 ms | 53.2% bf16 MFU | 206937 tok/s +step 16688/19560 | loss 3.324055 (+0.37z)| norm 0.2606 (+0.07z)| lr 3.37e-05 | 2531.50 ms | 53.3% bf16 MFU | 206946 tok/s +step 16689/19560 | loss 3.267819 (-1.16z)| norm 0.2633 (+0.20z)| lr 3.37e-05 | 2534.83 ms | 53.3% bf16 MFU | 206940 tok/s +step 16690/19560 | loss 3.297704 (-0.33z)| norm 0.2553 (-0.20z)| lr 3.37e-05 | 2533.72 ms | 53.3% bf16 MFU | 206939 tok/s +step 16691/19560 | loss 3.262943 (-1.27z)| norm 0.2611 (+0.09z)| lr 3.36e-05 | 2533.63 ms | 53.3% bf16 MFU | 206939 tok/s +step 16692/19560 | loss 3.371403 (+1.65z)| norm 0.2658 (+0.32z)| lr 3.36e-05 | 2532.70 ms | 53.3% bf16 MFU | 206942 tok/s +step 16693/19560 | loss 3.315817 (+0.15z)| norm 0.2668 (+0.35z)| lr 3.36e-05 | 2534.05 ms | 53.3% bf16 MFU | 206940 tok/s +step 16694/19560 | loss 3.279966 (-0.81z)| norm 0.2578 (-0.10z)| lr 3.36e-05 | 2532.98 ms | 53.3% bf16 MFU | 206942 tok/s +step 16695/19560 | loss 3.387565 (+2.06z)| norm 0.2578 (-0.11z)| lr 3.35e-05 | 2533.07 ms | 53.3% bf16 MFU | 206944 tok/s +step 16696/19560 | loss 3.306079 (-0.11z)| norm 0.2648 (+0.24z)| lr 3.35e-05 | 2533.46 ms | 53.3% bf16 MFU | 206944 tok/s +step 16697/19560 | loss 3.329685 (+0.52z)| norm 0.2751 (+0.75z)| lr 3.35e-05 | 2533.38 ms | 53.3% bf16 MFU | 206945 tok/s +step 16698/19560 | loss 3.331889 (+0.57z)| norm 0.2735 (+0.65z)| lr 3.35e-05 | 2533.32 ms | 53.3% bf16 MFU | 206945 tok/s +step 16699/19560 | loss 3.303627 (-0.19z)| norm 0.2667 (+0.31z)| lr 3.35e-05 | 2530.78 ms | 53.4% bf16 MFU | 206956 tok/s +step 16700/19560 | loss 3.241560 (-1.82z)| norm 0.2386 (-1.08z)| lr 3.34e-05 | 2532.43 ms | 53.3% bf16 MFU | 206960 tok/s +step 16701/19560 | loss 3.255113 (-1.47z)| norm 0.2496 (-0.52z)| lr 3.34e-05 | 2531.86 ms | 53.3% bf16 MFU | 206966 tok/s +step 16702/19560 | loss 3.254022 (-1.47z)| norm 0.2667 (+0.33z)| lr 3.34e-05 | 2532.39 ms | 53.3% bf16 MFU | 206969 tok/s +step 16703/19560 | loss 3.346582 (+0.96z)| norm 0.2426 (-0.86z)| lr 3.34e-05 | 2532.20 ms | 53.3% bf16 MFU | 206973 tok/s +step 16704/19560 | loss 3.263470 (-1.22z)| norm 0.2471 (-0.63z)| lr 3.33e-05 | 2533.33 ms | 53.3% bf16 MFU | 206972 tok/s +step 16705/19560 | loss 3.264728 (-1.17z)| norm 0.2374 (-1.11z)| lr 3.33e-05 | 2533.81 ms | 53.3% bf16 MFU | 206969 tok/s +step 16706/19560 | loss 3.306230 (-0.09z)| norm 0.2510 (-0.42z)| lr 3.33e-05 | 2532.98 ms | 53.3% bf16 MFU | 206970 tok/s +step 16707/19560 | loss 3.320138 (+0.27z)| norm 0.2577 (-0.09z)| lr 3.33e-05 | 2533.79 ms | 53.3% bf16 MFU | 206967 tok/s +step 16708/19560 | loss 3.322521 (+0.33z)| norm 0.2474 (-0.60z)| lr 3.32e-05 | 2533.44 ms | 53.3% bf16 MFU | 206966 tok/s +step 16709/19560 | loss 3.251976 (-1.49z)| norm 0.2513 (-0.40z)| lr 3.32e-05 | 2533.21 ms | 53.3% bf16 MFU | 206966 tok/s +step 16710/19560 | loss 3.339567 (+0.79z)| norm 0.2566 (-0.14z)| lr 3.32e-05 | 2534.03 ms | 53.3% bf16 MFU | 206963 tok/s +step 16711/19560 | loss 3.302982 (-0.15z)| norm 0.2576 (-0.09z)| lr 3.32e-05 | 2535.27 ms | 53.3% bf16 MFU | 206955 tok/s +step 16712/19560 | loss 3.318509 (+0.25z)| norm 0.2471 (-0.60z)| lr 3.32e-05 | 2533.10 ms | 53.3% bf16 MFU | 206956 tok/s +step 16713/19560 | loss 3.281406 (-0.71z)| norm 0.2457 (-0.67z)| lr 3.31e-05 | 2536.05 ms | 53.2% bf16 MFU | 206945 tok/s +step 16714/19560 | loss 3.354899 (+1.18z)| norm 0.2590 (-0.02z)| lr 3.31e-05 | 2534.78 ms | 53.3% bf16 MFU | 206939 tok/s +step 16715/19560 | loss 3.336070 (+0.68z)| norm 0.2648 (+0.27z)| lr 3.31e-05 | 2534.80 ms | 53.3% bf16 MFU | 206934 tok/s +step 16716/19560 | loss 3.274434 (-0.91z)| norm 0.2542 (-0.25z)| lr 3.31e-05 | 2534.93 ms | 53.3% bf16 MFU | 206929 tok/s +step 16717/19560 | loss 3.275318 (-0.87z)| norm 0.2522 (-0.36z)| lr 3.30e-05 | 2533.19 ms | 53.3% bf16 MFU | 206931 tok/s +step 16718/19560 | loss 3.264811 (-1.13z)| norm 0.2525 (-0.34z)| lr 3.30e-05 | 2534.62 ms | 53.3% bf16 MFU | 206927 tok/s +step 16719/19560 | loss 3.246755 (-1.58z)| norm 0.2412 (-0.90z)| lr 3.30e-05 | 2534.63 ms | 53.3% bf16 MFU | 206923 tok/s +step 16720/19560 | loss 3.281298 (-0.67z)| norm 0.2366 (-1.12z)| lr 3.30e-05 | 2534.10 ms | 53.3% bf16 MFU | 206921 tok/s +step 16721/19560 | loss 3.323956 (+0.44z)| norm 0.2548 (-0.20z)| lr 3.29e-05 | 2533.63 ms | 53.3% bf16 MFU | 206922 tok/s +step 16722/19560 | loss 3.301482 (-0.14z)| norm 0.2401 (-0.93z)| lr 3.29e-05 | 2534.04 ms | 53.3% bf16 MFU | 206921 tok/s +step 16723/19560 | loss 3.443961 (+3.39z)| norm 0.3123 (+2.58z)| lr 3.29e-05 | 2534.05 ms | 53.3% bf16 MFU | 206920 tok/s +step 16724/19560 | loss 3.338139 (+0.78z)| norm 0.2585 (-0.04z)| lr 3.29e-05 | 2534.24 ms | 53.3% bf16 MFU | 206918 tok/s +step 16725/19560 | loss 3.307971 (+0.02z)| norm 0.2606 (+0.06z)| lr 3.29e-05 | 2533.16 ms | 53.3% bf16 MFU | 206920 tok/s +step 16726/19560 | loss 3.324606 (+0.43z)| norm 0.2500 (-0.46z)| lr 3.28e-05 | 2535.59 ms | 53.2% bf16 MFU | 206913 tok/s +step 16727/19560 | loss 3.286556 (-0.53z)| norm 0.2517 (-0.38z)| lr 3.28e-05 | 2533.08 ms | 53.3% bf16 MFU | 206916 tok/s +step 16728/19560 | loss 3.302253 (-0.13z)| norm 0.2428 (-0.81z)| lr 3.28e-05 | 2532.69 ms | 53.3% bf16 MFU | 206921 tok/s +step 16729/19560 | loss 3.344546 (+0.95z)| norm 0.2547 (-0.22z)| lr 3.28e-05 | 2534.85 ms | 53.3% bf16 MFU | 206916 tok/s +step 16730/19560 | loss 3.298110 (-0.22z)| norm 0.2514 (-0.38z)| lr 3.27e-05 | 2533.77 ms | 53.3% bf16 MFU | 206916 tok/s +step 16731/19560 | loss 3.306149 (-0.01z)| norm 0.2546 (-0.23z)| lr 3.27e-05 | 2532.76 ms | 53.3% bf16 MFU | 206921 tok/s +step 16732/19560 | loss 3.361012 (+1.43z)| norm 0.2700 (+0.52z)| lr 3.27e-05 | 2532.32 ms | 53.3% bf16 MFU | 206926 tok/s +step 16733/19560 | loss 3.280796 (-0.68z)| norm 0.2517 (-0.37z)| lr 3.27e-05 | 2533.70 ms | 53.3% bf16 MFU | 206926 tok/s +step 16734/19560 | loss 3.379688 (+1.88z)| norm 0.2540 (-0.25z)| lr 3.27e-05 | 2534.38 ms | 53.3% bf16 MFU | 206924 tok/s +step 16735/19560 | loss 3.280391 (-0.69z)| norm 0.2431 (-0.78z)| lr 3.26e-05 | 2533.61 ms | 53.3% bf16 MFU | 206924 tok/s +step 16736/19560 | loss 3.331828 (+0.63z)| norm 0.2574 (-0.09z)| lr 3.26e-05 | 2532.93 ms | 53.3% bf16 MFU | 206927 tok/s +step 16737/19560 | loss 3.303797 (-0.12z)| norm 0.2552 (-0.18z)| lr 3.26e-05 | 2534.78 ms | 53.3% bf16 MFU | 206923 tok/s +step 16738/19560 | loss 3.316987 (+0.23z)| norm 0.2602 (+0.06z)| lr 3.26e-05 | 2533.60 ms | 53.3% bf16 MFU | 206923 tok/s +step 16739/19560 | loss 3.335283 (+0.72z)| norm 0.2603 (+0.06z)| lr 3.25e-05 | 2532.42 ms | 53.3% bf16 MFU | 206929 tok/s +step 16740/19560 | loss 3.312654 (+0.12z)| norm 0.2462 (-0.65z)| lr 3.25e-05 | 2532.59 ms | 53.3% bf16 MFU | 206933 tok/s +step 16741/19560 | loss 3.299157 (-0.23z)| norm 0.2595 (+0.02z)| lr 3.25e-05 | 2531.76 ms | 53.3% bf16 MFU | 206941 tok/s +step 16742/19560 | loss 3.316308 (+0.22z)| norm 0.2640 (+0.24z)| lr 3.25e-05 | 2532.27 ms | 53.3% bf16 MFU | 206946 tok/s +step 16743/19560 | loss 3.305749 (-0.06z)| norm 0.2479 (-0.57z)| lr 3.24e-05 | 2532.22 ms | 53.3% bf16 MFU | 206951 tok/s +step 16744/19560 | loss 3.390696 (+2.18z)| norm 0.2406 (-0.94z)| lr 3.24e-05 | 2532.49 ms | 53.3% bf16 MFU | 206955 tok/s +step 16745/19560 | loss 3.314551 (+0.16z)| norm 0.2431 (-0.81z)| lr 3.24e-05 | 2531.96 ms | 53.3% bf16 MFU | 206960 tok/s +step 16746/19560 | loss 3.310104 (+0.04z)| norm 0.2518 (-0.37z)| lr 3.24e-05 | 2533.52 ms | 53.3% bf16 MFU | 206959 tok/s +step 16747/19560 | loss 3.315234 (+0.19z)| norm 0.2569 (-0.11z)| lr 3.24e-05 | 2533.73 ms | 53.3% bf16 MFU | 206957 tok/s +step 16748/19560 | loss 3.360392 (+1.37z)| norm 0.2425 (-0.83z)| lr 3.23e-05 | 2532.76 ms | 53.3% bf16 MFU | 206960 tok/s +step 16749/19560 | loss 3.318491 (+0.27z)| norm 0.2463 (-0.64z)| lr 3.23e-05 | 2533.38 ms | 53.3% bf16 MFU | 206959 tok/s +step 16750/19560 | loss 3.280474 (-0.74z)| norm 0.2416 (-0.87z)| lr 3.23e-05 | 2531.68 ms | 53.3% bf16 MFU | 206966 tok/s +val loss 3.295071 +evaluating HellaSwag: 0/628 evaluating HellaSwag: 10/628 evaluating HellaSwag: 20/628 evaluating HellaSwag: 30/628 evaluating HellaSwag: 40/628 evaluating HellaSwag: 50/628 evaluating HellaSwag: 60/628 evaluating HellaSwag: 70/628 evaluating HellaSwag: 80/628 evaluating HellaSwag: 90/628 evaluating HellaSwag: 100/628 evaluating HellaSwag: 110/628 evaluating HellaSwag: 120/628 evaluating HellaSwag: 130/628 evaluating HellaSwag: 140/628 evaluating HellaSwag: 150/628 evaluating HellaSwag: 160/628 evaluating HellaSwag: 170/628 evaluating HellaSwag: 180/628 evaluating HellaSwag: 190/628 evaluating HellaSwag: 200/628 evaluating HellaSwag: 210/628 evaluating HellaSwag: 220/628 evaluating HellaSwag: 230/628 evaluating HellaSwag: 240/628 evaluating HellaSwag: 250/628 evaluating HellaSwag: 260/628 evaluating HellaSwag: 270/628 evaluating HellaSwag: 280/628 evaluating HellaSwag: 290/628 evaluating HellaSwag: 300/628 evaluating HellaSwag: 310/628 evaluating HellaSwag: 320/628 evaluating HellaSwag: 330/628 evaluating HellaSwag: 340/628 evaluating HellaSwag: 350/628 evaluating HellaSwag: 360/628 evaluating HellaSwag: 370/628 evaluating HellaSwag: 380/628 evaluating HellaSwag: 390/628 evaluating HellaSwag: 400/628 evaluating HellaSwag: 410/628 evaluating HellaSwag: 420/628 evaluating HellaSwag: 430/628 evaluating HellaSwag: 440/628 evaluating HellaSwag: 450/628 evaluating HellaSwag: 460/628 evaluating HellaSwag: 470/628 evaluating HellaSwag: 480/628 evaluating HellaSwag: 490/628 evaluating HellaSwag: 500/628 evaluating HellaSwag: 510/628 evaluating HellaSwag: 520/628 evaluating HellaSwag: 530/628 evaluating HellaSwag: 540/628 evaluating HellaSwag: 550/628 evaluating HellaSwag: 560/628 evaluating HellaSwag: 570/628 evaluating HellaSwag: 580/628 evaluating HellaSwag: 590/628 evaluating HellaSwag: 600/628 evaluating HellaSwag: 610/628 evaluating HellaSwag: 620/628 HellaSwag: 3023/10042 = 0.301036 +step 16751/19560 | loss 3.291645 (-0.44z)| norm 0.2728 (+0.69z)| lr 3.23e-05 | 2533.19 ms | 53.3% bf16 MFU | 206966 tok/s +step 16752/19560 | loss 3.295804 (-0.33z)| norm 0.2525 (-0.32z)| lr 3.22e-05 | 2533.69 ms | 53.3% bf16 MFU | 206964 tok/s +step 16753/19560 | loss 3.294187 (-0.37z)| norm 0.2408 (-0.90z)| lr 3.22e-05 | 2534.12 ms | 53.3% bf16 MFU | 206960 tok/s +step 16754/19560 | loss 3.307042 (-0.03z)| norm 0.2559 (-0.14z)| lr 3.22e-05 | 2531.74 ms | 53.3% bf16 MFU | 206967 tok/s +step 16755/19560 | loss 3.320400 (+0.33z)| norm 0.2448 (-0.69z)| lr 3.22e-05 | 2532.29 ms | 53.3% bf16 MFU | 206970 tok/s +step 16756/19560 | loss 3.263487 (-1.19z)| norm 0.2572 (-0.08z)| lr 3.22e-05 | 2534.49 ms | 53.3% bf16 MFU | 206965 tok/s +step 16757/19560 | loss 3.337292 (+0.77z)| norm 0.2499 (-0.45z)| lr 3.21e-05 | 2532.55 ms | 53.3% bf16 MFU | 206968 tok/s +step 16758/19560 | loss 3.267484 (-1.07z)| norm 0.2428 (-0.80z)| lr 3.21e-05 | 2532.70 ms | 53.3% bf16 MFU | 206970 tok/s +step 16759/19560 | loss 3.283661 (-0.64z)| norm 0.2412 (-0.87z)| lr 3.21e-05 | 2532.70 ms | 53.3% bf16 MFU | 206972 tok/s +step 16760/19560 | loss 3.394976 (+2.25z)| norm 0.2592 (+0.02z)| lr 3.21e-05 | 2533.22 ms | 53.3% bf16 MFU | 206971 tok/s +step 16761/19560 | loss 3.346804 (+0.99z)| norm 0.2411 (-0.88z)| lr 3.20e-05 | 2533.27 ms | 53.3% bf16 MFU | 206971 tok/s +step 16762/19560 | loss 3.298186 (-0.26z)| norm 0.2438 (-0.74z)| lr 3.20e-05 | 2531.92 ms | 53.3% bf16 MFU | 206976 tok/s +step 16763/19560 | loss 3.301496 (-0.18z)| norm 0.2412 (-0.86z)| lr 3.20e-05 | 2533.42 ms | 53.3% bf16 MFU | 206974 tok/s +step 16764/19560 | loss 3.305175 (-0.10z)| norm 0.2437 (-0.73z)| lr 3.20e-05 | 2534.14 ms | 53.3% bf16 MFU | 206970 tok/s +step 16765/19560 | loss 3.352670 (+1.15z)| norm 0.2591 (+0.03z)| lr 3.20e-05 | 2533.46 ms | 53.3% bf16 MFU | 206969 tok/s +step 16766/19560 | loss 3.330534 (+0.56z)| norm 0.2444 (-0.69z)| lr 3.19e-05 | 2534.44 ms | 53.3% bf16 MFU | 206964 tok/s +step 16767/19560 | loss 3.290726 (-0.50z)| norm 0.2673 (+0.45z)| lr 3.19e-05 | 2534.18 ms | 53.3% bf16 MFU | 206960 tok/s +step 16768/19560 | loss 3.299113 (-0.27z)| norm 0.2416 (-0.82z)| lr 3.19e-05 | 2535.99 ms | 53.2% bf16 MFU | 206949 tok/s +step 16769/19560 | loss 3.274758 (-0.91z)| norm 0.2381 (-0.99z)| lr 3.19e-05 | 2531.68 ms | 53.3% bf16 MFU | 206956 tok/s +step 16770/19560 | loss 3.275085 (-0.89z)| norm 0.2556 (-0.12z)| lr 3.18e-05 | 2534.95 ms | 53.3% bf16 MFU | 206949 tok/s +step 16771/19560 | loss 3.281013 (-0.73z)| norm 0.2428 (-0.76z)| lr 3.18e-05 | 2535.41 ms | 53.3% bf16 MFU | 206941 tok/s +step 16772/19560 | loss 3.274007 (-0.93z)| norm 0.2430 (-0.75z)| lr 3.18e-05 | 2533.97 ms | 53.3% bf16 MFU | 206939 tok/s +step 16773/19560 | loss 3.307526 (-0.04z)| norm 0.2596 (+0.07z)| lr 3.18e-05 | 2531.44 ms | 53.3% bf16 MFU | 206948 tok/s +step 16774/19560 | loss 3.341515 (+0.88z)| norm 0.2638 (+0.27z)| lr 3.18e-05 | 2532.53 ms | 53.3% bf16 MFU | 206951 tok/s +step 16775/19560 | loss 3.283241 (-0.70z)| norm 0.2438 (-0.72z)| lr 3.17e-05 | 2534.69 ms | 53.3% bf16 MFU | 206946 tok/s +step 16776/19560 | loss 3.333527 (+0.67z)| norm 0.2391 (-0.94z)| lr 3.17e-05 | 2532.21 ms | 53.3% bf16 MFU | 206951 tok/s +step 16777/19560 | loss 3.269839 (-1.06z)| norm 0.2573 (-0.04z)| lr 3.17e-05 | 2533.02 ms | 53.3% bf16 MFU | 206953 tok/s +step 16778/19560 | loss 3.301317 (-0.20z)| norm 0.2678 (+0.47z)| lr 3.17e-05 | 2533.82 ms | 53.3% bf16 MFU | 206951 tok/s +step 16779/19560 | loss 3.265418 (-1.16z)| norm 0.2737 (+0.75z)| lr 3.16e-05 | 2533.42 ms | 53.3% bf16 MFU | 206951 tok/s +step 16780/19560 | loss 3.310056 (+0.08z)| norm 0.2574 (-0.03z)| lr 3.16e-05 | 2534.29 ms | 53.3% bf16 MFU | 206947 tok/s +step 16781/19560 | loss 3.383955 (+2.11z)| norm 0.2396 (-0.94z)| lr 3.16e-05 | 2532.87 ms | 53.3% bf16 MFU | 206949 tok/s +step 16782/19560 | loss 3.288394 (-0.53z)| norm 0.2532 (-0.23z)| lr 3.16e-05 | 2533.63 ms | 53.3% bf16 MFU | 206949 tok/s +step 16783/19560 | loss 3.316788 (+0.26z)| norm 0.2611 (+0.18z)| lr 3.16e-05 | 2531.73 ms | 53.3% bf16 MFU | 206955 tok/s +step 16784/19560 | loss 3.264590 (-1.17z)| norm 0.2529 (-0.24z)| lr 3.15e-05 | 2532.69 ms | 53.3% bf16 MFU | 206958 tok/s +step 16785/19560 | loss 3.209727 (-2.60z)| norm 0.2579 (+0.01z)| lr 3.15e-05 | 2533.78 ms | 53.3% bf16 MFU | 206956 tok/s +step 16786/19560 | loss 3.262580 (-1.15z)| norm 0.2443 (-0.68z)| lr 3.15e-05 | 2531.59 ms | 53.3% bf16 MFU | 206963 tok/s +step 16787/19560 | loss 3.313612 (+0.21z)| norm 0.2531 (-0.22z)| lr 3.15e-05 | 2532.65 ms | 53.3% bf16 MFU | 206966 tok/s +step 16788/19560 | loss 3.311448 (+0.16z)| norm 0.2539 (-0.18z)| lr 3.14e-05 | 2532.92 ms | 53.3% bf16 MFU | 206967 tok/s +step 16789/19560 | loss 3.273692 (-0.86z)| norm 0.2612 (+0.19z)| lr 3.14e-05 | 2534.36 ms | 53.3% bf16 MFU | 206962 tok/s +step 16790/19560 | loss 3.311069 (+0.16z)| norm 0.2478 (-0.50z)| lr 3.14e-05 | 2533.13 ms | 53.3% bf16 MFU | 206963 tok/s +step 16791/19560 | loss 3.321401 (+0.43z)| norm 0.2430 (-0.73z)| lr 3.14e-05 | 2534.92 ms | 53.3% bf16 MFU | 206956 tok/s +step 16792/19560 | loss 3.240269 (-1.73z)| norm 0.2610 (+0.20z)| lr 3.14e-05 | 2533.17 ms | 53.3% bf16 MFU | 206956 tok/s +step 16793/19560 | loss 3.278946 (-0.69z)| norm 0.2547 (-0.13z)| lr 3.13e-05 | 2533.27 ms | 53.3% bf16 MFU | 206957 tok/s +step 16794/19560 | loss 3.299029 (-0.15z)| norm 0.2535 (-0.19z)| lr 3.13e-05 | 2533.12 ms | 53.3% bf16 MFU | 206957 tok/s +step 16795/19560 | loss 3.252653 (-1.37z)| norm 0.2455 (-0.59z)| lr 3.13e-05 | 2532.54 ms | 53.3% bf16 MFU | 206961 tok/s +step 16796/19560 | loss 3.483968 (+4.38z)| norm 0.3061 (+2.45z)| lr 3.13e-05 | 2533.48 ms | 53.3% bf16 MFU | 206960 tok/s +step 16797/19560 | loss 3.358619 (+1.28z)| norm 0.2557 (-0.09z)| lr 3.12e-05 | 2531.84 ms | 53.3% bf16 MFU | 206966 tok/s +step 16798/19560 | loss 3.296731 (-0.25z)| norm 0.2457 (-0.58z)| lr 3.12e-05 | 2533.55 ms | 53.3% bf16 MFU | 206964 tok/s +step 16799/19560 | loss 3.274467 (-0.80z)| norm 0.2668 (+0.48z)| lr 3.12e-05 | 2534.04 ms | 53.3% bf16 MFU | 206961 tok/s +step 16800/19560 | loss 3.296127 (-0.26z)| norm 0.2613 (+0.20z)| lr 3.12e-05 | 2532.01 ms | 53.3% bf16 MFU | 206966 tok/s +step 16801/19560 | loss 3.258422 (-1.20z)| norm 0.2542 (-0.13z)| lr 3.12e-05 | 2533.58 ms | 53.3% bf16 MFU | 206965 tok/s +step 16802/19560 | loss 3.382698 (+1.85z)| norm 0.2516 (-0.31z)| lr 3.11e-05 | 2531.03 ms | 53.3% bf16 MFU | 206974 tok/s +step 16803/19560 | loss 3.341098 (+0.81z)| norm 0.2563 (+0.09z)| lr 3.11e-05 | 2533.98 ms | 53.3% bf16 MFU | 206970 tok/s +step 16804/19560 | loss 3.239684 (-1.67z)| norm 0.2648 (+0.80z)| lr 3.11e-05 | 2535.06 ms | 53.3% bf16 MFU | 206962 tok/s +step 16805/19560 | loss 3.425006 (+2.76z)| norm 0.2834 (+2.36z)| lr 3.11e-05 | 2531.89 ms | 53.3% bf16 MFU | 206968 tok/s +step 16806/19560 | loss 3.219276 (-2.07z)| norm 0.2758 (+1.70z)| lr 3.10e-05 | 2535.40 ms | 53.3% bf16 MFU | 206959 tok/s +step 16807/19560 | loss 3.268294 (-0.92z)| norm 0.2653 (+0.83z)| lr 3.10e-05 | 2534.48 ms | 53.3% bf16 MFU | 206954 tok/s +step 16808/19560 | loss 3.284194 (-0.54z)| norm 0.2538 (-0.11z)| lr 3.10e-05 | 2536.07 ms | 53.2% bf16 MFU | 206943 tok/s +step 16809/19560 | loss 3.287096 (-0.47z)| norm 0.2523 (-0.23z)| lr 3.10e-05 | 2534.09 ms | 53.3% bf16 MFU | 206940 tok/s +step 16810/19560 | loss 3.394189 (+1.99z)| norm 0.2622 (+0.59z)| lr 3.10e-05 | 2532.45 ms | 53.3% bf16 MFU | 206945 tok/s +step 16811/19560 | loss 3.251805 (-1.28z)| norm 0.2654 (+0.85z)| lr 3.09e-05 | 2532.24 ms | 53.3% bf16 MFU | 206950 tok/s +step 16812/19560 | loss 3.292049 (-0.35z)| norm 0.2556 (+0.04z)| lr 3.09e-05 | 2532.67 ms | 53.3% bf16 MFU | 206953 tok/s +step 16813/19560 | loss 3.333779 (+0.61z)| norm 0.2586 (+0.28z)| lr 3.09e-05 | 2532.82 ms | 53.3% bf16 MFU | 206955 tok/s +step 16814/19560 | loss 3.317673 (+0.23z)| norm 0.2647 (+0.82z)| lr 3.09e-05 | 2532.73 ms | 53.3% bf16 MFU | 206958 tok/s +step 16815/19560 | loss 3.310600 (+0.08z)| norm 0.2605 (+0.47z)| lr 3.08e-05 | 2533.70 ms | 53.3% bf16 MFU | 206956 tok/s +step 16816/19560 | loss 3.325035 (+0.42z)| norm 0.2678 (+1.09z)| lr 3.08e-05 | 2533.73 ms | 53.3% bf16 MFU | 206954 tok/s +step 16817/19560 | loss 3.327809 (+0.47z)| norm 0.2440 (-0.92z)| lr 3.08e-05 | 2533.67 ms | 53.3% bf16 MFU | 206953 tok/s +step 16818/19560 | loss 3.325936 (+0.42z)| norm 0.2884 (+2.75z)| lr 3.08e-05 | 2533.79 ms | 53.3% bf16 MFU | 206951 tok/s +step 16819/19560 | loss 3.220096 (-2.02z)| norm 0.2630 (+0.65z)| lr 3.08e-05 | 2531.68 ms | 53.3% bf16 MFU | 206958 tok/s +step 16820/19560 | loss 3.288588 (-0.43z)| norm 0.2673 (+1.00z)| lr 3.07e-05 | 2536.47 ms | 53.2% bf16 MFU | 206945 tok/s +step 16821/19560 | loss 3.253733 (-1.22z)| norm 0.2603 (+0.43z)| lr 3.07e-05 | 2532.64 ms | 53.3% bf16 MFU | 206949 tok/s +step 16822/19560 | loss 3.388740 (+1.87z)| norm 0.2662 (+0.91z)| lr 3.07e-05 | 2532.97 ms | 53.3% bf16 MFU | 206951 tok/s +step 16823/19560 | loss 3.280147 (-0.61z)| norm 0.2570 (+0.15z)| lr 3.07e-05 | 2532.99 ms | 53.3% bf16 MFU | 206952 tok/s +step 16824/19560 | loss 3.315339 (+0.21z)| norm 0.2522 (-0.23z)| lr 3.06e-05 | 2534.06 ms | 53.3% bf16 MFU | 206949 tok/s +step 16825/19560 | loss 3.253016 (-1.22z)| norm 0.2417 (-1.09z)| lr 3.06e-05 | 2534.77 ms | 53.3% bf16 MFU | 206944 tok/s +step 16826/19560 | loss 3.250543 (-1.26z)| norm 0.2394 (-1.26z)| lr 3.06e-05 | 2534.09 ms | 53.3% bf16 MFU | 206941 tok/s +step 16827/19560 | loss 3.286277 (-0.43z)| norm 0.2617 (+0.60z)| lr 3.06e-05 | 2532.93 ms | 53.3% bf16 MFU | 206944 tok/s +step 16828/19560 | loss 3.284484 (-0.48z)| norm 0.2576 (+0.25z)| lr 3.06e-05 | 2534.37 ms | 53.3% bf16 MFU | 206940 tok/s +step 16829/19560 | loss 3.327979 (+0.51z)| norm 0.2569 (+0.18z)| lr 3.05e-05 | 2535.28 ms | 53.3% bf16 MFU | 206933 tok/s +step 16830/19560 | loss 3.254146 (-1.20z)| norm 0.2451 (-0.79z)| lr 3.05e-05 | 2534.02 ms | 53.3% bf16 MFU | 206931 tok/s +step 16831/19560 | loss 3.374429 (+1.57z)| norm 0.2723 (+1.47z)| lr 3.05e-05 | 2532.02 ms | 53.3% bf16 MFU | 206938 tok/s +step 16832/19560 | loss 3.288375 (-0.42z)| norm 0.2438 (-0.91z)| lr 3.05e-05 | 2534.44 ms | 53.3% bf16 MFU | 206934 tok/s +step 16833/19560 | loss 3.311668 (+0.11z)| norm 0.2454 (-0.79z)| lr 3.04e-05 | 2532.25 ms | 53.3% bf16 MFU | 206940 tok/s +step 16834/19560 | loss 3.341155 (+0.79z)| norm 0.2516 (-0.27z)| lr 3.04e-05 | 2533.58 ms | 53.3% bf16 MFU | 206940 tok/s +step 16835/19560 | loss 3.287187 (-0.45z)| norm 0.2540 (-0.07z)| lr 3.04e-05 | 2533.11 ms | 53.3% bf16 MFU | 206941 tok/s +step 16836/19560 | loss 3.307329 (+0.02z)| norm 0.2420 (-1.07z)| lr 3.04e-05 | 2535.38 ms | 53.3% bf16 MFU | 206934 tok/s +step 16837/19560 | loss 3.305388 (-0.04z)| norm 0.2588 (+0.34z)| lr 3.04e-05 | 2533.97 ms | 53.3% bf16 MFU | 206932 tok/s +step 16838/19560 | loss 3.365016 (+1.34z)| norm 0.2411 (-1.13z)| lr 3.03e-05 | 2534.44 ms | 53.3% bf16 MFU | 206929 tok/s +step 16839/19560 | loss 3.320399 (+0.30z)| norm 0.2524 (-0.19z)| lr 3.03e-05 | 2533.59 ms | 53.3% bf16 MFU | 206929 tok/s +step 16840/19560 | loss 3.359106 (+1.19z)| norm 0.2450 (-0.80z)| lr 3.03e-05 | 2534.56 ms | 53.3% bf16 MFU | 206925 tok/s +step 16841/19560 | loss 3.263691 (-1.01z)| norm 0.2602 (+0.46z)| lr 3.03e-05 | 2534.65 ms | 53.3% bf16 MFU | 206922 tok/s +step 16842/19560 | loss 3.350958 (+1.00z)| norm 0.2462 (-0.70z)| lr 3.02e-05 | 2534.49 ms | 53.3% bf16 MFU | 206919 tok/s +step 16843/19560 | loss 3.277128 (-0.69z)| norm 0.2578 (+0.27z)| lr 3.02e-05 | 2532.49 ms | 53.3% bf16 MFU | 206924 tok/s +step 16844/19560 | loss 3.273531 (-0.77z)| norm 0.2545 (-0.01z)| lr 3.02e-05 | 2534.43 ms | 53.3% bf16 MFU | 206921 tok/s +step 16845/19560 | loss 3.306318 (-0.02z)| norm 0.2499 (-0.39z)| lr 3.02e-05 | 2535.36 ms | 53.3% bf16 MFU | 206914 tok/s +step 16846/19560 | loss 3.279417 (-0.65z)| norm 0.2431 (-0.95z)| lr 3.02e-05 | 2534.30 ms | 53.3% bf16 MFU | 206913 tok/s +step 16847/19560 | loss 3.324233 (+0.38z)| norm 0.2403 (-1.18z)| lr 3.01e-05 | 2535.57 ms | 53.2% bf16 MFU | 206906 tok/s +step 16848/19560 | loss 3.303302 (-0.12z)| norm 0.2410 (-1.13z)| lr 3.01e-05 | 2532.16 ms | 53.3% bf16 MFU | 206913 tok/s +step 16849/19560 | loss 3.306520 (-0.04z)| norm 0.2592 (+0.39z)| lr 3.01e-05 | 2533.47 ms | 53.3% bf16 MFU | 206914 tok/s +step 16850/19560 | loss 3.284894 (-0.54z)| norm 0.2421 (-1.04z)| lr 3.01e-05 | 2533.76 ms | 53.3% bf16 MFU | 206915 tok/s +step 16851/19560 | loss 3.308918 (+0.05z)| norm 0.2563 (+0.20z)| lr 3.01e-05 | 2534.57 ms | 53.3% bf16 MFU | 206912 tok/s +step 16852/19560 | loss 3.306944 (+0.01z)| norm 0.2516 (-0.23z)| lr 3.00e-05 | 2531.92 ms | 53.3% bf16 MFU | 206920 tok/s +step 16853/19560 | loss 3.266937 (-0.96z)| norm 0.2517 (-0.22z)| lr 3.00e-05 | 2534.60 ms | 53.3% bf16 MFU | 206916 tok/s +step 16854/19560 | loss 3.254457 (-1.24z)| norm 0.2436 (-0.96z)| lr 3.00e-05 | 2535.41 ms | 53.3% bf16 MFU | 206910 tok/s +step 16855/19560 | loss 3.303323 (-0.06z)| norm 0.2500 (-0.36z)| lr 3.00e-05 | 2532.57 ms | 53.3% bf16 MFU | 206915 tok/s +step 16856/19560 | loss 3.338957 (+0.79z)| norm 0.2506 (-0.32z)| lr 2.99e-05 | 2535.65 ms | 53.2% bf16 MFU | 206908 tok/s +step 16857/19560 | loss 3.314538 (+0.21z)| norm 0.2472 (-0.63z)| lr 2.99e-05 | 2534.44 ms | 53.3% bf16 MFU | 206906 tok/s +step 16858/19560 | loss 3.254063 (-1.24z)| norm 0.2429 (-1.02z)| lr 2.99e-05 | 2533.92 ms | 53.3% bf16 MFU | 206906 tok/s +step 16859/19560 | loss 3.300630 (-0.12z)| norm 0.2511 (-0.25z)| lr 2.99e-05 | 2534.78 ms | 53.3% bf16 MFU | 206902 tok/s +step 16860/19560 | loss 3.287090 (-0.43z)| norm 0.2418 (-1.10z)| lr 2.99e-05 | 2532.82 ms | 53.3% bf16 MFU | 206907 tok/s +step 16861/19560 | loss 3.288543 (-0.40z)| norm 0.2405 (-1.20z)| lr 2.98e-05 | 2533.20 ms | 53.3% bf16 MFU | 206910 tok/s +step 16862/19560 | loss 3.269513 (-0.85z)| norm 0.2519 (-0.15z)| lr 2.98e-05 | 2532.14 ms | 53.3% bf16 MFU | 206917 tok/s +step 16863/19560 | loss 3.232289 (-1.73z)| norm 0.2360 (-1.60z)| lr 2.98e-05 | 2533.07 ms | 53.3% bf16 MFU | 206920 tok/s +step 16864/19560 | loss 3.301434 (-0.05z)| norm 0.2411 (-1.11z)| lr 2.98e-05 | 2534.81 ms | 53.3% bf16 MFU | 206916 tok/s +step 16865/19560 | loss 3.346537 (+1.03z)| norm 0.2460 (-0.66z)| lr 2.97e-05 | 2533.00 ms | 53.3% bf16 MFU | 206919 tok/s +step 16866/19560 | loss 3.277979 (-0.62z)| norm 0.2729 (+1.75z)| lr 2.97e-05 | 2533.23 ms | 53.3% bf16 MFU | 206922 tok/s +step 16867/19560 | loss 3.243390 (-1.43z)| norm 0.2531 (-0.02z)| lr 2.97e-05 | 2532.53 ms | 53.3% bf16 MFU | 206927 tok/s +step 16868/19560 | loss 3.300853 (-0.05z)| norm 0.2334 (-1.77z)| lr 2.97e-05 | 2533.63 ms | 53.3% bf16 MFU | 206927 tok/s +step 16869/19560 | loss 3.277477 (-0.60z)| norm 0.2460 (-0.63z)| lr 2.97e-05 | 2533.73 ms | 53.3% bf16 MFU | 206927 tok/s +step 16870/19560 | loss 3.281650 (-0.50z)| norm 0.2466 (-0.57z)| lr 2.96e-05 | 2533.44 ms | 53.3% bf16 MFU | 206928 tok/s +step 16871/19560 | loss 3.284642 (-0.42z)| norm 0.2472 (-0.51z)| lr 2.96e-05 | 2533.45 ms | 53.3% bf16 MFU | 206929 tok/s +step 16872/19560 | loss 3.363808 (+1.49z)| norm 0.2457 (-0.65z)| lr 2.96e-05 | 2534.54 ms | 53.3% bf16 MFU | 206925 tok/s +step 16873/19560 | loss 3.425589 (+2.87z)| norm 0.2660 (+1.14z)| lr 2.96e-05 | 2534.48 ms | 53.3% bf16 MFU | 206922 tok/s +step 16874/19560 | loss 3.264885 (-0.88z)| norm 0.2316 (-1.88z)| lr 2.96e-05 | 2533.75 ms | 53.3% bf16 MFU | 206922 tok/s +step 16875/19560 | loss 3.277069 (-0.59z)| norm 0.2476 (-0.47z)| lr 2.95e-05 | 2533.32 ms | 53.3% bf16 MFU | 206924 tok/s +step 16876/19560 | loss 3.253184 (-1.12z)| norm 0.2378 (-1.32z)| lr 2.95e-05 | 2534.30 ms | 53.3% bf16 MFU | 206921 tok/s +step 16877/19560 | loss 3.281219 (-0.46z)| norm 0.2544 (+0.13z)| lr 2.95e-05 | 2535.87 ms | 53.2% bf16 MFU | 206913 tok/s +step 16878/19560 | loss 3.341818 (+0.94z)| norm 0.2403 (-1.11z)| lr 2.95e-05 | 2533.53 ms | 53.3% bf16 MFU | 206914 tok/s +step 16879/19560 | loss 3.316110 (+0.33z)| norm 0.2594 (+0.58z)| lr 2.94e-05 | 2533.45 ms | 53.3% bf16 MFU | 206916 tok/s +step 16880/19560 | loss 3.304034 (+0.05z)| norm 0.2444 (-0.74z)| lr 2.94e-05 | 2534.24 ms | 53.3% bf16 MFU | 206914 tok/s +step 16881/19560 | loss 3.269439 (-0.75z)| norm 0.2469 (-0.53z)| lr 2.94e-05 | 2533.54 ms | 53.3% bf16 MFU | 206915 tok/s +step 16882/19560 | loss 3.287978 (-0.31z)| norm 0.2467 (-0.54z)| lr 2.94e-05 | 2534.36 ms | 53.3% bf16 MFU | 206913 tok/s +step 16883/19560 | loss 3.309879 (+0.20z)| norm 0.2395 (-1.17z)| lr 2.94e-05 | 2532.21 ms | 53.3% bf16 MFU | 206920 tok/s +step 16884/19560 | loss 3.246479 (-1.27z)| norm 0.2561 (+0.30z)| lr 2.93e-05 | 2534.52 ms | 53.3% bf16 MFU | 206917 tok/s +step 16885/19560 | loss 3.271987 (-0.67z)| norm 0.2557 (+0.26z)| lr 2.93e-05 | 2532.31 ms | 53.3% bf16 MFU | 206923 tok/s +step 16886/19560 | loss 3.350619 (+1.14z)| norm 0.2727 (+1.73z)| lr 2.93e-05 | 2533.76 ms | 53.3% bf16 MFU | 206923 tok/s +step 16887/19560 | loss 3.280008 (-0.49z)| norm 0.2455 (-0.66z)| lr 2.93e-05 | 2531.76 ms | 53.3% bf16 MFU | 206931 tok/s +step 16888/19560 | loss 3.249768 (-1.18z)| norm 0.2373 (-1.36z)| lr 2.92e-05 | 2534.23 ms | 53.3% bf16 MFU | 206928 tok/s +step 16889/19560 | loss 3.239309 (-1.40z)| norm 0.2724 (+1.67z)| lr 2.92e-05 | 2533.66 ms | 53.3% bf16 MFU | 206928 tok/s +step 16890/19560 | loss 3.284009 (-0.36z)| norm 0.2535 (+0.03z)| lr 2.92e-05 | 2532.78 ms | 53.3% bf16 MFU | 206932 tok/s +step 16891/19560 | loss 3.278669 (-0.48z)| norm 0.2456 (-0.66z)| lr 2.92e-05 | 2533.36 ms | 53.3% bf16 MFU | 206933 tok/s +step 16892/19560 | loss 3.304166 (+0.12z)| norm 0.2457 (-0.66z)| lr 2.92e-05 | 2534.21 ms | 53.3% bf16 MFU | 206931 tok/s +step 16893/19560 | loss 3.256771 (-0.97z)| norm 0.2792 (+2.21z)| lr 2.91e-05 | 2533.07 ms | 53.3% bf16 MFU | 206933 tok/s +step 16894/19560 | loss 3.297868 (-0.01z)| norm 0.2703 (+1.42z)| lr 2.91e-05 | 2536.44 ms | 53.2% bf16 MFU | 206921 tok/s +step 16895/19560 | loss 3.356310 (+1.34z)| norm 0.2768 (+1.95z)| lr 2.91e-05 | 2534.59 ms | 53.3% bf16 MFU | 206918 tok/s +step 16896/19560 | loss 3.255874 (-0.98z)| norm 0.2581 (+0.37z)| lr 2.91e-05 | 2534.70 ms | 53.3% bf16 MFU | 206914 tok/s +step 16897/19560 | loss 3.271896 (-0.61z)| norm 0.2825 (+2.36z)| lr 2.91e-05 | 2534.81 ms | 53.3% bf16 MFU | 206910 tok/s +step 16898/19560 | loss 3.240055 (-1.33z)| norm 0.2504 (-0.31z)| lr 2.90e-05 | 2535.05 ms | 53.3% bf16 MFU | 206906 tok/s +step 16899/19560 | loss 3.330448 (+0.74z)| norm 0.2588 (+0.39z)| lr 2.90e-05 | 2532.54 ms | 53.3% bf16 MFU | 206911 tok/s +step 16900/19560 | loss 3.253268 (-1.03z)| norm 0.2487 (-0.47z)| lr 2.90e-05 | 2533.61 ms | 53.3% bf16 MFU | 206912 tok/s +step 16901/19560 | loss 3.329764 (+0.72z)| norm 0.2657 (+0.95z)| lr 2.90e-05 | 2534.93 ms | 53.3% bf16 MFU | 206908 tok/s +step 16902/19560 | loss 3.230313 (-1.53z)| norm 0.2536 (-0.05z)| lr 2.89e-05 | 2533.64 ms | 53.3% bf16 MFU | 206909 tok/s +step 16903/19560 | loss 3.266580 (-0.70z)| norm 0.2546 (+0.02z)| lr 2.89e-05 | 2533.49 ms | 53.3% bf16 MFU | 206911 tok/s +step 16904/19560 | loss 3.293903 (-0.07z)| norm 0.2496 (-0.40z)| lr 2.89e-05 | 2531.02 ms | 53.3% bf16 MFU | 206923 tok/s +step 16905/19560 | loss 3.323556 (+0.59z)| norm 0.2481 (-0.52z)| lr 2.89e-05 | 2534.98 ms | 53.3% bf16 MFU | 206918 tok/s +step 16906/19560 | loss 3.282125 (-0.35z)| norm 0.2679 (+1.14z)| lr 2.89e-05 | 2532.70 ms | 53.3% bf16 MFU | 206922 tok/s +step 16907/19560 | loss 3.295458 (-0.05z)| norm 0.2488 (-0.46z)| lr 2.88e-05 | 2532.07 ms | 53.3% bf16 MFU | 206929 tok/s +step 16908/19560 | loss 3.300366 (+0.07z)| norm 0.2440 (-0.85z)| lr 2.88e-05 | 2533.30 ms | 53.3% bf16 MFU | 206930 tok/s +step 16909/19560 | loss 3.260254 (-0.84z)| norm 0.2431 (-0.93z)| lr 2.88e-05 | 2533.18 ms | 53.3% bf16 MFU | 206932 tok/s +step 16910/19560 | loss 3.302086 (+0.13z)| norm 0.2516 (-0.21z)| lr 2.88e-05 | 2534.14 ms | 53.3% bf16 MFU | 206930 tok/s +step 16911/19560 | loss 3.286358 (-0.23z)| norm 0.2548 (+0.07z)| lr 2.88e-05 | 2533.30 ms | 53.3% bf16 MFU | 206932 tok/s +step 16912/19560 | loss 3.337308 (+0.93z)| norm 0.2385 (-1.30z)| lr 2.87e-05 | 2534.41 ms | 53.3% bf16 MFU | 206928 tok/s +step 16913/19560 | loss 3.281036 (-0.39z)| norm 0.2357 (-1.52z)| lr 2.87e-05 | 2533.72 ms | 53.3% bf16 MFU | 206928 tok/s +step 16914/19560 | loss 3.252342 (-1.06z)| norm 0.2671 (+1.11z)| lr 2.87e-05 | 2533.03 ms | 53.3% bf16 MFU | 206931 tok/s +step 16915/19560 | loss 3.317147 (+0.46z)| norm 0.2525 (-0.12z)| lr 2.87e-05 | 2532.75 ms | 53.3% bf16 MFU | 206934 tok/s +step 16916/19560 | loss 3.320849 (+0.54z)| norm 0.2515 (-0.20z)| lr 2.86e-05 | 2533.03 ms | 53.3% bf16 MFU | 206937 tok/s +step 16917/19560 | loss 3.218187 (-1.82z)| norm 0.2542 (+0.04z)| lr 2.86e-05 | 2531.04 ms | 53.3% bf16 MFU | 206947 tok/s +step 16918/19560 | loss 3.249598 (-1.08z)| norm 0.2545 (+0.06z)| lr 2.86e-05 | 2534.17 ms | 53.3% bf16 MFU | 206944 tok/s +step 16919/19560 | loss 3.265748 (-0.70z)| norm 0.2423 (-0.97z)| lr 2.86e-05 | 2533.32 ms | 53.3% bf16 MFU | 206945 tok/s +step 16920/19560 | loss 3.328651 (+0.73z)| norm 0.2569 (+0.26z)| lr 2.86e-05 | 2531.50 ms | 53.3% bf16 MFU | 206953 tok/s +step 16921/19560 | loss 3.272102 (-0.57z)| norm 0.2388 (-1.25z)| lr 2.85e-05 | 2532.63 ms | 53.3% bf16 MFU | 206956 tok/s +step 16922/19560 | loss 3.304600 (+0.18z)| norm 0.2385 (-1.25z)| lr 2.85e-05 | 2533.21 ms | 53.3% bf16 MFU | 206956 tok/s +step 16923/19560 | loss 3.311267 (+0.32z)| norm 0.2552 (+0.13z)| lr 2.85e-05 | 2533.67 ms | 53.3% bf16 MFU | 206955 tok/s +step 16924/19560 | loss 3.309426 (+0.34z)| norm 0.2580 (+0.43z)| lr 2.85e-05 | 2532.21 ms | 53.3% bf16 MFU | 206960 tok/s +step 16925/19560 | loss 3.231423 (-1.59z)| norm 0.2332 (-1.77z)| lr 2.85e-05 | 2532.46 ms | 53.3% bf16 MFU | 206963 tok/s +step 16926/19560 | loss 3.400585 (+2.56z)| norm 0.2502 (-0.26z)| lr 2.84e-05 | 2532.95 ms | 53.3% bf16 MFU | 206964 tok/s +step 16927/19560 | loss 3.253304 (-1.03z)| norm 0.2329 (-1.78z)| lr 2.84e-05 | 2533.09 ms | 53.3% bf16 MFU | 206965 tok/s +step 16928/19560 | loss 3.260319 (-0.85z)| norm 0.2396 (-1.16z)| lr 2.84e-05 | 2531.96 ms | 53.3% bf16 MFU | 206970 tok/s +step 16929/19560 | loss 3.316877 (+0.51z)| norm 0.2480 (-0.41z)| lr 2.84e-05 | 2534.01 ms | 53.3% bf16 MFU | 206966 tok/s +step 16930/19560 | loss 3.330087 (+0.86z)| norm 0.2446 (-0.71z)| lr 2.84e-05 | 2532.21 ms | 53.3% bf16 MFU | 206970 tok/s +step 16931/19560 | loss 3.395722 (+2.42z)| norm 0.2580 (+0.47z)| lr 2.83e-05 | 2532.00 ms | 53.3% bf16 MFU | 206975 tok/s +step 16932/19560 | loss 3.346669 (+1.21z)| norm 0.2384 (-1.23z)| lr 2.83e-05 | 2533.20 ms | 53.3% bf16 MFU | 206975 tok/s +step 16933/19560 | loss 3.291840 (-0.10z)| norm 0.2505 (-0.15z)| lr 2.83e-05 | 2532.78 ms | 53.3% bf16 MFU | 206976 tok/s +step 16934/19560 | loss 3.278908 (-0.44z)| norm 0.2429 (-0.83z)| lr 2.83e-05 | 2533.44 ms | 53.3% bf16 MFU | 206975 tok/s +step 16935/19560 | loss 3.250311 (-1.16z)| norm 0.2425 (-0.85z)| lr 2.82e-05 | 2533.11 ms | 53.3% bf16 MFU | 206975 tok/s +step 16936/19560 | loss 3.320101 (+0.61z)| norm 0.2345 (-1.56z)| lr 2.82e-05 | 2532.91 ms | 53.3% bf16 MFU | 206975 tok/s +step 16937/19560 | loss 3.299487 (+0.08z)| norm 0.2474 (-0.38z)| lr 2.82e-05 | 2532.67 ms | 53.3% bf16 MFU | 206977 tok/s +step 16938/19560 | loss 3.324628 (+0.75z)| norm 0.2583 (+0.62z)| lr 2.82e-05 | 2534.55 ms | 53.3% bf16 MFU | 206971 tok/s +step 16939/19560 | loss 3.438070 (+3.51z)| norm 0.2465 (-0.45z)| lr 2.82e-05 | 2533.04 ms | 53.3% bf16 MFU | 206972 tok/s +step 16940/19560 | loss 3.327508 (+0.75z)| norm 0.2480 (-0.31z)| lr 2.81e-05 | 2533.89 ms | 53.3% bf16 MFU | 206968 tok/s +step 16941/19560 | loss 3.305245 (+0.20z)| norm 0.2552 (+0.36z)| lr 2.81e-05 | 2534.47 ms | 53.3% bf16 MFU | 206963 tok/s +step 16942/19560 | loss 3.265345 (-0.79z)| norm 0.2790 (+2.50z)| lr 2.81e-05 | 2533.82 ms | 53.3% bf16 MFU | 206961 tok/s +step 16943/19560 | loss 3.250028 (-1.15z)| norm 0.2579 (+0.60z)| lr 2.81e-05 | 2534.49 ms | 53.3% bf16 MFU | 206956 tok/s +step 16944/19560 | loss 3.332026 (+0.88z)| norm 0.2416 (-0.87z)| lr 2.81e-05 | 2533.74 ms | 53.3% bf16 MFU | 206954 tok/s +step 16945/19560 | loss 3.295157 (-0.03z)| norm 0.2839 (+2.87z)| lr 2.80e-05 | 2533.44 ms | 53.3% bf16 MFU | 206954 tok/s +step 16946/19560 | loss 3.269487 (-0.65z)| norm 0.2616 (+0.95z)| lr 2.80e-05 | 2533.82 ms | 53.3% bf16 MFU | 206952 tok/s +step 16947/19560 | loss 3.250269 (-1.15z)| norm 0.2473 (-0.36z)| lr 2.80e-05 | 2532.59 ms | 53.3% bf16 MFU | 206955 tok/s +step 16948/19560 | loss 3.360892 (+1.60z)| norm 0.2520 (+0.09z)| lr 2.80e-05 | 2534.82 ms | 53.3% bf16 MFU | 206949 tok/s +step 16949/19560 | loss 3.316821 (+0.49z)| norm 0.2593 (+0.78z)| lr 2.80e-05 | 2533.98 ms | 53.3% bf16 MFU | 206947 tok/s +step 16950/19560 | loss 3.237923 (-1.46z)| norm 0.2493 (-0.15z)| lr 2.79e-05 | 2534.07 ms | 53.3% bf16 MFU | 206944 tok/s +step 16951/19560 | loss 3.301363 (+0.13z)| norm 0.2593 (+0.79z)| lr 2.79e-05 | 2533.09 ms | 53.3% bf16 MFU | 206946 tok/s +step 16952/19560 | loss 3.295202 (-0.02z)| norm 0.2542 (+0.31z)| lr 2.79e-05 | 2532.94 ms | 53.3% bf16 MFU | 206948 tok/s +step 16953/19560 | loss 3.302798 (+0.17z)| norm 0.2653 (+1.33z)| lr 2.79e-05 | 2534.95 ms | 53.3% bf16 MFU | 206942 tok/s +step 16954/19560 | loss 3.597295 (+6.30z)| norm 0.3545 (+7.32z)| lr 2.78e-05 | 2533.74 ms | 53.3% bf16 MFU | 206941 tok/s +step 16955/19560 | loss 3.269110 (-0.63z)| norm 0.2495 (-0.17z)| lr 2.78e-05 | 2535.71 ms | 53.2% bf16 MFU | 206932 tok/s +step 16956/19560 | loss 3.307571 (+0.18z)| norm 0.2646 (+0.90z)| lr 2.78e-05 | 2534.00 ms | 53.3% bf16 MFU | 206930 tok/s +step 16957/19560 | loss 3.311231 (+0.26z)| norm 0.2526 (+0.05z)| lr 2.78e-05 | 2533.19 ms | 53.3% bf16 MFU | 206932 tok/s +step 16958/19560 | loss 3.257626 (-0.87z)| norm 0.2553 (+0.24z)| lr 2.78e-05 | 2532.03 ms | 53.3% bf16 MFU | 206939 tok/s +step 16959/19560 | loss 3.297175 (-0.02z)| norm 0.2556 (+0.27z)| lr 2.77e-05 | 2532.35 ms | 53.3% bf16 MFU | 206944 tok/s +step 16960/19560 | loss 3.292361 (-0.13z)| norm 0.2470 (-0.35z)| lr 2.77e-05 | 2533.11 ms | 53.3% bf16 MFU | 206945 tok/s +step 16961/19560 | loss 3.305004 (+0.14z)| norm 0.2621 (+0.73z)| lr 2.77e-05 | 2532.55 ms | 53.3% bf16 MFU | 206949 tok/s +step 16962/19560 | loss 3.337410 (+0.84z)| norm 0.2454 (-0.47z)| lr 2.77e-05 | 2533.08 ms | 53.3% bf16 MFU | 206950 tok/s +step 16963/19560 | loss 3.317540 (+0.41z)| norm 0.2482 (-0.27z)| lr 2.77e-05 | 2533.36 ms | 53.3% bf16 MFU | 206950 tok/s +step 16964/19560 | loss 3.335246 (+0.78z)| norm 0.2451 (-0.49z)| lr 2.76e-05 | 2534.42 ms | 53.3% bf16 MFU | 206946 tok/s +step 16965/19560 | loss 3.310575 (+0.25z)| norm 0.2540 (+0.15z)| lr 2.76e-05 | 2535.10 ms | 53.3% bf16 MFU | 206939 tok/s +step 16966/19560 | loss 3.294240 (-0.09z)| norm 0.2556 (+0.26z)| lr 2.76e-05 | 2533.32 ms | 53.3% bf16 MFU | 206940 tok/s +step 16967/19560 | loss 3.264712 (-0.71z)| norm 0.2568 (+0.34z)| lr 2.76e-05 | 2532.44 ms | 53.3% bf16 MFU | 206945 tok/s +step 16968/19560 | loss 3.349603 (+1.12z)| norm 0.2476 (-0.32z)| lr 2.76e-05 | 2532.05 ms | 53.3% bf16 MFU | 206951 tok/s +step 16969/19560 | loss 3.250426 (-1.01z)| norm 0.2435 (-0.61z)| lr 2.75e-05 | 2532.66 ms | 53.3% bf16 MFU | 206954 tok/s +step 16970/19560 | loss 3.346412 (+1.05z)| norm 0.2388 (-0.94z)| lr 2.75e-05 | 2533.20 ms | 53.3% bf16 MFU | 206954 tok/s +step 16971/19560 | loss 3.300771 (+0.06z)| norm 0.2478 (-0.29z)| lr 2.75e-05 | 2534.02 ms | 53.3% bf16 MFU | 206951 tok/s +step 16972/19560 | loss 3.283192 (-0.31z)| norm 0.2429 (-0.63z)| lr 2.75e-05 | 2532.83 ms | 53.3% bf16 MFU | 206954 tok/s +step 16973/19560 | loss 3.285450 (-0.26z)| norm 0.2520 (+0.02z)| lr 2.74e-05 | 2534.81 ms | 53.3% bf16 MFU | 206948 tok/s +step 16974/19560 | loss 3.284105 (-0.29z)| norm 0.2532 (+0.10z)| lr 2.74e-05 | 2534.01 ms | 53.3% bf16 MFU | 206945 tok/s +step 16975/19560 | loss 3.311728 (+0.30z)| norm 0.2473 (-0.33z)| lr 2.74e-05 | 2535.15 ms | 53.3% bf16 MFU | 206939 tok/s +step 16976/19560 | loss 3.260959 (-0.78z)| norm 0.2516 (-0.03z)| lr 2.74e-05 | 2535.15 ms | 53.3% bf16 MFU | 206932 tok/s +step 16977/19560 | loss 3.188716 (-2.27z)| norm 0.2802 (+2.01z)| lr 2.74e-05 | 2535.64 ms | 53.2% bf16 MFU | 206924 tok/s +step 16978/19560 | loss 3.381380 (+1.75z)| norm 0.2545 (+0.16z)| lr 2.73e-05 | 2533.01 ms | 53.3% bf16 MFU | 206927 tok/s +step 16979/19560 | loss 3.207196 (-1.83z)| norm 0.2768 (+1.73z)| lr 2.73e-05 | 2534.35 ms | 53.3% bf16 MFU | 206924 tok/s +step 16980/19560 | loss 3.355909 (+1.21z)| norm 0.2630 (+0.75z)| lr 2.73e-05 | 2533.75 ms | 53.3% bf16 MFU | 206924 tok/s +step 16981/19560 | loss 3.382899 (+1.72z)| norm 0.2592 (+0.47z)| lr 2.73e-05 | 2533.39 ms | 53.3% bf16 MFU | 206925 tok/s +step 16982/19560 | loss 3.382374 (+1.68z)| norm 0.2580 (+0.38z)| lr 2.73e-05 | 2533.17 ms | 53.3% bf16 MFU | 206927 tok/s +step 16983/19560 | loss 3.301757 (+0.06z)| norm 0.2446 (-0.56z)| lr 2.72e-05 | 2532.82 ms | 53.3% bf16 MFU | 206931 tok/s +step 16984/19560 | loss 3.319960 (+0.43z)| norm 0.2417 (-0.76z)| lr 2.72e-05 | 2531.75 ms | 53.3% bf16 MFU | 206939 tok/s +step 16985/19560 | loss 3.276354 (-0.44z)| norm 0.2445 (-0.56z)| lr 2.72e-05 | 2531.71 ms | 53.3% bf16 MFU | 206946 tok/s +step 16986/19560 | loss 3.356789 (+1.16z)| norm 0.2631 (+0.73z)| lr 2.72e-05 | 2534.12 ms | 53.3% bf16 MFU | 206943 tok/s +step 16987/19560 | loss 3.271598 (-0.54z)| norm 0.2503 (-0.16z)| lr 2.72e-05 | 2531.14 ms | 53.3% bf16 MFU | 206953 tok/s +step 16988/19560 | loss 3.268618 (-0.60z)| norm 0.2514 (-0.09z)| lr 2.71e-05 | 2534.26 ms | 53.3% bf16 MFU | 206949 tok/s +step 16989/19560 | loss 3.246956 (-1.02z)| norm 0.2466 (-0.43z)| lr 2.71e-05 | 2534.44 ms | 53.3% bf16 MFU | 206945 tok/s +step 16990/19560 | loss 3.289112 (-0.19z)| norm 0.2453 (-0.52z)| lr 2.71e-05 | 2532.29 ms | 53.3% bf16 MFU | 206950 tok/s +step 16991/19560 | loss 3.296489 (-0.05z)| norm 0.2445 (-0.59z)| lr 2.71e-05 | 2535.14 ms | 53.3% bf16 MFU | 206943 tok/s +step 16992/19560 | loss 3.405782 (+2.09z)| norm 0.2672 (+1.00z)| lr 2.71e-05 | 2533.49 ms | 53.3% bf16 MFU | 206943 tok/s +step 16993/19560 | loss 3.281254 (-0.35z)| norm 0.2443 (-0.62z)| lr 2.70e-05 | 2535.83 ms | 53.2% bf16 MFU | 206933 tok/s +step 16994/19560 | loss 3.299256 (-0.00z)| norm 0.2564 (+0.25z)| lr 2.70e-05 | 2533.27 ms | 53.3% bf16 MFU | 206935 tok/s +step 16995/19560 | loss 3.342496 (+0.84z)| norm 0.2464 (-0.46z)| lr 2.70e-05 | 2533.81 ms | 53.3% bf16 MFU | 206934 tok/s +step 16996/19560 | loss 3.242825 (-1.12z)| norm 0.3432 (+5.58z)| lr 2.70e-05 | 2533.02 ms | 53.3% bf16 MFU | 206936 tok/s +step 16997/19560 | loss 3.311470 (+0.23z)| norm 0.2790 (+1.55z)| lr 2.69e-05 | 2532.92 ms | 53.3% bf16 MFU | 206939 tok/s +step 16998/19560 | loss 3.330297 (+0.59z)| norm 0.2685 (+0.89z)| lr 2.69e-05 | 2533.67 ms | 53.3% bf16 MFU | 206938 tok/s +step 16999/19560 | loss 3.303545 (+0.06z)| norm 0.2468 (-0.45z)| lr 2.69e-05 | 2533.44 ms | 53.3% bf16 MFU | 206939 tok/s +step 17000/19560 | loss 3.341522 (+0.82z)| norm 0.2707 (+1.01z)| lr 2.69e-05 | 2532.75 ms | 53.3% bf16 MFU | 206942 tok/s +val loss 3.293225 +evaluating HellaSwag: 0/628 evaluating HellaSwag: 10/628 evaluating HellaSwag: 20/628 evaluating HellaSwag: 30/628 evaluating HellaSwag: 40/628 evaluating HellaSwag: 50/628 evaluating HellaSwag: 60/628 evaluating HellaSwag: 70/628 evaluating HellaSwag: 80/628 evaluating HellaSwag: 90/628 evaluating HellaSwag: 100/628 evaluating HellaSwag: 110/628 evaluating HellaSwag: 120/628 evaluating HellaSwag: 130/628 evaluating HellaSwag: 140/628 evaluating HellaSwag: 150/628 evaluating HellaSwag: 160/628 evaluating HellaSwag: 170/628 evaluating HellaSwag: 180/628 evaluating HellaSwag: 190/628 evaluating HellaSwag: 200/628 evaluating HellaSwag: 210/628 evaluating HellaSwag: 220/628 evaluating HellaSwag: 230/628 evaluating HellaSwag: 240/628 evaluating HellaSwag: 250/628 evaluating HellaSwag: 260/628 evaluating HellaSwag: 270/628 evaluating HellaSwag: 280/628 evaluating HellaSwag: 290/628 evaluating HellaSwag: 300/628 evaluating HellaSwag: 310/628 evaluating HellaSwag: 320/628 evaluating HellaSwag: 330/628 evaluating HellaSwag: 340/628 evaluating HellaSwag: 350/628 evaluating HellaSwag: 360/628 evaluating HellaSwag: 370/628 evaluating HellaSwag: 380/628 evaluating HellaSwag: 390/628 evaluating HellaSwag: 400/628 evaluating HellaSwag: 410/628 evaluating HellaSwag: 420/628 evaluating HellaSwag: 430/628 evaluating HellaSwag: 440/628 evaluating HellaSwag: 450/628 evaluating HellaSwag: 460/628 evaluating HellaSwag: 470/628 evaluating HellaSwag: 480/628 evaluating HellaSwag: 490/628 evaluating HellaSwag: 500/628 evaluating HellaSwag: 510/628 evaluating HellaSwag: 520/628 evaluating HellaSwag: 530/628 evaluating HellaSwag: 540/628 evaluating HellaSwag: 550/628 evaluating HellaSwag: 560/628 evaluating HellaSwag: 570/628 evaluating HellaSwag: 580/628 evaluating HellaSwag: 590/628 evaluating HellaSwag: 600/628 evaluating HellaSwag: 610/628 evaluating HellaSwag: 620/628 HellaSwag: 3031/10042 = 0.301832 +step 17001/19560 | loss 3.327417 (+0.56z)| norm 0.2527 (-0.09z)| lr 2.69e-05 | 2534.81 ms | 53.3% bf16 MFU | 206937 tok/s +step 17002/19560 | loss 3.310369 (+0.21z)| norm 0.2842 (+1.81z)| lr 2.68e-05 | 2531.44 ms | 53.3% bf16 MFU | 206945 tok/s +step 17003/19560 | loss 3.243635 (-1.13z)| norm 0.2482 (-0.39z)| lr 2.68e-05 | 2535.67 ms | 53.2% bf16 MFU | 206936 tok/s +step 17004/19560 | loss 3.319814 (+0.40z)| norm 0.2418 (-0.79z)| lr 2.68e-05 | 2532.88 ms | 53.3% bf16 MFU | 206939 tok/s +step 17005/19560 | loss 3.324815 (+0.49z)| norm 0.2460 (-0.52z)| lr 2.68e-05 | 2532.32 ms | 53.3% bf16 MFU | 206944 tok/s +step 17006/19560 | loss 3.292069 (-0.16z)| norm 0.2438 (-0.66z)| lr 2.68e-05 | 2533.95 ms | 53.3% bf16 MFU | 206942 tok/s +step 17007/19560 | loss 3.441948 (+2.77z)| norm 0.2732 (+1.13z)| lr 2.67e-05 | 2529.65 ms | 53.4% bf16 MFU | 206958 tok/s +step 17008/19560 | loss 3.305754 (+0.09z)| norm 0.2498 (-0.30z)| lr 2.67e-05 | 2531.41 ms | 53.3% bf16 MFU | 206966 tok/s +step 17009/19560 | loss 3.304986 (+0.07z)| norm 0.2538 (-0.06z)| lr 2.67e-05 | 2533.47 ms | 53.3% bf16 MFU | 206965 tok/s +step 17010/19560 | loss 3.233924 (-1.31z)| norm 0.2560 (+0.07z)| lr 2.67e-05 | 2532.90 ms | 53.3% bf16 MFU | 206966 tok/s +step 17011/19560 | loss 3.309109 (+0.16z)| norm 0.2417 (-0.81z)| lr 2.67e-05 | 2533.05 ms | 53.3% bf16 MFU | 206967 tok/s +step 17012/19560 | loss 3.291016 (-0.20z)| norm 0.2514 (-0.21z)| lr 2.66e-05 | 2533.17 ms | 53.3% bf16 MFU | 206967 tok/s +step 17013/19560 | loss 3.300588 (-0.02z)| norm 0.2632 (+0.51z)| lr 2.66e-05 | 2532.51 ms | 53.3% bf16 MFU | 206970 tok/s +step 17014/19560 | loss 3.304949 (+0.07z)| norm 0.2487 (-0.37z)| lr 2.66e-05 | 2533.09 ms | 53.3% bf16 MFU | 206970 tok/s +step 17015/19560 | loss 3.237787 (-1.24z)| norm 0.2411 (-0.83z)| lr 2.66e-05 | 2530.94 ms | 53.3% bf16 MFU | 206979 tok/s +step 17016/19560 | loss 3.221425 (-1.55z)| norm 0.2598 (+0.30z)| lr 2.66e-05 | 2533.17 ms | 53.3% bf16 MFU | 206978 tok/s +step 17017/19560 | loss 3.364441 (+1.23z)| norm 0.2574 (+0.17z)| lr 2.65e-05 | 2531.54 ms | 53.3% bf16 MFU | 206985 tok/s +step 17018/19560 | loss 3.299007 (-0.05z)| norm 0.2419 (-0.79z)| lr 2.65e-05 | 2533.72 ms | 53.3% bf16 MFU | 206982 tok/s +step 17019/19560 | loss 3.238041 (-1.23z)| norm 0.2500 (-0.29z)| lr 2.65e-05 | 2533.25 ms | 53.3% bf16 MFU | 206981 tok/s +step 17020/19560 | loss 3.256827 (-0.86z)| norm 0.2585 (+0.23z)| lr 2.65e-05 | 2532.64 ms | 53.3% bf16 MFU | 206982 tok/s +step 17021/19560 | loss 3.294441 (-0.13z)| norm 0.2431 (-0.71z)| lr 2.65e-05 | 2535.03 ms | 53.3% bf16 MFU | 206974 tok/s +step 17022/19560 | loss 3.249397 (-1.00z)| norm 0.2471 (-0.45z)| lr 2.64e-05 | 2532.95 ms | 53.3% bf16 MFU | 206975 tok/s +step 17023/19560 | loss 3.256255 (-0.85z)| norm 0.2598 (+0.35z)| lr 2.64e-05 | 2533.12 ms | 53.3% bf16 MFU | 206974 tok/s +step 17024/19560 | loss 3.279891 (-0.40z)| norm 0.2681 (+0.87z)| lr 2.64e-05 | 2534.77 ms | 53.3% bf16 MFU | 206968 tok/s +step 17025/19560 | loss 3.256251 (-0.85z)| norm 0.2554 (+0.09z)| lr 2.64e-05 | 2534.53 ms | 53.3% bf16 MFU | 206962 tok/s +step 17026/19560 | loss 3.588833 (+5.01z)| norm 0.2769 (+1.42z)| lr 2.64e-05 | 2534.41 ms | 53.3% bf16 MFU | 206957 tok/s +step 17027/19560 | loss 3.335439 (+0.57z)| norm 0.2786 (+1.51z)| lr 2.63e-05 | 2535.83 ms | 53.2% bf16 MFU | 206947 tok/s +step 17028/19560 | loss 3.312275 (+0.15z)| norm 0.3346 (+4.55z)| lr 2.63e-05 | 2532.06 ms | 53.3% bf16 MFU | 206953 tok/s +step 17029/19560 | loss 3.321909 (+0.33z)| norm 0.2562 (+0.07z)| lr 2.63e-05 | 2533.42 ms | 53.3% bf16 MFU | 206953 tok/s +step 17030/19560 | loss 3.291044 (-0.23z)| norm 0.2667 (+0.66z)| lr 2.63e-05 | 2533.14 ms | 53.3% bf16 MFU | 206954 tok/s +step 17031/19560 | loss 3.287732 (-0.29z)| norm 0.2688 (+0.78z)| lr 2.62e-05 | 2531.40 ms | 53.3% bf16 MFU | 206962 tok/s +step 17032/19560 | loss 3.316848 (+0.22z)| norm 0.2537 (-0.09z)| lr 2.62e-05 | 2534.98 ms | 53.3% bf16 MFU | 206955 tok/s +step 17033/19560 | loss 3.365742 (+1.08z)| norm 0.2549 (-0.02z)| lr 2.62e-05 | 2533.12 ms | 53.3% bf16 MFU | 206955 tok/s +step 17034/19560 | loss 3.318918 (+0.25z)| norm 0.2702 (+0.85z)| lr 2.62e-05 | 2534.49 ms | 53.3% bf16 MFU | 206951 tok/s +step 17035/19560 | loss 3.258156 (-0.82z)| norm 0.2495 (-0.33z)| lr 2.62e-05 | 2534.36 ms | 53.3% bf16 MFU | 206947 tok/s +step 17036/19560 | loss 3.249075 (-0.96z)| norm 0.2499 (-0.32z)| lr 2.61e-05 | 2535.67 ms | 53.2% bf16 MFU | 206938 tok/s +step 17037/19560 | loss 3.291323 (-0.23z)| norm 0.2556 (+0.01z)| lr 2.61e-05 | 2533.27 ms | 53.3% bf16 MFU | 206939 tok/s +step 17038/19560 | loss 3.323568 (+0.33z)| norm 0.2520 (-0.20z)| lr 2.61e-05 | 2534.12 ms | 53.3% bf16 MFU | 206937 tok/s +step 17039/19560 | loss 3.302271 (-0.04z)| norm 0.2565 (+0.06z)| lr 2.61e-05 | 2533.89 ms | 53.3% bf16 MFU | 206935 tok/s +step 17040/19560 | loss 3.355778 (+0.89z)| norm 0.2600 (+0.25z)| lr 2.61e-05 | 2533.31 ms | 53.3% bf16 MFU | 206936 tok/s +step 17041/19560 | loss 3.321250 (+0.28z)| norm 0.2396 (-0.93z)| lr 2.60e-05 | 2532.44 ms | 53.3% bf16 MFU | 206941 tok/s +step 17042/19560 | loss 3.277663 (-0.49z)| norm 0.2439 (-0.67z)| lr 2.60e-05 | 2531.96 ms | 53.3% bf16 MFU | 206947 tok/s +step 17043/19560 | loss 3.330740 (+0.44z)| norm 0.2454 (-0.58z)| lr 2.60e-05 | 2533.92 ms | 53.3% bf16 MFU | 206945 tok/s +step 17044/19560 | loss 3.299499 (-0.10z)| norm 0.2422 (-0.76z)| lr 2.60e-05 | 2533.15 ms | 53.3% bf16 MFU | 206947 tok/s +step 17045/19560 | loss 3.333670 (+0.49z)| norm 0.2544 (-0.06z)| lr 2.60e-05 | 2531.00 ms | 53.3% bf16 MFU | 206957 tok/s +step 17046/19560 | loss 3.290550 (-0.28z)| norm 0.2582 (+0.16z)| lr 2.59e-05 | 2532.60 ms | 53.3% bf16 MFU | 206960 tok/s +step 17047/19560 | loss 3.298673 (-0.14z)| norm 0.2416 (-0.79z)| lr 2.59e-05 | 2531.38 ms | 53.3% bf16 MFU | 206967 tok/s +step 17048/19560 | loss 3.296856 (-0.17z)| norm 0.2451 (-0.58z)| lr 2.59e-05 | 2533.95 ms | 53.3% bf16 MFU | 206964 tok/s +step 17049/19560 | loss 3.454108 (+2.54z)| norm 0.2584 (+0.17z)| lr 2.59e-05 | 2532.75 ms | 53.3% bf16 MFU | 206966 tok/s +step 17050/19560 | loss 3.311705 (+0.06z)| norm 0.2409 (-0.84z)| lr 2.59e-05 | 2532.62 ms | 53.3% bf16 MFU | 206969 tok/s +step 17051/19560 | loss 3.220607 (-1.49z)| norm 0.2425 (-0.74z)| lr 2.58e-05 | 2531.39 ms | 53.3% bf16 MFU | 206976 tok/s +step 17052/19560 | loss 3.289091 (-0.31z)| norm 0.2579 (+0.14z)| lr 2.58e-05 | 2531.91 ms | 53.3% bf16 MFU | 206981 tok/s +step 17053/19560 | loss 3.220694 (-1.48z)| norm 0.2534 (-0.12z)| lr 2.58e-05 | 2532.75 ms | 53.3% bf16 MFU | 206982 tok/s +step 17054/19560 | loss 3.274308 (-0.55z)| norm 0.2408 (-0.85z)| lr 2.58e-05 | 2533.49 ms | 53.3% bf16 MFU | 206980 tok/s +step 17055/19560 | loss 3.438827 (+2.24z)| norm 0.2701 (+0.83z)| lr 2.58e-05 | 2532.43 ms | 53.3% bf16 MFU | 206982 tok/s +step 17056/19560 | loss 3.250329 (-0.97z)| norm 0.2530 (-0.17z)| lr 2.57e-05 | 2532.47 ms | 53.3% bf16 MFU | 206985 tok/s +step 17057/19560 | loss 3.217476 (-1.51z)| norm 0.2588 (+0.17z)| lr 2.57e-05 | 2533.72 ms | 53.3% bf16 MFU | 206982 tok/s +step 17058/19560 | loss 3.294192 (-0.21z)| norm 0.2615 (+0.31z)| lr 2.57e-05 | 2533.05 ms | 53.3% bf16 MFU | 206981 tok/s +step 17059/19560 | loss 3.281796 (-0.40z)| norm 0.2533 (-0.16z)| lr 2.57e-05 | 2533.37 ms | 53.3% bf16 MFU | 206980 tok/s +step 17060/19560 | loss 3.279050 (-0.44z)| norm 0.2503 (-0.34z)| lr 2.57e-05 | 2532.23 ms | 53.3% bf16 MFU | 206983 tok/s +step 17061/19560 | loss 3.249050 (-0.94z)| norm 0.2590 (+0.17z)| lr 2.56e-05 | 2532.23 ms | 53.3% bf16 MFU | 206986 tok/s +step 17062/19560 | loss 3.277661 (-0.46z)| norm 0.2390 (-1.00z)| lr 2.56e-05 | 2532.63 ms | 53.3% bf16 MFU | 206988 tok/s +step 17063/19560 | loss 3.299286 (-0.10z)| norm 0.2427 (-0.79z)| lr 2.56e-05 | 2534.97 ms | 53.3% bf16 MFU | 206979 tok/s +step 17064/19560 | loss 3.262844 (-0.71z)| norm 0.2409 (-0.90z)| lr 2.56e-05 | 2531.52 ms | 53.3% bf16 MFU | 206986 tok/s +step 17065/19560 | loss 3.278384 (-0.44z)| norm 0.2420 (-0.83z)| lr 2.56e-05 | 2533.74 ms | 53.3% bf16 MFU | 206982 tok/s +step 17066/19560 | loss 3.276577 (-0.47z)| norm 0.2554 (-0.04z)| lr 2.55e-05 | 2535.26 ms | 53.3% bf16 MFU | 206973 tok/s +step 17067/19560 | loss 3.410673 (+1.83z)| norm 0.2623 (+0.35z)| lr 2.55e-05 | 2533.08 ms | 53.3% bf16 MFU | 206973 tok/s +step 17068/19560 | loss 3.238023 (-1.11z)| norm 0.2500 (-0.37z)| lr 2.55e-05 | 2532.65 ms | 53.3% bf16 MFU | 206975 tok/s +step 17069/19560 | loss 3.315902 (+0.22z)| norm 0.2441 (-0.71z)| lr 2.55e-05 | 2531.28 ms | 53.3% bf16 MFU | 206983 tok/s +step 17070/19560 | loss 3.316862 (+0.23z)| norm 0.2415 (-0.85z)| lr 2.55e-05 | 2533.02 ms | 53.3% bf16 MFU | 206983 tok/s +step 17071/19560 | loss 3.307416 (+0.06z)| norm 0.2507 (-0.30z)| lr 2.54e-05 | 2534.59 ms | 53.3% bf16 MFU | 206976 tok/s +step 17072/19560 | loss 3.346002 (+0.72z)| norm 0.2465 (-0.55z)| lr 2.54e-05 | 2533.24 ms | 53.3% bf16 MFU | 206976 tok/s +step 17073/19560 | loss 3.192500 (-1.87z)| norm 0.2609 (+0.31z)| lr 2.54e-05 | 2532.86 ms | 53.3% bf16 MFU | 206977 tok/s +step 17074/19560 | loss 3.306532 (+0.05z)| norm 0.2449 (-0.64z)| lr 2.54e-05 | 2532.16 ms | 53.3% bf16 MFU | 206980 tok/s +step 17075/19560 | loss 3.288291 (-0.26z)| norm 0.2541 (-0.09z)| lr 2.54e-05 | 2533.27 ms | 53.3% bf16 MFU | 206979 tok/s +step 17076/19560 | loss 3.288233 (-0.26z)| norm 0.2767 (+1.23z)| lr 2.53e-05 | 2534.16 ms | 53.3% bf16 MFU | 206975 tok/s +step 17077/19560 | loss 3.304169 (+0.02z)| norm 0.2550 (-0.05z)| lr 2.53e-05 | 2532.91 ms | 53.3% bf16 MFU | 206976 tok/s +step 17078/19560 | loss 3.285626 (-0.31z)| norm 0.2464 (-0.56z)| lr 2.53e-05 | 2533.21 ms | 53.3% bf16 MFU | 206975 tok/s +step 17079/19560 | loss 3.278155 (-0.43z)| norm 0.2604 (+0.27z)| lr 2.53e-05 | 2532.58 ms | 53.3% bf16 MFU | 206977 tok/s +step 17080/19560 | loss 3.314661 (+0.19z)| norm 0.2398 (-0.93z)| lr 2.53e-05 | 2534.31 ms | 53.3% bf16 MFU | 206972 tok/s +step 17081/19560 | loss 3.293308 (-0.17z)| norm 0.2484 (-0.42z)| lr 2.52e-05 | 2534.75 ms | 53.3% bf16 MFU | 206966 tok/s +step 17082/19560 | loss 3.242970 (-1.10z)| norm 0.2435 (-0.76z)| lr 2.52e-05 | 2534.70 ms | 53.3% bf16 MFU | 206959 tok/s +step 17083/19560 | loss 3.311828 (+0.20z)| norm 0.2422 (-0.85z)| lr 2.52e-05 | 2533.29 ms | 53.3% bf16 MFU | 206959 tok/s +step 17084/19560 | loss 3.234221 (-1.25z)| norm 0.2778 (+1.56z)| lr 2.52e-05 | 2533.66 ms | 53.3% bf16 MFU | 206958 tok/s +step 17085/19560 | loss 3.298814 (-0.03z)| norm 0.2375 (-1.15z)| lr 2.52e-05 | 2534.26 ms | 53.3% bf16 MFU | 206954 tok/s +step 17086/19560 | loss 3.275156 (-0.48z)| norm 0.2582 (+0.24z)| lr 2.51e-05 | 2533.34 ms | 53.3% bf16 MFU | 206954 tok/s +step 17087/19560 | loss 3.233588 (-1.25z)| norm 0.2518 (-0.19z)| lr 2.51e-05 | 2534.18 ms | 53.3% bf16 MFU | 206951 tok/s +step 17088/19560 | loss 3.280732 (-0.36z)| norm 0.2400 (-0.98z)| lr 2.51e-05 | 2534.26 ms | 53.3% bf16 MFU | 206947 tok/s +step 17089/19560 | loss 3.341381 (+0.77z)| norm 0.2546 (+0.01z)| lr 2.51e-05 | 2532.54 ms | 53.3% bf16 MFU | 206951 tok/s +step 17090/19560 | loss 3.344996 (+0.84z)| norm 0.2780 (+1.55z)| lr 2.51e-05 | 2533.62 ms | 53.3% bf16 MFU | 206950 tok/s +step 17091/19560 | loss 3.314058 (+0.26z)| norm 0.2529 (-0.13z)| lr 2.50e-05 | 2532.50 ms | 53.3% bf16 MFU | 206954 tok/s +step 17092/19560 | loss 3.240550 (-1.10z)| norm 0.2423 (-0.83z)| lr 2.50e-05 | 2533.15 ms | 53.3% bf16 MFU | 206954 tok/s +step 17093/19560 | loss 3.312689 (+0.25z)| norm 0.2524 (-0.16z)| lr 2.50e-05 | 2534.47 ms | 53.3% bf16 MFU | 206950 tok/s +step 17094/19560 | loss 3.272174 (-0.51z)| norm 0.2570 (+0.15z)| lr 2.50e-05 | 2533.72 ms | 53.3% bf16 MFU | 206949 tok/s +step 17095/19560 | loss 3.298373 (-0.02z)| norm 0.2536 (-0.08z)| lr 2.50e-05 | 2535.63 ms | 53.2% bf16 MFU | 206940 tok/s +step 17096/19560 | loss 3.318117 (+0.35z)| norm 0.2482 (-0.44z)| lr 2.49e-05 | 2532.04 ms | 53.3% bf16 MFU | 206946 tok/s +step 17097/19560 | loss 3.284034 (-0.29z)| norm 0.2454 (-0.62z)| lr 2.49e-05 | 2532.90 ms | 53.3% bf16 MFU | 206948 tok/s +step 17098/19560 | loss 3.279781 (-0.37z)| norm 0.2686 (+0.91z)| lr 2.49e-05 | 2533.54 ms | 53.3% bf16 MFU | 206947 tok/s +step 17099/19560 | loss 3.262485 (-0.68z)| norm 0.2777 (+1.49z)| lr 2.49e-05 | 2532.06 ms | 53.3% bf16 MFU | 206953 tok/s +step 17100/19560 | loss 3.265069 (-0.63z)| norm 0.2338 (-1.41z)| lr 2.49e-05 | 2534.93 ms | 53.3% bf16 MFU | 206947 tok/s +step 17101/19560 | loss 3.353736 (+1.02z)| norm 0.2437 (-0.75z)| lr 2.48e-05 | 2531.88 ms | 53.3% bf16 MFU | 206953 tok/s +step 17102/19560 | loss 3.226507 (-1.34z)| norm 0.2701 (+0.98z)| lr 2.48e-05 | 2534.02 ms | 53.3% bf16 MFU | 206950 tok/s +step 17103/19560 | loss 3.304453 (+0.11z)| norm 0.2901 (+2.22z)| lr 2.48e-05 | 2533.47 ms | 53.3% bf16 MFU | 206950 tok/s +step 17104/19560 | loss 3.350407 (+0.95z)| norm 0.2515 (-0.26z)| lr 2.48e-05 | 2535.35 ms | 53.3% bf16 MFU | 206942 tok/s +step 17105/19560 | loss 3.314649 (+0.27z)| norm 0.2458 (-0.62z)| lr 2.48e-05 | 2536.78 ms | 53.2% bf16 MFU | 206929 tok/s +step 17106/19560 | loss 3.313704 (+0.26z)| norm 0.2475 (-0.50z)| lr 2.47e-05 | 2532.81 ms | 53.3% bf16 MFU | 206932 tok/s +step 17107/19560 | loss 3.315840 (+0.29z)| norm 0.2778 (+1.46z)| lr 2.47e-05 | 2532.89 ms | 53.3% bf16 MFU | 206935 tok/s +step 17108/19560 | loss 3.294227 (-0.12z)| norm 0.2492 (-0.38z)| lr 2.47e-05 | 2535.65 ms | 53.2% bf16 MFU | 206927 tok/s +step 17109/19560 | loss 3.338274 (+0.75z)| norm 0.2549 (-0.01z)| lr 2.47e-05 | 2534.39 ms | 53.3% bf16 MFU | 206924 tok/s +step 17110/19560 | loss 3.295535 (-0.07z)| norm 0.2445 (-0.68z)| lr 2.47e-05 | 2534.90 ms | 53.3% bf16 MFU | 206919 tok/s +step 17111/19560 | loss 3.289569 (-0.19z)| norm 0.2504 (-0.30z)| lr 2.46e-05 | 2532.78 ms | 53.3% bf16 MFU | 206923 tok/s +step 17112/19560 | loss 3.298400 (-0.01z)| norm 0.2517 (-0.22z)| lr 2.46e-05 | 2533.88 ms | 53.3% bf16 MFU | 206923 tok/s +step 17113/19560 | loss 3.324197 (+0.49z)| norm 0.2379 (-1.11z)| lr 2.46e-05 | 2533.61 ms | 53.3% bf16 MFU | 206923 tok/s +step 17114/19560 | loss 3.329747 (+0.61z)| norm 0.2541 (-0.06z)| lr 2.46e-05 | 2533.65 ms | 53.3% bf16 MFU | 206924 tok/s +step 17115/19560 | loss 3.238365 (-1.19z)| norm 0.2462 (-0.57z)| lr 2.46e-05 | 2534.24 ms | 53.3% bf16 MFU | 206921 tok/s +step 17116/19560 | loss 3.321872 (+0.45z)| norm 0.2385 (-1.06z)| lr 2.45e-05 | 2532.33 ms | 53.3% bf16 MFU | 206927 tok/s +step 17117/19560 | loss 3.259371 (-0.79z)| norm 0.2412 (-0.88z)| lr 2.45e-05 | 2531.92 ms | 53.3% bf16 MFU | 206934 tok/s +step 17118/19560 | loss 3.211066 (-1.71z)| norm 0.2440 (-0.70z)| lr 2.45e-05 | 2530.79 ms | 53.3% bf16 MFU | 206946 tok/s +step 17119/19560 | loss 3.287463 (-0.22z)| norm 0.2595 (+0.30z)| lr 2.45e-05 | 2534.49 ms | 53.3% bf16 MFU | 206942 tok/s +step 17120/19560 | loss 3.268273 (-0.58z)| norm 0.2395 (-0.98z)| lr 2.45e-05 | 2533.09 ms | 53.3% bf16 MFU | 206943 tok/s +step 17121/19560 | loss 3.263242 (-0.68z)| norm 0.2427 (-0.77z)| lr 2.44e-05 | 2531.79 ms | 53.3% bf16 MFU | 206950 tok/s +step 17122/19560 | loss 3.231439 (-1.29z)| norm 0.2497 (-0.32z)| lr 2.44e-05 | 2533.64 ms | 53.3% bf16 MFU | 206949 tok/s +step 17123/19560 | loss 3.270669 (-0.51z)| norm 0.2454 (-0.59z)| lr 2.44e-05 | 2534.79 ms | 53.3% bf16 MFU | 206944 tok/s +step 17124/19560 | loss 3.267182 (-0.58z)| norm 0.2368 (-1.26z)| lr 2.44e-05 | 2533.48 ms | 53.3% bf16 MFU | 206944 tok/s +step 17125/19560 | loss 3.311507 (+0.30z)| norm 0.2511 (-0.19z)| lr 2.44e-05 | 2534.00 ms | 53.3% bf16 MFU | 206942 tok/s +step 17126/19560 | loss 3.323091 (+0.53z)| norm 0.2604 (+0.52z)| lr 2.43e-05 | 2534.20 ms | 53.3% bf16 MFU | 206939 tok/s +step 17127/19560 | loss 3.299736 (+0.06z)| norm 0.2409 (-0.95z)| lr 2.43e-05 | 2535.30 ms | 53.3% bf16 MFU | 206932 tok/s +step 17128/19560 | loss 3.288119 (-0.16z)| norm 0.2495 (-0.29z)| lr 2.43e-05 | 2532.79 ms | 53.3% bf16 MFU | 206935 tok/s +step 17129/19560 | loss 3.173869 (-2.36z)| norm 0.2582 (+0.36z)| lr 2.43e-05 | 2534.22 ms | 53.3% bf16 MFU | 206932 tok/s +step 17130/19560 | loss 3.295942 (+0.02z)| norm 0.2508 (-0.18z)| lr 2.43e-05 | 2534.32 ms | 53.3% bf16 MFU | 206930 tok/s +step 17131/19560 | loss 3.268082 (-0.52z)| norm 0.2364 (-1.27z)| lr 2.42e-05 | 2534.36 ms | 53.3% bf16 MFU | 206927 tok/s +step 17132/19560 | loss 3.259768 (-0.68z)| norm 0.2503 (-0.21z)| lr 2.42e-05 | 2535.04 ms | 53.3% bf16 MFU | 206921 tok/s +step 17133/19560 | loss 3.237119 (-1.10z)| norm 0.2452 (-0.60z)| lr 2.42e-05 | 2535.16 ms | 53.3% bf16 MFU | 206915 tok/s +step 17134/19560 | loss 3.195090 (-1.88z)| norm 0.2475 (-0.43z)| lr 2.42e-05 | 2533.40 ms | 53.3% bf16 MFU | 206917 tok/s +step 17135/19560 | loss 3.279156 (-0.25z)| norm 0.2641 (+0.86z)| lr 2.42e-05 | 2532.55 ms | 53.3% bf16 MFU | 206922 tok/s +step 17136/19560 | loss 3.356509 (+1.27z)| norm 0.2722 (+1.46z)| lr 2.41e-05 | 2530.98 ms | 53.3% bf16 MFU | 206934 tok/s +step 17137/19560 | loss 3.286071 (-0.12z)| norm 0.2420 (-0.85z)| lr 2.41e-05 | 2533.25 ms | 53.3% bf16 MFU | 206935 tok/s +step 17138/19560 | loss 3.250782 (-0.82z)| norm 0.2474 (-0.43z)| lr 2.41e-05 | 2533.28 ms | 53.3% bf16 MFU | 206936 tok/s +step 17139/19560 | loss 3.313317 (+0.42z)| norm 0.2642 (+0.84z)| lr 2.41e-05 | 2533.16 ms | 53.3% bf16 MFU | 206938 tok/s +step 17140/19560 | loss 3.290853 (-0.03z)| norm 0.2478 (-0.41z)| lr 2.41e-05 | 2532.02 ms | 53.3% bf16 MFU | 206944 tok/s +step 17141/19560 | loss 3.321455 (+0.57z)| norm 0.2485 (-0.35z)| lr 2.40e-05 | 2533.95 ms | 53.3% bf16 MFU | 206942 tok/s +step 17142/19560 | loss 3.251053 (-0.80z)| norm 0.2362 (-1.28z)| lr 2.40e-05 | 2532.47 ms | 53.3% bf16 MFU | 206946 tok/s +step 17143/19560 | loss 3.282691 (-0.19z)| norm 0.2439 (-0.69z)| lr 2.40e-05 | 2534.02 ms | 53.3% bf16 MFU | 206944 tok/s +step 17144/19560 | loss 3.292813 (-0.00z)| norm 0.2617 (+0.67z)| lr 2.40e-05 | 2534.05 ms | 53.3% bf16 MFU | 206942 tok/s +step 17145/19560 | loss 3.322994 (+0.61z)| norm 0.2470 (-0.45z)| lr 2.40e-05 | 2533.38 ms | 53.3% bf16 MFU | 206942 tok/s +step 17146/19560 | loss 3.201724 (-1.78z)| norm 0.2569 (+0.30z)| lr 2.39e-05 | 2531.25 ms | 53.3% bf16 MFU | 206951 tok/s +step 17147/19560 | loss 3.253040 (-0.77z)| norm 0.2496 (-0.26z)| lr 2.39e-05 | 2533.32 ms | 53.3% bf16 MFU | 206952 tok/s +step 17148/19560 | loss 3.241566 (-0.99z)| norm 0.2504 (-0.19z)| lr 2.39e-05 | 2533.31 ms | 53.3% bf16 MFU | 206952 tok/s +step 17149/19560 | loss 3.262336 (-0.58z)| norm 0.2564 (+0.25z)| lr 2.39e-05 | 2533.58 ms | 53.3% bf16 MFU | 206951 tok/s +step 17150/19560 | loss 3.321098 (+0.58z)| norm 0.2519 (-0.09z)| lr 2.39e-05 | 2534.44 ms | 53.3% bf16 MFU | 206947 tok/s +step 17151/19560 | loss 3.239714 (-1.03z)| norm 0.2569 (+0.29z)| lr 2.39e-05 | 2532.65 ms | 53.3% bf16 MFU | 206950 tok/s +step 17152/19560 | loss 3.306084 (+0.28z)| norm 0.2436 (-0.72z)| lr 2.38e-05 | 2536.74 ms | 53.2% bf16 MFU | 206937 tok/s +step 17153/19560 | loss 3.224885 (-1.32z)| norm 0.2455 (-0.56z)| lr 2.38e-05 | 2533.94 ms | 53.3% bf16 MFU | 206935 tok/s +step 17154/19560 | loss 3.339626 (+1.14z)| norm 0.2564 (+0.29z)| lr 2.38e-05 | 2535.07 ms | 53.3% bf16 MFU | 206929 tok/s +step 17155/19560 | loss 3.290447 (+0.02z)| norm 0.2537 (+0.10z)| lr 2.38e-05 | 2533.89 ms | 53.3% bf16 MFU | 206928 tok/s +step 17156/19560 | loss 3.286380 (-0.07z)| norm 0.2674 (+1.49z)| lr 2.38e-05 | 2533.74 ms | 53.3% bf16 MFU | 206928 tok/s +step 17157/19560 | loss 3.370156 (+1.83z)| norm 0.2597 (+0.75z)| lr 2.37e-05 | 2533.53 ms | 53.3% bf16 MFU | 206928 tok/s +step 17158/19560 | loss 3.235343 (-1.22z)| norm 0.2468 (-0.49z)| lr 2.37e-05 | 2533.46 ms | 53.3% bf16 MFU | 206929 tok/s +step 17159/19560 | loss 3.260454 (-0.65z)| norm 0.2458 (-0.57z)| lr 2.37e-05 | 2535.34 ms | 53.3% bf16 MFU | 206922 tok/s +step 17160/19560 | loss 3.305599 (+0.37z)| norm 0.2520 (+0.04z)| lr 2.37e-05 | 2534.20 ms | 53.3% bf16 MFU | 206920 tok/s +step 17161/19560 | loss 3.263203 (-0.57z)| norm 0.2651 (+1.30z)| lr 2.37e-05 | 2535.93 ms | 53.2% bf16 MFU | 206912 tok/s +step 17162/19560 | loss 3.207275 (-1.81z)| norm 0.2512 (-0.03z)| lr 2.36e-05 | 2535.05 ms | 53.3% bf16 MFU | 206907 tok/s +step 17163/19560 | loss 3.246874 (-0.91z)| norm 0.2427 (-0.87z)| lr 2.36e-05 | 2536.45 ms | 53.2% bf16 MFU | 206897 tok/s +step 17164/19560 | loss 3.324343 (+0.82z)| norm 0.2511 (-0.04z)| lr 2.36e-05 | 2535.69 ms | 53.2% bf16 MFU | 206890 tok/s +step 17165/19560 | loss 3.267598 (-0.45z)| norm 0.2476 (-0.38z)| lr 2.36e-05 | 2536.21 ms | 53.2% bf16 MFU | 206881 tok/s +step 17166/19560 | loss 3.230904 (-1.26z)| norm 0.2413 (-0.99z)| lr 2.36e-05 | 2535.57 ms | 53.2% bf16 MFU | 206876 tok/s +step 17167/19560 | loss 3.314502 (+0.62z)| norm 0.2470 (-0.42z)| lr 2.35e-05 | 2533.77 ms | 53.3% bf16 MFU | 206878 tok/s +step 17168/19560 | loss 3.295551 (+0.20z)| norm 0.2560 (+0.47z)| lr 2.35e-05 | 2535.30 ms | 53.3% bf16 MFU | 206874 tok/s +step 17169/19560 | loss 3.232815 (-1.20z)| norm 0.2822 (+2.91z)| lr 2.35e-05 | 2534.14 ms | 53.3% bf16 MFU | 206875 tok/s +step 17170/19560 | loss 3.247100 (-0.87z)| norm 0.2483 (-0.32z)| lr 2.35e-05 | 2535.64 ms | 53.2% bf16 MFU | 206870 tok/s +step 17171/19560 | loss 3.280465 (-0.11z)| norm 0.2453 (-0.61z)| lr 2.35e-05 | 2531.21 ms | 53.3% bf16 MFU | 206882 tok/s +step 17172/19560 | loss 3.308242 (+0.52z)| norm 0.2382 (-1.28z)| lr 2.34e-05 | 2534.69 ms | 53.3% bf16 MFU | 206881 tok/s +step 17173/19560 | loss 3.304229 (+0.43z)| norm 0.2409 (-1.00z)| lr 2.34e-05 | 2532.95 ms | 53.3% bf16 MFU | 206886 tok/s +step 17174/19560 | loss 3.295307 (+0.23z)| norm 0.2402 (-1.06z)| lr 2.34e-05 | 2533.56 ms | 53.3% bf16 MFU | 206888 tok/s +step 17175/19560 | loss 3.235079 (-1.12z)| norm 0.2476 (-0.36z)| lr 2.34e-05 | 2535.45 ms | 53.3% bf16 MFU | 206883 tok/s +step 17176/19560 | loss 3.334668 (+1.12z)| norm 0.2428 (-0.81z)| lr 2.34e-05 | 2530.97 ms | 53.3% bf16 MFU | 206897 tok/s +step 17177/19560 | loss 3.321930 (+0.91z)| norm 0.2476 (-0.35z)| lr 2.33e-05 | 2534.52 ms | 53.3% bf16 MFU | 206895 tok/s +step 17178/19560 | loss 3.346813 (+1.48z)| norm 0.2289 (-2.09z)| lr 2.33e-05 | 2533.25 ms | 53.3% bf16 MFU | 206898 tok/s +step 17179/19560 | loss 3.291104 (+0.15z)| norm 0.2507 (-0.05z)| lr 2.33e-05 | 2533.64 ms | 53.3% bf16 MFU | 206900 tok/s +step 17180/19560 | loss 3.313113 (+0.67z)| norm 0.2479 (-0.31z)| lr 2.33e-05 | 2534.34 ms | 53.3% bf16 MFU | 206898 tok/s +step 17181/19560 | loss 3.290499 (+0.12z)| norm 0.2572 (+0.56z)| lr 2.33e-05 | 2533.91 ms | 53.3% bf16 MFU | 206899 tok/s +step 17182/19560 | loss 3.265965 (-0.47z)| norm 0.2375 (-1.28z)| lr 2.32e-05 | 2533.26 ms | 53.3% bf16 MFU | 206902 tok/s +step 17183/19560 | loss 3.325083 (+1.03z)| norm 0.2935 (+3.76z)| lr 2.32e-05 | 2531.96 ms | 53.3% bf16 MFU | 206910 tok/s +step 17184/19560 | loss 3.248136 (-0.92z)| norm 0.2441 (-0.64z)| lr 2.32e-05 | 2534.92 ms | 53.3% bf16 MFU | 206906 tok/s +step 17185/19560 | loss 3.283119 (-0.05z)| norm 0.2467 (-0.40z)| lr 2.32e-05 | 2531.56 ms | 53.3% bf16 MFU | 206916 tok/s +step 17186/19560 | loss 3.342112 (+1.44z)| norm 0.2547 (+0.32z)| lr 2.32e-05 | 2531.82 ms | 53.3% bf16 MFU | 206924 tok/s +step 17187/19560 | loss 3.241731 (-1.09z)| norm 0.2558 (+0.41z)| lr 2.32e-05 | 2532.28 ms | 53.3% bf16 MFU | 206930 tok/s +step 17188/19560 | loss 3.208687 (-1.89z)| norm 0.2468 (-0.39z)| lr 2.31e-05 | 2533.04 ms | 53.3% bf16 MFU | 206932 tok/s +step 17189/19560 | loss 3.289007 (+0.11z)| norm 0.2312 (-1.75z)| lr 2.31e-05 | 2532.85 ms | 53.3% bf16 MFU | 206936 tok/s +step 17190/19560 | loss 3.312253 (+0.68z)| norm 0.2548 (+0.34z)| lr 2.31e-05 | 2534.48 ms | 53.3% bf16 MFU | 206932 tok/s +step 17191/19560 | loss 3.349096 (+1.57z)| norm 0.2465 (-0.41z)| lr 2.31e-05 | 2535.05 ms | 53.3% bf16 MFU | 206926 tok/s +step 17192/19560 | loss 3.270594 (-0.37z)| norm 0.2532 (+0.18z)| lr 2.31e-05 | 2533.32 ms | 53.3% bf16 MFU | 206928 tok/s +step 17193/19560 | loss 3.258841 (-0.65z)| norm 0.2512 (-0.00z)| lr 2.30e-05 | 2532.26 ms | 53.3% bf16 MFU | 206933 tok/s +step 17194/19560 | loss 3.314298 (+0.71z)| norm 0.2454 (-0.51z)| lr 2.30e-05 | 2532.38 ms | 53.3% bf16 MFU | 206938 tok/s +step 17195/19560 | loss 3.297930 (+0.34z)| norm 0.2553 (+0.38z)| lr 2.30e-05 | 2532.76 ms | 53.3% bf16 MFU | 206942 tok/s +step 17196/19560 | loss 3.306249 (+0.54z)| norm 0.2415 (-0.85z)| lr 2.30e-05 | 2531.85 ms | 53.3% bf16 MFU | 206948 tok/s +step 17197/19560 | loss 3.248861 (-0.92z)| norm 0.2506 (-0.04z)| lr 2.30e-05 | 2533.18 ms | 53.3% bf16 MFU | 206949 tok/s +step 17198/19560 | loss 3.239707 (-1.14z)| norm 0.2507 (-0.04z)| lr 2.29e-05 | 2534.66 ms | 53.3% bf16 MFU | 206944 tok/s +step 17199/19560 | loss 3.226055 (-1.46z)| norm 0.2440 (-0.64z)| lr 2.29e-05 | 2534.49 ms | 53.3% bf16 MFU | 206940 tok/s +step 17200/19560 | loss 3.326743 (+1.11z)| norm 0.2478 (-0.30z)| lr 2.29e-05 | 2534.40 ms | 53.3% bf16 MFU | 206937 tok/s +step 17201/19560 | loss 3.293917 (+0.25z)| norm 0.2406 (-0.93z)| lr 2.29e-05 | 2531.58 ms | 53.3% bf16 MFU | 206945 tok/s +step 17202/19560 | loss 3.382992 (+2.51z)| norm 0.2368 (-1.26z)| lr 2.29e-05 | 2534.72 ms | 53.3% bf16 MFU | 206940 tok/s +step 17203/19560 | loss 3.284083 (-0.02z)| norm 0.2365 (-1.27z)| lr 2.28e-05 | 2532.27 ms | 53.3% bf16 MFU | 206945 tok/s +step 17204/19560 | loss 3.258730 (-0.66z)| norm 0.2349 (-1.39z)| lr 2.28e-05 | 2534.79 ms | 53.3% bf16 MFU | 206939 tok/s +step 17205/19560 | loss 3.299113 (+0.37z)| norm 0.2551 (+0.42z)| lr 2.28e-05 | 2532.06 ms | 53.3% bf16 MFU | 206945 tok/s +step 17206/19560 | loss 3.229894 (-1.37z)| norm 0.2359 (-1.30z)| lr 2.28e-05 | 2534.22 ms | 53.3% bf16 MFU | 206942 tok/s +step 17207/19560 | loss 3.215782 (-1.69z)| norm 0.2364 (-1.23z)| lr 2.28e-05 | 2532.13 ms | 53.3% bf16 MFU | 206948 tok/s +step 17208/19560 | loss 3.291220 (+0.20z)| norm 0.2414 (-0.78z)| lr 2.28e-05 | 2532.24 ms | 53.3% bf16 MFU | 206953 tok/s +step 17209/19560 | loss 3.351964 (+1.69z)| norm 0.2541 (+0.35z)| lr 2.27e-05 | 2533.22 ms | 53.3% bf16 MFU | 206953 tok/s +step 17210/19560 | loss 3.307395 (+0.57z)| norm 0.2385 (-1.04z)| lr 2.27e-05 | 2531.46 ms | 53.3% bf16 MFU | 206961 tok/s +step 17211/19560 | loss 3.207591 (-1.86z)| norm 0.2349 (-1.35z)| lr 2.27e-05 | 2533.97 ms | 53.3% bf16 MFU | 206958 tok/s +step 17212/19560 | loss 3.249055 (-0.85z)| norm 0.2361 (-1.23z)| lr 2.27e-05 | 2533.18 ms | 53.3% bf16 MFU | 206959 tok/s +step 17213/19560 | loss 3.254747 (-0.70z)| norm 0.2548 (+0.44z)| lr 2.27e-05 | 2532.79 ms | 53.3% bf16 MFU | 206961 tok/s +step 17214/19560 | loss 3.269305 (-0.34z)| norm 0.2308 (-1.70z)| lr 2.26e-05 | 2530.87 ms | 53.3% bf16 MFU | 206971 tok/s +step 17215/19560 | loss 3.302469 (+0.46z)| norm 0.2362 (-1.19z)| lr 2.26e-05 | 2532.00 ms | 53.3% bf16 MFU | 206975 tok/s +step 17216/19560 | loss 3.247451 (-0.89z)| norm 0.2420 (-0.68z)| lr 2.26e-05 | 2534.62 ms | 53.3% bf16 MFU | 206969 tok/s +step 17217/19560 | loss 3.330395 (+1.16z)| norm 0.2571 (+0.67z)| lr 2.26e-05 | 2533.95 ms | 53.3% bf16 MFU | 206966 tok/s +step 17218/19560 | loss 3.242825 (-0.99z)| norm 0.2444 (-0.45z)| lr 2.26e-05 | 2532.35 ms | 53.3% bf16 MFU | 206969 tok/s +step 17219/19560 | loss 3.287642 (+0.13z)| norm 0.2532 (+0.35z)| lr 2.25e-05 | 2531.74 ms | 53.3% bf16 MFU | 206975 tok/s +step 17220/19560 | loss 3.210311 (-1.77z)| norm 0.2487 (-0.07z)| lr 2.25e-05 | 2532.89 ms | 53.3% bf16 MFU | 206976 tok/s +step 17221/19560 | loss 3.283554 (+0.04z)| norm 0.2449 (-0.41z)| lr 2.25e-05 | 2534.57 ms | 53.3% bf16 MFU | 206970 tok/s +step 17222/19560 | loss 3.311259 (+0.72z)| norm 0.2471 (-0.20z)| lr 2.25e-05 | 2531.53 ms | 53.3% bf16 MFU | 206977 tok/s +step 17223/19560 | loss 3.273140 (-0.22z)| norm 0.2513 (+0.18z)| lr 2.25e-05 | 2532.52 ms | 53.3% bf16 MFU | 206979 tok/s +step 17224/19560 | loss 3.194630 (-2.11z)| norm 0.2402 (-0.83z)| lr 2.24e-05 | 2532.09 ms | 53.3% bf16 MFU | 206983 tok/s +step 17225/19560 | loss 3.216289 (-1.55z)| norm 0.2437 (-0.50z)| lr 2.24e-05 | 2532.96 ms | 53.3% bf16 MFU | 206983 tok/s +step 17226/19560 | loss 3.289274 (+0.21z)| norm 0.2470 (-0.19z)| lr 2.24e-05 | 2535.01 ms | 53.3% bf16 MFU | 206975 tok/s +step 17227/19560 | loss 3.266400 (-0.34z)| norm 0.2530 (+0.39z)| lr 2.24e-05 | 2535.81 ms | 53.2% bf16 MFU | 206964 tok/s +step 17228/19560 | loss 3.271755 (-0.22z)| norm 0.2390 (-0.94z)| lr 2.24e-05 | 2531.89 ms | 53.3% bf16 MFU | 206969 tok/s +step 17229/19560 | loss 3.273373 (-0.16z)| norm 0.2742 (+2.35z)| lr 2.24e-05 | 2533.88 ms | 53.3% bf16 MFU | 206966 tok/s +step 17230/19560 | loss 3.278996 (-0.04z)| norm 0.2629 (+1.31z)| lr 2.23e-05 | 2534.53 ms | 53.3% bf16 MFU | 206961 tok/s +step 17231/19560 | loss 3.258845 (-0.52z)| norm 0.2534 (+0.46z)| lr 2.23e-05 | 2535.66 ms | 53.2% bf16 MFU | 206951 tok/s +step 17232/19560 | loss 3.307910 (+0.70z)| norm 0.2655 (+1.65z)| lr 2.23e-05 | 2534.88 ms | 53.3% bf16 MFU | 206945 tok/s +step 17233/19560 | loss 3.289346 (+0.24z)| norm 0.2813 (+3.07z)| lr 2.23e-05 | 2533.73 ms | 53.3% bf16 MFU | 206944 tok/s +step 17234/19560 | loss 3.295485 (+0.40z)| norm 0.2518 (+0.25z)| lr 2.23e-05 | 2533.08 ms | 53.3% bf16 MFU | 206946 tok/s +step 17235/19560 | loss 3.273418 (-0.14z)| norm 0.2679 (+1.83z)| lr 2.22e-05 | 2533.41 ms | 53.3% bf16 MFU | 206946 tok/s +step 17236/19560 | loss 3.272986 (-0.15z)| norm 0.2874 (+3.52z)| lr 2.22e-05 | 2533.62 ms | 53.3% bf16 MFU | 206945 tok/s +step 17237/19560 | loss 3.332469 (+1.34z)| norm 0.2551 (+0.52z)| lr 2.22e-05 | 2533.81 ms | 53.3% bf16 MFU | 206944 tok/s +step 17238/19560 | loss 3.341579 (+1.55z)| norm 0.2856 (+3.19z)| lr 2.22e-05 | 2534.06 ms | 53.3% bf16 MFU | 206941 tok/s +step 17239/19560 | loss 3.311171 (+0.79z)| norm 0.2447 (-0.44z)| lr 2.22e-05 | 2535.56 ms | 53.2% bf16 MFU | 206933 tok/s +step 17240/19560 | loss 3.281482 (+0.05z)| norm 0.2499 (+0.02z)| lr 2.21e-05 | 2532.73 ms | 53.3% bf16 MFU | 206937 tok/s +step 17241/19560 | loss 3.321742 (+1.05z)| norm 0.2588 (+0.79z)| lr 2.21e-05 | 2534.14 ms | 53.3% bf16 MFU | 206934 tok/s +step 17242/19560 | loss 3.287019 (+0.20z)| norm 0.2470 (-0.25z)| lr 2.21e-05 | 2533.66 ms | 53.3% bf16 MFU | 206934 tok/s +step 17243/19560 | loss 3.252117 (-0.68z)| norm 0.2706 (+1.81z)| lr 2.21e-05 | 2534.40 ms | 53.3% bf16 MFU | 206931 tok/s +step 17244/19560 | loss 3.271720 (-0.18z)| norm 0.2536 (+0.31z)| lr 2.21e-05 | 2532.35 ms | 53.3% bf16 MFU | 206936 tok/s +step 17245/19560 | loss 3.322766 (+1.09z)| norm 0.2520 (+0.16z)| lr 2.20e-05 | 2533.93 ms | 53.3% bf16 MFU | 206935 tok/s +step 17246/19560 | loss 3.253437 (-0.66z)| norm 0.2541 (+0.33z)| lr 2.20e-05 | 2532.47 ms | 53.3% bf16 MFU | 206939 tok/s +step 17247/19560 | loss 3.328999 (+1.24z)| norm 0.2738 (+2.04z)| lr 2.20e-05 | 2534.50 ms | 53.3% bf16 MFU | 206935 tok/s +step 17248/19560 | loss 3.274483 (-0.14z)| norm 0.2433 (-0.62z)| lr 2.20e-05 | 2535.32 ms | 53.3% bf16 MFU | 206928 tok/s +step 17249/19560 | loss 3.281458 (+0.04z)| norm 0.2342 (-1.41z)| lr 2.20e-05 | 2533.67 ms | 53.3% bf16 MFU | 206928 tok/s +step 17250/19560 | loss 3.212049 (-1.70z)| norm 0.2834 (+2.76z)| lr 2.20e-05 | 2532.64 ms | 53.3% bf16 MFU | 206932 tok/s +val loss 3.291815 +evaluating HellaSwag: 0/628 evaluating HellaSwag: 10/628 evaluating HellaSwag: 20/628 evaluating HellaSwag: 30/628 evaluating HellaSwag: 40/628 evaluating HellaSwag: 50/628 evaluating HellaSwag: 60/628 evaluating HellaSwag: 70/628 evaluating HellaSwag: 80/628 evaluating HellaSwag: 90/628 evaluating HellaSwag: 100/628 evaluating HellaSwag: 110/628 evaluating HellaSwag: 120/628 evaluating HellaSwag: 130/628 evaluating HellaSwag: 140/628 evaluating HellaSwag: 150/628 evaluating HellaSwag: 160/628 evaluating HellaSwag: 170/628 evaluating HellaSwag: 180/628 evaluating HellaSwag: 190/628 evaluating HellaSwag: 200/628 evaluating HellaSwag: 210/628 evaluating HellaSwag: 220/628 evaluating HellaSwag: 230/628 evaluating HellaSwag: 240/628 evaluating HellaSwag: 250/628 evaluating HellaSwag: 260/628 evaluating HellaSwag: 270/628 evaluating HellaSwag: 280/628 evaluating HellaSwag: 290/628 evaluating HellaSwag: 300/628 evaluating HellaSwag: 310/628 evaluating HellaSwag: 320/628 evaluating HellaSwag: 330/628 evaluating HellaSwag: 340/628 evaluating HellaSwag: 350/628 evaluating HellaSwag: 360/628 evaluating HellaSwag: 370/628 evaluating HellaSwag: 380/628 evaluating HellaSwag: 390/628 evaluating HellaSwag: 400/628 evaluating HellaSwag: 410/628 evaluating HellaSwag: 420/628 evaluating HellaSwag: 430/628 evaluating HellaSwag: 440/628 evaluating HellaSwag: 450/628 evaluating HellaSwag: 460/628 evaluating HellaSwag: 470/628 evaluating HellaSwag: 480/628 evaluating HellaSwag: 490/628 evaluating HellaSwag: 500/628 evaluating HellaSwag: 510/628 evaluating HellaSwag: 520/628 evaluating HellaSwag: 530/628 evaluating HellaSwag: 540/628 evaluating HellaSwag: 550/628 evaluating HellaSwag: 560/628 evaluating HellaSwag: 570/628 evaluating HellaSwag: 580/628 evaluating HellaSwag: 590/628 evaluating HellaSwag: 600/628 evaluating HellaSwag: 610/628 evaluating HellaSwag: 620/628 HellaSwag: 3017/10042 = 0.300438 +step 17251/19560 | loss 3.336702 (+1.40z)| norm 0.2457 (-0.41z)| lr 2.19e-05 | 2533.73 ms | 53.3% bf16 MFU | 206932 tok/s +step 17252/19560 | loss 3.248732 (-0.78z)| norm 0.2409 (-0.82z)| lr 2.19e-05 | 2533.74 ms | 53.3% bf16 MFU | 206931 tok/s +step 17253/19560 | loss 3.258356 (-0.53z)| norm 0.2492 (-0.12z)| lr 2.19e-05 | 2533.46 ms | 53.3% bf16 MFU | 206932 tok/s +step 17254/19560 | loss 3.279717 (+0.00z)| norm 0.2501 (-0.04z)| lr 2.19e-05 | 2533.50 ms | 53.3% bf16 MFU | 206933 tok/s +step 17255/19560 | loss 3.263116 (-0.40z)| norm 0.2457 (-0.41z)| lr 2.19e-05 | 2534.83 ms | 53.3% bf16 MFU | 206928 tok/s +step 17256/19560 | loss 3.372464 (+2.27z)| norm 0.2699 (+1.60z)| lr 2.18e-05 | 2533.49 ms | 53.3% bf16 MFU | 206928 tok/s +step 17257/19560 | loss 3.262865 (-0.45z)| norm 0.2600 (+0.78z)| lr 2.18e-05 | 2532.55 ms | 53.3% bf16 MFU | 206933 tok/s +step 17258/19560 | loss 3.250266 (-0.75z)| norm 0.2511 (+0.03z)| lr 2.18e-05 | 2533.23 ms | 53.3% bf16 MFU | 206935 tok/s +step 17259/19560 | loss 3.349075 (+1.69z)| norm 0.2533 (+0.21z)| lr 2.18e-05 | 2534.28 ms | 53.3% bf16 MFU | 206932 tok/s +step 17260/19560 | loss 3.314125 (+0.81z)| norm 0.2810 (+2.46z)| lr 2.18e-05 | 2532.70 ms | 53.3% bf16 MFU | 206935 tok/s +step 17261/19560 | loss 3.260272 (-0.53z)| norm 0.2454 (-0.47z)| lr 2.17e-05 | 2533.88 ms | 53.3% bf16 MFU | 206934 tok/s +step 17262/19560 | loss 3.266256 (-0.40z)| norm 0.2471 (-0.33z)| lr 2.17e-05 | 2534.73 ms | 53.3% bf16 MFU | 206930 tok/s +step 17263/19560 | loss 3.228714 (-1.33z)| norm 0.2422 (-0.73z)| lr 2.17e-05 | 2534.76 ms | 53.3% bf16 MFU | 206925 tok/s +step 17264/19560 | loss 3.227418 (-1.35z)| norm 0.2400 (-0.89z)| lr 2.17e-05 | 2532.79 ms | 53.3% bf16 MFU | 206929 tok/s +step 17265/19560 | loss 3.304324 (+0.60z)| norm 0.2382 (-1.04z)| lr 2.17e-05 | 2534.37 ms | 53.3% bf16 MFU | 206926 tok/s +step 17266/19560 | loss 3.246487 (-0.87z)| norm 0.2470 (-0.30z)| lr 2.17e-05 | 2535.57 ms | 53.2% bf16 MFU | 206918 tok/s +step 17267/19560 | loss 3.276722 (-0.09z)| norm 0.2471 (-0.29z)| lr 2.16e-05 | 2537.05 ms | 53.2% bf16 MFU | 206905 tok/s +step 17268/19560 | loss 3.311643 (+0.78z)| norm 0.2434 (-0.59z)| lr 2.16e-05 | 2533.28 ms | 53.3% bf16 MFU | 206908 tok/s +step 17269/19560 | loss 3.327727 (+1.19z)| norm 0.2425 (-0.66z)| lr 2.16e-05 | 2533.80 ms | 53.3% bf16 MFU | 206908 tok/s +step 17270/19560 | loss 3.269163 (-0.29z)| norm 0.2453 (-0.43z)| lr 2.16e-05 | 2533.87 ms | 53.3% bf16 MFU | 206908 tok/s +step 17271/19560 | loss 3.283629 (+0.07z)| norm 0.2621 (+0.95z)| lr 2.16e-05 | 2531.16 ms | 53.3% bf16 MFU | 206920 tok/s +step 17272/19560 | loss 3.257871 (-0.57z)| norm 0.2488 (-0.15z)| lr 2.15e-05 | 2533.35 ms | 53.3% bf16 MFU | 206921 tok/s +step 17273/19560 | loss 3.289294 (+0.23z)| norm 0.2472 (-0.28z)| lr 2.15e-05 | 2533.41 ms | 53.3% bf16 MFU | 206923 tok/s +step 17274/19560 | loss 3.275055 (-0.15z)| norm 0.2411 (-0.78z)| lr 2.15e-05 | 2532.40 ms | 53.3% bf16 MFU | 206928 tok/s +step 17275/19560 | loss 3.253428 (-0.71z)| norm 0.2419 (-0.71z)| lr 2.15e-05 | 2534.45 ms | 53.3% bf16 MFU | 206925 tok/s +step 17276/19560 | loss 3.205613 (-1.91z)| norm 0.2461 (-0.35z)| lr 2.15e-05 | 2531.63 ms | 53.3% bf16 MFU | 206934 tok/s +step 17277/19560 | loss 3.184832 (-2.37z)| norm 0.2426 (-0.64z)| lr 2.15e-05 | 2533.63 ms | 53.3% bf16 MFU | 206933 tok/s +step 17278/19560 | loss 3.371830 (+2.25z)| norm 0.2580 (+0.64z)| lr 2.14e-05 | 2535.19 ms | 53.3% bf16 MFU | 206927 tok/s +step 17279/19560 | loss 3.298181 (+0.43z)| norm 0.2564 (+0.51z)| lr 2.14e-05 | 2535.72 ms | 53.2% bf16 MFU | 206919 tok/s +step 17280/19560 | loss 3.181333 (-2.38z)| norm 0.2360 (-1.18z)| lr 2.14e-05 | 2534.10 ms | 53.3% bf16 MFU | 206917 tok/s +step 17281/19560 | loss 3.267112 (-0.32z)| norm 0.2358 (-1.18z)| lr 2.14e-05 | 2533.79 ms | 53.3% bf16 MFU | 206918 tok/s +step 17282/19560 | loss 3.277848 (-0.05z)| norm 0.2438 (-0.52z)| lr 2.14e-05 | 2533.68 ms | 53.3% bf16 MFU | 206918 tok/s +step 17283/19560 | loss 3.211429 (-1.64z)| norm 0.2474 (-0.21z)| lr 2.13e-05 | 2533.99 ms | 53.3% bf16 MFU | 206917 tok/s +step 17284/19560 | loss 3.331666 (+1.26z)| norm 0.2381 (-0.97z)| lr 2.13e-05 | 2533.94 ms | 53.3% bf16 MFU | 206917 tok/s +step 17285/19560 | loss 3.242684 (-0.87z)| norm 0.2286 (-1.72z)| lr 2.13e-05 | 2536.72 ms | 53.2% bf16 MFU | 206905 tok/s +step 17286/19560 | loss 3.296062 (+0.42z)| norm 0.2380 (-0.94z)| lr 2.13e-05 | 2533.72 ms | 53.3% bf16 MFU | 206906 tok/s +step 17287/19560 | loss 3.339309 (+1.46z)| norm 0.2452 (-0.35z)| lr 2.13e-05 | 2533.90 ms | 53.3% bf16 MFU | 206906 tok/s +step 17288/19560 | loss 3.255610 (-0.57z)| norm 0.2530 (+0.29z)| lr 2.12e-05 | 2535.30 ms | 53.3% bf16 MFU | 206900 tok/s +step 17289/19560 | loss 3.244469 (-0.84z)| norm 0.2340 (-1.25z)| lr 2.12e-05 | 2534.55 ms | 53.3% bf16 MFU | 206898 tok/s +step 17290/19560 | loss 3.262393 (-0.42z)| norm 0.2549 (+0.46z)| lr 2.12e-05 | 2532.48 ms | 53.3% bf16 MFU | 206905 tok/s +step 17291/19560 | loss 3.313450 (+0.82z)| norm 0.2345 (-1.19z)| lr 2.12e-05 | 2533.72 ms | 53.3% bf16 MFU | 206906 tok/s +step 17292/19560 | loss 3.282081 (+0.06z)| norm 0.2588 (+0.78z)| lr 2.12e-05 | 2532.15 ms | 53.3% bf16 MFU | 206913 tok/s +step 17293/19560 | loss 3.291830 (+0.30z)| norm 0.2523 (+0.24z)| lr 2.12e-05 | 2533.63 ms | 53.3% bf16 MFU | 206914 tok/s +step 17294/19560 | loss 3.218502 (-1.51z)| norm 0.2378 (-0.93z)| lr 2.11e-05 | 2533.76 ms | 53.3% bf16 MFU | 206914 tok/s +step 17295/19560 | loss 3.257189 (-0.54z)| norm 0.2476 (-0.13z)| lr 2.11e-05 | 2533.47 ms | 53.3% bf16 MFU | 206916 tok/s +step 17296/19560 | loss 3.326451 (+1.15z)| norm 0.2514 (+0.18z)| lr 2.11e-05 | 2533.95 ms | 53.3% bf16 MFU | 206915 tok/s +step 17297/19560 | loss 3.288314 (+0.21z)| norm 0.2473 (-0.14z)| lr 2.11e-05 | 2534.84 ms | 53.3% bf16 MFU | 206911 tok/s +step 17298/19560 | loss 3.276597 (-0.09z)| norm 0.2424 (-0.54z)| lr 2.11e-05 | 2533.90 ms | 53.3% bf16 MFU | 206911 tok/s +step 17299/19560 | loss 3.300615 (+0.50z)| norm 0.2362 (-1.05z)| lr 2.10e-05 | 2534.66 ms | 53.3% bf16 MFU | 206908 tok/s +step 17300/19560 | loss 3.296776 (+0.41z)| norm 0.2470 (-0.16z)| lr 2.10e-05 | 2532.95 ms | 53.3% bf16 MFU | 206912 tok/s +step 17301/19560 | loss 3.278092 (-0.05z)| norm 0.2444 (-0.38z)| lr 2.10e-05 | 2533.83 ms | 53.3% bf16 MFU | 206912 tok/s +step 17302/19560 | loss 3.296816 (+0.42z)| norm 0.2399 (-0.76z)| lr 2.10e-05 | 2535.16 ms | 53.3% bf16 MFU | 206907 tok/s +step 17303/19560 | loss 3.348633 (+1.67z)| norm 0.2652 (+1.34z)| lr 2.10e-05 | 2534.10 ms | 53.3% bf16 MFU | 206906 tok/s +step 17304/19560 | loss 3.293770 (+0.33z)| norm 0.2426 (-0.54z)| lr 2.10e-05 | 2533.55 ms | 53.3% bf16 MFU | 206908 tok/s +step 17305/19560 | loss 3.266785 (-0.33z)| norm 0.2443 (-0.39z)| lr 2.09e-05 | 2534.57 ms | 53.3% bf16 MFU | 206905 tok/s +step 17306/19560 | loss 3.284560 (+0.12z)| norm 0.2515 (+0.19z)| lr 2.09e-05 | 2533.06 ms | 53.3% bf16 MFU | 206909 tok/s +step 17307/19560 | loss 3.212332 (-1.66z)| norm 0.2334 (-1.30z)| lr 2.09e-05 | 2532.88 ms | 53.3% bf16 MFU | 206913 tok/s +step 17308/19560 | loss 3.332985 (+1.33z)| norm 0.2692 (+1.64z)| lr 2.09e-05 | 2532.43 ms | 53.3% bf16 MFU | 206919 tok/s +step 17309/19560 | loss 3.257699 (-0.53z)| norm 0.2450 (-0.34z)| lr 2.09e-05 | 2534.19 ms | 53.3% bf16 MFU | 206917 tok/s +step 17310/19560 | loss 3.269188 (-0.24z)| norm 0.2428 (-0.53z)| lr 2.08e-05 | 2532.61 ms | 53.3% bf16 MFU | 206922 tok/s +step 17311/19560 | loss 3.227633 (-1.25z)| norm 0.2514 (+0.22z)| lr 2.08e-05 | 2533.26 ms | 53.3% bf16 MFU | 206924 tok/s +step 17312/19560 | loss 3.266032 (-0.31z)| norm 0.2337 (-1.31z)| lr 2.08e-05 | 2533.23 ms | 53.3% bf16 MFU | 206926 tok/s +step 17313/19560 | loss 3.309110 (+0.76z)| norm 0.2333 (-1.33z)| lr 2.08e-05 | 2531.34 ms | 53.3% bf16 MFU | 206936 tok/s +step 17314/19560 | loss 3.210085 (-1.67z)| norm 0.2366 (-1.02z)| lr 2.08e-05 | 2532.45 ms | 53.3% bf16 MFU | 206940 tok/s +step 17315/19560 | loss 3.269229 (-0.21z)| norm 0.2407 (-0.67z)| lr 2.08e-05 | 2532.76 ms | 53.3% bf16 MFU | 206943 tok/s +step 17316/19560 | loss 3.224056 (-1.34z)| norm 0.2273 (-1.78z)| lr 2.07e-05 | 2531.08 ms | 53.3% bf16 MFU | 206953 tok/s +step 17317/19560 | loss 3.359454 (+1.99z)| norm 0.2423 (-0.51z)| lr 2.07e-05 | 2532.90 ms | 53.3% bf16 MFU | 206955 tok/s +step 17318/19560 | loss 3.201533 (-1.84z)| norm 0.2540 (+0.48z)| lr 2.07e-05 | 2534.57 ms | 53.3% bf16 MFU | 206950 tok/s +step 17319/19560 | loss 3.330739 (+1.30z)| norm 0.2602 (+1.00z)| lr 2.07e-05 | 2532.08 ms | 53.3% bf16 MFU | 206955 tok/s +step 17320/19560 | loss 3.335469 (+1.39z)| norm 0.2301 (-1.53z)| lr 2.07e-05 | 2533.71 ms | 53.3% bf16 MFU | 206954 tok/s +step 17321/19560 | loss 3.295163 (+0.41z)| norm 0.2394 (-0.74z)| lr 2.06e-05 | 2531.89 ms | 53.3% bf16 MFU | 206960 tok/s +step 17322/19560 | loss 3.279545 (+0.04z)| norm 0.2587 (+0.87z)| lr 2.06e-05 | 2533.16 ms | 53.3% bf16 MFU | 206960 tok/s +step 17323/19560 | loss 3.395063 (+2.74z)| norm 0.2520 (+0.31z)| lr 2.06e-05 | 2532.76 ms | 53.3% bf16 MFU | 206962 tok/s +step 17324/19560 | loss 3.273659 (-0.11z)| norm 0.2316 (-1.38z)| lr 2.06e-05 | 2534.06 ms | 53.3% bf16 MFU | 206959 tok/s +step 17325/19560 | loss 3.255824 (-0.54z)| norm 0.2570 (+0.73z)| lr 2.06e-05 | 2534.25 ms | 53.3% bf16 MFU | 206955 tok/s +step 17326/19560 | loss 3.294721 (+0.37z)| norm 0.2460 (-0.19z)| lr 2.06e-05 | 2532.27 ms | 53.3% bf16 MFU | 206960 tok/s +step 17327/19560 | loss 3.215184 (-1.50z)| norm 0.2543 (+0.50z)| lr 2.05e-05 | 2533.06 ms | 53.3% bf16 MFU | 206961 tok/s +step 17328/19560 | loss 3.318367 (+0.94z)| norm 0.2334 (-1.23z)| lr 2.05e-05 | 2531.59 ms | 53.3% bf16 MFU | 206967 tok/s +step 17329/19560 | loss 3.279519 (+0.02z)| norm 0.2495 (+0.11z)| lr 2.05e-05 | 2533.47 ms | 53.3% bf16 MFU | 206966 tok/s +step 17330/19560 | loss 3.476269 (+4.40z)| norm 0.2930 (+3.51z)| lr 2.05e-05 | 2534.37 ms | 53.3% bf16 MFU | 206962 tok/s +step 17331/19560 | loss 3.277133 (-0.05z)| norm 0.2425 (-0.49z)| lr 2.05e-05 | 2534.81 ms | 53.3% bf16 MFU | 206955 tok/s +step 17332/19560 | loss 3.272441 (-0.16z)| norm 0.2513 (+0.20z)| lr 2.04e-05 | 2531.36 ms | 53.3% bf16 MFU | 206963 tok/s +step 17333/19560 | loss 3.198837 (-1.76z)| norm 0.2565 (+0.61z)| lr 2.04e-05 | 2533.52 ms | 53.3% bf16 MFU | 206962 tok/s +step 17334/19560 | loss 3.151244 (-2.73z)| norm 0.2422 (-0.54z)| lr 2.04e-05 | 2533.33 ms | 53.3% bf16 MFU | 206962 tok/s +step 17335/19560 | loss 3.327234 (+1.04z)| norm 0.2508 (+0.14z)| lr 2.04e-05 | 2533.15 ms | 53.3% bf16 MFU | 206962 tok/s +step 17336/19560 | loss 3.288394 (+0.21z)| norm 0.2496 (+0.04z)| lr 2.04e-05 | 2533.49 ms | 53.3% bf16 MFU | 206961 tok/s +step 17337/19560 | loss 3.196301 (-1.75z)| norm 0.2418 (-0.58z)| lr 2.04e-05 | 2533.03 ms | 53.3% bf16 MFU | 206962 tok/s +step 17338/19560 | loss 3.270132 (-0.16z)| norm 0.2552 (+0.49z)| lr 2.03e-05 | 2533.07 ms | 53.3% bf16 MFU | 206963 tok/s +step 17339/19560 | loss 3.277342 (-0.01z)| norm 0.2455 (-0.30z)| lr 2.03e-05 | 2532.95 ms | 53.3% bf16 MFU | 206964 tok/s +step 17340/19560 | loss 3.238375 (-0.86z)| norm 0.2505 (+0.10z)| lr 2.03e-05 | 2533.65 ms | 53.3% bf16 MFU | 206963 tok/s +step 17341/19560 | loss 3.258563 (-0.42z)| norm 0.2468 (-0.20z)| lr 2.03e-05 | 2531.85 ms | 53.3% bf16 MFU | 206968 tok/s +step 17342/19560 | loss 3.250565 (-0.59z)| norm 0.2450 (-0.36z)| lr 2.03e-05 | 2531.20 ms | 53.3% bf16 MFU | 206976 tok/s +step 17343/19560 | loss 3.234955 (-0.92z)| norm 0.2507 (+0.10z)| lr 2.02e-05 | 2534.04 ms | 53.3% bf16 MFU | 206972 tok/s +step 17344/19560 | loss 3.297217 (+0.43z)| norm 0.2515 (+0.16z)| lr 2.02e-05 | 2532.94 ms | 53.3% bf16 MFU | 206973 tok/s +step 17345/19560 | loss 3.261543 (-0.34z)| norm 0.2495 (-0.00z)| lr 2.02e-05 | 2533.71 ms | 53.3% bf16 MFU | 206971 tok/s +step 17346/19560 | loss 3.295253 (+0.39z)| norm 0.2718 (+1.80z)| lr 2.02e-05 | 2532.95 ms | 53.3% bf16 MFU | 206972 tok/s +step 17347/19560 | loss 3.303198 (+0.56z)| norm 0.2647 (+1.21z)| lr 2.02e-05 | 2533.55 ms | 53.3% bf16 MFU | 206970 tok/s +step 17348/19560 | loss 3.293989 (+0.35z)| norm 0.2570 (+0.58z)| lr 2.02e-05 | 2532.96 ms | 53.3% bf16 MFU | 206971 tok/s +step 17349/19560 | loss 3.362836 (+1.82z)| norm 0.3035 (+4.03z)| lr 2.01e-05 | 2533.36 ms | 53.3% bf16 MFU | 206970 tok/s +step 17350/19560 | loss 3.225223 (-1.15z)| norm 0.2554 (+0.38z)| lr 2.01e-05 | 2533.11 ms | 53.3% bf16 MFU | 206970 tok/s +step 17351/19560 | loss 3.286235 (+0.17z)| norm 0.2546 (+0.32z)| lr 2.01e-05 | 2532.99 ms | 53.3% bf16 MFU | 206971 tok/s +step 17352/19560 | loss 3.305874 (+0.58z)| norm 0.2508 (+0.02z)| lr 2.01e-05 | 2534.10 ms | 53.3% bf16 MFU | 206967 tok/s +step 17353/19560 | loss 3.319004 (+0.86z)| norm 0.2693 (+1.41z)| lr 2.01e-05 | 2533.91 ms | 53.3% bf16 MFU | 206964 tok/s +step 17354/19560 | loss 3.252106 (-0.61z)| norm 0.2537 (+0.22z)| lr 2.00e-05 | 2533.54 ms | 53.3% bf16 MFU | 206963 tok/s +step 17355/19560 | loss 3.240774 (-0.85z)| norm 0.2554 (+0.35z)| lr 2.00e-05 | 2533.59 ms | 53.3% bf16 MFU | 206961 tok/s +step 17356/19560 | loss 3.264146 (-0.33z)| norm 0.2420 (-0.67z)| lr 2.00e-05 | 2534.59 ms | 53.3% bf16 MFU | 206956 tok/s +step 17357/19560 | loss 3.281458 (+0.04z)| norm 0.2495 (-0.08z)| lr 2.00e-05 | 2534.30 ms | 53.3% bf16 MFU | 206952 tok/s +step 17358/19560 | loss 3.259672 (-0.43z)| norm 0.2452 (-0.40z)| lr 2.00e-05 | 2533.97 ms | 53.3% bf16 MFU | 206950 tok/s +step 17359/19560 | loss 3.281558 (+0.05z)| norm 0.2429 (-0.57z)| lr 2.00e-05 | 2534.01 ms | 53.3% bf16 MFU | 206947 tok/s +step 17360/19560 | loss 3.238701 (-0.88z)| norm 0.2363 (-1.06z)| lr 1.99e-05 | 2534.90 ms | 53.3% bf16 MFU | 206941 tok/s +step 17361/19560 | loss 3.378714 (+2.13z)| norm 0.2481 (-0.14z)| lr 1.99e-05 | 2533.60 ms | 53.3% bf16 MFU | 206941 tok/s +step 17362/19560 | loss 3.282042 (+0.05z)| norm 0.2415 (-0.65z)| lr 1.99e-05 | 2533.47 ms | 53.3% bf16 MFU | 206941 tok/s +step 17363/19560 | loss 3.288303 (+0.19z)| norm 0.2479 (-0.14z)| lr 1.99e-05 | 2532.99 ms | 53.3% bf16 MFU | 206943 tok/s +step 17364/19560 | loss 3.295812 (+0.34z)| norm 0.2514 (+0.17z)| lr 1.99e-05 | 2533.12 ms | 53.3% bf16 MFU | 206945 tok/s +step 17365/19560 | loss 3.285945 (+0.14z)| norm 0.2546 (+0.43z)| lr 1.98e-05 | 2534.48 ms | 53.3% bf16 MFU | 206941 tok/s +step 17366/19560 | loss 3.331715 (+1.13z)| norm 0.2510 (+0.16z)| lr 1.98e-05 | 2534.86 ms | 53.3% bf16 MFU | 206935 tok/s +step 17367/19560 | loss 3.241673 (-0.80z)| norm 0.2503 (+0.10z)| lr 1.98e-05 | 2533.76 ms | 53.3% bf16 MFU | 206934 tok/s +step 17368/19560 | loss 3.298275 (+0.42z)| norm 0.2597 (+0.88z)| lr 1.98e-05 | 2532.67 ms | 53.3% bf16 MFU | 206938 tok/s +step 17369/19560 | loss 3.236259 (-0.91z)| norm 0.2499 (+0.06z)| lr 1.98e-05 | 2532.37 ms | 53.3% bf16 MFU | 206943 tok/s +step 17370/19560 | loss 3.240095 (-0.82z)| norm 0.2400 (-0.77z)| lr 1.98e-05 | 2533.14 ms | 53.3% bf16 MFU | 206944 tok/s +step 17371/19560 | loss 3.330055 (+1.11z)| norm 0.2455 (-0.29z)| lr 1.97e-05 | 2533.42 ms | 53.3% bf16 MFU | 206945 tok/s +step 17372/19560 | loss 3.262730 (-0.34z)| norm 0.2514 (+0.21z)| lr 1.97e-05 | 2532.39 ms | 53.3% bf16 MFU | 206949 tok/s +step 17373/19560 | loss 3.218133 (-1.27z)| norm 0.2471 (-0.15z)| lr 1.97e-05 | 2532.59 ms | 53.3% bf16 MFU | 206952 tok/s +step 17374/19560 | loss 3.345460 (+1.43z)| norm 0.2549 (+0.51z)| lr 1.97e-05 | 2535.51 ms | 53.3% bf16 MFU | 206944 tok/s +step 17375/19560 | loss 3.223503 (-1.15z)| norm 0.2594 (+0.93z)| lr 1.97e-05 | 2536.56 ms | 53.2% bf16 MFU | 206931 tok/s +step 17376/19560 | loss 3.342296 (+1.35z)| norm 0.2481 (-0.06z)| lr 1.97e-05 | 2535.69 ms | 53.2% bf16 MFU | 206923 tok/s +step 17377/19560 | loss 3.227414 (-1.06z)| norm 0.2442 (-0.40z)| lr 1.96e-05 | 2534.45 ms | 53.3% bf16 MFU | 206920 tok/s +step 17378/19560 | loss 3.317456 (+0.82z)| norm 0.2376 (-0.99z)| lr 1.96e-05 | 2533.81 ms | 53.3% bf16 MFU | 206920 tok/s +step 17379/19560 | loss 3.246967 (-0.65z)| norm 0.2489 (+0.04z)| lr 1.96e-05 | 2532.50 ms | 53.3% bf16 MFU | 206925 tok/s +step 17380/19560 | loss 3.291163 (+0.28z)| norm 0.2429 (-0.51z)| lr 1.96e-05 | 2536.51 ms | 53.2% bf16 MFU | 206913 tok/s +step 17381/19560 | loss 3.236621 (-0.88z)| norm 0.2363 (-1.09z)| lr 1.96e-05 | 2533.58 ms | 53.3% bf16 MFU | 206914 tok/s +step 17382/19560 | loss 3.262867 (-0.32z)| norm 0.2515 (+0.28z)| lr 1.95e-05 | 2533.04 ms | 53.3% bf16 MFU | 206918 tok/s +step 17383/19560 | loss 3.376260 (+2.03z)| norm 0.2380 (-0.93z)| lr 1.95e-05 | 2534.39 ms | 53.3% bf16 MFU | 206915 tok/s +step 17384/19560 | loss 3.231151 (-0.98z)| norm 0.2430 (-0.47z)| lr 1.95e-05 | 2533.13 ms | 53.3% bf16 MFU | 206918 tok/s +step 17385/19560 | loss 3.255116 (-0.47z)| norm 0.2343 (-1.24z)| lr 1.95e-05 | 2533.07 ms | 53.3% bf16 MFU | 206921 tok/s +step 17386/19560 | loss 3.269902 (-0.16z)| norm 0.2518 (+0.34z)| lr 1.95e-05 | 2533.13 ms | 53.3% bf16 MFU | 206924 tok/s +step 17387/19560 | loss 3.377021 (+2.07z)| norm 0.2616 (+1.23z)| lr 1.95e-05 | 2534.05 ms | 53.3% bf16 MFU | 206922 tok/s +step 17388/19560 | loss 3.289540 (+0.25z)| norm 0.2505 (+0.26z)| lr 1.94e-05 | 2532.44 ms | 53.3% bf16 MFU | 206928 tok/s +step 17389/19560 | loss 3.281161 (+0.07z)| norm 0.2528 (+0.47z)| lr 1.94e-05 | 2532.91 ms | 53.3% bf16 MFU | 206931 tok/s +step 17390/19560 | loss 3.297138 (+0.40z)| norm 0.2369 (-1.01z)| lr 1.94e-05 | 2532.08 ms | 53.3% bf16 MFU | 206937 tok/s +step 17391/19560 | loss 3.281095 (+0.05z)| norm 0.2360 (-1.09z)| lr 1.94e-05 | 2533.10 ms | 53.3% bf16 MFU | 206939 tok/s +step 17392/19560 | loss 3.286593 (+0.16z)| norm 0.2576 (+0.90z)| lr 1.94e-05 | 2532.06 ms | 53.3% bf16 MFU | 206945 tok/s +step 17393/19560 | loss 3.173858 (-2.16z)| norm 0.2484 (+0.05z)| lr 1.94e-05 | 2533.20 ms | 53.3% bf16 MFU | 206946 tok/s +step 17394/19560 | loss 3.296782 (+0.38z)| norm 0.2552 (+0.67z)| lr 1.93e-05 | 2535.06 ms | 53.3% bf16 MFU | 206940 tok/s +step 17395/19560 | loss 3.305735 (+0.56z)| norm 0.2453 (-0.25z)| lr 1.93e-05 | 2532.89 ms | 53.3% bf16 MFU | 206942 tok/s +step 17396/19560 | loss 3.352979 (+1.53z)| norm 0.2465 (-0.14z)| lr 1.93e-05 | 2531.64 ms | 53.3% bf16 MFU | 206950 tok/s +step 17397/19560 | loss 3.262877 (-0.32z)| norm 0.2508 (+0.25z)| lr 1.93e-05 | 2535.71 ms | 53.2% bf16 MFU | 206940 tok/s +step 17398/19560 | loss 3.302117 (+0.48z)| norm 0.2593 (+1.03z)| lr 1.93e-05 | 2533.24 ms | 53.3% bf16 MFU | 206942 tok/s +step 17399/19560 | loss 3.279421 (+0.02z)| norm 0.2322 (-1.46z)| lr 1.92e-05 | 2534.93 ms | 53.3% bf16 MFU | 206936 tok/s +step 17400/19560 | loss 3.285625 (+0.14z)| norm 0.2485 (+0.05z)| lr 1.92e-05 | 2535.04 ms | 53.3% bf16 MFU | 206930 tok/s +step 17401/19560 | loss 3.258533 (-0.42z)| norm 0.2443 (-0.34z)| lr 1.92e-05 | 2535.38 ms | 53.3% bf16 MFU | 206923 tok/s +step 17402/19560 | loss 3.342473 (+1.30z)| norm 0.3253 (+6.01z)| lr 1.92e-05 | 2534.35 ms | 53.3% bf16 MFU | 206920 tok/s +step 17403/19560 | loss 3.260774 (-0.38z)| norm 0.2356 (-1.01z)| lr 1.92e-05 | 2533.89 ms | 53.3% bf16 MFU | 206920 tok/s +step 17404/19560 | loss 3.274134 (-0.12z)| norm 0.2477 (-0.07z)| lr 1.92e-05 | 2534.63 ms | 53.3% bf16 MFU | 206916 tok/s +step 17405/19560 | loss 3.245759 (-0.72z)| norm 0.2387 (-0.77z)| lr 1.91e-05 | 2534.97 ms | 53.3% bf16 MFU | 206912 tok/s +step 17406/19560 | loss 3.328606 (+1.03z)| norm 0.3144 (+4.66z)| lr 1.91e-05 | 2532.04 ms | 53.3% bf16 MFU | 206919 tok/s +step 17407/19560 | loss 3.261562 (-0.38z)| norm 0.2398 (-0.65z)| lr 1.91e-05 | 2532.60 ms | 53.3% bf16 MFU | 206924 tok/s +step 17408/19560 | loss 3.234711 (-0.98z)| norm 0.2509 (+0.14z)| lr 1.91e-05 | 2533.10 ms | 53.3% bf16 MFU | 206926 tok/s +step 17409/19560 | loss 3.220195 (-1.27z)| norm 0.2493 (+0.02z)| lr 1.91e-05 | 2532.77 ms | 53.3% bf16 MFU | 206930 tok/s +step 17410/19560 | loss 3.319486 (+0.84z)| norm 0.2405 (-0.61z)| lr 1.91e-05 | 2531.95 ms | 53.3% bf16 MFU | 206937 tok/s +step 17411/19560 | loss 3.243277 (-0.79z)| norm 0.2623 (+0.94z)| lr 1.90e-05 | 2532.20 ms | 53.3% bf16 MFU | 206943 tok/s +step 17412/19560 | loss 3.357370 (+1.64z)| norm 0.2648 (+1.10z)| lr 1.90e-05 | 2532.42 ms | 53.3% bf16 MFU | 206947 tok/s +step 17413/19560 | loss 3.297796 (+0.36z)| norm 0.2428 (-0.48z)| lr 1.90e-05 | 2531.54 ms | 53.3% bf16 MFU | 206955 tok/s +step 17414/19560 | loss 3.232141 (-1.03z)| norm 0.2565 (+0.50z)| lr 1.90e-05 | 2533.53 ms | 53.3% bf16 MFU | 206954 tok/s +step 17415/19560 | loss 3.247571 (-0.69z)| norm 0.2323 (-1.23z)| lr 1.90e-05 | 2532.28 ms | 53.3% bf16 MFU | 206958 tok/s +step 17416/19560 | loss 3.231951 (-1.01z)| norm 0.2429 (-0.46z)| lr 1.89e-05 | 2534.55 ms | 53.3% bf16 MFU | 206953 tok/s +step 17417/19560 | loss 3.296525 (+0.36z)| norm 0.2482 (-0.09z)| lr 1.89e-05 | 2533.46 ms | 53.3% bf16 MFU | 206953 tok/s +step 17418/19560 | loss 3.236261 (-0.92z)| norm 0.2787 (+2.04z)| lr 1.89e-05 | 2534.52 ms | 53.3% bf16 MFU | 206948 tok/s +step 17419/19560 | loss 3.261522 (-0.38z)| norm 0.2331 (-1.17z)| lr 1.89e-05 | 2535.27 ms | 53.3% bf16 MFU | 206941 tok/s +step 17420/19560 | loss 3.268550 (-0.23z)| norm 0.2460 (-0.26z)| lr 1.89e-05 | 2533.41 ms | 53.3% bf16 MFU | 206941 tok/s +step 17421/19560 | loss 3.189448 (-1.87z)| norm 0.2316 (-1.26z)| lr 1.89e-05 | 2534.23 ms | 53.3% bf16 MFU | 206938 tok/s +step 17422/19560 | loss 3.306595 (+0.58z)| norm 0.2425 (-0.49z)| lr 1.88e-05 | 2536.50 ms | 53.2% bf16 MFU | 206926 tok/s +step 17423/19560 | loss 3.267278 (-0.25z)| norm 0.2423 (-0.50z)| lr 1.88e-05 | 2534.80 ms | 53.3% bf16 MFU | 206922 tok/s +step 17424/19560 | loss 3.276342 (-0.05z)| norm 0.2257 (-1.64z)| lr 1.88e-05 | 2535.07 ms | 53.3% bf16 MFU | 206916 tok/s +step 17425/19560 | loss 3.273575 (-0.11z)| norm 0.2308 (-1.27z)| lr 1.88e-05 | 2531.25 ms | 53.3% bf16 MFU | 206927 tok/s +step 17426/19560 | loss 3.236590 (-0.88z)| norm 0.2395 (-0.66z)| lr 1.88e-05 | 2535.04 ms | 53.3% bf16 MFU | 206921 tok/s +step 17427/19560 | loss 3.322611 (+0.93z)| norm 0.2345 (-1.01z)| lr 1.88e-05 | 2532.46 ms | 53.3% bf16 MFU | 206927 tok/s +step 17428/19560 | loss 3.275075 (-0.07z)| norm 0.2472 (-0.13z)| lr 1.87e-05 | 2533.92 ms | 53.3% bf16 MFU | 206926 tok/s +step 17429/19560 | loss 3.219690 (-1.22z)| norm 0.2352 (-0.95z)| lr 1.87e-05 | 2531.95 ms | 53.3% bf16 MFU | 206933 tok/s +step 17430/19560 | loss 3.242428 (-0.73z)| norm 0.2362 (-0.88z)| lr 1.87e-05 | 2533.90 ms | 53.3% bf16 MFU | 206932 tok/s +step 17431/19560 | loss 3.286768 (+0.21z)| norm 0.2653 (+1.12z)| lr 1.87e-05 | 2532.75 ms | 53.3% bf16 MFU | 206935 tok/s +step 17432/19560 | loss 3.283817 (+0.15z)| norm 0.2888 (+2.64z)| lr 1.87e-05 | 2532.66 ms | 53.3% bf16 MFU | 206939 tok/s +step 17433/19560 | loss 3.214474 (-1.30z)| norm 0.2330 (-1.08z)| lr 1.87e-05 | 2534.61 ms | 53.3% bf16 MFU | 206935 tok/s +step 17434/19560 | loss 3.285410 (+0.19z)| norm 0.2384 (-0.71z)| lr 1.86e-05 | 2534.88 ms | 53.3% bf16 MFU | 206929 tok/s +step 17435/19560 | loss 3.318479 (+0.87z)| norm 0.2479 (-0.10z)| lr 1.86e-05 | 2534.05 ms | 53.3% bf16 MFU | 206928 tok/s +step 17436/19560 | loss 3.266483 (-0.22z)| norm 0.2301 (-1.26z)| lr 1.86e-05 | 2532.90 ms | 53.3% bf16 MFU | 206931 tok/s +step 17437/19560 | loss 3.317858 (+0.86z)| norm 0.2389 (-0.67z)| lr 1.86e-05 | 2533.64 ms | 53.3% bf16 MFU | 206931 tok/s +step 17438/19560 | loss 3.323633 (+0.97z)| norm 0.2368 (-0.81z)| lr 1.86e-05 | 2532.41 ms | 53.3% bf16 MFU | 206936 tok/s +step 17439/19560 | loss 3.314713 (+0.77z)| norm 0.2337 (-0.99z)| lr 1.85e-05 | 2534.52 ms | 53.3% bf16 MFU | 206932 tok/s +step 17440/19560 | loss 3.282767 (+0.09z)| norm 0.2456 (-0.22z)| lr 1.85e-05 | 2534.87 ms | 53.3% bf16 MFU | 206927 tok/s +step 17441/19560 | loss 3.256422 (-0.46z)| norm 0.2507 (+0.12z)| lr 1.85e-05 | 2532.55 ms | 53.3% bf16 MFU | 206932 tok/s +step 17442/19560 | loss 3.305188 (+0.56z)| norm 0.2681 (+1.26z)| lr 1.85e-05 | 2534.08 ms | 53.3% bf16 MFU | 206930 tok/s +step 17443/19560 | loss 3.336815 (+1.22z)| norm 0.2496 (+0.02z)| lr 1.85e-05 | 2534.18 ms | 53.3% bf16 MFU | 206928 tok/s +step 17444/19560 | loss 3.340936 (+1.29z)| norm 0.2506 (+0.07z)| lr 1.85e-05 | 2535.21 ms | 53.3% bf16 MFU | 206921 tok/s +step 17445/19560 | loss 3.278808 (-0.02z)| norm 0.2403 (-0.61z)| lr 1.84e-05 | 2533.28 ms | 53.3% bf16 MFU | 206923 tok/s +step 17446/19560 | loss 3.294195 (+0.30z)| norm 0.2345 (-0.99z)| lr 1.84e-05 | 2533.52 ms | 53.3% bf16 MFU | 206924 tok/s +step 17447/19560 | loss 3.360404 (+1.72z)| norm 0.2460 (-0.22z)| lr 1.84e-05 | 2534.40 ms | 53.3% bf16 MFU | 206921 tok/s +step 17448/19560 | loss 3.315416 (+0.75z)| norm 0.2321 (-1.15z)| lr 1.84e-05 | 2533.21 ms | 53.3% bf16 MFU | 206924 tok/s +step 17449/19560 | loss 3.302489 (+0.47z)| norm 0.2423 (-0.46z)| lr 1.84e-05 | 2532.84 ms | 53.3% bf16 MFU | 206927 tok/s +step 17450/19560 | loss 3.196554 (-1.77z)| norm 0.2604 (+0.75z)| lr 1.84e-05 | 2535.24 ms | 53.3% bf16 MFU | 206921 tok/s +step 17451/19560 | loss 3.208162 (-1.52z)| norm 0.2483 (-0.06z)| lr 1.83e-05 | 2532.81 ms | 53.3% bf16 MFU | 206925 tok/s +step 17452/19560 | loss 3.313832 (+0.76z)| norm 0.2504 (+0.07z)| lr 1.83e-05 | 2534.67 ms | 53.3% bf16 MFU | 206921 tok/s +step 17453/19560 | loss 3.308857 (+0.64z)| norm 0.2514 (+0.14z)| lr 1.83e-05 | 2533.95 ms | 53.3% bf16 MFU | 206920 tok/s +step 17454/19560 | loss 3.280425 (+0.03z)| norm 0.2602 (+0.73z)| lr 1.83e-05 | 2532.90 ms | 53.3% bf16 MFU | 206924 tok/s +step 17455/19560 | loss 3.276671 (-0.06z)| norm 0.2437 (-0.38z)| lr 1.83e-05 | 2534.64 ms | 53.3% bf16 MFU | 206920 tok/s +step 17456/19560 | loss 3.319772 (+0.88z)| norm 0.2424 (-0.47z)| lr 1.83e-05 | 2533.70 ms | 53.3% bf16 MFU | 206920 tok/s +step 17457/19560 | loss 3.268390 (-0.24z)| norm 0.2696 (+1.35z)| lr 1.82e-05 | 2533.69 ms | 53.3% bf16 MFU | 206921 tok/s +step 17458/19560 | loss 3.231501 (-1.07z)| norm 0.2607 (+0.79z)| lr 1.82e-05 | 2535.76 ms | 53.2% bf16 MFU | 206912 tok/s +step 17459/19560 | loss 3.295178 (+0.41z)| norm 0.2842 (+2.35z)| lr 1.82e-05 | 2532.81 ms | 53.3% bf16 MFU | 206917 tok/s +step 17460/19560 | loss 3.260217 (-0.40z)| norm 0.2496 (-0.00z)| lr 1.82e-05 | 2532.05 ms | 53.3% bf16 MFU | 206924 tok/s +step 17461/19560 | loss 3.245152 (-0.77z)| norm 0.2345 (-1.01z)| lr 1.82e-05 | 2532.54 ms | 53.3% bf16 MFU | 206929 tok/s +step 17462/19560 | loss 3.244459 (-0.83z)| norm 0.2428 (-0.45z)| lr 1.82e-05 | 2533.58 ms | 53.3% bf16 MFU | 206929 tok/s +step 17463/19560 | loss 3.281642 (+0.08z)| norm 0.2420 (-0.50z)| lr 1.81e-05 | 2532.72 ms | 53.3% bf16 MFU | 206933 tok/s +step 17464/19560 | loss 3.337006 (+1.42z)| norm 0.2612 (+0.79z)| lr 1.81e-05 | 2533.77 ms | 53.3% bf16 MFU | 206932 tok/s +step 17465/19560 | loss 3.241783 (-0.92z)| norm 0.2470 (-0.17z)| lr 1.81e-05 | 2535.07 ms | 53.3% bf16 MFU | 206926 tok/s +step 17466/19560 | loss 3.325939 (+1.14z)| norm 0.2452 (-0.29z)| lr 1.81e-05 | 2533.36 ms | 53.3% bf16 MFU | 206928 tok/s +step 17467/19560 | loss 3.270393 (-0.22z)| norm 0.2376 (-0.80z)| lr 1.81e-05 | 2535.69 ms | 53.2% bf16 MFU | 206920 tok/s +step 17468/19560 | loss 3.345439 (+1.59z)| norm 0.2444 (-0.33z)| lr 1.80e-05 | 2533.35 ms | 53.3% bf16 MFU | 206921 tok/s +step 17469/19560 | loss 3.298528 (+0.44z)| norm 0.2505 (+0.08z)| lr 1.80e-05 | 2532.80 ms | 53.3% bf16 MFU | 206925 tok/s +step 17470/19560 | loss 3.332257 (+1.24z)| norm 0.2492 (-0.01z)| lr 1.80e-05 | 2533.83 ms | 53.3% bf16 MFU | 206925 tok/s +step 17471/19560 | loss 3.279346 (-0.05z)| norm 0.2510 (+0.11z)| lr 1.80e-05 | 2534.87 ms | 53.3% bf16 MFU | 206920 tok/s +step 17472/19560 | loss 3.246291 (-0.85z)| norm 0.2589 (+0.64z)| lr 1.80e-05 | 2533.87 ms | 53.3% bf16 MFU | 206920 tok/s +step 17473/19560 | loss 3.313367 (+0.78z)| norm 0.2423 (-0.48z)| lr 1.80e-05 | 2533.34 ms | 53.3% bf16 MFU | 206921 tok/s +step 17474/19560 | loss 3.279672 (-0.04z)| norm 0.2484 (-0.06z)| lr 1.79e-05 | 2533.13 ms | 53.3% bf16 MFU | 206924 tok/s +step 17475/19560 | loss 3.289490 (+0.20z)| norm 0.2498 (+0.05z)| lr 1.79e-05 | 2534.67 ms | 53.3% bf16 MFU | 206920 tok/s +step 17476/19560 | loss 3.279397 (-0.04z)| norm 0.2443 (-0.32z)| lr 1.79e-05 | 2535.48 ms | 53.3% bf16 MFU | 206913 tok/s +step 17477/19560 | loss 3.301921 (+0.53z)| norm 0.2349 (-0.98z)| lr 1.79e-05 | 2531.70 ms | 53.3% bf16 MFU | 206922 tok/s +step 17478/19560 | loss 3.342417 (+1.50z)| norm 0.3088 (+4.04z)| lr 1.79e-05 | 2534.49 ms | 53.3% bf16 MFU | 206919 tok/s +step 17479/19560 | loss 3.315593 (+0.83z)| norm 0.2453 (-0.23z)| lr 1.79e-05 | 2532.90 ms | 53.3% bf16 MFU | 206922 tok/s +step 17480/19560 | loss 3.335073 (+1.30z)| norm 0.2461 (-0.18z)| lr 1.78e-05 | 2532.86 ms | 53.3% bf16 MFU | 206926 tok/s +step 17481/19560 | loss 3.269930 (-0.29z)| norm 0.2504 (+0.12z)| lr 1.78e-05 | 2534.40 ms | 53.3% bf16 MFU | 206923 tok/s +step 17482/19560 | loss 3.352865 (+1.71z)| norm 0.2422 (-0.43z)| lr 1.78e-05 | 2531.85 ms | 53.3% bf16 MFU | 206931 tok/s +step 17483/19560 | loss 3.286281 (+0.08z)| norm 0.2822 (+2.23z)| lr 1.78e-05 | 2534.46 ms | 53.3% bf16 MFU | 206928 tok/s +step 17484/19560 | loss 3.295935 (+0.31z)| norm 0.2818 (+2.14z)| lr 1.78e-05 | 2532.39 ms | 53.3% bf16 MFU | 206933 tok/s +step 17485/19560 | loss 3.291135 (+0.20z)| norm 0.2382 (-0.70z)| lr 1.78e-05 | 2534.75 ms | 53.3% bf16 MFU | 206928 tok/s +step 17486/19560 | loss 3.277971 (-0.13z)| norm 0.2516 (+0.17z)| lr 1.77e-05 | 2532.55 ms | 53.3% bf16 MFU | 206933 tok/s +step 17487/19560 | loss 3.336628 (+1.29z)| norm 0.2497 (+0.04z)| lr 1.77e-05 | 2535.30 ms | 53.3% bf16 MFU | 206926 tok/s +step 17488/19560 | loss 3.286576 (+0.06z)| norm 0.2400 (-0.59z)| lr 1.77e-05 | 2533.93 ms | 53.3% bf16 MFU | 206925 tok/s +step 17489/19560 | loss 3.274431 (-0.22z)| norm 0.2462 (-0.19z)| lr 1.77e-05 | 2533.28 ms | 53.3% bf16 MFU | 206927 tok/s +step 17490/19560 | loss 3.297749 (+0.36z)| norm 0.2335 (-1.01z)| lr 1.77e-05 | 2533.82 ms | 53.3% bf16 MFU | 206926 tok/s +step 17491/19560 | loss 3.302806 (+0.48z)| norm 0.2507 (+0.11z)| lr 1.77e-05 | 2532.67 ms | 53.3% bf16 MFU | 206930 tok/s +step 17492/19560 | loss 3.276557 (-0.17z)| norm 0.2495 (+0.03z)| lr 1.76e-05 | 2532.74 ms | 53.3% bf16 MFU | 206934 tok/s +step 17493/19560 | loss 3.245044 (-0.94z)| norm 0.2367 (-0.79z)| lr 1.76e-05 | 2533.78 ms | 53.3% bf16 MFU | 206933 tok/s +step 17494/19560 | loss 3.273600 (-0.22z)| norm 0.2351 (-0.89z)| lr 1.76e-05 | 2534.26 ms | 53.3% bf16 MFU | 206931 tok/s +step 17495/19560 | loss 3.291301 (+0.21z)| norm 0.2745 (+1.64z)| lr 1.76e-05 | 2534.85 ms | 53.3% bf16 MFU | 206926 tok/s +step 17496/19560 | loss 3.267253 (-0.39z)| norm 0.2450 (-0.25z)| lr 1.76e-05 | 2533.57 ms | 53.3% bf16 MFU | 206926 tok/s +step 17497/19560 | loss 3.300827 (+0.44z)| norm 0.2473 (-0.10z)| lr 1.76e-05 | 2535.92 ms | 53.2% bf16 MFU | 206917 tok/s +step 17498/19560 | loss 3.377326 (+2.30z)| norm 0.2776 (+1.81z)| lr 1.75e-05 | 2533.73 ms | 53.3% bf16 MFU | 206917 tok/s +step 17499/19560 | loss 3.274843 (-0.22z)| norm 0.2496 (+0.03z)| lr 1.75e-05 | 2534.03 ms | 53.3% bf16 MFU | 206917 tok/s +step 17500/19560 | loss 3.281031 (-0.07z)| norm 0.2417 (-0.47z)| lr 1.75e-05 | 2535.82 ms | 53.2% bf16 MFU | 206908 tok/s +val loss 3.290144 +evaluating HellaSwag: 0/628 evaluating HellaSwag: 10/628 evaluating HellaSwag: 20/628 evaluating HellaSwag: 30/628 evaluating HellaSwag: 40/628 evaluating HellaSwag: 50/628 evaluating HellaSwag: 60/628 evaluating HellaSwag: 70/628 evaluating HellaSwag: 80/628 evaluating HellaSwag: 90/628 evaluating HellaSwag: 100/628 evaluating HellaSwag: 110/628 evaluating HellaSwag: 120/628 evaluating HellaSwag: 130/628 evaluating HellaSwag: 140/628 evaluating HellaSwag: 150/628 evaluating HellaSwag: 160/628 evaluating HellaSwag: 170/628 evaluating HellaSwag: 180/628 evaluating HellaSwag: 190/628 evaluating HellaSwag: 200/628 evaluating HellaSwag: 210/628 evaluating HellaSwag: 220/628 evaluating HellaSwag: 230/628 evaluating HellaSwag: 240/628 evaluating HellaSwag: 250/628 evaluating HellaSwag: 260/628 evaluating HellaSwag: 270/628 evaluating HellaSwag: 280/628 evaluating HellaSwag: 290/628 evaluating HellaSwag: 300/628 evaluating HellaSwag: 310/628 evaluating HellaSwag: 320/628 evaluating HellaSwag: 330/628 evaluating HellaSwag: 340/628 evaluating HellaSwag: 350/628 evaluating HellaSwag: 360/628 evaluating HellaSwag: 370/628 evaluating HellaSwag: 380/628 evaluating HellaSwag: 390/628 evaluating HellaSwag: 400/628 evaluating HellaSwag: 410/628 evaluating HellaSwag: 420/628 evaluating HellaSwag: 430/628 evaluating HellaSwag: 440/628 evaluating HellaSwag: 450/628 evaluating HellaSwag: 460/628 evaluating HellaSwag: 470/628 evaluating HellaSwag: 480/628 evaluating HellaSwag: 490/628 evaluating HellaSwag: 500/628 evaluating HellaSwag: 510/628 evaluating HellaSwag: 520/628 evaluating HellaSwag: 530/628 evaluating HellaSwag: 540/628 evaluating HellaSwag: 550/628 evaluating HellaSwag: 560/628 evaluating HellaSwag: 570/628 evaluating HellaSwag: 580/628 evaluating HellaSwag: 590/628 evaluating HellaSwag: 600/628 evaluating HellaSwag: 610/628 evaluating HellaSwag: 620/628 HellaSwag: 3031/10042 = 0.301832 +step 17501/19560 | loss 3.304698 (+0.50z)| norm 0.2538 (+0.30z)| lr 1.75e-05 | 2535.04 ms | 53.3% bf16 MFU | 206904 tok/s +step 17502/19560 | loss 3.342421 (+1.45z)| norm 0.2440 (-0.32z)| lr 1.75e-05 | 2534.29 ms | 53.3% bf16 MFU | 206902 tok/s +step 17503/19560 | loss 3.262973 (-0.56z)| norm 0.2368 (-0.77z)| lr 1.75e-05 | 2535.73 ms | 53.2% bf16 MFU | 206895 tok/s +step 17504/19560 | loss 3.282418 (-0.05z)| norm 0.2400 (-0.56z)| lr 1.74e-05 | 2534.75 ms | 53.3% bf16 MFU | 206893 tok/s +step 17505/19560 | loss 3.284785 (-0.00z)| norm 0.2392 (-0.60z)| lr 1.74e-05 | 2533.68 ms | 53.3% bf16 MFU | 206894 tok/s +step 17506/19560 | loss 3.296114 (+0.29z)| norm 0.2583 (+0.60z)| lr 1.74e-05 | 2533.92 ms | 53.3% bf16 MFU | 206895 tok/s +step 17507/19560 | loss 3.298628 (+0.35z)| norm 0.2418 (-0.45z)| lr 1.74e-05 | 2534.34 ms | 53.3% bf16 MFU | 206894 tok/s +step 17508/19560 | loss 3.283199 (-0.05z)| norm 0.2472 (-0.11z)| lr 1.74e-05 | 2533.11 ms | 53.3% bf16 MFU | 206898 tok/s +step 17509/19560 | loss 3.256639 (-0.75z)| norm 0.2294 (-1.23z)| lr 1.74e-05 | 2534.87 ms | 53.3% bf16 MFU | 206894 tok/s +step 17510/19560 | loss 3.264981 (-0.53z)| norm 0.2377 (-0.70z)| lr 1.73e-05 | 2535.15 ms | 53.3% bf16 MFU | 206890 tok/s +step 17511/19560 | loss 3.334763 (+1.32z)| norm 0.2345 (-0.89z)| lr 1.73e-05 | 2534.94 ms | 53.3% bf16 MFU | 206887 tok/s +step 17512/19560 | loss 3.274970 (-0.28z)| norm 0.2469 (-0.12z)| lr 1.73e-05 | 2535.46 ms | 53.3% bf16 MFU | 206882 tok/s +step 17513/19560 | loss 3.294189 (+0.23z)| norm 0.2514 (+0.16z)| lr 1.73e-05 | 2535.13 ms | 53.3% bf16 MFU | 206878 tok/s +step 17514/19560 | loss 3.253440 (-0.86z)| norm 0.2476 (-0.08z)| lr 1.73e-05 | 2533.49 ms | 53.3% bf16 MFU | 206881 tok/s +step 17515/19560 | loss 3.348495 (+1.71z)| norm 0.2332 (-0.98z)| lr 1.73e-05 | 2533.70 ms | 53.3% bf16 MFU | 206884 tok/s +step 17516/19560 | loss 3.305853 (+0.55z)| norm 0.2323 (-1.01z)| lr 1.72e-05 | 2533.63 ms | 53.3% bf16 MFU | 206886 tok/s +step 17517/19560 | loss 3.290170 (+0.13z)| norm 0.2401 (-0.52z)| lr 1.72e-05 | 2531.78 ms | 53.3% bf16 MFU | 206896 tok/s +step 17518/19560 | loss 3.269531 (-0.43z)| norm 0.2484 (-0.01z)| lr 1.72e-05 | 2533.08 ms | 53.3% bf16 MFU | 206900 tok/s +step 17519/19560 | loss 3.289923 (+0.12z)| norm 0.2530 (+0.28z)| lr 1.72e-05 | 2532.41 ms | 53.3% bf16 MFU | 206906 tok/s +step 17520/19560 | loss 3.308159 (+0.61z)| norm 0.2340 (-0.90z)| lr 1.72e-05 | 2533.23 ms | 53.3% bf16 MFU | 206909 tok/s +step 17521/19560 | loss 3.278601 (-0.22z)| norm 0.2316 (-1.04z)| lr 1.72e-05 | 2536.35 ms | 53.2% bf16 MFU | 206899 tok/s +step 17522/19560 | loss 3.247025 (-1.08z)| norm 0.2345 (-0.85z)| lr 1.71e-05 | 2534.03 ms | 53.3% bf16 MFU | 206899 tok/s +step 17523/19560 | loss 3.305834 (+0.55z)| norm 0.2397 (-0.52z)| lr 1.71e-05 | 2536.27 ms | 53.2% bf16 MFU | 206890 tok/s +step 17524/19560 | loss 3.360963 (+2.08z)| norm 0.2486 (+0.03z)| lr 1.71e-05 | 2534.44 ms | 53.3% bf16 MFU | 206889 tok/s +step 17525/19560 | loss 3.311767 (+0.70z)| norm 0.2387 (-0.58z)| lr 1.71e-05 | 2534.53 ms | 53.3% bf16 MFU | 206887 tok/s +step 17526/19560 | loss 3.297585 (+0.31z)| norm 0.2496 (+0.10z)| lr 1.71e-05 | 2535.05 ms | 53.3% bf16 MFU | 206884 tok/s +step 17527/19560 | loss 3.219366 (-1.82z)| norm 0.2537 (+0.35z)| lr 1.71e-05 | 2534.87 ms | 53.3% bf16 MFU | 206881 tok/s +step 17528/19560 | loss 3.303571 (+0.48z)| norm 0.2333 (-0.92z)| lr 1.70e-05 | 2532.94 ms | 53.3% bf16 MFU | 206886 tok/s +step 17529/19560 | loss 3.240186 (-1.25z)| norm 0.2401 (-0.49z)| lr 1.70e-05 | 2532.93 ms | 53.3% bf16 MFU | 206892 tok/s +step 17530/19560 | loss 3.268899 (-0.45z)| norm 0.2383 (-0.62z)| lr 1.70e-05 | 2533.73 ms | 53.3% bf16 MFU | 206893 tok/s +step 17531/19560 | loss 3.234813 (-1.38z)| norm 0.2373 (-0.69z)| lr 1.70e-05 | 2532.20 ms | 53.3% bf16 MFU | 206901 tok/s +step 17532/19560 | loss 3.233118 (-1.40z)| norm 0.2338 (-0.92z)| lr 1.70e-05 | 2533.24 ms | 53.3% bf16 MFU | 206904 tok/s +step 17533/19560 | loss 3.293208 (+0.22z)| norm 0.2406 (-0.45z)| lr 1.70e-05 | 2535.34 ms | 53.3% bf16 MFU | 206898 tok/s +step 17534/19560 | loss 3.296535 (+0.32z)| norm 0.2402 (-0.48z)| lr 1.69e-05 | 2534.39 ms | 53.3% bf16 MFU | 206897 tok/s +step 17535/19560 | loss 3.316927 (+0.87z)| norm 0.2395 (-0.53z)| lr 1.69e-05 | 2530.70 ms | 53.4% bf16 MFU | 206911 tok/s +step 17536/19560 | loss 3.290393 (+0.13z)| norm 0.2357 (-0.81z)| lr 1.69e-05 | 2532.60 ms | 53.3% bf16 MFU | 206916 tok/s +step 17537/19560 | loss 3.276260 (-0.28z)| norm 0.2433 (-0.24z)| lr 1.69e-05 | 2533.35 ms | 53.3% bf16 MFU | 206918 tok/s +step 17538/19560 | loss 3.315329 (+0.82z)| norm 0.2331 (-1.00z)| lr 1.69e-05 | 2535.73 ms | 53.2% bf16 MFU | 206910 tok/s +step 17539/19560 | loss 3.339030 (+1.45z)| norm 0.2695 (+1.72z)| lr 1.69e-05 | 2532.13 ms | 53.3% bf16 MFU | 206917 tok/s +step 17540/19560 | loss 3.308586 (+0.62z)| norm 0.2298 (-1.22z)| lr 1.68e-05 | 2535.39 ms | 53.3% bf16 MFU | 206911 tok/s +step 17541/19560 | loss 3.268701 (-0.50z)| norm 0.2478 (+0.12z)| lr 1.68e-05 | 2530.92 ms | 53.3% bf16 MFU | 206923 tok/s +step 17542/19560 | loss 3.274346 (-0.35z)| norm 0.2478 (+0.13z)| lr 1.68e-05 | 2533.25 ms | 53.3% bf16 MFU | 206925 tok/s +step 17543/19560 | loss 3.225030 (-1.74z)| norm 0.2320 (-1.06z)| lr 1.68e-05 | 2535.24 ms | 53.3% bf16 MFU | 206919 tok/s +step 17544/19560 | loss 3.306658 (+0.56z)| norm 0.2412 (-0.37z)| lr 1.68e-05 | 2534.04 ms | 53.3% bf16 MFU | 206918 tok/s +step 17545/19560 | loss 3.346997 (+1.68z)| norm 0.2271 (-1.40z)| lr 1.68e-05 | 2534.34 ms | 53.3% bf16 MFU | 206915 tok/s +step 17546/19560 | loss 3.306117 (+0.52z)| norm 0.2490 (+0.25z)| lr 1.67e-05 | 2534.31 ms | 53.3% bf16 MFU | 206913 tok/s +step 17547/19560 | loss 3.291891 (+0.10z)| norm 0.2430 (-0.21z)| lr 1.67e-05 | 2535.16 ms | 53.3% bf16 MFU | 206908 tok/s +step 17548/19560 | loss 3.245730 (-1.20z)| norm 0.2400 (-0.44z)| lr 1.67e-05 | 2535.22 ms | 53.3% bf16 MFU | 206903 tok/s +step 17549/19560 | loss 3.250632 (-1.10z)| norm 0.2452 (-0.05z)| lr 1.67e-05 | 2533.38 ms | 53.3% bf16 MFU | 206905 tok/s +step 17550/19560 | loss 3.298544 (+0.29z)| norm 0.2320 (-1.05z)| lr 1.67e-05 | 2533.87 ms | 53.3% bf16 MFU | 206906 tok/s +step 17551/19560 | loss 3.363768 (+2.14z)| norm 0.2440 (-0.14z)| lr 1.67e-05 | 2532.84 ms | 53.3% bf16 MFU | 206910 tok/s +step 17552/19560 | loss 3.352973 (+1.79z)| norm 0.2784 (+2.42z)| lr 1.66e-05 | 2534.30 ms | 53.3% bf16 MFU | 206908 tok/s +step 17553/19560 | loss 3.297019 (+0.20z)| norm 0.2533 (+0.53z)| lr 1.66e-05 | 2535.14 ms | 53.3% bf16 MFU | 206903 tok/s +step 17554/19560 | loss 3.303426 (+0.37z)| norm 0.2633 (+1.26z)| lr 1.66e-05 | 2533.56 ms | 53.3% bf16 MFU | 206905 tok/s +step 17555/19560 | loss 3.285174 (-0.14z)| norm 0.2416 (-0.38z)| lr 1.66e-05 | 2535.16 ms | 53.3% bf16 MFU | 206900 tok/s +step 17556/19560 | loss 3.271674 (-0.53z)| norm 0.2494 (+0.21z)| lr 1.66e-05 | 2535.16 ms | 53.3% bf16 MFU | 206895 tok/s +step 17557/19560 | loss 3.252129 (-1.11z)| norm 0.2426 (-0.31z)| lr 1.66e-05 | 2533.92 ms | 53.3% bf16 MFU | 206896 tok/s +step 17558/19560 | loss 3.252135 (-1.12z)| norm 0.2448 (-0.15z)| lr 1.65e-05 | 2534.38 ms | 53.3% bf16 MFU | 206895 tok/s +step 17559/19560 | loss 3.306097 (+0.45z)| norm 0.2350 (-0.88z)| lr 1.65e-05 | 2532.49 ms | 53.3% bf16 MFU | 206901 tok/s +step 17560/19560 | loss 3.267931 (-0.66z)| norm 0.2898 (+3.29z)| lr 1.65e-05 | 2533.47 ms | 53.3% bf16 MFU | 206903 tok/s +step 17561/19560 | loss 3.340302 (+1.43z)| norm 0.2755 (+2.15z)| lr 1.65e-05 | 2534.91 ms | 53.3% bf16 MFU | 206900 tok/s +step 17562/19560 | loss 3.274082 (-0.51z)| norm 0.2462 (-0.05z)| lr 1.65e-05 | 2533.19 ms | 53.3% bf16 MFU | 206903 tok/s +step 17563/19560 | loss 3.321203 (+0.87z)| norm 0.2431 (-0.28z)| lr 1.65e-05 | 2533.19 ms | 53.3% bf16 MFU | 206906 tok/s +step 17564/19560 | loss 3.252540 (-1.13z)| norm 0.2375 (-0.71z)| lr 1.64e-05 | 2533.75 ms | 53.3% bf16 MFU | 206907 tok/s +step 17565/19560 | loss 3.282191 (-0.26z)| norm 0.2464 (-0.04z)| lr 1.64e-05 | 2532.89 ms | 53.3% bf16 MFU | 206911 tok/s +step 17566/19560 | loss 3.244526 (-1.34z)| norm 0.2454 (-0.13z)| lr 1.64e-05 | 2533.73 ms | 53.3% bf16 MFU | 206912 tok/s +step 17567/19560 | loss 3.304923 (+0.42z)| norm 0.2446 (-0.19z)| lr 1.64e-05 | 2532.51 ms | 53.3% bf16 MFU | 206918 tok/s +step 17568/19560 | loss 3.331810 (+1.19z)| norm 0.2386 (-0.65z)| lr 1.64e-05 | 2531.61 ms | 53.3% bf16 MFU | 206926 tok/s +step 17569/19560 | loss 3.218598 (-2.06z)| norm 0.2595 (+0.93z)| lr 1.64e-05 | 2532.17 ms | 53.3% bf16 MFU | 206933 tok/s +step 17570/19560 | loss 3.297471 (+0.20z)| norm 0.2378 (-0.69z)| lr 1.63e-05 | 2534.01 ms | 53.3% bf16 MFU | 206931 tok/s +step 17571/19560 | loss 3.250988 (-1.11z)| norm 0.2510 (+0.31z)| lr 1.63e-05 | 2532.91 ms | 53.3% bf16 MFU | 206934 tok/s +step 17572/19560 | loss 3.309135 (+0.57z)| norm 0.2452 (-0.13z)| lr 1.63e-05 | 2532.44 ms | 53.3% bf16 MFU | 206939 tok/s +step 17573/19560 | loss 3.249078 (-1.16z)| norm 0.2441 (-0.22z)| lr 1.63e-05 | 2534.33 ms | 53.3% bf16 MFU | 206935 tok/s +step 17574/19560 | loss 3.319429 (+0.86z)| norm 0.2298 (-1.30z)| lr 1.63e-05 | 2532.94 ms | 53.3% bf16 MFU | 206938 tok/s +step 17575/19560 | loss 3.263724 (-0.73z)| norm 0.2441 (-0.21z)| lr 1.63e-05 | 2534.72 ms | 53.3% bf16 MFU | 206933 tok/s +step 17576/19560 | loss 3.203992 (-2.40z)| norm 0.2353 (-0.88z)| lr 1.63e-05 | 2532.77 ms | 53.3% bf16 MFU | 206937 tok/s +step 17577/19560 | loss 3.318638 (+0.87z)| norm 0.2647 (+1.33z)| lr 1.62e-05 | 2533.79 ms | 53.3% bf16 MFU | 206936 tok/s +step 17578/19560 | loss 3.270644 (-0.52z)| norm 0.2369 (-0.76z)| lr 1.62e-05 | 2531.91 ms | 53.3% bf16 MFU | 206943 tok/s +step 17579/19560 | loss 3.186149 (-2.95z)| norm 0.2594 (+0.93z)| lr 1.62e-05 | 2532.52 ms | 53.3% bf16 MFU | 206947 tok/s +step 17580/19560 | loss 3.288059 (-0.00z)| norm 0.2356 (-0.84z)| lr 1.62e-05 | 2531.51 ms | 53.3% bf16 MFU | 206955 tok/s +step 17581/19560 | loss 3.322637 (+0.99z)| norm 0.2460 (-0.06z)| lr 1.62e-05 | 2535.11 ms | 53.3% bf16 MFU | 206947 tok/s +step 17582/19560 | loss 3.242835 (-1.29z)| norm 0.2356 (-0.83z)| lr 1.62e-05 | 2531.90 ms | 53.3% bf16 MFU | 206954 tok/s +step 17583/19560 | loss 3.307224 (+0.54z)| norm 0.2556 (+0.67z)| lr 1.61e-05 | 2532.89 ms | 53.3% bf16 MFU | 206956 tok/s +step 17584/19560 | loss 3.322589 (+0.98z)| norm 0.2751 (+2.08z)| lr 1.61e-05 | 2532.64 ms | 53.3% bf16 MFU | 206958 tok/s +step 17585/19560 | loss 3.334070 (+1.29z)| norm 0.2301 (-1.23z)| lr 1.61e-05 | 2534.68 ms | 53.3% bf16 MFU | 206953 tok/s +step 17586/19560 | loss 3.295202 (+0.17z)| norm 0.2552 (+0.64z)| lr 1.61e-05 | 2534.29 ms | 53.3% bf16 MFU | 206949 tok/s +step 17587/19560 | loss 3.311197 (+0.63z)| norm 0.2519 (+0.42z)| lr 1.61e-05 | 2535.30 ms | 53.3% bf16 MFU | 206941 tok/s +step 17588/19560 | loss 3.301110 (+0.33z)| norm 0.2433 (-0.23z)| lr 1.61e-05 | 2533.20 ms | 53.3% bf16 MFU | 206943 tok/s +step 17589/19560 | loss 3.390942 (+2.81z)| norm 0.2656 (+1.46z)| lr 1.60e-05 | 2533.58 ms | 53.3% bf16 MFU | 206942 tok/s +step 17590/19560 | loss 3.339851 (+1.36z)| norm 0.2461 (-0.04z)| lr 1.60e-05 | 2533.34 ms | 53.3% bf16 MFU | 206943 tok/s +step 17591/19560 | loss 3.325884 (+0.95z)| norm 0.2694 (+1.71z)| lr 1.60e-05 | 2535.03 ms | 53.3% bf16 MFU | 206937 tok/s +step 17592/19560 | loss 3.297904 (+0.18z)| norm 0.2444 (-0.17z)| lr 1.60e-05 | 2535.08 ms | 53.3% bf16 MFU | 206930 tok/s +step 17593/19560 | loss 3.304366 (+0.35z)| norm 0.2504 (+0.28z)| lr 1.60e-05 | 2533.64 ms | 53.3% bf16 MFU | 206930 tok/s +step 17594/19560 | loss 3.270085 (-0.61z)| norm 0.2628 (+1.21z)| lr 1.60e-05 | 2533.41 ms | 53.3% bf16 MFU | 206931 tok/s +step 17595/19560 | loss 3.281886 (-0.28z)| norm 0.2412 (-0.43z)| lr 1.59e-05 | 2533.08 ms | 53.3% bf16 MFU | 206934 tok/s +step 17596/19560 | loss 3.278845 (-0.36z)| norm 0.2591 (+0.92z)| lr 1.59e-05 | 2532.96 ms | 53.3% bf16 MFU | 206936 tok/s +step 17597/19560 | loss 3.298670 (+0.21z)| norm 0.2391 (-0.59z)| lr 1.59e-05 | 2532.65 ms | 53.3% bf16 MFU | 206940 tok/s +step 17598/19560 | loss 3.314534 (+0.68z)| norm 0.2526 (+0.43z)| lr 1.59e-05 | 2532.09 ms | 53.3% bf16 MFU | 206946 tok/s +step 17599/19560 | loss 3.293424 (+0.06z)| norm 0.2384 (-0.64z)| lr 1.59e-05 | 2533.68 ms | 53.3% bf16 MFU | 206945 tok/s +step 17600/19560 | loss 3.291095 (-0.01z)| norm 0.2409 (-0.44z)| lr 1.59e-05 | 2533.75 ms | 53.3% bf16 MFU | 206944 tok/s +step 17601/19560 | loss 3.249735 (-1.19z)| norm 0.2396 (-0.53z)| lr 1.58e-05 | 2533.38 ms | 53.3% bf16 MFU | 206944 tok/s +step 17602/19560 | loss 3.298152 (+0.20z)| norm 0.2991 (+3.71z)| lr 1.58e-05 | 2532.81 ms | 53.3% bf16 MFU | 206947 tok/s +step 17603/19560 | loss 3.284469 (-0.19z)| norm 0.2531 (+0.43z)| lr 1.58e-05 | 2534.23 ms | 53.3% bf16 MFU | 206944 tok/s +step 17604/19560 | loss 3.416536 (+3.42z)| norm 0.2575 (+0.74z)| lr 1.58e-05 | 2534.97 ms | 53.3% bf16 MFU | 206938 tok/s +step 17605/19560 | loss 3.257241 (-0.95z)| norm 0.2391 (-0.58z)| lr 1.58e-05 | 2534.23 ms | 53.3% bf16 MFU | 206935 tok/s +step 17606/19560 | loss 3.312549 (+0.58z)| norm 0.2888 (+3.10z)| lr 1.58e-05 | 2532.84 ms | 53.3% bf16 MFU | 206938 tok/s +step 17607/19560 | loss 3.298472 (+0.19z)| norm 0.2574 (+0.76z)| lr 1.58e-05 | 2534.01 ms | 53.3% bf16 MFU | 206936 tok/s +step 17608/19560 | loss 3.347126 (+1.53z)| norm 0.2355 (-0.86z)| lr 1.57e-05 | 2535.18 ms | 53.3% bf16 MFU | 206930 tok/s +step 17609/19560 | loss 3.221382 (-1.90z)| norm 0.2603 (+0.97z)| lr 1.57e-05 | 2533.52 ms | 53.3% bf16 MFU | 206930 tok/s +step 17610/19560 | loss 3.280236 (-0.29z)| norm 0.2662 (+1.38z)| lr 1.57e-05 | 2535.09 ms | 53.3% bf16 MFU | 206924 tok/s +step 17611/19560 | loss 3.289123 (-0.04z)| norm 0.2343 (-0.94z)| lr 1.57e-05 | 2533.55 ms | 53.3% bf16 MFU | 206925 tok/s +step 17612/19560 | loss 3.257319 (-0.91z)| norm 0.2480 (+0.10z)| lr 1.57e-05 | 2533.23 ms | 53.3% bf16 MFU | 206927 tok/s +step 17613/19560 | loss 3.312511 (+0.60z)| norm 0.2524 (+0.43z)| lr 1.57e-05 | 2533.66 ms | 53.3% bf16 MFU | 206927 tok/s +step 17614/19560 | loss 3.305560 (+0.40z)| norm 0.2324 (-1.09z)| lr 1.56e-05 | 2535.01 ms | 53.3% bf16 MFU | 206922 tok/s +step 17615/19560 | loss 3.189053 (-2.69z)| norm 0.2511 (+0.34z)| lr 1.56e-05 | 2532.47 ms | 53.3% bf16 MFU | 206927 tok/s +step 17616/19560 | loss 3.339214 (+1.31z)| norm 0.2515 (+0.36z)| lr 1.56e-05 | 2535.16 ms | 53.3% bf16 MFU | 206921 tok/s +step 17617/19560 | loss 3.281989 (-0.22z)| norm 0.2378 (-0.68z)| lr 1.56e-05 | 2533.65 ms | 53.3% bf16 MFU | 206921 tok/s +step 17618/19560 | loss 3.337674 (+1.25z)| norm 0.2381 (-0.66z)| lr 1.56e-05 | 2532.30 ms | 53.3% bf16 MFU | 206927 tok/s +step 17619/19560 | loss 3.268574 (-0.57z)| norm 0.2380 (-0.66z)| lr 1.56e-05 | 2532.08 ms | 53.3% bf16 MFU | 206934 tok/s +step 17620/19560 | loss 3.319216 (+0.76z)| norm 0.2324 (-1.07z)| lr 1.55e-05 | 2531.68 ms | 53.3% bf16 MFU | 206942 tok/s +step 17621/19560 | loss 3.256637 (-0.90z)| norm 0.2389 (-0.58z)| lr 1.55e-05 | 2533.38 ms | 53.3% bf16 MFU | 206942 tok/s +step 17622/19560 | loss 3.288237 (-0.06z)| norm 0.2711 (+1.84z)| lr 1.55e-05 | 2534.42 ms | 53.3% bf16 MFU | 206938 tok/s +step 17623/19560 | loss 3.246124 (-1.16z)| norm 0.2539 (+0.56z)| lr 1.55e-05 | 2532.71 ms | 53.3% bf16 MFU | 206942 tok/s +step 17624/19560 | loss 3.334763 (+1.15z)| norm 0.2481 (+0.11z)| lr 1.55e-05 | 2532.63 ms | 53.3% bf16 MFU | 206945 tok/s +step 17625/19560 | loss 3.304572 (+0.36z)| norm 0.2361 (-0.80z)| lr 1.55e-05 | 2534.28 ms | 53.3% bf16 MFU | 206942 tok/s +step 17626/19560 | loss 3.287097 (-0.08z)| norm 0.2483 (+0.16z)| lr 1.54e-05 | 2532.25 ms | 53.3% bf16 MFU | 206947 tok/s +step 17627/19560 | loss 3.309247 (+0.50z)| norm 0.2391 (-0.56z)| lr 1.54e-05 | 2533.99 ms | 53.3% bf16 MFU | 206945 tok/s +step 17628/19560 | loss 3.324007 (+0.89z)| norm 0.2419 (-0.34z)| lr 1.54e-05 | 2532.80 ms | 53.3% bf16 MFU | 206948 tok/s +step 17629/19560 | loss 3.269825 (-0.55z)| norm 0.2520 (+0.45z)| lr 1.54e-05 | 2532.67 ms | 53.3% bf16 MFU | 206951 tok/s +step 17630/19560 | loss 3.269477 (-0.55z)| norm 0.2648 (+1.43z)| lr 1.54e-05 | 2532.83 ms | 53.3% bf16 MFU | 206953 tok/s +step 17631/19560 | loss 3.316803 (+0.71z)| norm 0.2496 (+0.24z)| lr 1.54e-05 | 2531.66 ms | 53.3% bf16 MFU | 206960 tok/s +step 17632/19560 | loss 3.302222 (+0.31z)| norm 0.2592 (+0.97z)| lr 1.54e-05 | 2532.14 ms | 53.3% bf16 MFU | 206965 tok/s +step 17633/19560 | loss 3.267283 (-0.62z)| norm 0.2344 (-0.94z)| lr 1.53e-05 | 2531.53 ms | 53.3% bf16 MFU | 206972 tok/s +step 17634/19560 | loss 3.279392 (-0.29z)| norm 0.2467 (+0.01z)| lr 1.53e-05 | 2533.17 ms | 53.3% bf16 MFU | 206971 tok/s +step 17635/19560 | loss 3.272918 (-0.46z)| norm 0.2731 (+2.01z)| lr 1.53e-05 | 2532.60 ms | 53.3% bf16 MFU | 206974 tok/s +step 17636/19560 | loss 3.288167 (-0.05z)| norm 0.2511 (+0.33z)| lr 1.53e-05 | 2533.64 ms | 53.3% bf16 MFU | 206972 tok/s +step 17637/19560 | loss 3.298722 (+0.22z)| norm 0.2404 (-0.50z)| lr 1.53e-05 | 2533.71 ms | 53.3% bf16 MFU | 206969 tok/s +step 17638/19560 | loss 3.277640 (-0.35z)| norm 0.2332 (-1.05z)| lr 1.53e-05 | 2535.28 ms | 53.3% bf16 MFU | 206961 tok/s +step 17639/19560 | loss 3.302333 (+0.33z)| norm 0.2326 (-1.09z)| lr 1.52e-05 | 2534.13 ms | 53.3% bf16 MFU | 206957 tok/s +step 17640/19560 | loss 3.274645 (-0.42z)| norm 0.2548 (+0.60z)| lr 1.52e-05 | 2535.21 ms | 53.3% bf16 MFU | 206949 tok/s +step 17641/19560 | loss 3.289177 (-0.03z)| norm 0.2662 (+1.46z)| lr 1.52e-05 | 2534.12 ms | 53.3% bf16 MFU | 206946 tok/s +step 17642/19560 | loss 3.253814 (-0.98z)| norm 0.2576 (+0.79z)| lr 1.52e-05 | 2533.66 ms | 53.3% bf16 MFU | 206946 tok/s +step 17643/19560 | loss 3.247912 (-1.13z)| norm 0.2377 (-0.72z)| lr 1.52e-05 | 2533.19 ms | 53.3% bf16 MFU | 206947 tok/s +step 17644/19560 | loss 3.310789 (+0.58z)| norm 0.2291 (-1.36z)| lr 1.52e-05 | 2532.53 ms | 53.3% bf16 MFU | 206950 tok/s +step 17645/19560 | loss 3.299073 (+0.26z)| norm 0.2496 (+0.19z)| lr 1.51e-05 | 2531.05 ms | 53.3% bf16 MFU | 206960 tok/s +step 17646/19560 | loss 3.278076 (-0.31z)| norm 0.2320 (-1.14z)| lr 1.51e-05 | 2531.69 ms | 53.3% bf16 MFU | 206966 tok/s +step 17647/19560 | loss 3.300003 (+0.28z)| norm 0.2532 (+0.46z)| lr 1.51e-05 | 2533.73 ms | 53.3% bf16 MFU | 206964 tok/s +step 17648/19560 | loss 3.297420 (+0.21z)| norm 0.2700 (+1.70z)| lr 1.51e-05 | 2532.18 ms | 53.3% bf16 MFU | 206969 tok/s +step 17649/19560 | loss 3.283038 (-0.18z)| norm 0.2366 (-0.81z)| lr 1.51e-05 | 2532.47 ms | 53.3% bf16 MFU | 206971 tok/s +step 17650/19560 | loss 3.297928 (+0.22z)| norm 0.2583 (+0.81z)| lr 1.51e-05 | 2533.80 ms | 53.3% bf16 MFU | 206969 tok/s +step 17651/19560 | loss 3.424265 (+3.47z)| norm 0.3173 (+4.72z)| lr 1.51e-05 | 2532.08 ms | 53.3% bf16 MFU | 206973 tok/s +step 17652/19560 | loss 3.292283 (+0.05z)| norm 0.2765 (+1.89z)| lr 1.50e-05 | 2533.89 ms | 53.3% bf16 MFU | 206970 tok/s +step 17653/19560 | loss 3.277118 (-0.34z)| norm 0.2423 (-0.41z)| lr 1.50e-05 | 2535.33 ms | 53.3% bf16 MFU | 206961 tok/s +step 17654/19560 | loss 3.270789 (-0.50z)| norm 0.2472 (-0.08z)| lr 1.50e-05 | 2533.29 ms | 53.3% bf16 MFU | 206961 tok/s +step 17655/19560 | loss 3.291308 (+0.02z)| norm 0.2797 (+2.06z)| lr 1.50e-05 | 2533.22 ms | 53.3% bf16 MFU | 206961 tok/s +step 17656/19560 | loss 3.279285 (-0.29z)| norm 0.2659 (+1.13z)| lr 1.50e-05 | 2531.19 ms | 53.3% bf16 MFU | 206970 tok/s +step 17657/19560 | loss 3.242053 (-1.29z)| norm 0.2639 (+0.98z)| lr 1.50e-05 | 2532.55 ms | 53.3% bf16 MFU | 206972 tok/s +step 17658/19560 | loss 3.309307 (+0.50z)| norm 0.2442 (-0.32z)| lr 1.49e-05 | 2532.36 ms | 53.3% bf16 MFU | 206975 tok/s +step 17659/19560 | loss 3.234906 (-1.49z)| norm 0.2397 (-0.62z)| lr 1.49e-05 | 2531.53 ms | 53.3% bf16 MFU | 206982 tok/s +step 17660/19560 | loss 3.319998 (+0.77z)| norm 0.2467 (-0.16z)| lr 1.49e-05 | 2532.19 ms | 53.3% bf16 MFU | 206985 tok/s +step 17661/19560 | loss 3.289509 (-0.05z)| norm 0.2421 (-0.47z)| lr 1.49e-05 | 2531.64 ms | 53.3% bf16 MFU | 206991 tok/s +step 17662/19560 | loss 3.233038 (-1.54z)| norm 0.2455 (-0.25z)| lr 1.49e-05 | 2531.60 ms | 53.3% bf16 MFU | 206996 tok/s +step 17663/19560 | loss 3.278943 (-0.31z)| norm 0.2379 (-0.76z)| lr 1.49e-05 | 2535.21 ms | 53.3% bf16 MFU | 206986 tok/s +step 17664/19560 | loss 3.325714 (+0.93z)| norm 0.2785 (+1.90z)| lr 1.49e-05 | 2534.16 ms | 53.3% bf16 MFU | 206981 tok/s +step 17665/19560 | loss 3.300313 (+0.25z)| norm 0.2354 (-0.93z)| lr 1.48e-05 | 2530.02 ms | 53.4% bf16 MFU | 206994 tok/s +step 17666/19560 | loss 3.286815 (-0.10z)| norm 0.2404 (-0.61z)| lr 1.48e-05 | 2530.05 ms | 53.4% bf16 MFU | 207005 tok/s +step 17667/19560 | loss 3.347740 (+1.51z)| norm 0.2429 (-0.43z)| lr 1.48e-05 | 2530.88 ms | 53.3% bf16 MFU | 207013 tok/s +step 17668/19560 | loss 3.292604 (+0.05z)| norm 0.2710 (+1.41z)| lr 1.48e-05 | 2530.74 ms | 53.4% bf16 MFU | 207021 tok/s +step 17669/19560 | loss 3.253357 (-0.99z)| norm 0.2369 (-0.84z)| lr 1.48e-05 | 2532.53 ms | 53.3% bf16 MFU | 207021 tok/s +step 17670/19560 | loss 3.263824 (-0.71z)| norm 0.2439 (-0.37z)| lr 1.48e-05 | 2532.89 ms | 53.3% bf16 MFU | 207019 tok/s +step 17671/19560 | loss 3.290953 (-0.00z)| norm 0.2518 (+0.14z)| lr 1.47e-05 | 2534.46 ms | 53.3% bf16 MFU | 207011 tok/s +step 17672/19560 | loss 3.300438 (+0.25z)| norm 0.2394 (-0.68z)| lr 1.47e-05 | 2533.00 ms | 53.3% bf16 MFU | 207010 tok/s +step 17673/19560 | loss 3.274698 (-0.43z)| norm 0.2380 (-0.78z)| lr 1.47e-05 | 2534.48 ms | 53.3% bf16 MFU | 207003 tok/s +step 17674/19560 | loss 3.269990 (-0.54z)| norm 0.2545 (+0.31z)| lr 1.47e-05 | 2534.50 ms | 53.3% bf16 MFU | 206995 tok/s +step 17675/19560 | loss 3.354116 (+1.70z)| norm 0.2365 (-0.88z)| lr 1.47e-05 | 2535.23 ms | 53.3% bf16 MFU | 206986 tok/s +step 17676/19560 | loss 3.342252 (+1.36z)| norm 0.2694 (+1.28z)| lr 1.47e-05 | 2532.99 ms | 53.3% bf16 MFU | 206986 tok/s +step 17677/19560 | loss 3.300413 (+0.23z)| norm 0.2476 (-0.16z)| lr 1.47e-05 | 2533.26 ms | 53.3% bf16 MFU | 206984 tok/s +step 17678/19560 | loss 3.346471 (+1.45z)| norm 0.2567 (+0.43z)| lr 1.46e-05 | 2532.36 ms | 53.3% bf16 MFU | 206987 tok/s +step 17679/19560 | loss 3.368535 (+2.03z)| norm 0.2372 (-0.86z)| lr 1.46e-05 | 2534.55 ms | 53.3% bf16 MFU | 206980 tok/s +step 17680/19560 | loss 3.249135 (-1.13z)| norm 0.2332 (-1.11z)| lr 1.46e-05 | 2532.08 ms | 53.3% bf16 MFU | 206984 tok/s +step 17681/19560 | loss 3.331406 (+1.06z)| norm 0.2330 (-1.11z)| lr 1.46e-05 | 2532.60 ms | 53.3% bf16 MFU | 206986 tok/s +step 17682/19560 | loss 3.364129 (+1.89z)| norm 0.2514 (+0.12z)| lr 1.46e-05 | 2533.78 ms | 53.3% bf16 MFU | 206983 tok/s +step 17683/19560 | loss 3.315128 (+0.60z)| norm 0.2739 (+1.59z)| lr 1.46e-05 | 2532.82 ms | 53.3% bf16 MFU | 206983 tok/s +step 17684/19560 | loss 3.309794 (+0.45z)| norm 0.2277 (-1.44z)| lr 1.45e-05 | 2534.29 ms | 53.3% bf16 MFU | 206978 tok/s +step 17685/19560 | loss 3.297477 (+0.12z)| norm 0.2378 (-0.77z)| lr 1.45e-05 | 2532.51 ms | 53.3% bf16 MFU | 206980 tok/s +step 17686/19560 | loss 3.266496 (-0.70z)| norm 0.2501 (+0.03z)| lr 1.45e-05 | 2534.76 ms | 53.3% bf16 MFU | 206973 tok/s +step 17687/19560 | loss 3.241254 (-1.35z)| norm 0.2499 (+0.01z)| lr 1.45e-05 | 2533.36 ms | 53.3% bf16 MFU | 206972 tok/s +step 17688/19560 | loss 3.409260 (+2.93z)| norm 0.2650 (+1.04z)| lr 1.45e-05 | 2531.91 ms | 53.3% bf16 MFU | 206977 tok/s +step 17689/19560 | loss 3.392341 (+2.45z)| norm 0.2452 (-0.28z)| lr 1.45e-05 | 2533.85 ms | 53.3% bf16 MFU | 206974 tok/s +step 17690/19560 | loss 3.320126 (+0.64z)| norm 0.2527 (+0.23z)| lr 1.45e-05 | 2532.41 ms | 53.3% bf16 MFU | 206977 tok/s +step 17691/19560 | loss 3.260237 (-0.84z)| norm 0.2551 (+0.38z)| lr 1.44e-05 | 2533.35 ms | 53.3% bf16 MFU | 206976 tok/s +step 17692/19560 | loss 3.283827 (-0.26z)| norm 0.2445 (-0.35z)| lr 1.44e-05 | 2533.21 ms | 53.3% bf16 MFU | 206975 tok/s +step 17693/19560 | loss 3.289178 (-0.13z)| norm 0.2350 (-0.99z)| lr 1.44e-05 | 2531.57 ms | 53.3% bf16 MFU | 206981 tok/s +step 17694/19560 | loss 3.290240 (-0.11z)| norm 0.2445 (-0.34z)| lr 1.44e-05 | 2533.03 ms | 53.3% bf16 MFU | 206981 tok/s +step 17695/19560 | loss 3.372933 (+1.92z)| norm 0.2541 (+0.31z)| lr 1.44e-05 | 2534.15 ms | 53.3% bf16 MFU | 206977 tok/s +step 17696/19560 | loss 3.307323 (+0.31z)| norm 0.2276 (-1.48z)| lr 1.44e-05 | 2534.68 ms | 53.3% bf16 MFU | 206970 tok/s +step 17697/19560 | loss 3.329865 (+0.85z)| norm 0.2447 (-0.31z)| lr 1.43e-05 | 2534.14 ms | 53.3% bf16 MFU | 206966 tok/s +step 17698/19560 | loss 3.294561 (-0.03z)| norm 0.2413 (-0.55z)| lr 1.43e-05 | 2534.55 ms | 53.3% bf16 MFU | 206961 tok/s +step 17699/19560 | loss 3.338461 (+1.05z)| norm 0.2328 (-1.11z)| lr 1.43e-05 | 2579.05 ms | 52.4% bf16 MFU | 206777 tok/s +step 17700/19560 | loss 3.293521 (-0.07z)| norm 0.2360 (-0.88z)| lr 1.43e-05 | 2534.38 ms | 53.3% bf16 MFU | 206782 tok/s +step 17701/19560 | loss 3.333372 (+0.92z)| norm 0.2424 (-0.45z)| lr 1.43e-05 | 2533.68 ms | 53.3% bf16 MFU | 206789 tok/s +step 17702/19560 | loss 3.329016 (+0.80z)| norm 0.2506 (+0.09z)| lr 1.43e-05 | 2531.64 ms | 53.3% bf16 MFU | 206804 tok/s +step 17703/19560 | loss 3.293470 (-0.10z)| norm 0.2343 (-1.01z)| lr 1.43e-05 | 2533.91 ms | 53.3% bf16 MFU | 206809 tok/s +step 17704/19560 | loss 3.326817 (+0.73z)| norm 0.2335 (-1.06z)| lr 1.42e-05 | 2532.75 ms | 53.3% bf16 MFU | 206819 tok/s +step 17705/19560 | loss 3.209979 (-2.21z)| norm 0.2257 (-1.56z)| lr 1.42e-05 | 2532.54 ms | 53.3% bf16 MFU | 206829 tok/s +step 17706/19560 | loss 3.297853 (+0.00z)| norm 0.2358 (-0.88z)| lr 1.42e-05 | 2533.80 ms | 53.3% bf16 MFU | 206834 tok/s +step 17707/19560 | loss 3.280880 (-0.46z)| norm 0.2437 (-0.34z)| lr 1.42e-05 | 2534.08 ms | 53.3% bf16 MFU | 206837 tok/s +step 17708/19560 | loss 3.289668 (-0.23z)| norm 0.2415 (-0.49z)| lr 1.42e-05 | 2534.52 ms | 53.3% bf16 MFU | 206838 tok/s +step 17709/19560 | loss 3.373564 (+1.93z)| norm 0.2424 (-0.43z)| lr 1.42e-05 | 2537.19 ms | 53.2% bf16 MFU | 206828 tok/s +step 17710/19560 | loss 3.262951 (-0.93z)| norm 0.2371 (-0.79z)| lr 1.41e-05 | 2532.97 ms | 53.3% bf16 MFU | 206836 tok/s +step 17711/19560 | loss 3.263758 (-0.90z)| norm 0.2365 (-0.82z)| lr 1.41e-05 | 2533.23 ms | 53.3% bf16 MFU | 206842 tok/s +step 17712/19560 | loss 3.323170 (+0.63z)| norm 0.2569 (+0.57z)| lr 1.41e-05 | 2533.28 ms | 53.3% bf16 MFU | 206848 tok/s +step 17713/19560 | loss 3.297879 (-0.01z)| norm 0.2941 (+2.98z)| lr 1.41e-05 | 2534.44 ms | 53.3% bf16 MFU | 206849 tok/s +step 17714/19560 | loss 3.296998 (-0.04z)| norm 0.2498 (+0.06z)| lr 1.41e-05 | 2533.93 ms | 53.3% bf16 MFU | 206852 tok/s +step 17715/19560 | loss 3.248278 (-1.28z)| norm 0.2358 (-0.86z)| lr 1.41e-05 | 2533.26 ms | 53.3% bf16 MFU | 206857 tok/s +step 17716/19560 | loss 3.272432 (-0.65z)| norm 0.2357 (-0.86z)| lr 1.41e-05 | 2534.25 ms | 53.3% bf16 MFU | 206859 tok/s +step 17717/19560 | loss 3.386590 (+2.29z)| norm 0.2466 (-0.13z)| lr 1.40e-05 | 2533.48 ms | 53.3% bf16 MFU | 206863 tok/s +step 17718/19560 | loss 3.288156 (-0.23z)| norm 0.2447 (-0.26z)| lr 1.40e-05 | 2533.98 ms | 53.3% bf16 MFU | 206865 tok/s +step 17719/19560 | loss 3.259189 (-0.97z)| norm 0.2459 (-0.17z)| lr 1.40e-05 | 2534.07 ms | 53.3% bf16 MFU | 206866 tok/s +step 17720/19560 | loss 3.433784 (+3.36z)| norm 0.2429 (-0.37z)| lr 1.40e-05 | 2534.88 ms | 53.3% bf16 MFU | 206864 tok/s +step 17721/19560 | loss 3.264547 (-0.81z)| norm 0.2411 (-0.48z)| lr 1.40e-05 | 2531.07 ms | 53.3% bf16 MFU | 206878 tok/s +step 17722/19560 | loss 3.265219 (-0.79z)| norm 0.2596 (+0.75z)| lr 1.40e-05 | 2533.55 ms | 53.3% bf16 MFU | 206881 tok/s +step 17723/19560 | loss 3.306372 (+0.22z)| norm 0.2529 (+0.30z)| lr 1.40e-05 | 2534.00 ms | 53.3% bf16 MFU | 206882 tok/s +step 17724/19560 | loss 3.298375 (+0.02z)| norm 0.2379 (-0.69z)| lr 1.39e-05 | 2534.86 ms | 53.3% bf16 MFU | 206880 tok/s +step 17725/19560 | loss 3.265283 (-0.79z)| norm 0.2400 (-0.55z)| lr 1.39e-05 | 2533.32 ms | 53.3% bf16 MFU | 206884 tok/s +step 17726/19560 | loss 3.264114 (-0.81z)| norm 0.2502 (+0.13z)| lr 1.39e-05 | 2531.00 ms | 53.3% bf16 MFU | 206897 tok/s +step 17727/19560 | loss 3.293603 (-0.09z)| norm 0.2619 (+0.90z)| lr 1.39e-05 | 2534.12 ms | 53.3% bf16 MFU | 206896 tok/s +step 17728/19560 | loss 3.248666 (-1.17z)| norm 0.2364 (-0.80z)| lr 1.39e-05 | 2531.11 ms | 53.3% bf16 MFU | 206909 tok/s +step 17729/19560 | loss 3.343103 (+1.11z)| norm 0.2376 (-0.72z)| lr 1.39e-05 | 2533.10 ms | 53.3% bf16 MFU | 206912 tok/s +step 17730/19560 | loss 3.266971 (-0.74z)| norm 0.2430 (-0.34z)| lr 1.38e-05 | 2533.35 ms | 53.3% bf16 MFU | 206914 tok/s +step 17731/19560 | loss 3.329065 (+0.76z)| norm 0.2449 (-0.21z)| lr 1.38e-05 | 2532.76 ms | 53.3% bf16 MFU | 206918 tok/s +step 17732/19560 | loss 3.248401 (-1.20z)| norm 0.2453 (-0.17z)| lr 1.38e-05 | 2532.57 ms | 53.3% bf16 MFU | 206923 tok/s +step 17733/19560 | loss 3.287864 (-0.22z)| norm 0.2337 (-0.97z)| lr 1.38e-05 | 2531.68 ms | 53.3% bf16 MFU | 206932 tok/s +step 17734/19560 | loss 3.306379 (+0.25z)| norm 0.2448 (-0.18z)| lr 1.38e-05 | 2532.23 ms | 53.3% bf16 MFU | 206937 tok/s +step 17735/19560 | loss 3.359773 (+1.56z)| norm 0.2432 (-0.29z)| lr 1.38e-05 | 2531.97 ms | 53.3% bf16 MFU | 206944 tok/s +step 17736/19560 | loss 3.287679 (-0.22z)| norm 0.2299 (-1.24z)| lr 1.38e-05 | 2532.60 ms | 53.3% bf16 MFU | 206948 tok/s +step 17737/19560 | loss 3.347716 (+1.27z)| norm 0.2445 (-0.19z)| lr 1.37e-05 | 2531.15 ms | 53.3% bf16 MFU | 206957 tok/s +step 17738/19560 | loss 3.354308 (+1.41z)| norm 0.2393 (-0.55z)| lr 1.37e-05 | 2533.25 ms | 53.3% bf16 MFU | 206957 tok/s +step 17739/19560 | loss 3.260766 (-0.92z)| norm 0.2376 (-0.68z)| lr 1.37e-05 | 2534.00 ms | 53.3% bf16 MFU | 206954 tok/s +step 17740/19560 | loss 3.265628 (-0.81z)| norm 0.2441 (-0.20z)| lr 1.37e-05 | 2532.52 ms | 53.3% bf16 MFU | 206958 tok/s +step 17741/19560 | loss 3.303710 (+0.15z)| norm 0.2377 (-0.66z)| lr 1.37e-05 | 2532.25 ms | 53.3% bf16 MFU | 206962 tok/s +step 17742/19560 | loss 3.277946 (-0.49z)| norm 0.2480 (+0.08z)| lr 1.37e-05 | 2534.36 ms | 53.3% bf16 MFU | 206958 tok/s +step 17743/19560 | loss 3.261243 (-0.95z)| norm 0.2541 (+0.53z)| lr 1.37e-05 | 2533.53 ms | 53.3% bf16 MFU | 206957 tok/s +step 17744/19560 | loss 3.298843 (+0.03z)| norm 0.2583 (+0.82z)| lr 1.36e-05 | 2532.49 ms | 53.3% bf16 MFU | 206960 tok/s +step 17745/19560 | loss 3.308769 (+0.28z)| norm 0.2334 (-0.98z)| lr 1.36e-05 | 2533.38 ms | 53.3% bf16 MFU | 206960 tok/s +step 17746/19560 | loss 3.292851 (-0.13z)| norm 0.2337 (-0.95z)| lr 1.36e-05 | 2533.64 ms | 53.3% bf16 MFU | 206958 tok/s +step 17747/19560 | loss 3.255338 (-1.09z)| norm 0.2390 (-0.57z)| lr 1.36e-05 | 2533.19 ms | 53.3% bf16 MFU | 206959 tok/s +step 17748/19560 | loss 3.295163 (-0.06z)| norm 0.2531 (+0.43z)| lr 1.36e-05 | 2534.86 ms | 53.3% bf16 MFU | 206952 tok/s +step 17749/19560 | loss 3.294223 (-0.09z)| norm 0.2435 (-0.26z)| lr 1.36e-05 | 2533.69 ms | 53.3% bf16 MFU | 206951 tok/s +step 17750/19560 | loss 3.386027 (+2.23z)| norm 0.2473 (+0.03z)| lr 1.35e-05 | 2532.93 ms | 53.3% bf16 MFU | 206953 tok/s +val loss 3.288825 +evaluating HellaSwag: 0/628 evaluating HellaSwag: 10/628 evaluating HellaSwag: 20/628 evaluating HellaSwag: 30/628 evaluating HellaSwag: 40/628 evaluating HellaSwag: 50/628 evaluating HellaSwag: 60/628 evaluating HellaSwag: 70/628 evaluating HellaSwag: 80/628 evaluating HellaSwag: 90/628 evaluating HellaSwag: 100/628 evaluating HellaSwag: 110/628 evaluating HellaSwag: 120/628 evaluating HellaSwag: 130/628 evaluating HellaSwag: 140/628 evaluating HellaSwag: 150/628 evaluating HellaSwag: 160/628 evaluating HellaSwag: 170/628 evaluating HellaSwag: 180/628 evaluating HellaSwag: 190/628 evaluating HellaSwag: 200/628 evaluating HellaSwag: 210/628 evaluating HellaSwag: 220/628 evaluating HellaSwag: 230/628 evaluating HellaSwag: 240/628 evaluating HellaSwag: 250/628 evaluating HellaSwag: 260/628 evaluating HellaSwag: 270/628 evaluating HellaSwag: 280/628 evaluating HellaSwag: 290/628 evaluating HellaSwag: 300/628 evaluating HellaSwag: 310/628 evaluating HellaSwag: 320/628 evaluating HellaSwag: 330/628 evaluating HellaSwag: 340/628 evaluating HellaSwag: 350/628 evaluating HellaSwag: 360/628 evaluating HellaSwag: 370/628 evaluating HellaSwag: 380/628 evaluating HellaSwag: 390/628 evaluating HellaSwag: 400/628 evaluating HellaSwag: 410/628 evaluating HellaSwag: 420/628 evaluating HellaSwag: 430/628 evaluating HellaSwag: 440/628 evaluating HellaSwag: 450/628 evaluating HellaSwag: 460/628 evaluating HellaSwag: 470/628 evaluating HellaSwag: 480/628 evaluating HellaSwag: 490/628 evaluating HellaSwag: 500/628 evaluating HellaSwag: 510/628 evaluating HellaSwag: 520/628 evaluating HellaSwag: 530/628 evaluating HellaSwag: 540/628 evaluating HellaSwag: 550/628 evaluating HellaSwag: 560/628 evaluating HellaSwag: 570/628 evaluating HellaSwag: 580/628 evaluating HellaSwag: 590/628 evaluating HellaSwag: 600/628 evaluating HellaSwag: 610/628 evaluating HellaSwag: 620/628 HellaSwag: 3037/10042 = 0.302430 +step 17751/19560 | loss 3.311368 (+0.32z)| norm 0.2432 (-0.27z)| lr 1.35e-05 | 2533.02 ms | 53.3% bf16 MFU | 206954 tok/s +step 17752/19560 | loss 3.280540 (-0.46z)| norm 0.2319 (-1.08z)| lr 1.35e-05 | 2530.02 ms | 53.4% bf16 MFU | 206968 tok/s +step 17753/19560 | loss 3.251913 (-1.18z)| norm 0.2334 (-0.97z)| lr 1.35e-05 | 2529.41 ms | 53.4% bf16 MFU | 206983 tok/s +step 17754/19560 | loss 3.314007 (+0.40z)| norm 0.2435 (-0.23z)| lr 1.35e-05 | 2532.81 ms | 53.3% bf16 MFU | 206984 tok/s +step 17755/19560 | loss 3.272303 (-0.66z)| norm 0.2415 (-0.38z)| lr 1.35e-05 | 2532.00 ms | 53.3% bf16 MFU | 206988 tok/s +step 17756/19560 | loss 3.321503 (+0.60z)| norm 0.2310 (-1.14z)| lr 1.35e-05 | 2533.10 ms | 53.3% bf16 MFU | 206988 tok/s +step 17757/19560 | loss 3.267448 (-0.78z)| norm 0.2334 (-0.95z)| lr 1.34e-05 | 2531.60 ms | 53.3% bf16 MFU | 206993 tok/s +step 17758/19560 | loss 3.269094 (-0.74z)| norm 0.2271 (-1.38z)| lr 1.34e-05 | 2531.46 ms | 53.3% bf16 MFU | 206999 tok/s +step 17759/19560 | loss 3.277593 (-0.51z)| norm 0.2318 (-1.03z)| lr 1.34e-05 | 2532.76 ms | 53.3% bf16 MFU | 206999 tok/s +step 17760/19560 | loss 3.306096 (+0.21z)| norm 0.2294 (-1.18z)| lr 1.34e-05 | 2530.20 ms | 53.4% bf16 MFU | 207010 tok/s +step 17761/19560 | loss 3.327469 (+0.74z)| norm 0.2413 (-0.32z)| lr 1.34e-05 | 2532.18 ms | 53.3% bf16 MFU | 207012 tok/s +step 17762/19560 | loss 3.266724 (-0.80z)| norm 0.2402 (-0.40z)| lr 1.34e-05 | 2531.44 ms | 53.3% bf16 MFU | 207017 tok/s +step 17763/19560 | loss 3.234672 (-1.59z)| norm 0.2489 (+0.25z)| lr 1.34e-05 | 2531.80 ms | 53.3% bf16 MFU | 207020 tok/s +step 17764/19560 | loss 3.367819 (+1.73z)| norm 0.2485 (+0.21z)| lr 1.33e-05 | 2531.58 ms | 53.3% bf16 MFU | 207024 tok/s +step 17765/19560 | loss 3.194566 (-2.50z)| norm 0.2581 (+0.91z)| lr 1.33e-05 | 2532.02 ms | 53.3% bf16 MFU | 207026 tok/s +step 17766/19560 | loss 3.338270 (+0.97z)| norm 0.2390 (-0.49z)| lr 1.33e-05 | 2532.50 ms | 53.3% bf16 MFU | 207026 tok/s +step 17767/19560 | loss 3.308189 (+0.24z)| norm 0.2348 (-0.80z)| lr 1.33e-05 | 2534.34 ms | 53.3% bf16 MFU | 207018 tok/s +step 17768/19560 | loss 3.220720 (-1.84z)| norm 0.2257 (-1.45z)| lr 1.33e-05 | 2534.28 ms | 53.3% bf16 MFU | 207011 tok/s +step 17769/19560 | loss 3.319893 (+0.52z)| norm 0.2488 (+0.25z)| lr 1.33e-05 | 2534.04 ms | 53.3% bf16 MFU | 207005 tok/s +step 17770/19560 | loss 3.286113 (-0.29z)| norm 0.2736 (+2.04z)| lr 1.33e-05 | 2535.65 ms | 53.2% bf16 MFU | 206994 tok/s +step 17771/19560 | loss 3.262900 (-0.85z)| norm 0.2385 (-0.51z)| lr 1.32e-05 | 2534.18 ms | 53.3% bf16 MFU | 206988 tok/s +step 17772/19560 | loss 3.343530 (+1.08z)| norm 0.2550 (+0.68z)| lr 1.32e-05 | 2534.54 ms | 53.3% bf16 MFU | 206982 tok/s +step 17773/19560 | loss 3.276634 (-0.52z)| norm 0.2334 (-0.89z)| lr 1.32e-05 | 2532.15 ms | 53.3% bf16 MFU | 206985 tok/s +step 17774/19560 | loss 3.502866 (+4.46z)| norm 0.3135 (+4.51z)| lr 1.32e-05 | 2533.06 ms | 53.3% bf16 MFU | 206985 tok/s +step 17775/19560 | loss 3.296994 (-0.07z)| norm 0.2373 (-0.59z)| lr 1.32e-05 | 2533.69 ms | 53.3% bf16 MFU | 206982 tok/s +step 17776/19560 | loss 3.345450 (+0.98z)| norm 0.2396 (-0.42z)| lr 1.32e-05 | 2533.11 ms | 53.3% bf16 MFU | 206982 tok/s +step 17777/19560 | loss 3.259660 (-0.89z)| norm 0.2412 (-0.32z)| lr 1.31e-05 | 2534.19 ms | 53.3% bf16 MFU | 206977 tok/s +step 17778/19560 | loss 3.263923 (-0.79z)| norm 0.2479 (+0.14z)| lr 1.31e-05 | 2530.20 ms | 53.4% bf16 MFU | 206989 tok/s +step 17779/19560 | loss 3.309438 (+0.23z)| norm 0.2345 (-0.80z)| lr 1.31e-05 | 2534.11 ms | 53.3% bf16 MFU | 206984 tok/s +step 17780/19560 | loss 3.281021 (-0.41z)| norm 0.2431 (-0.14z)| lr 1.31e-05 | 2531.00 ms | 53.3% bf16 MFU | 206992 tok/s +step 17781/19560 | loss 3.287278 (-0.27z)| norm 0.2342 (-0.82z)| lr 1.31e-05 | 2532.88 ms | 53.3% bf16 MFU | 206992 tok/s +step 17782/19560 | loss 3.267385 (-0.71z)| norm 0.2272 (-1.33z)| lr 1.31e-05 | 2532.18 ms | 53.3% bf16 MFU | 206995 tok/s +step 17783/19560 | loss 3.301753 (+0.06z)| norm 0.2315 (-1.00z)| lr 1.31e-05 | 2532.51 ms | 53.3% bf16 MFU | 206996 tok/s +step 17784/19560 | loss 3.248638 (-1.13z)| norm 0.2289 (-1.18z)| lr 1.30e-05 | 2531.56 ms | 53.3% bf16 MFU | 207001 tok/s +step 17785/19560 | loss 3.244333 (-1.22z)| norm 0.2309 (-1.01z)| lr 1.30e-05 | 2532.53 ms | 53.3% bf16 MFU | 207002 tok/s +step 17786/19560 | loss 3.308770 (+0.22z)| norm 0.2881 (+3.31z)| lr 1.30e-05 | 2534.39 ms | 53.3% bf16 MFU | 206996 tok/s +step 17787/19560 | loss 3.301979 (+0.06z)| norm 0.2350 (-0.68z)| lr 1.30e-05 | 2532.21 ms | 53.3% bf16 MFU | 206998 tok/s +step 17788/19560 | loss 3.338160 (+0.87z)| norm 0.2354 (-0.65z)| lr 1.30e-05 | 2533.05 ms | 53.3% bf16 MFU | 206997 tok/s +step 17789/19560 | loss 3.347090 (+1.05z)| norm 0.2364 (-0.56z)| lr 1.30e-05 | 2532.17 ms | 53.3% bf16 MFU | 207000 tok/s +step 17790/19560 | loss 3.304934 (+0.10z)| norm 0.2458 (+0.13z)| lr 1.30e-05 | 2532.75 ms | 53.3% bf16 MFU | 207000 tok/s +step 17791/19560 | loss 3.389903 (+1.97z)| norm 0.2786 (+2.51z)| lr 1.29e-05 | 2531.34 ms | 53.3% bf16 MFU | 207006 tok/s +step 17792/19560 | loss 3.337764 (+0.80z)| norm 0.2522 (+0.61z)| lr 1.29e-05 | 2531.23 ms | 53.3% bf16 MFU | 207012 tok/s +step 17793/19560 | loss 3.260472 (-0.91z)| norm 0.2319 (-0.91z)| lr 1.29e-05 | 2533.05 ms | 53.3% bf16 MFU | 207011 tok/s +step 17794/19560 | loss 3.305786 (+0.10z)| norm 0.2323 (-0.87z)| lr 1.29e-05 | 2532.04 ms | 53.3% bf16 MFU | 207013 tok/s +step 17795/19560 | loss 3.293741 (-0.16z)| norm 0.2619 (+1.31z)| lr 1.29e-05 | 2533.01 ms | 53.3% bf16 MFU | 207012 tok/s +step 17796/19560 | loss 3.285521 (-0.35z)| norm 0.2404 (-0.26z)| lr 1.29e-05 | 2531.48 ms | 53.3% bf16 MFU | 207016 tok/s +step 17797/19560 | loss 3.349463 (+1.06z)| norm 0.2406 (-0.25z)| lr 1.29e-05 | 2531.51 ms | 53.3% bf16 MFU | 207021 tok/s +step 17798/19560 | loss 3.347831 (+1.01z)| norm 0.2271 (-1.25z)| lr 1.28e-05 | 2532.01 ms | 53.3% bf16 MFU | 207023 tok/s +step 17799/19560 | loss 3.235812 (-1.46z)| norm 0.2929 (+3.48z)| lr 1.28e-05 | 2531.52 ms | 53.3% bf16 MFU | 207027 tok/s +step 17800/19560 | loss 3.441267 (+2.95z)| norm 0.2565 (+0.87z)| lr 1.28e-05 | 2530.18 ms | 53.4% bf16 MFU | 207036 tok/s +step 17801/19560 | loss 3.311821 (+0.18z)| norm 0.2349 (-0.67z)| lr 1.28e-05 | 2532.49 ms | 53.3% bf16 MFU | 207036 tok/s +step 17802/19560 | loss 3.376482 (+1.53z)| norm 0.2599 (+1.11z)| lr 1.28e-05 | 2534.22 ms | 53.3% bf16 MFU | 207028 tok/s +step 17803/19560 | loss 3.269024 (-0.73z)| norm 0.2384 (-0.41z)| lr 1.28e-05 | 2533.27 ms | 53.3% bf16 MFU | 207025 tok/s +step 17804/19560 | loss 3.349355 (+0.97z)| norm 0.2331 (-0.78z)| lr 1.28e-05 | 2533.16 ms | 53.3% bf16 MFU | 207022 tok/s +step 17805/19560 | loss 3.287805 (-0.33z)| norm 0.2358 (-0.58z)| lr 1.27e-05 | 2532.38 ms | 53.3% bf16 MFU | 207023 tok/s +step 17806/19560 | loss 3.328934 (+0.54z)| norm 0.2452 (+0.10z)| lr 1.27e-05 | 2531.47 ms | 53.3% bf16 MFU | 207027 tok/s +step 17807/19560 | loss 3.333073 (+0.64z)| norm 0.2387 (-0.37z)| lr 1.27e-05 | 2532.44 ms | 53.3% bf16 MFU | 207027 tok/s +step 17808/19560 | loss 3.280771 (-0.48z)| norm 0.2370 (-0.49z)| lr 1.27e-05 | 2531.84 ms | 53.3% bf16 MFU | 207030 tok/s +step 17809/19560 | loss 3.360326 (+1.21z)| norm 0.2314 (-0.89z)| lr 1.27e-05 | 2532.12 ms | 53.3% bf16 MFU | 207031 tok/s +step 17810/19560 | loss 3.248106 (-1.17z)| norm 0.2619 (+1.29z)| lr 1.27e-05 | 2533.47 ms | 53.3% bf16 MFU | 207026 tok/s +step 17811/19560 | loss 3.286032 (-0.35z)| norm 0.2434 (-0.02z)| lr 1.27e-05 | 2531.69 ms | 53.3% bf16 MFU | 207030 tok/s +step 17812/19560 | loss 3.261720 (-0.86z)| norm 0.2308 (-0.94z)| lr 1.26e-05 | 2533.71 ms | 53.3% bf16 MFU | 207024 tok/s +step 17813/19560 | loss 3.315328 (+0.28z)| norm 0.2499 (+0.44z)| lr 1.26e-05 | 2530.54 ms | 53.4% bf16 MFU | 207032 tok/s +step 17814/19560 | loss 3.257033 (-0.96z)| norm 0.2375 (-0.45z)| lr 1.26e-05 | 2533.25 ms | 53.3% bf16 MFU | 207029 tok/s +step 17815/19560 | loss 3.246503 (-1.19z)| norm 0.2340 (-0.70z)| lr 1.26e-05 | 2533.22 ms | 53.3% bf16 MFU | 207026 tok/s +step 17816/19560 | loss 3.328715 (+0.59z)| norm 0.2288 (-1.06z)| lr 1.26e-05 | 2534.14 ms | 53.3% bf16 MFU | 207019 tok/s +step 17817/19560 | loss 3.313967 (+0.29z)| norm 0.2247 (-1.34z)| lr 1.26e-05 | 2532.70 ms | 53.3% bf16 MFU | 207018 tok/s +step 17818/19560 | loss 3.214691 (-1.86z)| norm 0.2440 (+0.07z)| lr 1.26e-05 | 2534.06 ms | 53.3% bf16 MFU | 207012 tok/s +step 17819/19560 | loss 3.348885 (+1.05z)| norm 0.2425 (-0.03z)| lr 1.25e-05 | 2531.93 ms | 53.3% bf16 MFU | 207015 tok/s +step 17820/19560 | loss 3.254769 (-0.99z)| norm 0.2285 (-1.04z)| lr 1.25e-05 | 2531.81 ms | 53.3% bf16 MFU | 207018 tok/s +step 17821/19560 | loss 3.288206 (-0.27z)| norm 0.2321 (-0.78z)| lr 1.25e-05 | 2532.30 ms | 53.3% bf16 MFU | 207019 tok/s +step 17822/19560 | loss 3.285896 (-0.32z)| norm 0.2441 (+0.09z)| lr 1.25e-05 | 2532.01 ms | 53.3% bf16 MFU | 207022 tok/s +step 17823/19560 | loss 3.292803 (-0.16z)| norm 0.2419 (-0.06z)| lr 1.25e-05 | 2530.39 ms | 53.4% bf16 MFU | 207030 tok/s +step 17824/19560 | loss 3.301303 (+0.03z)| norm 0.2393 (-0.26z)| lr 1.25e-05 | 2531.89 ms | 53.3% bf16 MFU | 207033 tok/s +step 17825/19560 | loss 3.328321 (+0.62z)| norm 0.2313 (-0.83z)| lr 1.25e-05 | 2533.06 ms | 53.3% bf16 MFU | 207030 tok/s +step 17826/19560 | loss 3.235182 (-1.40z)| norm 0.2310 (-0.84z)| lr 1.24e-05 | 2532.14 ms | 53.3% bf16 MFU | 207031 tok/s +step 17827/19560 | loss 3.277420 (-0.47z)| norm 0.2352 (-0.54z)| lr 1.24e-05 | 2531.20 ms | 53.3% bf16 MFU | 207036 tok/s +step 17828/19560 | loss 3.245050 (-1.16z)| norm 0.2494 (+0.48z)| lr 1.24e-05 | 2531.66 ms | 53.3% bf16 MFU | 207039 tok/s +step 17829/19560 | loss 3.287666 (-0.23z)| norm 0.2478 (+0.37z)| lr 1.24e-05 | 2531.43 ms | 53.3% bf16 MFU | 207042 tok/s +step 17830/19560 | loss 3.239052 (-1.26z)| norm 0.2486 (+0.42z)| lr 1.24e-05 | 2531.75 ms | 53.3% bf16 MFU | 207045 tok/s +step 17831/19560 | loss 3.297637 (+0.00z)| norm 0.2351 (-0.56z)| lr 1.24e-05 | 2534.37 ms | 53.3% bf16 MFU | 207036 tok/s +step 17832/19560 | loss 3.310553 (+0.29z)| norm 0.2457 (+0.21z)| lr 1.24e-05 | 2531.64 ms | 53.3% bf16 MFU | 207039 tok/s +step 17833/19560 | loss 3.310565 (+0.27z)| norm 0.2405 (-0.19z)| lr 1.23e-05 | 2533.48 ms | 53.3% bf16 MFU | 207034 tok/s +step 17834/19560 | loss 3.298817 (+0.01z)| norm 0.2326 (-0.76z)| lr 1.23e-05 | 2532.27 ms | 53.3% bf16 MFU | 207034 tok/s +step 17835/19560 | loss 3.300043 (+0.04z)| norm 0.2492 (+0.45z)| lr 1.23e-05 | 2530.57 ms | 53.4% bf16 MFU | 207042 tok/s +step 17836/19560 | loss 3.314465 (+0.35z)| norm 0.2807 (+2.66z)| lr 1.23e-05 | 2532.85 ms | 53.3% bf16 MFU | 207039 tok/s +step 17837/19560 | loss 3.316084 (+0.40z)| norm 0.2407 (-0.19z)| lr 1.23e-05 | 2532.73 ms | 53.3% bf16 MFU | 207038 tok/s +step 17838/19560 | loss 3.315680 (+0.38z)| norm 0.2493 (+0.42z)| lr 1.23e-05 | 2531.49 ms | 53.3% bf16 MFU | 207041 tok/s +step 17839/19560 | loss 3.284131 (-0.32z)| norm 0.2384 (-0.36z)| lr 1.23e-05 | 2534.90 ms | 53.3% bf16 MFU | 207030 tok/s +step 17840/19560 | loss 3.355149 (+1.25z)| norm 0.2358 (-0.54z)| lr 1.22e-05 | 2532.17 ms | 53.3% bf16 MFU | 207032 tok/s +step 17841/19560 | loss 3.281987 (-0.37z)| norm 0.2346 (-0.61z)| lr 1.22e-05 | 2531.57 ms | 53.3% bf16 MFU | 207035 tok/s +step 17842/19560 | loss 3.314917 (+0.35z)| norm 0.2685 (+1.90z)| lr 1.22e-05 | 2532.28 ms | 53.3% bf16 MFU | 207035 tok/s +step 17843/19560 | loss 3.287729 (-0.25z)| norm 0.2363 (-0.49z)| lr 1.22e-05 | 2535.76 ms | 53.2% bf16 MFU | 207021 tok/s +step 17844/19560 | loss 3.315063 (+0.35z)| norm 0.2404 (-0.19z)| lr 1.22e-05 | 2533.39 ms | 53.3% bf16 MFU | 207018 tok/s +step 17845/19560 | loss 3.331146 (+0.72z)| norm 0.2378 (-0.38z)| lr 1.22e-05 | 2532.53 ms | 53.3% bf16 MFU | 207018 tok/s +step 17846/19560 | loss 3.285016 (-0.32z)| norm 0.2346 (-0.61z)| lr 1.22e-05 | 2533.87 ms | 53.3% bf16 MFU | 207013 tok/s +step 17847/19560 | loss 3.407421 (+2.37z)| norm 0.2547 (+0.88z)| lr 1.21e-05 | 2531.95 ms | 53.3% bf16 MFU | 207016 tok/s +step 17848/19560 | loss 3.324975 (+0.59z)| norm 0.2320 (-0.80z)| lr 1.21e-05 | 2533.42 ms | 53.3% bf16 MFU | 207012 tok/s +step 17849/19560 | loss 3.366338 (+1.50z)| norm 0.2426 (-0.02z)| lr 1.21e-05 | 2532.97 ms | 53.3% bf16 MFU | 207011 tok/s +step 17850/19560 | loss 3.349228 (+1.10z)| norm 0.2524 (+0.71z)| lr 1.21e-05 | 2533.02 ms | 53.3% bf16 MFU | 207009 tok/s +step 17851/19560 | loss 3.350056 (+1.10z)| norm 0.2792 (+2.62z)| lr 1.21e-05 | 2533.22 ms | 53.3% bf16 MFU | 207007 tok/s +step 17852/19560 | loss 3.343464 (+0.94z)| norm 0.2328 (-0.73z)| lr 1.21e-05 | 2536.44 ms | 53.2% bf16 MFU | 206992 tok/s +step 17853/19560 | loss 3.333305 (+0.70z)| norm 0.2590 (+1.14z)| lr 1.21e-05 | 2534.65 ms | 53.3% bf16 MFU | 206985 tok/s +step 17854/19560 | loss 3.323434 (+0.47z)| norm 0.2438 (+0.05z)| lr 1.20e-05 | 2534.65 ms | 53.3% bf16 MFU | 206978 tok/s +step 17855/19560 | loss 3.328540 (+0.58z)| norm 0.2323 (-0.76z)| lr 1.20e-05 | 2531.89 ms | 53.3% bf16 MFU | 206983 tok/s +step 17856/19560 | loss 3.328954 (+0.58z)| norm 0.2356 (-0.52z)| lr 1.20e-05 | 2532.88 ms | 53.3% bf16 MFU | 206983 tok/s +step 17857/19560 | loss 3.323880 (+0.47z)| norm 0.2439 (+0.08z)| lr 1.20e-05 | 2534.17 ms | 53.3% bf16 MFU | 206978 tok/s +step 17858/19560 | loss 3.320935 (+0.39z)| norm 0.2323 (-0.75z)| lr 1.20e-05 | 2531.54 ms | 53.3% bf16 MFU | 206985 tok/s +step 17859/19560 | loss 3.297338 (-0.14z)| norm 0.2418 (-0.07z)| lr 1.20e-05 | 2534.62 ms | 53.3% bf16 MFU | 206978 tok/s +step 17860/19560 | loss 3.293411 (-0.24z)| norm 0.2398 (-0.21z)| lr 1.20e-05 | 2530.57 ms | 53.4% bf16 MFU | 206988 tok/s +step 17861/19560 | loss 3.358383 (+1.23z)| norm 0.2273 (-1.10z)| lr 1.19e-05 | 2533.60 ms | 53.3% bf16 MFU | 206985 tok/s +step 17862/19560 | loss 3.321823 (+0.39z)| norm 0.2245 (-1.28z)| lr 1.19e-05 | 2533.16 ms | 53.3% bf16 MFU | 206985 tok/s +step 17863/19560 | loss 3.351898 (+1.08z)| norm 0.2469 (+0.31z)| lr 1.19e-05 | 2534.40 ms | 53.3% bf16 MFU | 206979 tok/s +step 17864/19560 | loss 3.293894 (-0.24z)| norm 0.2468 (+0.30z)| lr 1.19e-05 | 2532.20 ms | 53.3% bf16 MFU | 206982 tok/s +step 17865/19560 | loss 3.313706 (+0.22z)| norm 0.2720 (+2.05z)| lr 1.19e-05 | 2533.14 ms | 53.3% bf16 MFU | 206982 tok/s +step 17866/19560 | loss 3.248226 (-1.26z)| norm 0.2370 (-0.41z)| lr 1.19e-05 | 2533.02 ms | 53.3% bf16 MFU | 206982 tok/s +step 17867/19560 | loss 3.339467 (+0.81z)| norm 0.2419 (-0.07z)| lr 1.19e-05 | 2534.64 ms | 53.3% bf16 MFU | 206975 tok/s +step 17868/19560 | loss 3.329889 (+0.58z)| norm 0.2493 (+0.45z)| lr 1.19e-05 | 2532.69 ms | 53.3% bf16 MFU | 206977 tok/s +step 17869/19560 | loss 3.291948 (-0.28z)| norm 0.2518 (+0.62z)| lr 1.18e-05 | 2533.34 ms | 53.3% bf16 MFU | 206976 tok/s +step 17870/19560 | loss 3.318048 (+0.31z)| norm 0.2342 (-0.61z)| lr 1.18e-05 | 2532.76 ms | 53.3% bf16 MFU | 206977 tok/s +step 17871/19560 | loss 3.385034 (+1.80z)| norm 0.2562 (+0.93z)| lr 1.18e-05 | 2531.10 ms | 53.3% bf16 MFU | 206985 tok/s +step 17872/19560 | loss 3.332767 (+0.61z)| norm 0.2397 (-0.22z)| lr 1.18e-05 | 2531.38 ms | 53.3% bf16 MFU | 206992 tok/s +step 17873/19560 | loss 3.353291 (+1.06z)| norm 0.2567 (+0.97z)| lr 1.18e-05 | 2533.44 ms | 53.3% bf16 MFU | 206989 tok/s +step 17874/19560 | loss 3.228185 (-1.73z)| norm 0.2347 (-0.58z)| lr 1.18e-05 | 2532.65 ms | 53.3% bf16 MFU | 206991 tok/s +step 17875/19560 | loss 3.306576 (+0.01z)| norm 0.2330 (-0.70z)| lr 1.18e-05 | 2533.70 ms | 53.3% bf16 MFU | 206987 tok/s +step 17876/19560 | loss 3.387538 (+1.79z)| norm 0.2379 (-0.35z)| lr 1.17e-05 | 2532.10 ms | 53.3% bf16 MFU | 206991 tok/s +step 17877/19560 | loss 3.269966 (-0.81z)| norm 0.2310 (-0.82z)| lr 1.17e-05 | 2530.61 ms | 53.4% bf16 MFU | 207000 tok/s +step 17878/19560 | loss 3.346155 (+0.89z)| norm 0.2448 (+0.15z)| lr 1.17e-05 | 2532.93 ms | 53.3% bf16 MFU | 207000 tok/s +step 17879/19560 | loss 3.292871 (-0.30z)| norm 0.2397 (-0.21z)| lr 1.17e-05 | 2531.99 ms | 53.3% bf16 MFU | 207003 tok/s +step 17880/19560 | loss 3.273339 (-0.73z)| norm 0.2398 (-0.20z)| lr 1.17e-05 | 2532.18 ms | 53.3% bf16 MFU | 207005 tok/s +step 17881/19560 | loss 3.299067 (-0.17z)| norm 0.2339 (-0.62z)| lr 1.17e-05 | 2529.83 ms | 53.4% bf16 MFU | 207017 tok/s +step 17882/19560 | loss 3.298973 (-0.17z)| norm 0.2412 (-0.11z)| lr 1.17e-05 | 2530.62 ms | 53.4% bf16 MFU | 207025 tok/s +step 17883/19560 | loss 3.282750 (-0.53z)| norm 0.2439 (+0.08z)| lr 1.16e-05 | 2533.79 ms | 53.3% bf16 MFU | 207020 tok/s +step 17884/19560 | loss 3.303078 (-0.07z)| norm 0.2408 (-0.14z)| lr 1.16e-05 | 2530.23 ms | 53.4% bf16 MFU | 207029 tok/s +step 17885/19560 | loss 3.309417 (+0.06z)| norm 0.2297 (-0.92z)| lr 1.16e-05 | 2531.05 ms | 53.3% bf16 MFU | 207035 tok/s +step 17886/19560 | loss 3.311251 (+0.10z)| norm 0.2341 (-0.62z)| lr 1.16e-05 | 2530.85 ms | 53.3% bf16 MFU | 207041 tok/s +step 17887/19560 | loss 3.344663 (+0.84z)| norm 0.2535 (+0.74z)| lr 1.16e-05 | 2533.40 ms | 53.3% bf16 MFU | 207036 tok/s +step 17888/19560 | loss 3.268401 (-0.87z)| norm 0.2344 (-0.61z)| lr 1.16e-05 | 2532.39 ms | 53.3% bf16 MFU | 207036 tok/s +step 17889/19560 | loss 3.374110 (+1.48z)| norm 0.2377 (-0.37z)| lr 1.16e-05 | 2532.20 ms | 53.3% bf16 MFU | 207037 tok/s +step 17890/19560 | loss 3.264926 (-0.95z)| norm 0.2434 (+0.03z)| lr 1.15e-05 | 2532.48 ms | 53.3% bf16 MFU | 207036 tok/s +step 17891/19560 | loss 3.344672 (+0.81z)| norm 0.2364 (-0.47z)| lr 1.15e-05 | 2533.59 ms | 53.3% bf16 MFU | 207031 tok/s +step 17892/19560 | loss 3.307215 (-0.02z)| norm 0.2337 (-0.65z)| lr 1.15e-05 | 2532.09 ms | 53.3% bf16 MFU | 207033 tok/s +step 17893/19560 | loss 3.379819 (+1.62z)| norm 0.2333 (-0.66z)| lr 1.15e-05 | 2533.60 ms | 53.3% bf16 MFU | 207028 tok/s +step 17894/19560 | loss 3.336111 (+0.61z)| norm 0.2317 (-0.77z)| lr 1.15e-05 | 2533.44 ms | 53.3% bf16 MFU | 207024 tok/s +step 17895/19560 | loss 3.342643 (+0.76z)| norm 0.2576 (+1.05z)| lr 1.15e-05 | 2531.83 ms | 53.3% bf16 MFU | 207026 tok/s +step 17896/19560 | loss 3.287645 (-0.53z)| norm 0.2306 (-0.86z)| lr 1.15e-05 | 2532.50 ms | 53.3% bf16 MFU | 207026 tok/s +step 17897/19560 | loss 3.263692 (-1.07z)| norm 0.2318 (-0.76z)| lr 1.15e-05 | 2532.91 ms | 53.3% bf16 MFU | 207024 tok/s +step 17898/19560 | loss 3.319660 (+0.22z)| norm 0.2462 (+0.27z)| lr 1.14e-05 | 2532.39 ms | 53.3% bf16 MFU | 207025 tok/s +step 17899/19560 | loss 3.358511 (+1.11z)| norm 0.2351 (-0.53z)| lr 1.14e-05 | 2531.67 ms | 53.3% bf16 MFU | 207028 tok/s +step 17900/19560 | loss 3.281791 (-0.66z)| norm 0.2500 (+0.55z)| lr 1.14e-05 | 2533.89 ms | 53.3% bf16 MFU | 207022 tok/s +step 17901/19560 | loss 3.377513 (+1.53z)| norm 0.2438 (+0.09z)| lr 1.14e-05 | 2531.67 ms | 53.3% bf16 MFU | 207026 tok/s +step 17902/19560 | loss 3.289695 (-0.50z)| norm 0.2421 (+0.01z)| lr 1.14e-05 | 2533.46 ms | 53.3% bf16 MFU | 207022 tok/s +step 17903/19560 | loss 3.303833 (-0.14z)| norm 0.2393 (-0.22z)| lr 1.14e-05 | 2532.03 ms | 53.3% bf16 MFU | 207024 tok/s +step 17904/19560 | loss 3.303096 (-0.15z)| norm 0.2392 (-0.22z)| lr 1.14e-05 | 2533.07 ms | 53.3% bf16 MFU | 207022 tok/s +step 17905/19560 | loss 3.232376 (-1.91z)| norm 0.2320 (-0.80z)| lr 1.13e-05 | 2531.38 ms | 53.3% bf16 MFU | 207026 tok/s +step 17906/19560 | loss 3.265943 (-1.08z)| norm 0.2373 (-0.36z)| lr 1.13e-05 | 2532.70 ms | 53.3% bf16 MFU | 207025 tok/s +step 17907/19560 | loss 3.355740 (+1.15z)| norm 0.2418 (-0.01z)| lr 1.13e-05 | 2531.76 ms | 53.3% bf16 MFU | 207028 tok/s +step 17908/19560 | loss 3.379114 (+1.70z)| norm 0.2597 (+1.42z)| lr 1.13e-05 | 2530.81 ms | 53.3% bf16 MFU | 207035 tok/s +step 17909/19560 | loss 3.308913 (-0.03z)| norm 0.2577 (+1.24z)| lr 1.13e-05 | 2534.01 ms | 53.3% bf16 MFU | 207028 tok/s +step 17910/19560 | loss 3.304253 (-0.16z)| norm 0.2388 (-0.27z)| lr 1.13e-05 | 2533.24 ms | 53.3% bf16 MFU | 207025 tok/s +step 17911/19560 | loss 3.279292 (-0.77z)| norm 0.2317 (-0.84z)| lr 1.13e-05 | 2532.41 ms | 53.3% bf16 MFU | 207025 tok/s +step 17912/19560 | loss 3.294627 (-0.40z)| norm 0.2351 (-0.58z)| lr 1.12e-05 | 2530.76 ms | 53.4% bf16 MFU | 207032 tok/s +step 17913/19560 | loss 3.316497 (+0.13z)| norm 0.2390 (-0.27z)| lr 1.12e-05 | 2531.56 ms | 53.3% bf16 MFU | 207036 tok/s +step 17914/19560 | loss 3.359673 (+1.20z)| norm 0.2393 (-0.23z)| lr 1.12e-05 | 2534.10 ms | 53.3% bf16 MFU | 207029 tok/s +step 17915/19560 | loss 3.270625 (-1.02z)| norm 0.2429 (+0.07z)| lr 1.12e-05 | 2530.94 ms | 53.3% bf16 MFU | 207035 tok/s +step 17916/19560 | loss 3.378604 (+1.65z)| norm 0.2387 (-0.29z)| lr 1.12e-05 | 2532.68 ms | 53.3% bf16 MFU | 207033 tok/s +step 17917/19560 | loss 3.293309 (-0.45z)| norm 0.2351 (-0.59z)| lr 1.12e-05 | 2535.59 ms | 53.2% bf16 MFU | 207020 tok/s +step 17918/19560 | loss 3.300185 (-0.28z)| norm 0.2431 (+0.09z)| lr 1.12e-05 | 2532.61 ms | 53.3% bf16 MFU | 207020 tok/s +step 17919/19560 | loss 3.286798 (-0.60z)| norm 0.2526 (+0.96z)| lr 1.12e-05 | 2532.26 ms | 53.3% bf16 MFU | 207021 tok/s +step 17920/19560 | loss 3.309410 (-0.02z)| norm 0.2452 (+0.30z)| lr 1.11e-05 | 2534.38 ms | 53.3% bf16 MFU | 207014 tok/s +step 17921/19560 | loss 3.308609 (-0.05z)| norm 0.2554 (+1.20z)| lr 1.11e-05 | 2533.36 ms | 53.3% bf16 MFU | 207011 tok/s +step 17922/19560 | loss 3.339905 (+0.73z)| norm 0.2420 (+0.00z)| lr 1.11e-05 | 2531.84 ms | 53.3% bf16 MFU | 207014 tok/s +step 17923/19560 | loss 3.386281 (+1.86z)| norm 0.2490 (+0.63z)| lr 1.11e-05 | 2532.27 ms | 53.3% bf16 MFU | 207015 tok/s +step 17924/19560 | loss 3.329384 (+0.43z)| norm 0.2544 (+1.11z)| lr 1.11e-05 | 2532.24 ms | 53.3% bf16 MFU | 207017 tok/s +step 17925/19560 | loss 3.321197 (+0.24z)| norm 0.2558 (+1.22z)| lr 1.11e-05 | 2531.49 ms | 53.3% bf16 MFU | 207021 tok/s +step 17926/19560 | loss 3.307111 (-0.11z)| norm 0.2357 (-0.58z)| lr 1.11e-05 | 2532.83 ms | 53.3% bf16 MFU | 207020 tok/s +step 17927/19560 | loss 3.278340 (-0.85z)| norm 0.2568 (+1.44z)| lr 1.10e-05 | 2533.21 ms | 53.3% bf16 MFU | 207018 tok/s +step 17928/19560 | loss 3.280476 (-0.80z)| norm 0.2327 (-0.89z)| lr 1.10e-05 | 2533.53 ms | 53.3% bf16 MFU | 207014 tok/s +step 17929/19560 | loss 3.318088 (+0.20z)| norm 0.2433 (+0.15z)| lr 1.10e-05 | 2533.85 ms | 53.3% bf16 MFU | 207009 tok/s +step 17930/19560 | loss 3.351562 (+1.10z)| norm 0.2468 (+0.51z)| lr 1.10e-05 | 2532.01 ms | 53.3% bf16 MFU | 207011 tok/s +step 17931/19560 | loss 3.324942 (+0.38z)| norm 0.2381 (-0.36z)| lr 1.10e-05 | 2532.59 ms | 53.3% bf16 MFU | 207012 tok/s +step 17932/19560 | loss 3.296389 (-0.38z)| norm 0.2412 (-0.06z)| lr 1.10e-05 | 2529.91 ms | 53.4% bf16 MFU | 207023 tok/s +step 17933/19560 | loss 3.425830 (+2.97z)| norm 0.2437 (+0.19z)| lr 1.10e-05 | 2532.73 ms | 53.3% bf16 MFU | 207022 tok/s +step 17934/19560 | loss 3.268017 (-1.11z)| norm 0.2274 (-1.42z)| lr 1.10e-05 | 2532.61 ms | 53.3% bf16 MFU | 207022 tok/s +step 17935/19560 | loss 3.255197 (-1.42z)| norm 0.2354 (-0.62z)| lr 1.09e-05 | 2530.85 ms | 53.3% bf16 MFU | 207028 tok/s +step 17936/19560 | loss 3.238230 (-1.83z)| norm 0.2280 (-1.34z)| lr 1.09e-05 | 2531.79 ms | 53.3% bf16 MFU | 207031 tok/s +step 17937/19560 | loss 3.278355 (-0.79z)| norm 0.2415 (-0.02z)| lr 1.09e-05 | 2534.21 ms | 53.3% bf16 MFU | 207024 tok/s +step 17938/19560 | loss 3.279146 (-0.79z)| norm 0.2309 (-1.05z)| lr 1.09e-05 | 2531.25 ms | 53.3% bf16 MFU | 207029 tok/s +step 17939/19560 | loss 3.250828 (-1.50z)| norm 0.2347 (-0.66z)| lr 1.09e-05 | 2533.17 ms | 53.3% bf16 MFU | 207026 tok/s +step 17940/19560 | loss 3.334020 (+0.62z)| norm 0.2387 (-0.27z)| lr 1.09e-05 | 2531.76 ms | 53.3% bf16 MFU | 207029 tok/s +step 17941/19560 | loss 3.282992 (-0.68z)| norm 0.2326 (-0.87z)| lr 1.09e-05 | 2532.83 ms | 53.3% bf16 MFU | 207027 tok/s +step 17942/19560 | loss 3.238922 (-1.80z)| norm 0.2440 (+0.27z)| lr 1.08e-05 | 2531.96 ms | 53.3% bf16 MFU | 207029 tok/s +step 17943/19560 | loss 3.311000 (+0.02z)| norm 0.2398 (-0.16z)| lr 1.08e-05 | 2530.92 ms | 53.3% bf16 MFU | 207035 tok/s +step 17944/19560 | loss 3.318731 (+0.23z)| norm 0.2432 (+0.17z)| lr 1.08e-05 | 2533.12 ms | 53.3% bf16 MFU | 207032 tok/s +step 17945/19560 | loss 3.294490 (-0.39z)| norm 0.2270 (-1.47z)| lr 1.08e-05 | 2533.36 ms | 53.3% bf16 MFU | 207028 tok/s +step 17946/19560 | loss 3.290261 (-0.53z)| norm 0.2341 (-0.74z)| lr 1.08e-05 | 2533.28 ms | 53.3% bf16 MFU | 207025 tok/s +step 17947/19560 | loss 3.295140 (-0.39z)| norm 0.2342 (-0.72z)| lr 1.08e-05 | 2533.36 ms | 53.3% bf16 MFU | 207021 tok/s +step 17948/19560 | loss 3.299829 (-0.28z)| norm 0.2388 (-0.27z)| lr 1.08e-05 | 2533.11 ms | 53.3% bf16 MFU | 207019 tok/s +step 17949/19560 | loss 3.349102 (+1.02z)| norm 0.2267 (-1.49z)| lr 1.08e-05 | 2531.79 ms | 53.3% bf16 MFU | 207022 tok/s +step 17950/19560 | loss 3.287395 (-0.62z)| norm 0.2357 (-0.57z)| lr 1.07e-05 | 2533.59 ms | 53.3% bf16 MFU | 207018 tok/s +step 17951/19560 | loss 3.261767 (-1.29z)| norm 0.2283 (-1.30z)| lr 1.07e-05 | 2532.45 ms | 53.3% bf16 MFU | 207018 tok/s +step 17952/19560 | loss 3.278206 (-0.85z)| norm 0.2266 (-1.44z)| lr 1.07e-05 | 2532.96 ms | 53.3% bf16 MFU | 207017 tok/s +step 17953/19560 | loss 3.401068 (+2.32z)| norm 0.2416 (+0.03z)| lr 1.07e-05 | 2531.17 ms | 53.3% bf16 MFU | 207022 tok/s +step 17954/19560 | loss 3.309549 (-0.05z)| norm 0.2325 (-0.87z)| lr 1.07e-05 | 2533.02 ms | 53.3% bf16 MFU | 207020 tok/s +step 17955/19560 | loss 3.311560 (-0.01z)| norm 0.2537 (+1.23z)| lr 1.07e-05 | 2533.58 ms | 53.3% bf16 MFU | 207016 tok/s +step 17956/19560 | loss 3.310872 (-0.04z)| norm 0.2249 (-1.60z)| lr 1.07e-05 | 2532.93 ms | 53.3% bf16 MFU | 207015 tok/s +step 17957/19560 | loss 3.273246 (-1.03z)| norm 0.2289 (-1.19z)| lr 1.06e-05 | 2532.27 ms | 53.3% bf16 MFU | 207016 tok/s +step 17958/19560 | loss 3.330018 (+0.46z)| norm 0.2477 (+0.65z)| lr 1.06e-05 | 2534.58 ms | 53.3% bf16 MFU | 207008 tok/s +step 17959/19560 | loss 3.350463 (+1.00z)| norm 0.2681 (+2.57z)| lr 1.06e-05 | 2533.27 ms | 53.3% bf16 MFU | 207006 tok/s +step 17960/19560 | loss 3.287228 (-0.70z)| norm 0.2660 (+2.30z)| lr 1.06e-05 | 2532.58 ms | 53.3% bf16 MFU | 207006 tok/s +step 17961/19560 | loss 3.286590 (-0.71z)| norm 0.2332 (-0.77z)| lr 1.06e-05 | 2530.96 ms | 53.3% bf16 MFU | 207013 tok/s +step 17962/19560 | loss 3.317723 (+0.12z)| norm 0.2918 (+4.33z)| lr 1.06e-05 | 2532.19 ms | 53.3% bf16 MFU | 207015 tok/s +step 17963/19560 | loss 3.322343 (+0.24z)| norm 0.2324 (-0.81z)| lr 1.06e-05 | 2532.29 ms | 53.3% bf16 MFU | 207017 tok/s +step 17964/19560 | loss 3.335530 (+0.59z)| norm 0.2535 (+1.08z)| lr 1.06e-05 | 2531.12 ms | 53.3% bf16 MFU | 207023 tok/s +step 17965/19560 | loss 3.304077 (-0.25z)| norm 0.2500 (+0.76z)| lr 1.05e-05 | 2532.06 ms | 53.3% bf16 MFU | 207024 tok/s +step 17966/19560 | loss 3.308579 (-0.13z)| norm 0.2470 (+0.49z)| lr 1.05e-05 | 2531.20 ms | 53.3% bf16 MFU | 207030 tok/s +step 17967/19560 | loss 3.340806 (+0.72z)| norm 0.2681 (+2.32z)| lr 1.05e-05 | 2532.68 ms | 53.3% bf16 MFU | 207029 tok/s +step 17968/19560 | loss 3.266045 (-1.25z)| norm 0.2503 (+0.74z)| lr 1.05e-05 | 2534.78 ms | 53.3% bf16 MFU | 207019 tok/s +step 17969/19560 | loss 3.369416 (+1.48z)| norm 0.2453 (+0.29z)| lr 1.05e-05 | 2532.72 ms | 53.3% bf16 MFU | 207018 tok/s +step 17970/19560 | loss 3.307940 (-0.15z)| norm 0.2437 (+0.17z)| lr 1.05e-05 | 2531.59 ms | 53.3% bf16 MFU | 207022 tok/s +step 17971/19560 | loss 3.303553 (-0.27z)| norm 0.2460 (+0.38z)| lr 1.05e-05 | 2532.90 ms | 53.3% bf16 MFU | 207021 tok/s +step 17972/19560 | loss 3.344846 (+0.82z)| norm 0.2297 (-1.09z)| lr 1.04e-05 | 2535.00 ms | 53.3% bf16 MFU | 207011 tok/s +step 17973/19560 | loss 3.407935 (+2.42z)| norm 0.2647 (+2.01z)| lr 1.04e-05 | 2531.55 ms | 53.3% bf16 MFU | 207015 tok/s +step 17974/19560 | loss 3.271735 (-1.11z)| norm 0.2343 (-0.68z)| lr 1.04e-05 | 2533.14 ms | 53.3% bf16 MFU | 207013 tok/s +step 17975/19560 | loss 3.324010 (+0.27z)| norm 0.2476 (+0.51z)| lr 1.04e-05 | 2532.92 ms | 53.3% bf16 MFU | 207012 tok/s +step 17976/19560 | loss 3.332592 (+0.49z)| norm 0.2536 (+1.02z)| lr 1.04e-05 | 2530.96 ms | 53.3% bf16 MFU | 207019 tok/s +step 17977/19560 | loss 3.291821 (-0.57z)| norm 0.2465 (+0.38z)| lr 1.04e-05 | 2532.58 ms | 53.3% bf16 MFU | 207019 tok/s +step 17978/19560 | loss 3.191748 (-3.09z)| norm 0.2328 (-0.82z)| lr 1.04e-05 | 2533.87 ms | 53.3% bf16 MFU | 207013 tok/s +step 17979/19560 | loss 3.307679 (-0.11z)| norm 0.2275 (-1.30z)| lr 1.04e-05 | 2533.17 ms | 53.3% bf16 MFU | 207011 tok/s +step 17980/19560 | loss 3.311725 (+0.00z)| norm 0.2437 (+0.19z)| lr 1.03e-05 | 2532.07 ms | 53.3% bf16 MFU | 207014 tok/s +step 17981/19560 | loss 3.260596 (-1.29z)| norm 0.2481 (+0.61z)| lr 1.03e-05 | 2533.05 ms | 53.3% bf16 MFU | 207012 tok/s +step 17982/19560 | loss 3.290892 (-0.51z)| norm 0.2360 (-0.51z)| lr 1.03e-05 | 2533.68 ms | 53.3% bf16 MFU | 207008 tok/s +step 17983/19560 | loss 3.331332 (+0.53z)| norm 0.2371 (-0.42z)| lr 1.03e-05 | 2535.11 ms | 53.3% bf16 MFU | 206998 tok/s +step 17984/19560 | loss 3.277239 (-0.85z)| norm 0.2356 (-0.56z)| lr 1.03e-05 | 2533.30 ms | 53.3% bf16 MFU | 206996 tok/s +step 17985/19560 | loss 3.239546 (-1.78z)| norm 0.2358 (-0.54z)| lr 1.03e-05 | 2532.80 ms | 53.3% bf16 MFU | 206996 tok/s +step 17986/19560 | loss 3.296451 (-0.33z)| norm 0.2453 (+0.34z)| lr 1.03e-05 | 2534.78 ms | 53.3% bf16 MFU | 206988 tok/s +step 17987/19560 | loss 3.300724 (-0.22z)| norm 0.2390 (-0.24z)| lr 1.03e-05 | 2530.77 ms | 53.4% bf16 MFU | 206997 tok/s +step 17988/19560 | loss 3.384598 (+1.86z)| norm 0.2477 (+0.56z)| lr 1.02e-05 | 2532.13 ms | 53.3% bf16 MFU | 207000 tok/s +step 17989/19560 | loss 3.342397 (+0.81z)| norm 0.2437 (+0.18z)| lr 1.02e-05 | 2533.50 ms | 53.3% bf16 MFU | 206997 tok/s +step 17990/19560 | loss 3.247265 (-1.55z)| norm 0.2202 (-2.01z)| lr 1.02e-05 | 2530.09 ms | 53.4% bf16 MFU | 207008 tok/s +step 17991/19560 | loss 3.334452 (+0.62z)| norm 0.2368 (-0.46z)| lr 1.02e-05 | 2533.17 ms | 53.3% bf16 MFU | 207006 tok/s +step 17992/19560 | loss 3.347169 (+0.93z)| norm 0.2344 (-0.67z)| lr 1.02e-05 | 2533.57 ms | 53.3% bf16 MFU | 207003 tok/s +step 17993/19560 | loss 3.304842 (-0.12z)| norm 0.2361 (-0.50z)| lr 1.02e-05 | 2531.25 ms | 53.3% bf16 MFU | 207009 tok/s +step 17994/19560 | loss 3.322786 (+0.31z)| norm 0.2304 (-1.04z)| lr 1.02e-05 | 2533.00 ms | 53.3% bf16 MFU | 207008 tok/s +step 17995/19560 | loss 3.345868 (+0.89z)| norm 0.2299 (-1.08z)| lr 1.01e-05 | 2533.41 ms | 53.3% bf16 MFU | 207005 tok/s +step 17996/19560 | loss 3.362043 (+1.28z)| norm 0.2612 (+1.89z)| lr 1.01e-05 | 2534.18 ms | 53.3% bf16 MFU | 206999 tok/s +step 17997/19560 | loss 3.304758 (-0.15z)| norm 0.2306 (-0.99z)| lr 1.01e-05 | 2533.88 ms | 53.3% bf16 MFU | 206994 tok/s +step 17998/19560 | loss 3.285314 (-0.63z)| norm 0.2382 (-0.27z)| lr 1.01e-05 | 2532.92 ms | 53.3% bf16 MFU | 206994 tok/s +step 17999/19560 | loss 3.312890 (+0.07z)| norm 0.2484 (+0.70z)| lr 1.01e-05 | 2534.91 ms | 53.3% bf16 MFU | 206986 tok/s +step 18000/19560 | loss 3.318493 (+0.22z)| norm 0.2576 (+1.55z)| lr 1.01e-05 | 2533.67 ms | 53.3% bf16 MFU | 206983 tok/s +val loss 3.287839 +evaluating HellaSwag: 0/628 evaluating HellaSwag: 10/628 evaluating HellaSwag: 20/628 evaluating HellaSwag: 30/628 evaluating HellaSwag: 40/628 evaluating HellaSwag: 50/628 evaluating HellaSwag: 60/628 evaluating HellaSwag: 70/628 evaluating HellaSwag: 80/628 evaluating HellaSwag: 90/628 evaluating HellaSwag: 100/628 evaluating HellaSwag: 110/628 evaluating HellaSwag: 120/628 evaluating HellaSwag: 130/628 evaluating HellaSwag: 140/628 evaluating HellaSwag: 150/628 evaluating HellaSwag: 160/628 evaluating HellaSwag: 170/628 evaluating HellaSwag: 180/628 evaluating HellaSwag: 190/628 evaluating HellaSwag: 200/628 evaluating HellaSwag: 210/628 evaluating HellaSwag: 220/628 evaluating HellaSwag: 230/628 evaluating HellaSwag: 240/628 evaluating HellaSwag: 250/628 evaluating HellaSwag: 260/628 evaluating HellaSwag: 270/628 evaluating HellaSwag: 280/628 evaluating HellaSwag: 290/628 evaluating HellaSwag: 300/628 evaluating HellaSwag: 310/628 evaluating HellaSwag: 320/628 evaluating HellaSwag: 330/628 evaluating HellaSwag: 340/628 evaluating HellaSwag: 350/628 evaluating HellaSwag: 360/628 evaluating HellaSwag: 370/628 evaluating HellaSwag: 380/628 evaluating HellaSwag: 390/628 evaluating HellaSwag: 400/628 evaluating HellaSwag: 410/628 evaluating HellaSwag: 420/628 evaluating HellaSwag: 430/628 evaluating HellaSwag: 440/628 evaluating HellaSwag: 450/628 evaluating HellaSwag: 460/628 evaluating HellaSwag: 470/628 evaluating HellaSwag: 480/628 evaluating HellaSwag: 490/628 evaluating HellaSwag: 500/628 evaluating HellaSwag: 510/628 evaluating HellaSwag: 520/628 evaluating HellaSwag: 530/628 evaluating HellaSwag: 540/628 evaluating HellaSwag: 550/628 evaluating HellaSwag: 560/628 evaluating HellaSwag: 570/628 evaluating HellaSwag: 580/628 evaluating HellaSwag: 590/628 evaluating HellaSwag: 600/628 evaluating HellaSwag: 610/628 evaluating HellaSwag: 620/628 HellaSwag: 3038/10042 = 0.302529 +step 18001/19560 | loss 3.257389 (-1.30z)| norm 0.2368 (-0.40z)| lr 1.01e-05 | 2531.74 ms | 53.3% bf16 MFU | 206988 tok/s +step 18002/19560 | loss 3.261766 (-1.21z)| norm 0.2281 (-1.22z)| lr 1.01e-05 | 2531.32 ms | 53.3% bf16 MFU | 206995 tok/s +step 18003/19560 | loss 3.436793 (+3.10z)| norm 0.3286 (+6.65z)| lr 1.00e-05 | 2532.25 ms | 53.3% bf16 MFU | 206997 tok/s +step 18004/19560 | loss 3.292503 (-0.42z)| norm 0.2937 (+3.73z)| lr 1.00e-05 | 2531.67 ms | 53.3% bf16 MFU | 207002 tok/s +step 18005/19560 | loss 3.311406 (+0.04z)| norm 0.2618 (+1.39z)| lr 1.00e-05 | 2532.22 ms | 53.3% bf16 MFU | 207004 tok/s +step 18006/19560 | loss 3.345253 (+0.88z)| norm 0.2362 (-0.44z)| lr 1.00e-05 | 2533.08 ms | 53.3% bf16 MFU | 207003 tok/s +step 18007/19560 | loss 3.298743 (-0.28z)| norm 0.2407 (-0.12z)| lr 1.00e-05 | 2531.94 ms | 53.3% bf16 MFU | 207006 tok/s +step 18008/19560 | loss 3.299078 (-0.28z)| norm 0.2297 (-0.90z)| lr 9.98e-06 | 2532.63 ms | 53.3% bf16 MFU | 207006 tok/s +step 18009/19560 | loss 3.306083 (-0.10z)| norm 0.2459 (+0.25z)| lr 9.97e-06 | 2532.73 ms | 53.3% bf16 MFU | 207006 tok/s +step 18010/19560 | loss 3.395662 (+2.08z)| norm 0.2502 (+0.56z)| lr 9.96e-06 | 2533.56 ms | 53.3% bf16 MFU | 207003 tok/s +step 18011/19560 | loss 3.324527 (+0.33z)| norm 0.2438 (+0.10z)| lr 9.94e-06 | 2531.78 ms | 53.3% bf16 MFU | 207007 tok/s +step 18012/19560 | loss 3.324369 (+0.32z)| norm 0.2453 (+0.21z)| lr 9.93e-06 | 2534.13 ms | 53.3% bf16 MFU | 207001 tok/s +step 18013/19560 | loss 3.269387 (-1.02z)| norm 0.2344 (-0.58z)| lr 9.92e-06 | 2532.52 ms | 53.3% bf16 MFU | 207002 tok/s +step 18014/19560 | loss 3.223239 (-2.10z)| norm 0.2360 (-0.47z)| lr 9.91e-06 | 2533.22 ms | 53.3% bf16 MFU | 207000 tok/s +step 18015/19560 | loss 3.345313 (+0.84z)| norm 0.2408 (-0.12z)| lr 9.89e-06 | 2532.14 ms | 53.3% bf16 MFU | 207003 tok/s +step 18016/19560 | loss 3.280792 (-0.72z)| norm 0.2299 (-0.89z)| lr 9.88e-06 | 2531.34 ms | 53.3% bf16 MFU | 207009 tok/s +step 18017/19560 | loss 3.284286 (-0.62z)| norm 0.2348 (-0.54z)| lr 9.87e-06 | 2533.50 ms | 53.3% bf16 MFU | 207005 tok/s +step 18018/19560 | loss 3.333455 (+0.56z)| norm 0.2417 (-0.05z)| lr 9.85e-06 | 2533.01 ms | 53.3% bf16 MFU | 207004 tok/s +step 18019/19560 | loss 3.298041 (-0.29z)| norm 0.2428 (+0.03z)| lr 9.84e-06 | 2531.14 ms | 53.3% bf16 MFU | 207011 tok/s +step 18020/19560 | loss 3.341686 (+0.77z)| norm 0.2474 (+0.35z)| lr 9.83e-06 | 2531.82 ms | 53.3% bf16 MFU | 207014 tok/s +step 18021/19560 | loss 3.349949 (+0.98z)| norm 0.2225 (-1.42z)| lr 9.82e-06 | 2531.91 ms | 53.3% bf16 MFU | 207017 tok/s +step 18022/19560 | loss 3.274087 (-0.87z)| norm 0.2482 (+0.40z)| lr 9.80e-06 | 2532.86 ms | 53.3% bf16 MFU | 207016 tok/s +step 18023/19560 | loss 3.335793 (+0.65z)| norm 0.2480 (+0.40z)| lr 9.79e-06 | 2533.80 ms | 53.3% bf16 MFU | 207011 tok/s +step 18024/19560 | loss 3.269810 (-0.97z)| norm 0.2320 (-0.75z)| lr 9.78e-06 | 2533.18 ms | 53.3% bf16 MFU | 207009 tok/s +step 18025/19560 | loss 3.342815 (+0.81z)| norm 0.2352 (-0.52z)| lr 9.77e-06 | 2532.72 ms | 53.3% bf16 MFU | 207009 tok/s +step 18026/19560 | loss 3.351096 (+1.00z)| norm 0.2463 (+0.27z)| lr 9.75e-06 | 2532.76 ms | 53.3% bf16 MFU | 207008 tok/s +step 18027/19560 | loss 3.296293 (-0.33z)| norm 0.2408 (-0.13z)| lr 9.74e-06 | 2533.10 ms | 53.3% bf16 MFU | 207007 tok/s +step 18028/19560 | loss 3.286874 (-0.56z)| norm 0.2351 (-0.53z)| lr 9.73e-06 | 2532.86 ms | 53.3% bf16 MFU | 207006 tok/s +step 18029/19560 | loss 3.330596 (+0.53z)| norm 0.2279 (-1.03z)| lr 9.72e-06 | 2532.81 ms | 53.3% bf16 MFU | 207006 tok/s +step 18030/19560 | loss 3.259263 (-1.24z)| norm 0.2335 (-0.63z)| lr 9.70e-06 | 2532.73 ms | 53.3% bf16 MFU | 207006 tok/s +step 18031/19560 | loss 3.291428 (-0.44z)| norm 0.2508 (+0.61z)| lr 9.69e-06 | 2534.89 ms | 53.3% bf16 MFU | 206997 tok/s +step 18032/19560 | loss 3.281229 (-0.68z)| norm 0.2310 (-0.81z)| lr 9.68e-06 | 2533.92 ms | 53.3% bf16 MFU | 206992 tok/s +step 18033/19560 | loss 3.283612 (-0.64z)| norm 0.2508 (+0.60z)| lr 9.67e-06 | 2534.12 ms | 53.3% bf16 MFU | 206987 tok/s +step 18034/19560 | loss 3.294158 (-0.39z)| norm 0.2429 (+0.03z)| lr 9.65e-06 | 2534.40 ms | 53.3% bf16 MFU | 206981 tok/s +step 18035/19560 | loss 3.314868 (+0.14z)| norm 0.2248 (-1.25z)| lr 9.64e-06 | 2532.14 ms | 53.3% bf16 MFU | 206985 tok/s +step 18036/19560 | loss 3.309504 (+0.02z)| norm 0.2351 (-0.50z)| lr 9.63e-06 | 2532.11 ms | 53.3% bf16 MFU | 206988 tok/s +step 18037/19560 | loss 3.282642 (-0.66z)| norm 0.2293 (-0.90z)| lr 9.61e-06 | 2534.11 ms | 53.3% bf16 MFU | 206984 tok/s +step 18038/19560 | loss 3.285760 (-0.58z)| norm 0.2564 (+1.03z)| lr 9.60e-06 | 2532.37 ms | 53.3% bf16 MFU | 206986 tok/s +step 18039/19560 | loss 3.271900 (-0.93z)| norm 0.2431 (+0.07z)| lr 9.59e-06 | 2534.91 ms | 53.3% bf16 MFU | 206978 tok/s +step 18040/19560 | loss 3.323538 (+0.38z)| norm 0.2385 (-0.26z)| lr 9.58e-06 | 2531.50 ms | 53.3% bf16 MFU | 206985 tok/s +step 18041/19560 | loss 3.278069 (-0.77z)| norm 0.2472 (+0.36z)| lr 9.56e-06 | 2533.77 ms | 53.3% bf16 MFU | 206981 tok/s +step 18042/19560 | loss 3.319662 (+0.30z)| norm 0.2619 (+1.39z)| lr 9.55e-06 | 2533.56 ms | 53.3% bf16 MFU | 206979 tok/s +step 18043/19560 | loss 3.289508 (-0.48z)| norm 0.2339 (-0.60z)| lr 9.54e-06 | 2533.90 ms | 53.3% bf16 MFU | 206976 tok/s +step 18044/19560 | loss 3.307061 (-0.01z)| norm 0.2264 (-1.12z)| lr 9.53e-06 | 2532.88 ms | 53.3% bf16 MFU | 206977 tok/s +step 18045/19560 | loss 3.298589 (-0.23z)| norm 0.2304 (-0.83z)| lr 9.51e-06 | 2532.35 ms | 53.3% bf16 MFU | 206980 tok/s +step 18046/19560 | loss 3.339543 (+0.82z)| norm 0.2278 (-1.00z)| lr 9.50e-06 | 2534.15 ms | 53.3% bf16 MFU | 206975 tok/s +step 18047/19560 | loss 3.358197 (+1.28z)| norm 0.2458 (+0.27z)| lr 9.49e-06 | 2531.94 ms | 53.3% bf16 MFU | 206980 tok/s +step 18048/19560 | loss 3.330543 (+0.57z)| norm 0.2692 (+1.87z)| lr 9.48e-06 | 2532.18 ms | 53.3% bf16 MFU | 206983 tok/s +step 18049/19560 | loss 3.300203 (-0.21z)| norm 0.2459 (+0.26z)| lr 9.46e-06 | 2531.75 ms | 53.3% bf16 MFU | 206988 tok/s +step 18050/19560 | loss 3.312433 (+0.11z)| norm 0.2284 (-0.94z)| lr 9.45e-06 | 2531.85 ms | 53.3% bf16 MFU | 206993 tok/s +step 18051/19560 | loss 3.341491 (+0.88z)| norm 0.2264 (-1.07z)| lr 9.44e-06 | 2535.77 ms | 53.2% bf16 MFU | 206981 tok/s +step 18052/19560 | loss 3.353745 (+1.19z)| norm 0.2294 (-0.85z)| lr 9.43e-06 | 2531.94 ms | 53.3% bf16 MFU | 206985 tok/s +step 18053/19560 | loss 3.272660 (-0.91z)| norm 0.2236 (-1.22z)| lr 9.42e-06 | 2532.26 ms | 53.3% bf16 MFU | 206988 tok/s +step 18054/19560 | loss 3.326743 (+0.49z)| norm 0.2367 (-0.33z)| lr 9.40e-06 | 2532.06 ms | 53.3% bf16 MFU | 206992 tok/s +step 18055/19560 | loss 3.429487 (+3.01z)| norm 0.2582 (+1.15z)| lr 9.39e-06 | 2532.29 ms | 53.3% bf16 MFU | 206994 tok/s +step 18056/19560 | loss 3.427103 (+2.84z)| norm 0.4280 (+8.45z)| lr 9.38e-06 | 2533.18 ms | 53.3% bf16 MFU | 206993 tok/s +step 18057/19560 | loss 3.381663 (+1.70z)| norm 0.2356 (-0.33z)| lr 9.37e-06 | 2532.78 ms | 53.3% bf16 MFU | 206994 tok/s +step 18058/19560 | loss 3.342001 (+0.75z)| norm 0.2333 (-0.43z)| lr 9.35e-06 | 2534.27 ms | 53.3% bf16 MFU | 206988 tok/s +step 18059/19560 | loss 3.327537 (+0.41z)| norm 0.2495 (+0.30z)| lr 9.34e-06 | 2533.95 ms | 53.3% bf16 MFU | 206984 tok/s +step 18060/19560 | loss 3.346141 (+0.84z)| norm 0.2370 (-0.27z)| lr 9.33e-06 | 2534.41 ms | 53.3% bf16 MFU | 206978 tok/s +step 18061/19560 | loss 3.254603 (-1.35z)| norm 0.2375 (-0.24z)| lr 9.32e-06 | 2534.46 ms | 53.3% bf16 MFU | 206972 tok/s +step 18062/19560 | loss 3.302430 (-0.19z)| norm 0.2261 (-0.76z)| lr 9.30e-06 | 2536.16 ms | 53.2% bf16 MFU | 206960 tok/s +step 18063/19560 | loss 3.293066 (-0.43z)| norm 0.2255 (-0.78z)| lr 9.29e-06 | 2534.62 ms | 53.3% bf16 MFU | 206954 tok/s +step 18064/19560 | loss 3.350590 (+0.99z)| norm 0.2324 (-0.47z)| lr 9.28e-06 | 2534.58 ms | 53.3% bf16 MFU | 206949 tok/s +step 18065/19560 | loss 3.306430 (-0.12z)| norm 0.2431 (+0.02z)| lr 9.27e-06 | 2534.59 ms | 53.3% bf16 MFU | 206945 tok/s +step 18066/19560 | loss 3.286247 (-0.63z)| norm 0.2329 (-0.45z)| lr 9.25e-06 | 2532.58 ms | 53.3% bf16 MFU | 206948 tok/s +step 18067/19560 | loss 3.295879 (-0.40z)| norm 0.2467 (+0.17z)| lr 9.24e-06 | 2532.72 ms | 53.3% bf16 MFU | 206951 tok/s +step 18068/19560 | loss 3.358616 (+1.18z)| norm 0.2331 (-0.44z)| lr 9.23e-06 | 2533.80 ms | 53.3% bf16 MFU | 206949 tok/s +step 18069/19560 | loss 3.301288 (-0.27z)| norm 0.2404 (-0.11z)| lr 9.22e-06 | 2535.16 ms | 53.3% bf16 MFU | 206942 tok/s +step 18070/19560 | loss 3.288282 (-0.62z)| norm 0.2371 (-0.26z)| lr 9.21e-06 | 2534.29 ms | 53.3% bf16 MFU | 206939 tok/s +step 18071/19560 | loss 3.316712 (+0.11z)| norm 0.2450 (+0.10z)| lr 9.19e-06 | 2533.30 ms | 53.3% bf16 MFU | 206940 tok/s +step 18072/19560 | loss 3.309075 (-0.09z)| norm 0.2338 (-0.41z)| lr 9.18e-06 | 2535.23 ms | 53.3% bf16 MFU | 206933 tok/s +step 18073/19560 | loss 3.271890 (-1.03z)| norm 0.2329 (-0.45z)| lr 9.17e-06 | 2533.86 ms | 53.3% bf16 MFU | 206932 tok/s +step 18074/19560 | loss 3.320924 (+0.21z)| norm 0.2472 (+0.19z)| lr 9.16e-06 | 2532.89 ms | 53.3% bf16 MFU | 206935 tok/s +step 18075/19560 | loss 3.304816 (-0.20z)| norm 0.2370 (-0.27z)| lr 9.14e-06 | 2533.45 ms | 53.3% bf16 MFU | 206936 tok/s +step 18076/19560 | loss 3.287437 (-0.64z)| norm 0.2370 (-0.27z)| lr 9.13e-06 | 2532.65 ms | 53.3% bf16 MFU | 206939 tok/s +step 18077/19560 | loss 3.319248 (+0.18z)| norm 0.2537 (+0.48z)| lr 9.12e-06 | 2534.56 ms | 53.3% bf16 MFU | 206935 tok/s +step 18078/19560 | loss 3.342088 (+0.75z)| norm 0.2335 (-0.44z)| lr 9.11e-06 | 2532.86 ms | 53.3% bf16 MFU | 206938 tok/s +step 18079/19560 | loss 3.346565 (+0.85z)| norm 0.2339 (-0.42z)| lr 9.09e-06 | 2532.85 ms | 53.3% bf16 MFU | 206941 tok/s +step 18080/19560 | loss 3.388991 (+1.90z)| norm 0.2386 (-0.21z)| lr 9.08e-06 | 2533.07 ms | 53.3% bf16 MFU | 206943 tok/s +step 18081/19560 | loss 3.262395 (-1.31z)| norm 0.2385 (-0.22z)| lr 9.07e-06 | 2532.31 ms | 53.3% bf16 MFU | 206948 tok/s +step 18082/19560 | loss 3.331719 (+0.47z)| norm 0.2354 (-0.36z)| lr 9.06e-06 | 2535.65 ms | 53.2% bf16 MFU | 206939 tok/s +step 18083/19560 | loss 3.356030 (+1.08z)| norm 0.2344 (-0.40z)| lr 9.05e-06 | 2532.57 ms | 53.3% bf16 MFU | 206943 tok/s +step 18084/19560 | loss 3.323228 (+0.24z)| norm 0.2378 (-0.25z)| lr 9.03e-06 | 2533.45 ms | 53.3% bf16 MFU | 206943 tok/s +step 18085/19560 | loss 3.384387 (+1.77z)| norm 0.2341 (-0.42z)| lr 9.02e-06 | 2535.99 ms | 53.2% bf16 MFU | 206933 tok/s +step 18086/19560 | loss 3.318012 (+0.09z)| norm 0.2246 (-0.85z)| lr 9.01e-06 | 2534.84 ms | 53.3% bf16 MFU | 206928 tok/s +step 18087/19560 | loss 3.558959 (+5.42z)| norm 0.3096 (+2.94z)| lr 9.00e-06 | 2535.21 ms | 53.3% bf16 MFU | 206921 tok/s +step 18088/19560 | loss 3.291001 (-0.56z)| norm 0.2345 (-0.39z)| lr 8.99e-06 | 2536.48 ms | 53.2% bf16 MFU | 206910 tok/s +step 18089/19560 | loss 3.317811 (+0.03z)| norm 0.2362 (-0.31z)| lr 8.97e-06 | 2533.91 ms | 53.3% bf16 MFU | 206910 tok/s +step 18090/19560 | loss 3.270235 (-1.02z)| norm 0.2303 (-0.56z)| lr 8.96e-06 | 2533.81 ms | 53.3% bf16 MFU | 206910 tok/s +step 18091/19560 | loss 3.301330 (-0.32z)| norm 0.2242 (-0.84z)| lr 8.95e-06 | 2534.10 ms | 53.3% bf16 MFU | 206910 tok/s +step 18092/19560 | loss 3.276228 (-0.87z)| norm 0.2657 (+1.04z)| lr 8.94e-06 | 2535.10 ms | 53.3% bf16 MFU | 206905 tok/s +step 18093/19560 | loss 3.347377 (+0.70z)| norm 0.2356 (-0.32z)| lr 8.92e-06 | 2533.76 ms | 53.3% bf16 MFU | 206905 tok/s +step 18094/19560 | loss 3.266162 (-1.09z)| norm 0.2319 (-0.48z)| lr 8.91e-06 | 2532.40 ms | 53.3% bf16 MFU | 206912 tok/s +step 18095/19560 | loss 3.271914 (-0.95z)| norm 0.2292 (-0.59z)| lr 8.90e-06 | 2533.34 ms | 53.3% bf16 MFU | 206914 tok/s +step 18096/19560 | loss 3.311550 (-0.08z)| norm 0.2322 (-0.45z)| lr 8.89e-06 | 2533.69 ms | 53.3% bf16 MFU | 206915 tok/s +step 18097/19560 | loss 3.331366 (+0.36z)| norm 0.2294 (-0.57z)| lr 8.88e-06 | 2534.45 ms | 53.3% bf16 MFU | 206912 tok/s +step 18098/19560 | loss 3.302596 (-0.27z)| norm 0.2316 (-0.46z)| lr 8.86e-06 | 2534.24 ms | 53.3% bf16 MFU | 206911 tok/s +step 18099/19560 | loss 3.349785 (+0.77z)| norm 0.2345 (-0.33z)| lr 8.85e-06 | 2531.33 ms | 53.3% bf16 MFU | 206921 tok/s +step 18100/19560 | loss 3.244030 (-1.55z)| norm 0.2290 (-0.58z)| lr 8.84e-06 | 2533.94 ms | 53.3% bf16 MFU | 206920 tok/s +step 18101/19560 | loss 3.375200 (+1.35z)| norm 0.2580 (+0.74z)| lr 8.83e-06 | 2534.05 ms | 53.3% bf16 MFU | 206919 tok/s +step 18102/19560 | loss 3.248557 (-1.45z)| norm 0.2442 (+0.11z)| lr 8.82e-06 | 2532.77 ms | 53.3% bf16 MFU | 206923 tok/s +step 18103/19560 | loss 3.339670 (+0.56z)| norm 0.2358 (-0.27z)| lr 8.80e-06 | 2533.29 ms | 53.3% bf16 MFU | 206925 tok/s +step 18104/19560 | loss 3.386889 (+1.58z)| norm 0.2476 (+0.27z)| lr 8.79e-06 | 2534.45 ms | 53.3% bf16 MFU | 206922 tok/s +step 18105/19560 | loss 3.322415 (+0.17z)| norm 0.3261 (+3.60z)| lr 8.78e-06 | 2533.14 ms | 53.3% bf16 MFU | 206925 tok/s +step 18106/19560 | loss 3.329089 (+0.30z)| norm 0.2405 (-0.08z)| lr 8.77e-06 | 2532.54 ms | 53.3% bf16 MFU | 206929 tok/s +step 18107/19560 | loss 3.298601 (-0.39z)| norm 0.2437 (+0.05z)| lr 8.76e-06 | 2533.48 ms | 53.3% bf16 MFU | 206930 tok/s +step 18108/19560 | loss 3.360295 (+0.99z)| norm 0.2445 (+0.09z)| lr 8.74e-06 | 2532.53 ms | 53.3% bf16 MFU | 206935 tok/s +step 18109/19560 | loss 3.332536 (+0.36z)| norm 0.2472 (+0.21z)| lr 8.73e-06 | 2532.90 ms | 53.3% bf16 MFU | 206937 tok/s +step 18110/19560 | loss 3.208425 (-2.38z)| norm 0.2327 (-0.42z)| lr 8.72e-06 | 2533.26 ms | 53.3% bf16 MFU | 206939 tok/s +step 18111/19560 | loss 3.290393 (-0.56z)| norm 0.2371 (-0.23z)| lr 8.71e-06 | 2534.31 ms | 53.3% bf16 MFU | 206936 tok/s +step 18112/19560 | loss 3.245908 (-1.53z)| norm 0.2267 (-0.67z)| lr 8.70e-06 | 2531.80 ms | 53.3% bf16 MFU | 206943 tok/s +step 18113/19560 | loss 3.229702 (-1.88z)| norm 0.2337 (-0.37z)| lr 8.68e-06 | 2535.88 ms | 53.2% bf16 MFU | 206933 tok/s +step 18114/19560 | loss 3.261717 (-1.17z)| norm 0.2500 (+0.33z)| lr 8.67e-06 | 2534.56 ms | 53.3% bf16 MFU | 206929 tok/s +step 18115/19560 | loss 3.336186 (+0.45z)| norm 0.2455 (+0.13z)| lr 8.66e-06 | 2534.97 ms | 53.3% bf16 MFU | 206924 tok/s +step 18116/19560 | loss 3.281254 (-0.73z)| norm 0.2411 (-0.05z)| lr 8.65e-06 | 2535.01 ms | 53.3% bf16 MFU | 206919 tok/s +step 18117/19560 | loss 3.300220 (-0.31z)| norm 0.2362 (-0.26z)| lr 8.64e-06 | 2533.12 ms | 53.3% bf16 MFU | 206921 tok/s +step 18118/19560 | loss 3.252445 (-1.36z)| norm 0.2256 (-0.72z)| lr 8.62e-06 | 2534.79 ms | 53.3% bf16 MFU | 206917 tok/s +step 18119/19560 | loss 3.239036 (-1.62z)| norm 0.2353 (-0.30z)| lr 8.61e-06 | 2535.80 ms | 53.2% bf16 MFU | 206909 tok/s +step 18120/19560 | loss 3.240540 (-1.56z)| norm 0.2379 (-0.19z)| lr 8.60e-06 | 2535.33 ms | 53.3% bf16 MFU | 206903 tok/s +step 18121/19560 | loss 3.287121 (-0.55z)| norm 0.2340 (-0.36z)| lr 8.59e-06 | 2534.19 ms | 53.3% bf16 MFU | 206902 tok/s +step 18122/19560 | loss 3.306071 (-0.14z)| norm 0.2318 (-0.45z)| lr 8.58e-06 | 2533.27 ms | 53.3% bf16 MFU | 206905 tok/s +step 18123/19560 | loss 3.282883 (-0.63z)| norm 0.2447 (+0.09z)| lr 8.57e-06 | 2533.39 ms | 53.3% bf16 MFU | 206907 tok/s +step 18124/19560 | loss 3.305949 (-0.12z)| norm 0.2241 (-0.78z)| lr 8.55e-06 | 2535.74 ms | 53.2% bf16 MFU | 206900 tok/s +step 18125/19560 | loss 3.303028 (-0.19z)| norm 0.2559 (+0.58z)| lr 8.54e-06 | 2531.63 ms | 53.3% bf16 MFU | 206910 tok/s +step 18126/19560 | loss 3.277772 (-0.73z)| norm 0.2261 (-0.69z)| lr 8.53e-06 | 2532.63 ms | 53.3% bf16 MFU | 206915 tok/s +step 18127/19560 | loss 3.306332 (-0.11z)| norm 0.2387 (-0.15z)| lr 8.52e-06 | 2534.30 ms | 53.3% bf16 MFU | 206913 tok/s +step 18128/19560 | loss 3.280423 (-0.67z)| norm 0.2393 (-0.12z)| lr 8.51e-06 | 2534.20 ms | 53.3% bf16 MFU | 206912 tok/s +step 18129/19560 | loss 3.283518 (-0.61z)| norm 0.2426 (+0.02z)| lr 8.49e-06 | 2532.89 ms | 53.3% bf16 MFU | 206916 tok/s +step 18130/19560 | loss 3.301708 (-0.22z)| norm 0.2402 (-0.09z)| lr 8.48e-06 | 2532.51 ms | 53.3% bf16 MFU | 206921 tok/s +step 18131/19560 | loss 3.488086 (+3.73z)| norm 0.2548 (+0.60z)| lr 8.47e-06 | 2532.44 ms | 53.3% bf16 MFU | 206926 tok/s +step 18132/19560 | loss 3.412050 (+2.07z)| norm 0.2582 (+0.79z)| lr 8.46e-06 | 2534.59 ms | 53.3% bf16 MFU | 206923 tok/s +step 18133/19560 | loss 3.300325 (-0.26z)| norm 0.2304 (-0.50z)| lr 8.45e-06 | 2533.71 ms | 53.3% bf16 MFU | 206923 tok/s +step 18134/19560 | loss 3.318712 (+0.12z)| norm 0.2422 (+0.05z)| lr 8.44e-06 | 2532.90 ms | 53.3% bf16 MFU | 206926 tok/s +step 18135/19560 | loss 3.255270 (-1.19z)| norm 0.2372 (-0.18z)| lr 8.42e-06 | 2533.20 ms | 53.3% bf16 MFU | 206928 tok/s +step 18136/19560 | loss 3.407180 (+1.92z)| norm 0.2663 (+1.16z)| lr 8.41e-06 | 2534.17 ms | 53.3% bf16 MFU | 206926 tok/s +step 18137/19560 | loss 3.347927 (+0.70z)| norm 0.2263 (-0.69z)| lr 8.40e-06 | 2534.87 ms | 53.3% bf16 MFU | 206922 tok/s +step 18138/19560 | loss 3.322721 (+0.20z)| norm 0.2448 (+0.17z)| lr 8.39e-06 | 2536.80 ms | 53.2% bf16 MFU | 206909 tok/s +step 18139/19560 | loss 3.318505 (+0.11z)| norm 0.2384 (-0.13z)| lr 8.38e-06 | 2534.06 ms | 53.3% bf16 MFU | 206908 tok/s +step 18140/19560 | loss 3.301228 (-0.24z)| norm 0.2307 (-0.48z)| lr 8.37e-06 | 2534.68 ms | 53.3% bf16 MFU | 206905 tok/s +step 18141/19560 | loss 3.287939 (-0.52z)| norm 0.2313 (-0.45z)| lr 8.35e-06 | 2534.88 ms | 53.3% bf16 MFU | 206902 tok/s +step 18142/19560 | loss 3.296035 (-0.37z)| norm 0.2331 (-0.37z)| lr 8.34e-06 | 2535.48 ms | 53.3% bf16 MFU | 206896 tok/s +step 18143/19560 | loss 3.290348 (-0.48z)| norm 0.2348 (-0.28z)| lr 8.33e-06 | 2534.35 ms | 53.3% bf16 MFU | 206894 tok/s +step 18144/19560 | loss 3.228941 (-1.74z)| norm 0.2452 (+0.19z)| lr 8.32e-06 | 2534.81 ms | 53.3% bf16 MFU | 206891 tok/s +step 18145/19560 | loss 3.288009 (-0.51z)| norm 0.2205 (-0.95z)| lr 8.31e-06 | 2535.21 ms | 53.3% bf16 MFU | 206887 tok/s +step 18146/19560 | loss 3.339162 (+0.55z)| norm 0.2393 (-0.08z)| lr 8.29e-06 | 2532.50 ms | 53.3% bf16 MFU | 206894 tok/s +step 18147/19560 | loss 3.334256 (+0.44z)| norm 0.2400 (-0.05z)| lr 8.28e-06 | 2534.82 ms | 53.3% bf16 MFU | 206891 tok/s +step 18148/19560 | loss 3.331576 (+0.39z)| norm 0.2208 (-0.92z)| lr 8.27e-06 | 2531.51 ms | 53.3% bf16 MFU | 206902 tok/s +step 18149/19560 | loss 3.273305 (-0.81z)| norm 0.2320 (-0.41z)| lr 8.26e-06 | 2533.94 ms | 53.3% bf16 MFU | 206902 tok/s +step 18150/19560 | loss 3.303240 (-0.20z)| norm 0.2339 (-0.31z)| lr 8.25e-06 | 2533.75 ms | 53.3% bf16 MFU | 206903 tok/s +step 18151/19560 | loss 3.313118 (+0.01z)| norm 0.2347 (-0.27z)| lr 8.24e-06 | 2531.35 ms | 53.3% bf16 MFU | 206914 tok/s +step 18152/19560 | loss 3.280327 (-0.67z)| norm 0.2255 (-0.70z)| lr 8.22e-06 | 2535.29 ms | 53.3% bf16 MFU | 206908 tok/s +step 18153/19560 | loss 3.329171 (+0.35z)| norm 0.2401 (-0.02z)| lr 8.21e-06 | 2532.79 ms | 53.3% bf16 MFU | 206912 tok/s +step 18154/19560 | loss 3.264585 (-0.99z)| norm 0.2296 (-0.50z)| lr 8.20e-06 | 2534.18 ms | 53.3% bf16 MFU | 206911 tok/s +step 18155/19560 | loss 3.286003 (-0.54z)| norm 0.2247 (-0.72z)| lr 8.19e-06 | 2535.62 ms | 53.2% bf16 MFU | 206904 tok/s +step 18156/19560 | loss 3.367886 (+1.16z)| norm 0.2731 (+1.48z)| lr 8.18e-06 | 2533.52 ms | 53.3% bf16 MFU | 206906 tok/s +step 18157/19560 | loss 3.346471 (+0.71z)| norm 0.2478 (+0.32z)| lr 8.17e-06 | 2532.14 ms | 53.3% bf16 MFU | 206913 tok/s +step 18158/19560 | loss 3.314017 (+0.02z)| norm 0.2323 (-0.39z)| lr 8.16e-06 | 2532.57 ms | 53.3% bf16 MFU | 206918 tok/s +step 18159/19560 | loss 3.301598 (-0.24z)| norm 0.2435 (+0.13z)| lr 8.14e-06 | 2532.15 ms | 53.3% bf16 MFU | 206925 tok/s +step 18160/19560 | loss 3.257334 (-1.15z)| norm 0.2382 (-0.12z)| lr 8.13e-06 | 2533.94 ms | 53.3% bf16 MFU | 206924 tok/s +step 18161/19560 | loss 3.358311 (+0.93z)| norm 0.2506 (+0.45z)| lr 8.12e-06 | 2531.80 ms | 53.3% bf16 MFU | 206932 tok/s +step 18162/19560 | loss 3.339893 (+0.54z)| norm 0.2493 (+0.39z)| lr 8.11e-06 | 2533.73 ms | 53.3% bf16 MFU | 206932 tok/s +step 18163/19560 | loss 3.348221 (+0.71z)| norm 0.2237 (-0.78z)| lr 8.10e-06 | 2531.65 ms | 53.3% bf16 MFU | 206940 tok/s +step 18164/19560 | loss 3.279306 (-0.71z)| norm 0.2287 (-0.55z)| lr 8.09e-06 | 2532.47 ms | 53.3% bf16 MFU | 206944 tok/s +step 18165/19560 | loss 3.299871 (-0.29z)| norm 0.2288 (-0.54z)| lr 8.07e-06 | 2533.40 ms | 53.3% bf16 MFU | 206944 tok/s +step 18166/19560 | loss 3.315197 (+0.02z)| norm 0.2433 (+0.12z)| lr 8.06e-06 | 2534.11 ms | 53.3% bf16 MFU | 206942 tok/s +step 18167/19560 | loss 3.326468 (+0.25z)| norm 0.2284 (-0.55z)| lr 8.05e-06 | 2532.78 ms | 53.3% bf16 MFU | 206945 tok/s +step 18168/19560 | loss 3.296723 (-0.36z)| norm 0.2363 (-0.20z)| lr 8.04e-06 | 2535.17 ms | 53.3% bf16 MFU | 206938 tok/s +step 18169/19560 | loss 3.279747 (-0.72z)| norm 0.2319 (-0.39z)| lr 8.03e-06 | 2533.15 ms | 53.3% bf16 MFU | 206939 tok/s +step 18170/19560 | loss 3.271868 (-0.87z)| norm 0.2362 (-0.19z)| lr 8.02e-06 | 2533.11 ms | 53.3% bf16 MFU | 206941 tok/s +step 18171/19560 | loss 3.314663 (+0.01z)| norm 0.2332 (-0.32z)| lr 8.01e-06 | 2533.79 ms | 53.3% bf16 MFU | 206940 tok/s +step 18172/19560 | loss 3.282751 (-0.65z)| norm 0.2296 (-0.49z)| lr 7.99e-06 | 2533.43 ms | 53.3% bf16 MFU | 206940 tok/s +step 18173/19560 | loss 3.385102 (+1.45z)| norm 0.3023 (+2.75z)| lr 7.98e-06 | 2531.81 ms | 53.3% bf16 MFU | 206947 tok/s +step 18174/19560 | loss 3.317652 (+0.07z)| norm 0.2295 (-0.50z)| lr 7.97e-06 | 2536.12 ms | 53.2% bf16 MFU | 206936 tok/s +step 18175/19560 | loss 3.342713 (+0.59z)| norm 0.2430 (+0.10z)| lr 7.96e-06 | 2534.32 ms | 53.3% bf16 MFU | 206933 tok/s +step 18176/19560 | loss 3.376667 (+1.27z)| norm 0.2533 (+0.57z)| lr 7.95e-06 | 2533.02 ms | 53.3% bf16 MFU | 206936 tok/s +step 18177/19560 | loss 3.322662 (+0.16z)| norm 0.2537 (+0.58z)| lr 7.94e-06 | 2533.55 ms | 53.3% bf16 MFU | 206936 tok/s +step 18178/19560 | loss 3.323145 (+0.17z)| norm 0.2460 (+0.23z)| lr 7.93e-06 | 2532.56 ms | 53.3% bf16 MFU | 206940 tok/s +step 18179/19560 | loss 3.383583 (+1.39z)| norm 0.2338 (-0.32z)| lr 7.91e-06 | 2534.25 ms | 53.3% bf16 MFU | 206937 tok/s +step 18180/19560 | loss 3.337265 (+0.45z)| norm 0.2577 (+0.75z)| lr 7.90e-06 | 2533.53 ms | 53.3% bf16 MFU | 206937 tok/s +step 18181/19560 | loss 3.276814 (-0.78z)| norm 0.2331 (-0.36z)| lr 7.89e-06 | 2533.74 ms | 53.3% bf16 MFU | 206936 tok/s +step 18182/19560 | loss 3.299545 (-0.31z)| norm 0.2525 (+0.50z)| lr 7.88e-06 | 2535.32 ms | 53.3% bf16 MFU | 206929 tok/s +step 18183/19560 | loss 3.358337 (+0.91z)| norm 0.2419 (+0.03z)| lr 7.87e-06 | 2533.86 ms | 53.3% bf16 MFU | 206928 tok/s +step 18184/19560 | loss 3.229929 (-1.74z)| norm 0.2472 (+0.50z)| lr 7.86e-06 | 2534.62 ms | 53.3% bf16 MFU | 206925 tok/s +step 18185/19560 | loss 3.248097 (-1.34z)| norm 0.2418 (+0.13z)| lr 7.85e-06 | 2533.36 ms | 53.3% bf16 MFU | 206926 tok/s +step 18186/19560 | loss 3.305227 (-0.13z)| norm 0.2331 (-0.46z)| lr 7.83e-06 | 2533.49 ms | 53.3% bf16 MFU | 206927 tok/s +step 18187/19560 | loss 3.313388 (+0.04z)| norm 0.2485 (+0.58z)| lr 7.82e-06 | 2533.45 ms | 53.3% bf16 MFU | 206928 tok/s +step 18188/19560 | loss 3.316425 (+0.11z)| norm 0.2336 (-0.42z)| lr 7.81e-06 | 2533.76 ms | 53.3% bf16 MFU | 206927 tok/s +step 18189/19560 | loss 3.283408 (-0.59z)| norm 0.2313 (-0.57z)| lr 7.80e-06 | 2534.40 ms | 53.3% bf16 MFU | 206925 tok/s +step 18190/19560 | loss 3.379836 (+1.43z)| norm 0.2367 (-0.21z)| lr 7.79e-06 | 2533.69 ms | 53.3% bf16 MFU | 206925 tok/s +step 18191/19560 | loss 3.335270 (+0.48z)| norm 0.2283 (-0.78z)| lr 7.78e-06 | 2532.05 ms | 53.3% bf16 MFU | 206931 tok/s +step 18192/19560 | loss 3.300164 (-0.25z)| norm 0.2456 (+0.38z)| lr 7.77e-06 | 2534.02 ms | 53.3% bf16 MFU | 206930 tok/s +step 18193/19560 | loss 3.265596 (-0.97z)| norm 0.2271 (-0.86z)| lr 7.76e-06 | 2531.58 ms | 53.3% bf16 MFU | 206938 tok/s +step 18194/19560 | loss 3.311562 (-0.01z)| norm 0.2356 (-0.28z)| lr 7.74e-06 | 2534.33 ms | 53.3% bf16 MFU | 206935 tok/s +step 18195/19560 | loss 3.248738 (-1.31z)| norm 0.2365 (-0.22z)| lr 7.73e-06 | 2533.65 ms | 53.3% bf16 MFU | 206935 tok/s +step 18196/19560 | loss 3.317905 (+0.14z)| norm 0.2282 (-0.78z)| lr 7.72e-06 | 2532.77 ms | 53.3% bf16 MFU | 206938 tok/s +step 18197/19560 | loss 3.307436 (-0.08z)| norm 0.2347 (-0.33z)| lr 7.71e-06 | 2533.53 ms | 53.3% bf16 MFU | 206938 tok/s +step 18198/19560 | loss 3.274116 (-0.77z)| norm 0.2387 (-0.06z)| lr 7.70e-06 | 2532.78 ms | 53.3% bf16 MFU | 206941 tok/s +step 18199/19560 | loss 3.269034 (-0.87z)| norm 0.2403 (+0.04z)| lr 7.69e-06 | 2535.08 ms | 53.3% bf16 MFU | 206935 tok/s +step 18200/19560 | loss 3.364650 (+1.11z)| norm 0.2294 (-0.69z)| lr 7.68e-06 | 2531.69 ms | 53.3% bf16 MFU | 206943 tok/s +step 18201/19560 | loss 3.320539 (+0.19z)| norm 0.2410 (+0.09z)| lr 7.67e-06 | 2534.12 ms | 53.3% bf16 MFU | 206940 tok/s +step 18202/19560 | loss 3.320339 (+0.18z)| norm 0.2231 (-1.10z)| lr 7.65e-06 | 2535.16 ms | 53.3% bf16 MFU | 206933 tok/s +step 18203/19560 | loss 3.307879 (-0.08z)| norm 0.2298 (-0.65z)| lr 7.64e-06 | 2534.27 ms | 53.3% bf16 MFU | 206931 tok/s +step 18204/19560 | loss 3.249699 (-1.27z)| norm 0.2356 (-0.26z)| lr 7.63e-06 | 2533.51 ms | 53.3% bf16 MFU | 206931 tok/s +step 18205/19560 | loss 3.406561 (+1.93z)| norm 0.2351 (-0.28z)| lr 7.62e-06 | 2534.72 ms | 53.3% bf16 MFU | 206927 tok/s +step 18206/19560 | loss 3.280008 (-0.64z)| norm 0.2283 (-0.74z)| lr 7.61e-06 | 2533.15 ms | 53.3% bf16 MFU | 206929 tok/s +step 18207/19560 | loss 3.309582 (-0.03z)| norm 0.2238 (-1.03z)| lr 7.60e-06 | 2530.19 ms | 53.4% bf16 MFU | 206943 tok/s +step 18208/19560 | loss 3.297333 (-0.27z)| norm 0.2295 (-0.64z)| lr 7.59e-06 | 2533.72 ms | 53.3% bf16 MFU | 206942 tok/s +step 18209/19560 | loss 3.284458 (-0.54z)| norm 0.2372 (-0.13z)| lr 7.58e-06 | 2533.74 ms | 53.3% bf16 MFU | 206941 tok/s +step 18210/19560 | loss 3.300645 (-0.20z)| norm 0.2541 (+0.99z)| lr 7.56e-06 | 2534.73 ms | 53.3% bf16 MFU | 206936 tok/s +step 18211/19560 | loss 3.271315 (-0.80z)| norm 0.2385 (-0.05z)| lr 7.55e-06 | 2533.46 ms | 53.3% bf16 MFU | 206937 tok/s +step 18212/19560 | loss 3.282992 (-0.55z)| norm 0.2386 (-0.05z)| lr 7.54e-06 | 2533.18 ms | 53.3% bf16 MFU | 206938 tok/s +step 18213/19560 | loss 3.372564 (+1.31z)| norm 0.2920 (+3.33z)| lr 7.53e-06 | 2535.44 ms | 53.3% bf16 MFU | 206931 tok/s +step 18214/19560 | loss 3.299826 (-0.19z)| norm 0.2478 (+0.50z)| lr 7.52e-06 | 2534.44 ms | 53.3% bf16 MFU | 206927 tok/s +step 18215/19560 | loss 3.315319 (+0.19z)| norm 0.2307 (-0.60z)| lr 7.51e-06 | 2534.26 ms | 53.3% bf16 MFU | 206925 tok/s +step 18216/19560 | loss 3.252809 (-1.26z)| norm 0.2369 (-0.17z)| lr 7.50e-06 | 2531.42 ms | 53.3% bf16 MFU | 206934 tok/s +step 18217/19560 | loss 3.347706 (+0.94z)| norm 0.2381 (-0.09z)| lr 7.49e-06 | 2535.01 ms | 53.3% bf16 MFU | 206929 tok/s +step 18218/19560 | loss 3.315316 (+0.18z)| norm 0.2623 (+1.57z)| lr 7.48e-06 | 2534.50 ms | 53.3% bf16 MFU | 206925 tok/s +step 18219/19560 | loss 3.335075 (+0.63z)| norm 0.2462 (+0.44z)| lr 7.46e-06 | 2536.08 ms | 53.2% bf16 MFU | 206916 tok/s +step 18220/19560 | loss 3.300220 (-0.18z)| norm 0.2359 (-0.25z)| lr 7.45e-06 | 2533.32 ms | 53.3% bf16 MFU | 206918 tok/s +step 18221/19560 | loss 3.327398 (+0.46z)| norm 0.2325 (-0.49z)| lr 7.44e-06 | 2533.38 ms | 53.3% bf16 MFU | 206919 tok/s +step 18222/19560 | loss 3.295597 (-0.29z)| norm 0.2406 (+0.07z)| lr 7.43e-06 | 2534.04 ms | 53.3% bf16 MFU | 206918 tok/s +step 18223/19560 | loss 3.247524 (-1.40z)| norm 0.2411 (+0.10z)| lr 7.42e-06 | 2532.53 ms | 53.3% bf16 MFU | 206923 tok/s +step 18224/19560 | loss 3.284976 (-0.53z)| norm 0.2326 (-0.50z)| lr 7.41e-06 | 2534.86 ms | 53.3% bf16 MFU | 206919 tok/s +step 18225/19560 | loss 3.270250 (-0.86z)| norm 0.2355 (-0.29z)| lr 7.40e-06 | 2535.97 ms | 53.2% bf16 MFU | 206910 tok/s +step 18226/19560 | loss 3.274685 (-0.75z)| norm 0.2456 (+0.41z)| lr 7.39e-06 | 2534.00 ms | 53.3% bf16 MFU | 206909 tok/s +step 18227/19560 | loss 3.277331 (-0.67z)| norm 0.2381 (-0.12z)| lr 7.38e-06 | 2534.65 ms | 53.3% bf16 MFU | 206906 tok/s +step 18228/19560 | loss 3.325856 (+0.44z)| norm 0.2693 (+2.02z)| lr 7.37e-06 | 2531.40 ms | 53.3% bf16 MFU | 206917 tok/s +step 18229/19560 | loss 3.297605 (-0.21z)| norm 0.2294 (-0.74z)| lr 7.35e-06 | 2533.19 ms | 53.3% bf16 MFU | 206919 tok/s +step 18230/19560 | loss 3.287754 (-0.45z)| norm 0.2324 (-0.52z)| lr 7.34e-06 | 2534.47 ms | 53.3% bf16 MFU | 206916 tok/s +step 18231/19560 | loss 3.335612 (+0.69z)| norm 0.2547 (+1.02z)| lr 7.33e-06 | 2532.40 ms | 53.3% bf16 MFU | 206922 tok/s +step 18232/19560 | loss 3.286843 (-0.46z)| norm 0.2357 (-0.29z)| lr 7.32e-06 | 2532.32 ms | 53.3% bf16 MFU | 206928 tok/s +step 18233/19560 | loss 3.262913 (-1.02z)| norm 0.2476 (+0.68z)| lr 7.31e-06 | 2531.41 ms | 53.3% bf16 MFU | 206937 tok/s +step 18234/19560 | loss 3.306542 (+0.03z)| norm 0.2490 (+0.79z)| lr 7.30e-06 | 2531.93 ms | 53.3% bf16 MFU | 206944 tok/s +step 18235/19560 | loss 3.281117 (-0.58z)| norm 0.2329 (-0.52z)| lr 7.29e-06 | 2531.57 ms | 53.3% bf16 MFU | 206952 tok/s +step 18236/19560 | loss 3.289916 (-0.36z)| norm 0.2347 (-0.37z)| lr 7.28e-06 | 2530.72 ms | 53.4% bf16 MFU | 206963 tok/s +step 18237/19560 | loss 3.293638 (-0.26z)| norm 0.2320 (-0.58z)| lr 7.27e-06 | 2532.86 ms | 53.3% bf16 MFU | 206964 tok/s +step 18238/19560 | loss 3.229506 (-1.83z)| norm 0.2296 (-0.77z)| lr 7.26e-06 | 2531.93 ms | 53.3% bf16 MFU | 206969 tok/s +step 18239/19560 | loss 3.257186 (-1.14z)| norm 0.2280 (-0.89z)| lr 7.24e-06 | 2532.36 ms | 53.3% bf16 MFU | 206973 tok/s +step 18240/19560 | loss 3.419738 (+2.72z)| norm 0.2386 (-0.04z)| lr 7.23e-06 | 2534.00 ms | 53.3% bf16 MFU | 206969 tok/s +step 18241/19560 | loss 3.310446 (+0.10z)| norm 0.2305 (-0.70z)| lr 7.22e-06 | 2532.32 ms | 53.3% bf16 MFU | 206973 tok/s +step 18242/19560 | loss 3.249420 (-1.36z)| norm 0.2346 (-0.36z)| lr 7.21e-06 | 2533.27 ms | 53.3% bf16 MFU | 206972 tok/s +step 18243/19560 | loss 3.388103 (+1.94z)| norm 0.2449 (+0.49z)| lr 7.20e-06 | 2532.36 ms | 53.3% bf16 MFU | 206975 tok/s +step 18244/19560 | loss 3.242875 (-1.49z)| norm 0.2328 (-0.49z)| lr 7.19e-06 | 2535.41 ms | 53.3% bf16 MFU | 206966 tok/s +step 18245/19560 | loss 3.303428 (-0.07z)| norm 0.2318 (-0.58z)| lr 7.18e-06 | 2533.81 ms | 53.3% bf16 MFU | 206963 tok/s +step 18246/19560 | loss 3.275480 (-0.73z)| norm 0.2381 (-0.07z)| lr 7.17e-06 | 2535.59 ms | 53.2% bf16 MFU | 206954 tok/s +step 18247/19560 | loss 3.297342 (-0.23z)| norm 0.2747 (+2.82z)| lr 7.16e-06 | 2533.16 ms | 53.3% bf16 MFU | 206955 tok/s +step 18248/19560 | loss 3.423512 (+2.70z)| norm 0.3584 (+7.22z)| lr 7.15e-06 | 2534.70 ms | 53.3% bf16 MFU | 206949 tok/s +step 18249/19560 | loss 3.269764 (-0.90z)| norm 0.2342 (-0.36z)| lr 7.14e-06 | 2533.80 ms | 53.3% bf16 MFU | 206948 tok/s +step 18250/19560 | loss 3.265060 (-1.00z)| norm 0.2640 (+1.43z)| lr 7.13e-06 | 2533.21 ms | 53.3% bf16 MFU | 206948 tok/s +val loss 3.286863 +evaluating HellaSwag: 0/628 evaluating HellaSwag: 10/628 evaluating HellaSwag: 20/628 evaluating HellaSwag: 30/628 evaluating HellaSwag: 40/628 evaluating HellaSwag: 50/628 evaluating HellaSwag: 60/628 evaluating HellaSwag: 70/628 evaluating HellaSwag: 80/628 evaluating HellaSwag: 90/628 evaluating HellaSwag: 100/628 evaluating HellaSwag: 110/628 evaluating HellaSwag: 120/628 evaluating HellaSwag: 130/628 evaluating HellaSwag: 140/628 evaluating HellaSwag: 150/628 evaluating HellaSwag: 160/628 evaluating HellaSwag: 170/628 evaluating HellaSwag: 180/628 evaluating HellaSwag: 190/628 evaluating HellaSwag: 200/628 evaluating HellaSwag: 210/628 evaluating HellaSwag: 220/628 evaluating HellaSwag: 230/628 evaluating HellaSwag: 240/628 evaluating HellaSwag: 250/628 evaluating HellaSwag: 260/628 evaluating HellaSwag: 270/628 evaluating HellaSwag: 280/628 evaluating HellaSwag: 290/628 evaluating HellaSwag: 300/628 evaluating HellaSwag: 310/628 evaluating HellaSwag: 320/628 evaluating HellaSwag: 330/628 evaluating HellaSwag: 340/628 evaluating HellaSwag: 350/628 evaluating HellaSwag: 360/628 evaluating HellaSwag: 370/628 evaluating HellaSwag: 380/628 evaluating HellaSwag: 390/628 evaluating HellaSwag: 400/628 evaluating HellaSwag: 410/628 evaluating HellaSwag: 420/628 evaluating HellaSwag: 430/628 evaluating HellaSwag: 440/628 evaluating HellaSwag: 450/628 evaluating HellaSwag: 460/628 evaluating HellaSwag: 470/628 evaluating HellaSwag: 480/628 evaluating HellaSwag: 490/628 evaluating HellaSwag: 500/628 evaluating HellaSwag: 510/628 evaluating HellaSwag: 520/628 evaluating HellaSwag: 530/628 evaluating HellaSwag: 540/628 evaluating HellaSwag: 550/628 evaluating HellaSwag: 560/628 evaluating HellaSwag: 570/628 evaluating HellaSwag: 580/628 evaluating HellaSwag: 590/628 evaluating HellaSwag: 600/628 evaluating HellaSwag: 610/628 evaluating HellaSwag: 620/628 HellaSwag: 3035/10042 = 0.302231 +step 18251/19560 | loss 3.273795 (-0.79z)| norm 0.2286 (-0.71z)| lr 7.11e-06 | 2532.57 ms | 53.3% bf16 MFU | 206952 tok/s +step 18252/19560 | loss 3.309494 (+0.04z)| norm 0.2389 (-0.09z)| lr 7.10e-06 | 2532.21 ms | 53.3% bf16 MFU | 206957 tok/s +step 18253/19560 | loss 3.393507 (+1.95z)| norm 0.2486 (+0.51z)| lr 7.09e-06 | 2532.03 ms | 53.3% bf16 MFU | 206962 tok/s +step 18254/19560 | loss 3.237584 (-1.60z)| norm 0.2548 (+0.87z)| lr 7.08e-06 | 2534.36 ms | 53.3% bf16 MFU | 206957 tok/s +step 18255/19560 | loss 3.246431 (-1.38z)| norm 0.2295 (-0.67z)| lr 7.07e-06 | 2532.06 ms | 53.3% bf16 MFU | 206963 tok/s +step 18256/19560 | loss 3.278948 (-0.65z)| norm 0.2533 (+0.77z)| lr 7.06e-06 | 2531.96 ms | 53.3% bf16 MFU | 206968 tok/s +step 18257/19560 | loss 3.380798 (+1.62z)| norm 0.2331 (-0.45z)| lr 7.05e-06 | 2531.87 ms | 53.3% bf16 MFU | 206973 tok/s +step 18258/19560 | loss 3.336781 (+0.63z)| norm 0.2209 (-1.17z)| lr 7.04e-06 | 2532.49 ms | 53.3% bf16 MFU | 206976 tok/s +step 18259/19560 | loss 3.230517 (-1.79z)| norm 0.2205 (-1.18z)| lr 7.03e-06 | 2531.83 ms | 53.3% bf16 MFU | 206981 tok/s +step 18260/19560 | loss 3.280152 (-0.62z)| norm 0.2228 (-1.02z)| lr 7.02e-06 | 2532.36 ms | 53.3% bf16 MFU | 206984 tok/s +step 18261/19560 | loss 3.272488 (-0.79z)| norm 0.2308 (-0.54z)| lr 7.01e-06 | 2534.20 ms | 53.3% bf16 MFU | 206979 tok/s +step 18262/19560 | loss 3.282411 (-0.55z)| norm 0.2297 (-0.60z)| lr 7.00e-06 | 2532.85 ms | 53.3% bf16 MFU | 206980 tok/s +step 18263/19560 | loss 3.297803 (-0.19z)| norm 0.2372 (-0.15z)| lr 6.98e-06 | 2534.48 ms | 53.3% bf16 MFU | 206974 tok/s +step 18264/19560 | loss 3.214877 (-2.17z)| norm 0.2269 (-0.76z)| lr 6.97e-06 | 2534.54 ms | 53.3% bf16 MFU | 206968 tok/s +step 18265/19560 | loss 3.323540 (+0.48z)| norm 0.2323 (-0.44z)| lr 6.96e-06 | 2534.26 ms | 53.3% bf16 MFU | 206963 tok/s +step 18266/19560 | loss 3.358754 (+1.32z)| norm 0.2340 (-0.33z)| lr 6.95e-06 | 2533.49 ms | 53.3% bf16 MFU | 206962 tok/s +step 18267/19560 | loss 3.293472 (-0.25z)| norm 0.2372 (-0.13z)| lr 6.94e-06 | 2532.95 ms | 53.3% bf16 MFU | 206964 tok/s +step 18268/19560 | loss 3.277125 (-0.64z)| norm 0.2224 (-1.02z)| lr 6.93e-06 | 2533.52 ms | 53.3% bf16 MFU | 206963 tok/s +step 18269/19560 | loss 3.271284 (-0.78z)| norm 0.2321 (-0.44z)| lr 6.92e-06 | 2533.35 ms | 53.3% bf16 MFU | 206962 tok/s +step 18270/19560 | loss 3.265481 (-0.91z)| norm 0.2275 (-0.71z)| lr 6.91e-06 | 2535.85 ms | 53.2% bf16 MFU | 206952 tok/s +step 18271/19560 | loss 3.226485 (-1.82z)| norm 0.2296 (-0.58z)| lr 6.90e-06 | 2534.06 ms | 53.3% bf16 MFU | 206949 tok/s +step 18272/19560 | loss 3.260858 (-1.01z)| norm 0.2471 (+0.47z)| lr 6.89e-06 | 2534.28 ms | 53.3% bf16 MFU | 206945 tok/s +step 18273/19560 | loss 3.335106 (+0.76z)| norm 0.2334 (-0.36z)| lr 6.88e-06 | 2534.43 ms | 53.3% bf16 MFU | 206941 tok/s +step 18274/19560 | loss 3.279535 (-0.56z)| norm 0.2392 (-0.01z)| lr 6.87e-06 | 2535.08 ms | 53.3% bf16 MFU | 206935 tok/s +step 18275/19560 | loss 3.256021 (-1.11z)| norm 0.2409 (+0.09z)| lr 6.86e-06 | 2534.82 ms | 53.3% bf16 MFU | 206930 tok/s +step 18276/19560 | loss 3.262875 (-0.93z)| norm 0.2277 (-0.71z)| lr 6.85e-06 | 2532.62 ms | 53.3% bf16 MFU | 206934 tok/s +step 18277/19560 | loss 3.308468 (+0.15z)| norm 0.2344 (-0.31z)| lr 6.84e-06 | 2533.51 ms | 53.3% bf16 MFU | 206934 tok/s +step 18278/19560 | loss 3.274912 (-0.64z)| norm 0.2307 (-0.53z)| lr 6.83e-06 | 2531.45 ms | 53.3% bf16 MFU | 206943 tok/s +step 18279/19560 | loss 3.217891 (-1.96z)| norm 0.2358 (-0.22z)| lr 6.81e-06 | 2533.92 ms | 53.3% bf16 MFU | 206941 tok/s +step 18280/19560 | loss 3.271729 (-0.69z)| norm 0.2532 (+0.82z)| lr 6.80e-06 | 2532.62 ms | 53.3% bf16 MFU | 206945 tok/s +step 18281/19560 | loss 3.377425 (+1.77z)| norm 0.2315 (-0.49z)| lr 6.79e-06 | 2534.48 ms | 53.3% bf16 MFU | 206941 tok/s +step 18282/19560 | loss 3.284257 (-0.40z)| norm 0.2443 (+0.28z)| lr 6.78e-06 | 2530.72 ms | 53.4% bf16 MFU | 206952 tok/s +step 18283/19560 | loss 3.279922 (-0.50z)| norm 0.2329 (-0.42z)| lr 6.77e-06 | 2532.60 ms | 53.3% bf16 MFU | 206956 tok/s +step 18284/19560 | loss 3.285775 (-0.36z)| norm 0.2330 (-0.40z)| lr 6.76e-06 | 2533.31 ms | 53.3% bf16 MFU | 206956 tok/s +step 18285/19560 | loss 3.312446 (+0.28z)| norm 0.2264 (-0.79z)| lr 6.75e-06 | 2534.53 ms | 53.3% bf16 MFU | 206951 tok/s +step 18286/19560 | loss 3.266088 (-0.81z)| norm 0.2284 (-0.67z)| lr 6.74e-06 | 2533.14 ms | 53.3% bf16 MFU | 206952 tok/s +step 18287/19560 | loss 3.269681 (-0.71z)| norm 0.2297 (-0.58z)| lr 6.73e-06 | 2532.74 ms | 53.3% bf16 MFU | 206954 tok/s +step 18288/19560 | loss 3.308243 (+0.18z)| norm 0.2294 (-0.60z)| lr 6.72e-06 | 2532.84 ms | 53.3% bf16 MFU | 206957 tok/s +step 18289/19560 | loss 3.281916 (-0.43z)| norm 0.2333 (-0.35z)| lr 6.71e-06 | 2534.58 ms | 53.3% bf16 MFU | 206951 tok/s +step 18290/19560 | loss 3.314378 (+0.35z)| norm 0.2238 (-0.92z)| lr 6.70e-06 | 2533.95 ms | 53.3% bf16 MFU | 206949 tok/s +step 18291/19560 | loss 3.230578 (-1.62z)| norm 0.2315 (-0.45z)| lr 6.69e-06 | 2531.42 ms | 53.3% bf16 MFU | 206957 tok/s +step 18292/19560 | loss 3.286328 (-0.30z)| norm 0.2330 (-0.36z)| lr 6.68e-06 | 2532.05 ms | 53.3% bf16 MFU | 206962 tok/s +step 18293/19560 | loss 3.557843 (+5.36z)| norm 0.2906 (+3.06z)| lr 6.67e-06 | 2534.26 ms | 53.3% bf16 MFU | 206958 tok/s +step 18294/19560 | loss 3.274870 (-0.53z)| norm 0.2298 (-0.56z)| lr 6.66e-06 | 2532.28 ms | 53.3% bf16 MFU | 206962 tok/s +step 18295/19560 | loss 3.294919 (-0.11z)| norm 0.2345 (-0.28z)| lr 6.65e-06 | 2533.26 ms | 53.3% bf16 MFU | 206962 tok/s +step 18296/19560 | loss 3.287056 (-0.27z)| norm 0.2344 (-0.29z)| lr 6.64e-06 | 2533.69 ms | 53.3% bf16 MFU | 206961 tok/s +step 18297/19560 | loss 3.335931 (+0.74z)| norm 0.2277 (-0.68z)| lr 6.63e-06 | 2531.71 ms | 53.3% bf16 MFU | 206967 tok/s +step 18298/19560 | loss 3.246205 (-1.12z)| norm 0.2553 (+0.95z)| lr 6.61e-06 | 2534.77 ms | 53.3% bf16 MFU | 206961 tok/s +step 18299/19560 | loss 3.262763 (-0.77z)| norm 0.2302 (-0.54z)| lr 6.60e-06 | 2534.74 ms | 53.3% bf16 MFU | 206955 tok/s +step 18300/19560 | loss 3.242188 (-1.18z)| norm 0.2292 (-0.60z)| lr 6.59e-06 | 2534.13 ms | 53.3% bf16 MFU | 206951 tok/s +step 18301/19560 | loss 3.274024 (-0.52z)| norm 0.2368 (-0.13z)| lr 6.58e-06 | 2534.23 ms | 53.3% bf16 MFU | 206948 tok/s +step 18302/19560 | loss 3.318047 (+0.40z)| norm 0.2286 (-0.64z)| lr 6.57e-06 | 2535.85 ms | 53.2% bf16 MFU | 206938 tok/s +step 18303/19560 | loss 3.285120 (-0.28z)| norm 0.2663 (+1.69z)| lr 6.56e-06 | 2532.18 ms | 53.3% bf16 MFU | 206944 tok/s +step 18304/19560 | loss 3.275205 (-0.47z)| norm 0.2268 (-0.75z)| lr 6.55e-06 | 2533.18 ms | 53.3% bf16 MFU | 206945 tok/s +step 18305/19560 | loss 3.304410 (+0.15z)| norm 0.2275 (-0.69z)| lr 6.54e-06 | 2531.20 ms | 53.3% bf16 MFU | 206954 tok/s +step 18306/19560 | loss 3.325988 (+0.60z)| norm 0.2338 (-0.29z)| lr 6.53e-06 | 2533.87 ms | 53.3% bf16 MFU | 206952 tok/s +step 18307/19560 | loss 3.343078 (+0.98z)| norm 0.2606 (+1.35z)| lr 6.52e-06 | 2533.83 ms | 53.3% bf16 MFU | 206950 tok/s +step 18308/19560 | loss 3.304365 (+0.16z)| norm 0.2254 (-0.81z)| lr 6.51e-06 | 2532.60 ms | 53.3% bf16 MFU | 206954 tok/s +step 18309/19560 | loss 3.272696 (-0.52z)| norm 0.2505 (+0.74z)| lr 6.50e-06 | 2532.13 ms | 53.3% bf16 MFU | 206959 tok/s +step 18310/19560 | loss 3.292964 (-0.08z)| norm 0.2428 (+0.27z)| lr 6.49e-06 | 2534.36 ms | 53.3% bf16 MFU | 206954 tok/s +step 18311/19560 | loss 3.234214 (-1.32z)| norm 0.2430 (+0.28z)| lr 6.48e-06 | 2532.16 ms | 53.3% bf16 MFU | 206959 tok/s +step 18312/19560 | loss 3.260910 (-0.76z)| norm 0.2376 (-0.05z)| lr 6.47e-06 | 2535.15 ms | 53.3% bf16 MFU | 206952 tok/s +step 18313/19560 | loss 3.293679 (-0.06z)| norm 0.2307 (-0.47z)| lr 6.46e-06 | 2532.97 ms | 53.3% bf16 MFU | 206953 tok/s +step 18314/19560 | loss 3.276756 (-0.42z)| norm 0.2436 (+0.32z)| lr 6.45e-06 | 2532.33 ms | 53.3% bf16 MFU | 206957 tok/s +step 18315/19560 | loss 3.252658 (-0.93z)| norm 0.2431 (+0.29z)| lr 6.44e-06 | 2532.71 ms | 53.3% bf16 MFU | 206960 tok/s +step 18316/19560 | loss 3.276511 (-0.41z)| norm 0.2276 (-0.67z)| lr 6.43e-06 | 2535.14 ms | 53.3% bf16 MFU | 206952 tok/s +step 18317/19560 | loss 3.313620 (+0.39z)| norm 0.2362 (-0.13z)| lr 6.42e-06 | 2532.59 ms | 53.3% bf16 MFU | 206956 tok/s +step 18318/19560 | loss 3.216389 (-1.68z)| norm 0.2251 (-0.82z)| lr 6.41e-06 | 2531.07 ms | 53.3% bf16 MFU | 206965 tok/s +step 18319/19560 | loss 3.264009 (-0.64z)| norm 0.2478 (+0.58z)| lr 6.40e-06 | 2534.38 ms | 53.3% bf16 MFU | 206960 tok/s +step 18320/19560 | loss 3.368925 (+1.59z)| norm 0.2529 (+0.89z)| lr 6.39e-06 | 2532.44 ms | 53.3% bf16 MFU | 206964 tok/s +step 18321/19560 | loss 3.242300 (-1.10z)| norm 0.2336 (-0.31z)| lr 6.38e-06 | 2535.38 ms | 53.3% bf16 MFU | 206955 tok/s +step 18322/19560 | loss 3.239740 (-1.14z)| norm 0.2532 (+0.90z)| lr 6.37e-06 | 2532.00 ms | 53.3% bf16 MFU | 206960 tok/s +step 18323/19560 | loss 3.494811 (+3.97z)| norm 0.2646 (+1.57z)| lr 6.36e-06 | 2534.43 ms | 53.3% bf16 MFU | 206956 tok/s +step 18324/19560 | loss 3.306249 (+0.22z)| norm 0.2394 (+0.02z)| lr 6.35e-06 | 2535.19 ms | 53.3% bf16 MFU | 206948 tok/s +step 18325/19560 | loss 3.362405 (+1.32z)| norm 0.2230 (-0.97z)| lr 6.34e-06 | 2534.86 ms | 53.3% bf16 MFU | 206942 tok/s +step 18326/19560 | loss 3.282144 (-0.27z)| norm 0.2419 (+0.18z)| lr 6.33e-06 | 2533.86 ms | 53.3% bf16 MFU | 206941 tok/s +step 18327/19560 | loss 3.265653 (-0.60z)| norm 0.2230 (-0.96z)| lr 6.32e-06 | 2532.77 ms | 53.3% bf16 MFU | 206944 tok/s +step 18328/19560 | loss 3.318036 (+0.45z)| norm 0.2331 (-0.35z)| lr 6.31e-06 | 2531.81 ms | 53.3% bf16 MFU | 206951 tok/s +step 18329/19560 | loss 3.270553 (-0.49z)| norm 0.2290 (-0.59z)| lr 6.30e-06 | 2534.41 ms | 53.3% bf16 MFU | 206946 tok/s +step 18330/19560 | loss 3.250744 (-0.87z)| norm 0.2462 (+0.44z)| lr 6.28e-06 | 2536.12 ms | 53.2% bf16 MFU | 206935 tok/s +step 18331/19560 | loss 3.198157 (-1.87z)| norm 0.2439 (+0.30z)| lr 6.27e-06 | 2533.00 ms | 53.3% bf16 MFU | 206938 tok/s +step 18332/19560 | loss 3.248399 (-0.89z)| norm 0.2313 (-0.47z)| lr 6.26e-06 | 2535.34 ms | 53.3% bf16 MFU | 206931 tok/s +step 18333/19560 | loss 3.291106 (-0.03z)| norm 0.2436 (+0.28z)| lr 6.25e-06 | 2533.00 ms | 53.3% bf16 MFU | 206933 tok/s +step 18334/19560 | loss 3.268923 (-0.48z)| norm 0.2438 (+0.28z)| lr 6.24e-06 | 2535.15 ms | 53.3% bf16 MFU | 206927 tok/s +step 18335/19560 | loss 3.291495 (-0.02z)| norm 0.2337 (-0.34z)| lr 6.23e-06 | 2533.04 ms | 53.3% bf16 MFU | 206930 tok/s +step 18336/19560 | loss 3.321764 (+0.58z)| norm 0.2352 (-0.25z)| lr 6.22e-06 | 2533.12 ms | 53.3% bf16 MFU | 206932 tok/s +step 18337/19560 | loss 3.287040 (-0.12z)| norm 0.2279 (-0.69z)| lr 6.21e-06 | 2533.18 ms | 53.3% bf16 MFU | 206934 tok/s +step 18338/19560 | loss 3.272343 (-0.40z)| norm 0.2389 (-0.01z)| lr 6.20e-06 | 2534.00 ms | 53.3% bf16 MFU | 206932 tok/s +step 18339/19560 | loss 3.334825 (+0.83z)| norm 0.2329 (-0.38z)| lr 6.19e-06 | 2532.36 ms | 53.3% bf16 MFU | 206937 tok/s +step 18340/19560 | loss 3.325075 (+0.63z)| norm 0.2425 (+0.21z)| lr 6.18e-06 | 2531.75 ms | 53.3% bf16 MFU | 206945 tok/s +step 18341/19560 | loss 3.319597 (+0.53z)| norm 0.2373 (-0.09z)| lr 6.17e-06 | 2532.64 ms | 53.3% bf16 MFU | 206948 tok/s +step 18342/19560 | loss 3.336244 (+0.86z)| norm 0.2388 (+0.01z)| lr 6.16e-06 | 2532.10 ms | 53.3% bf16 MFU | 206953 tok/s +step 18343/19560 | loss 3.283980 (-0.18z)| norm 0.2318 (-0.44z)| lr 6.15e-06 | 2533.32 ms | 53.3% bf16 MFU | 206954 tok/s +step 18344/19560 | loss 3.270609 (-0.45z)| norm 0.2415 (+0.19z)| lr 6.14e-06 | 2533.38 ms | 53.3% bf16 MFU | 206953 tok/s +step 18345/19560 | loss 3.283612 (-0.18z)| norm 0.2285 (-0.64z)| lr 6.13e-06 | 2533.09 ms | 53.3% bf16 MFU | 206955 tok/s +step 18346/19560 | loss 3.303879 (+0.23z)| norm 0.2603 (+1.39z)| lr 6.12e-06 | 2532.41 ms | 53.3% bf16 MFU | 206958 tok/s +step 18347/19560 | loss 3.291785 (-0.01z)| norm 0.2289 (-0.61z)| lr 6.11e-06 | 2535.10 ms | 53.3% bf16 MFU | 206951 tok/s +step 18348/19560 | loss 3.360777 (+1.37z)| norm 0.2307 (-0.49z)| lr 6.10e-06 | 2535.22 ms | 53.3% bf16 MFU | 206944 tok/s +step 18349/19560 | loss 3.257329 (-0.70z)| norm 0.2268 (-0.73z)| lr 6.09e-06 | 2534.61 ms | 53.3% bf16 MFU | 206939 tok/s +step 18350/19560 | loss 3.258217 (-0.67z)| norm 0.2271 (-0.71z)| lr 6.08e-06 | 2533.10 ms | 53.3% bf16 MFU | 206941 tok/s +step 18351/19560 | loss 3.250996 (-0.82z)| norm 0.2287 (-0.60z)| lr 6.07e-06 | 2534.06 ms | 53.3% bf16 MFU | 206939 tok/s +step 18352/19560 | loss 3.239927 (-1.03z)| norm 0.2521 (+0.88z)| lr 6.06e-06 | 2532.22 ms | 53.3% bf16 MFU | 206944 tok/s +step 18353/19560 | loss 3.204854 (-1.70z)| norm 0.2367 (-0.10z)| lr 6.05e-06 | 2534.09 ms | 53.3% bf16 MFU | 206941 tok/s +step 18354/19560 | loss 3.298497 (+0.14z)| norm 0.2306 (-0.48z)| lr 6.04e-06 | 2532.44 ms | 53.3% bf16 MFU | 206946 tok/s +step 18355/19560 | loss 3.302229 (+0.21z)| norm 0.2336 (-0.28z)| lr 6.03e-06 | 2536.38 ms | 53.2% bf16 MFU | 206934 tok/s +step 18356/19560 | loss 3.285861 (-0.10z)| norm 0.2512 (+0.85z)| lr 6.02e-06 | 2531.71 ms | 53.3% bf16 MFU | 206942 tok/s +step 18357/19560 | loss 3.252354 (-0.76z)| norm 0.2416 (+0.23z)| lr 6.01e-06 | 2534.69 ms | 53.3% bf16 MFU | 206937 tok/s +step 18358/19560 | loss 3.394187 (+1.99z)| norm 0.2947 (+3.44z)| lr 6.00e-06 | 2534.12 ms | 53.3% bf16 MFU | 206934 tok/s +step 18359/19560 | loss 3.250806 (-0.78z)| norm 0.2271 (-0.69z)| lr 5.99e-06 | 2532.88 ms | 53.3% bf16 MFU | 206937 tok/s +step 18360/19560 | loss 3.266233 (-0.48z)| norm 0.2443 (+0.36z)| lr 5.98e-06 | 2532.31 ms | 53.3% bf16 MFU | 206942 tok/s +step 18361/19560 | loss 3.302291 (+0.22z)| norm 0.2423 (+0.24z)| lr 5.97e-06 | 2533.53 ms | 53.3% bf16 MFU | 206942 tok/s +step 18362/19560 | loss 3.262535 (-0.55z)| norm 0.2435 (+0.32z)| lr 5.96e-06 | 2532.72 ms | 53.3% bf16 MFU | 206946 tok/s +step 18363/19560 | loss 3.315968 (+0.48z)| norm 0.2249 (-0.82z)| lr 5.95e-06 | 2533.35 ms | 53.3% bf16 MFU | 206946 tok/s +step 18364/19560 | loss 3.314889 (+0.46z)| norm 0.2373 (-0.06z)| lr 5.94e-06 | 2531.90 ms | 53.3% bf16 MFU | 206952 tok/s +step 18365/19560 | loss 3.290132 (-0.02z)| norm 0.2362 (-0.13z)| lr 5.93e-06 | 2533.60 ms | 53.3% bf16 MFU | 206951 tok/s +step 18366/19560 | loss 3.337791 (+0.89z)| norm 0.2433 (+0.30z)| lr 5.92e-06 | 2533.44 ms | 53.3% bf16 MFU | 206951 tok/s +step 18367/19560 | loss 3.354746 (+1.20z)| norm 0.2272 (-0.69z)| lr 5.91e-06 | 2533.70 ms | 53.3% bf16 MFU | 206950 tok/s +step 18368/19560 | loss 3.252150 (-0.78z)| norm 0.2366 (-0.11z)| lr 5.90e-06 | 2533.34 ms | 53.3% bf16 MFU | 206950 tok/s +step 18369/19560 | loss 3.284288 (-0.14z)| norm 0.2345 (-0.24z)| lr 5.89e-06 | 2533.66 ms | 53.3% bf16 MFU | 206949 tok/s +step 18370/19560 | loss 3.288100 (-0.07z)| norm 0.2582 (+1.20z)| lr 5.88e-06 | 2531.57 ms | 53.3% bf16 MFU | 206957 tok/s +step 18371/19560 | loss 3.252103 (-0.77z)| norm 0.2294 (-0.56z)| lr 5.87e-06 | 2533.01 ms | 53.3% bf16 MFU | 206958 tok/s +step 18372/19560 | loss 3.273880 (-0.34z)| norm 0.2412 (+0.16z)| lr 5.86e-06 | 2535.96 ms | 53.2% bf16 MFU | 206947 tok/s +step 18373/19560 | loss 3.334360 (+0.87z)| norm 0.2251 (-0.82z)| lr 5.85e-06 | 2532.90 ms | 53.3% bf16 MFU | 206949 tok/s +step 18374/19560 | loss 3.281294 (-0.20z)| norm 0.2404 (+0.11z)| lr 5.85e-06 | 2533.29 ms | 53.3% bf16 MFU | 206950 tok/s +step 18375/19560 | loss 3.288822 (-0.04z)| norm 0.2225 (-0.97z)| lr 5.84e-06 | 2532.92 ms | 53.3% bf16 MFU | 206952 tok/s +step 18376/19560 | loss 3.265326 (-0.51z)| norm 0.2395 (+0.19z)| lr 5.83e-06 | 2534.90 ms | 53.3% bf16 MFU | 206946 tok/s +step 18377/19560 | loss 3.245985 (-0.90z)| norm 0.2340 (-0.27z)| lr 5.82e-06 | 2532.55 ms | 53.3% bf16 MFU | 206949 tok/s +step 18378/19560 | loss 3.364404 (+1.52z)| norm 0.2338 (-0.27z)| lr 5.81e-06 | 2533.77 ms | 53.3% bf16 MFU | 206948 tok/s +step 18379/19560 | loss 3.334461 (+0.89z)| norm 0.2320 (-0.42z)| lr 5.80e-06 | 2533.57 ms | 53.3% bf16 MFU | 206947 tok/s +step 18380/19560 | loss 3.268667 (-0.45z)| norm 0.2295 (-0.63z)| lr 5.79e-06 | 2533.86 ms | 53.3% bf16 MFU | 206945 tok/s +step 18381/19560 | loss 3.318503 (+0.59z)| norm 0.2309 (-0.50z)| lr 5.78e-06 | 2534.76 ms | 53.3% bf16 MFU | 206940 tok/s +step 18382/19560 | loss 3.321659 (+0.65z)| norm 0.2325 (-0.35z)| lr 5.77e-06 | 2535.67 ms | 53.2% bf16 MFU | 206931 tok/s +step 18383/19560 | loss 3.289492 (-0.03z)| norm 0.2310 (-0.48z)| lr 5.76e-06 | 2535.11 ms | 53.3% bf16 MFU | 206925 tok/s +step 18384/19560 | loss 3.306119 (+0.31z)| norm 0.2388 (+0.20z)| lr 5.75e-06 | 2535.52 ms | 53.3% bf16 MFU | 206918 tok/s +step 18385/19560 | loss 3.258913 (-0.66z)| norm 0.2397 (+0.27z)| lr 5.74e-06 | 2534.04 ms | 53.3% bf16 MFU | 206917 tok/s +step 18386/19560 | loss 3.264712 (-0.53z)| norm 0.2483 (+1.00z)| lr 5.73e-06 | 2531.56 ms | 53.3% bf16 MFU | 206926 tok/s +step 18387/19560 | loss 3.250902 (-0.83z)| norm 0.2262 (-0.92z)| lr 5.72e-06 | 2532.35 ms | 53.3% bf16 MFU | 206932 tok/s +step 18388/19560 | loss 3.250462 (-0.83z)| norm 0.2294 (-0.66z)| lr 5.71e-06 | 2533.41 ms | 53.3% bf16 MFU | 206933 tok/s +step 18389/19560 | loss 3.295320 (+0.12z)| norm 0.5561 (+10.44z)| lr 5.70e-06 | 2533.65 ms | 53.3% bf16 MFU | 206932 tok/s +step 18390/19560 | loss 3.289372 (-0.01z)| norm 0.2593 (+0.65z)| lr 5.69e-06 | 2533.70 ms | 53.3% bf16 MFU | 206932 tok/s +step 18391/19560 | loss 3.311730 (+0.46z)| norm 0.2272 (-0.41z)| lr 5.68e-06 | 2534.10 ms | 53.3% bf16 MFU | 206930 tok/s +step 18392/19560 | loss 3.272622 (-0.38z)| norm 0.2326 (-0.23z)| lr 5.67e-06 | 2532.02 ms | 53.3% bf16 MFU | 206937 tok/s +step 18393/19560 | loss 3.431913 (+2.92z)| norm 0.2451 (+0.18z)| lr 5.66e-06 | 2534.25 ms | 53.3% bf16 MFU | 206934 tok/s +step 18394/19560 | loss 3.322275 (+0.65z)| norm 0.2306 (-0.30z)| lr 5.65e-06 | 2533.14 ms | 53.3% bf16 MFU | 206936 tok/s +step 18395/19560 | loss 3.251722 (-0.81z)| norm 0.2323 (-0.24z)| lr 5.64e-06 | 2533.09 ms | 53.3% bf16 MFU | 206938 tok/s +step 18396/19560 | loss 3.331129 (+0.83z)| norm 0.2271 (-0.41z)| lr 5.63e-06 | 2534.49 ms | 53.3% bf16 MFU | 206934 tok/s +step 18397/19560 | loss 3.328208 (+0.76z)| norm 0.2460 (+0.20z)| lr 5.62e-06 | 2533.59 ms | 53.3% bf16 MFU | 206934 tok/s +step 18398/19560 | loss 3.284030 (-0.16z)| norm 0.2209 (-0.62z)| lr 5.61e-06 | 2533.34 ms | 53.3% bf16 MFU | 206935 tok/s +step 18399/19560 | loss 3.264215 (-0.58z)| norm 0.2367 (-0.10z)| lr 5.60e-06 | 2531.70 ms | 53.3% bf16 MFU | 206943 tok/s +step 18400/19560 | loss 3.252853 (-0.81z)| norm 0.2258 (-0.45z)| lr 5.59e-06 | 2532.61 ms | 53.3% bf16 MFU | 206946 tok/s +step 18401/19560 | loss 3.268343 (-0.48z)| norm 0.2369 (-0.09z)| lr 5.58e-06 | 2532.92 ms | 53.3% bf16 MFU | 206949 tok/s +step 18402/19560 | loss 3.235621 (-1.15z)| norm 0.2311 (-0.28z)| lr 5.57e-06 | 2533.10 ms | 53.3% bf16 MFU | 206950 tok/s +step 18403/19560 | loss 3.319491 (+0.58z)| norm 0.2232 (-0.54z)| lr 5.56e-06 | 2534.36 ms | 53.3% bf16 MFU | 206946 tok/s +step 18404/19560 | loss 3.354427 (+1.29z)| norm 0.2266 (-0.42z)| lr 5.55e-06 | 2534.88 ms | 53.3% bf16 MFU | 206940 tok/s +step 18405/19560 | loss 3.281796 (-0.21z)| norm 0.2246 (-0.48z)| lr 5.54e-06 | 2533.89 ms | 53.3% bf16 MFU | 206939 tok/s +step 18406/19560 | loss 3.328391 (+0.74z)| norm 0.2275 (-0.39z)| lr 5.54e-06 | 2534.00 ms | 53.3% bf16 MFU | 206937 tok/s +step 18407/19560 | loss 3.311303 (+0.38z)| norm 0.2393 (-0.00z)| lr 5.53e-06 | 2533.52 ms | 53.3% bf16 MFU | 206937 tok/s +step 18408/19560 | loss 3.297659 (+0.09z)| norm 0.2456 (+0.21z)| lr 5.52e-06 | 2532.04 ms | 53.3% bf16 MFU | 206943 tok/s +step 18409/19560 | loss 3.313348 (+0.43z)| norm 0.2331 (-0.20z)| lr 5.51e-06 | 2531.41 ms | 53.3% bf16 MFU | 206952 tok/s +step 18410/19560 | loss 3.319691 (+0.56z)| norm 0.2284 (-0.35z)| lr 5.50e-06 | 2533.37 ms | 53.3% bf16 MFU | 206952 tok/s +step 18411/19560 | loss 3.303745 (+0.22z)| norm 0.2405 (+0.04z)| lr 5.49e-06 | 2530.90 ms | 53.3% bf16 MFU | 206962 tok/s +step 18412/19560 | loss 3.368014 (+1.55z)| norm 0.2289 (-0.34z)| lr 5.48e-06 | 2534.42 ms | 53.3% bf16 MFU | 206957 tok/s +step 18413/19560 | loss 3.239143 (-1.13z)| norm 0.2293 (-0.33z)| lr 5.47e-06 | 2531.42 ms | 53.3% bf16 MFU | 206965 tok/s +step 18414/19560 | loss 3.238514 (-1.13z)| norm 0.2292 (-0.33z)| lr 5.46e-06 | 2534.36 ms | 53.3% bf16 MFU | 206960 tok/s +step 18415/19560 | loss 3.295578 (+0.05z)| norm 0.2410 (+0.05z)| lr 5.45e-06 | 2533.65 ms | 53.3% bf16 MFU | 206959 tok/s +step 18416/19560 | loss 3.267158 (-0.54z)| norm 0.2323 (-0.23z)| lr 5.44e-06 | 2533.10 ms | 53.3% bf16 MFU | 206960 tok/s +step 18417/19560 | loss 3.362835 (+1.42z)| norm 0.2309 (-0.28z)| lr 5.43e-06 | 2534.54 ms | 53.3% bf16 MFU | 206954 tok/s +step 18418/19560 | loss 3.249038 (-0.90z)| norm 0.2410 (+0.05z)| lr 5.42e-06 | 2532.39 ms | 53.3% bf16 MFU | 206958 tok/s +step 18419/19560 | loss 3.290810 (-0.06z)| norm 0.2378 (-0.06z)| lr 5.41e-06 | 2534.15 ms | 53.3% bf16 MFU | 206955 tok/s +step 18420/19560 | loss 3.258136 (-0.73z)| norm 0.2274 (-0.40z)| lr 5.40e-06 | 2533.10 ms | 53.3% bf16 MFU | 206956 tok/s +step 18421/19560 | loss 3.336322 (+1.05z)| norm 0.2407 (+0.05z)| lr 5.39e-06 | 2532.75 ms | 53.3% bf16 MFU | 206958 tok/s +step 18422/19560 | loss 3.232630 (-1.37z)| norm 0.2358 (-0.11z)| lr 5.38e-06 | 2533.70 ms | 53.3% bf16 MFU | 206957 tok/s +step 18423/19560 | loss 3.321567 (+0.70z)| norm 0.2248 (-0.47z)| lr 5.37e-06 | 2534.09 ms | 53.3% bf16 MFU | 206954 tok/s +step 18424/19560 | loss 3.277353 (-0.33z)| norm 0.2406 (+0.05z)| lr 5.36e-06 | 2534.98 ms | 53.3% bf16 MFU | 206947 tok/s +step 18425/19560 | loss 3.277241 (-0.32z)| norm 0.2351 (-0.14z)| lr 5.36e-06 | 2532.73 ms | 53.3% bf16 MFU | 206950 tok/s +step 18426/19560 | loss 3.328557 (+0.86z)| norm 0.2240 (-0.50z)| lr 5.35e-06 | 2532.54 ms | 53.3% bf16 MFU | 206953 tok/s +step 18427/19560 | loss 3.288157 (-0.09z)| norm 0.2353 (-0.12z)| lr 5.34e-06 | 2533.94 ms | 53.3% bf16 MFU | 206951 tok/s +step 18428/19560 | loss 3.394807 (+2.34z)| norm 0.2982 (+1.93z)| lr 5.33e-06 | 2532.18 ms | 53.3% bf16 MFU | 206956 tok/s +step 18429/19560 | loss 3.325171 (+0.73z)| norm 0.2318 (-0.25z)| lr 5.32e-06 | 2534.99 ms | 53.3% bf16 MFU | 206949 tok/s +step 18430/19560 | loss 3.292305 (-0.02z)| norm 0.2421 (+0.08z)| lr 5.31e-06 | 2531.97 ms | 53.3% bf16 MFU | 206955 tok/s +step 18431/19560 | loss 3.295347 (+0.05z)| norm 0.2387 (-0.02z)| lr 5.30e-06 | 2534.23 ms | 53.3% bf16 MFU | 206951 tok/s +step 18432/19560 | loss 3.285359 (-0.18z)| norm 0.2292 (-0.33z)| lr 5.29e-06 | 2532.35 ms | 53.3% bf16 MFU | 206956 tok/s +step 18433/19560 | loss 3.304975 (+0.27z)| norm 0.2279 (-0.38z)| lr 5.28e-06 | 2532.60 ms | 53.3% bf16 MFU | 206959 tok/s +step 18434/19560 | loss 3.253867 (-0.90z)| norm 0.2318 (-0.25z)| lr 5.27e-06 | 2534.42 ms | 53.3% bf16 MFU | 206954 tok/s +step 18435/19560 | loss 3.283724 (-0.20z)| norm 0.2344 (-0.16z)| lr 5.26e-06 | 2532.14 ms | 53.3% bf16 MFU | 206959 tok/s +step 18436/19560 | loss 3.445264 (+3.35z)| norm 0.3393 (+3.15z)| lr 5.25e-06 | 2530.01 ms | 53.4% bf16 MFU | 206972 tok/s +step 18437/19560 | loss 3.302766 (+0.20z)| norm 0.2372 (-0.09z)| lr 5.24e-06 | 2534.12 ms | 53.3% bf16 MFU | 206968 tok/s +step 18438/19560 | loss 3.220737 (-1.58z)| norm 0.2342 (-0.18z)| lr 5.23e-06 | 2535.76 ms | 53.2% bf16 MFU | 206958 tok/s +step 18439/19560 | loss 3.292575 (-0.02z)| norm 0.2333 (-0.21z)| lr 5.22e-06 | 2532.80 ms | 53.3% bf16 MFU | 206960 tok/s +step 18440/19560 | loss 3.266944 (-0.59z)| norm 0.2428 (+0.09z)| lr 5.22e-06 | 2533.46 ms | 53.3% bf16 MFU | 206959 tok/s +step 18441/19560 | loss 3.278370 (-0.33z)| norm 0.2240 (-0.50z)| lr 5.21e-06 | 2533.82 ms | 53.3% bf16 MFU | 206957 tok/s +step 18442/19560 | loss 3.282155 (-0.25z)| norm 0.2281 (-0.37z)| lr 5.20e-06 | 2532.73 ms | 53.3% bf16 MFU | 206959 tok/s +step 18443/19560 | loss 3.264294 (-0.65z)| norm 0.2381 (-0.05z)| lr 5.19e-06 | 2532.19 ms | 53.3% bf16 MFU | 206964 tok/s +step 18444/19560 | loss 3.310257 (+0.36z)| norm 0.2299 (-0.31z)| lr 5.18e-06 | 2532.85 ms | 53.3% bf16 MFU | 206966 tok/s +step 18445/19560 | loss 3.290009 (-0.08z)| norm 0.2311 (-0.27z)| lr 5.17e-06 | 2530.27 ms | 53.4% bf16 MFU | 206978 tok/s +step 18446/19560 | loss 3.299486 (+0.11z)| norm 0.2217 (-0.57z)| lr 5.16e-06 | 2532.43 ms | 53.3% bf16 MFU | 206980 tok/s +step 18447/19560 | loss 3.383323 (+1.94z)| norm 0.2521 (+0.39z)| lr 5.15e-06 | 2534.72 ms | 53.3% bf16 MFU | 206973 tok/s +step 18448/19560 | loss 3.316348 (+0.48z)| norm 0.2318 (-0.24z)| lr 5.14e-06 | 2533.70 ms | 53.3% bf16 MFU | 206971 tok/s +step 18449/19560 | loss 3.266924 (-0.63z)| norm 0.2356 (-0.12z)| lr 5.13e-06 | 2535.09 ms | 53.3% bf16 MFU | 206963 tok/s +step 18450/19560 | loss 3.300519 (+0.11z)| norm 0.2262 (-0.41z)| lr 5.12e-06 | 2533.61 ms | 53.3% bf16 MFU | 206961 tok/s +step 18451/19560 | loss 3.284084 (-0.24z)| norm 0.2257 (-0.42z)| lr 5.11e-06 | 2533.26 ms | 53.3% bf16 MFU | 206962 tok/s +step 18452/19560 | loss 3.314698 (+0.51z)| norm 0.2319 (-0.22z)| lr 5.10e-06 | 2534.69 ms | 53.3% bf16 MFU | 206956 tok/s +step 18453/19560 | loss 3.211713 (-1.97z)| norm 0.2263 (-0.40z)| lr 5.10e-06 | 2533.79 ms | 53.3% bf16 MFU | 206954 tok/s +step 18454/19560 | loss 3.307999 (+0.37z)| norm 0.2365 (-0.08z)| lr 5.09e-06 | 2534.44 ms | 53.3% bf16 MFU | 206949 tok/s +step 18455/19560 | loss 3.286078 (-0.17z)| norm 0.2280 (-0.35z)| lr 5.08e-06 | 2534.72 ms | 53.3% bf16 MFU | 206944 tok/s +step 18456/19560 | loss 3.271400 (-0.52z)| norm 0.2461 (+0.22z)| lr 5.07e-06 | 2534.11 ms | 53.3% bf16 MFU | 206941 tok/s +step 18457/19560 | loss 3.283651 (-0.23z)| norm 0.2263 (-0.40z)| lr 5.06e-06 | 2534.69 ms | 53.3% bf16 MFU | 206937 tok/s +step 18458/19560 | loss 3.233956 (-1.43z)| norm 0.2252 (-0.43z)| lr 5.05e-06 | 2532.42 ms | 53.3% bf16 MFU | 206941 tok/s +step 18459/19560 | loss 3.306395 (+0.32z)| norm 0.2343 (-0.14z)| lr 5.04e-06 | 2532.23 ms | 53.3% bf16 MFU | 206947 tok/s +step 18460/19560 | loss 3.339137 (+1.11z)| norm 0.2371 (-0.05z)| lr 5.03e-06 | 2533.03 ms | 53.3% bf16 MFU | 206948 tok/s +step 18461/19560 | loss 3.319298 (+0.61z)| norm 0.2361 (-0.08z)| lr 5.02e-06 | 2533.21 ms | 53.3% bf16 MFU | 206949 tok/s +step 18462/19560 | loss 3.281058 (-0.34z)| norm 0.2373 (-0.04z)| lr 5.01e-06 | 2534.49 ms | 53.3% bf16 MFU | 206945 tok/s +step 18463/19560 | loss 3.345990 (+1.26z)| norm 0.2329 (-0.18z)| lr 5.00e-06 | 2531.36 ms | 53.3% bf16 MFU | 206953 tok/s +step 18464/19560 | loss 3.284308 (-0.26z)| norm 0.2486 (+0.31z)| lr 4.99e-06 | 2533.90 ms | 53.3% bf16 MFU | 206951 tok/s +step 18465/19560 | loss 3.344484 (+1.21z)| norm 0.2347 (-0.13z)| lr 4.99e-06 | 2532.81 ms | 53.3% bf16 MFU | 206954 tok/s +step 18466/19560 | loss 3.235482 (-1.45z)| norm 0.2310 (-0.25z)| lr 4.98e-06 | 2531.97 ms | 53.3% bf16 MFU | 206959 tok/s +step 18467/19560 | loss 3.293244 (-0.03z)| norm 0.2312 (-0.24z)| lr 4.97e-06 | 2533.67 ms | 53.3% bf16 MFU | 206958 tok/s +step 18468/19560 | loss 3.322515 (+0.68z)| norm 0.2191 (-0.62z)| lr 4.96e-06 | 2532.76 ms | 53.3% bf16 MFU | 206960 tok/s +step 18469/19560 | loss 3.305276 (+0.27z)| norm 0.2298 (-0.27z)| lr 4.95e-06 | 2534.41 ms | 53.3% bf16 MFU | 206955 tok/s +step 18470/19560 | loss 3.273985 (-0.49z)| norm 0.2278 (-0.34z)| lr 4.94e-06 | 2534.69 ms | 53.3% bf16 MFU | 206950 tok/s +step 18471/19560 | loss 3.312788 (+0.46z)| norm 0.2284 (-0.32z)| lr 4.93e-06 | 2534.09 ms | 53.3% bf16 MFU | 206947 tok/s +step 18472/19560 | loss 3.258904 (-0.86z)| norm 0.2431 (+0.15z)| lr 4.92e-06 | 2532.10 ms | 53.3% bf16 MFU | 206952 tok/s +step 18473/19560 | loss 3.239427 (-1.33z)| norm 0.2310 (-0.23z)| lr 4.91e-06 | 2532.81 ms | 53.3% bf16 MFU | 206955 tok/s +step 18474/19560 | loss 3.240229 (-1.29z)| norm 0.2248 (-0.42z)| lr 4.90e-06 | 2532.40 ms | 53.3% bf16 MFU | 206959 tok/s +step 18475/19560 | loss 3.348487 (+1.32z)| norm 0.2428 (+0.14z)| lr 4.90e-06 | 2532.79 ms | 53.3% bf16 MFU | 206961 tok/s +step 18476/19560 | loss 3.303264 (+0.24z)| norm 0.2268 (-0.36z)| lr 4.89e-06 | 2530.97 ms | 53.3% bf16 MFU | 206970 tok/s +step 18477/19560 | loss 3.218241 (-1.80z)| norm 0.2274 (-0.34z)| lr 4.88e-06 | 2532.58 ms | 53.3% bf16 MFU | 206972 tok/s +step 18478/19560 | loss 3.408599 (+2.68z)| norm 0.2383 (-0.00z)| lr 4.87e-06 | 2531.78 ms | 53.3% bf16 MFU | 206978 tok/s +step 18479/19560 | loss 3.229748 (-1.50z)| norm 0.2221 (-0.51z)| lr 4.86e-06 | 2533.21 ms | 53.3% bf16 MFU | 206977 tok/s +step 18480/19560 | loss 3.313608 (+0.45z)| norm 0.2234 (-0.46z)| lr 4.85e-06 | 2532.49 ms | 53.3% bf16 MFU | 206980 tok/s +step 18481/19560 | loss 3.266893 (-0.67z)| norm 0.2531 (+0.47z)| lr 4.84e-06 | 2533.41 ms | 53.3% bf16 MFU | 206978 tok/s +step 18482/19560 | loss 3.308833 (+0.33z)| norm 0.2375 (-0.02z)| lr 4.83e-06 | 2535.13 ms | 53.3% bf16 MFU | 206970 tok/s +step 18483/19560 | loss 3.249496 (-1.07z)| norm 0.2215 (-0.53z)| lr 4.82e-06 | 2533.67 ms | 53.3% bf16 MFU | 206968 tok/s +step 18484/19560 | loss 3.328247 (+0.79z)| norm 0.2350 (-0.10z)| lr 4.81e-06 | 2533.65 ms | 53.3% bf16 MFU | 206966 tok/s +step 18485/19560 | loss 3.319423 (+0.57z)| norm 0.2298 (-0.26z)| lr 4.81e-06 | 2533.01 ms | 53.3% bf16 MFU | 206967 tok/s +step 18486/19560 | loss 3.285319 (-0.23z)| norm 0.2329 (-0.15z)| lr 4.80e-06 | 2535.60 ms | 53.2% bf16 MFU | 206957 tok/s +step 18487/19560 | loss 3.376735 (+1.95z)| norm 0.2402 (+0.08z)| lr 4.79e-06 | 2533.96 ms | 53.3% bf16 MFU | 206954 tok/s +step 18488/19560 | loss 3.319193 (+0.55z)| norm 0.2756 (+1.20z)| lr 4.78e-06 | 2532.89 ms | 53.3% bf16 MFU | 206956 tok/s +step 18489/19560 | loss 3.364951 (+1.62z)| norm 0.2360 (-0.05z)| lr 4.77e-06 | 2532.18 ms | 53.3% bf16 MFU | 206961 tok/s +step 18490/19560 | loss 3.268954 (-0.66z)| norm 0.2275 (-0.32z)| lr 4.76e-06 | 2532.84 ms | 53.3% bf16 MFU | 206962 tok/s +step 18491/19560 | loss 3.322256 (+0.61z)| norm 0.2332 (-0.14z)| lr 4.75e-06 | 2533.77 ms | 53.3% bf16 MFU | 206960 tok/s +step 18492/19560 | loss 3.424025 (+2.91z)| norm 0.2386 (+0.03z)| lr 4.74e-06 | 2532.70 ms | 53.3% bf16 MFU | 206963 tok/s +step 18493/19560 | loss 3.333473 (+0.82z)| norm 0.2231 (-0.46z)| lr 4.73e-06 | 2533.68 ms | 53.3% bf16 MFU | 206961 tok/s +step 18494/19560 | loss 3.229620 (-1.54z)| norm 0.2283 (-0.29z)| lr 4.73e-06 | 2532.51 ms | 53.3% bf16 MFU | 206964 tok/s +step 18495/19560 | loss 3.326356 (+0.68z)| norm 0.2462 (+0.27z)| lr 4.72e-06 | 2532.51 ms | 53.3% bf16 MFU | 206967 tok/s +step 18496/19560 | loss 3.318655 (+0.49z)| norm 0.2338 (-0.12z)| lr 4.71e-06 | 2534.08 ms | 53.3% bf16 MFU | 206963 tok/s +step 18497/19560 | loss 3.323175 (+0.59z)| norm 0.2228 (-0.47z)| lr 4.70e-06 | 2532.97 ms | 53.3% bf16 MFU | 206965 tok/s +step 18498/19560 | loss 3.282111 (-0.36z)| norm 0.2384 (+0.03z)| lr 4.69e-06 | 2534.30 ms | 53.3% bf16 MFU | 206960 tok/s +step 18499/19560 | loss 3.392444 (+2.12z)| norm 0.2319 (-0.18z)| lr 4.68e-06 | 2533.52 ms | 53.3% bf16 MFU | 206959 tok/s +step 18500/19560 | loss 3.314583 (+0.35z)| norm 0.2379 (+0.02z)| lr 4.67e-06 | 2535.16 ms | 53.3% bf16 MFU | 206952 tok/s +val loss 3.286122 +evaluating HellaSwag: 0/628 evaluating HellaSwag: 10/628 evaluating HellaSwag: 20/628 evaluating HellaSwag: 30/628 evaluating HellaSwag: 40/628 evaluating HellaSwag: 50/628 evaluating HellaSwag: 60/628 evaluating HellaSwag: 70/628 evaluating HellaSwag: 80/628 evaluating HellaSwag: 90/628 evaluating HellaSwag: 100/628 evaluating HellaSwag: 110/628 evaluating HellaSwag: 120/628 evaluating HellaSwag: 130/628 evaluating HellaSwag: 140/628 evaluating HellaSwag: 150/628 evaluating HellaSwag: 160/628 evaluating HellaSwag: 170/628 evaluating HellaSwag: 180/628 evaluating HellaSwag: 190/628 evaluating HellaSwag: 200/628 evaluating HellaSwag: 210/628 evaluating HellaSwag: 220/628 evaluating HellaSwag: 230/628 evaluating HellaSwag: 240/628 evaluating HellaSwag: 250/628 evaluating HellaSwag: 260/628 evaluating HellaSwag: 270/628 evaluating HellaSwag: 280/628 evaluating HellaSwag: 290/628 evaluating HellaSwag: 300/628 evaluating HellaSwag: 310/628 evaluating HellaSwag: 320/628 evaluating HellaSwag: 330/628 evaluating HellaSwag: 340/628 evaluating HellaSwag: 350/628 evaluating HellaSwag: 360/628 evaluating HellaSwag: 370/628 evaluating HellaSwag: 380/628 evaluating HellaSwag: 390/628 evaluating HellaSwag: 400/628 evaluating HellaSwag: 410/628 evaluating HellaSwag: 420/628 evaluating HellaSwag: 430/628 evaluating HellaSwag: 440/628 evaluating HellaSwag: 450/628 evaluating HellaSwag: 460/628 evaluating HellaSwag: 470/628 evaluating HellaSwag: 480/628 evaluating HellaSwag: 490/628 evaluating HellaSwag: 500/628 evaluating HellaSwag: 510/628 evaluating HellaSwag: 520/628 evaluating HellaSwag: 530/628 evaluating HellaSwag: 540/628 evaluating HellaSwag: 550/628 evaluating HellaSwag: 560/628 evaluating HellaSwag: 570/628 evaluating HellaSwag: 580/628 evaluating HellaSwag: 590/628 evaluating HellaSwag: 600/628 evaluating HellaSwag: 610/628 evaluating HellaSwag: 620/628 HellaSwag: 3027/10042 = 0.301434 +step 18501/19560 | loss 3.331020 (+0.73z)| norm 0.2435 (+0.19z)| lr 4.66e-06 | 2533.71 ms | 53.3% bf16 MFU | 206950 tok/s +step 18502/19560 | loss 3.274031 (-0.56z)| norm 0.2190 (-0.58z)| lr 4.66e-06 | 2532.78 ms | 53.3% bf16 MFU | 206953 tok/s +step 18503/19560 | loss 3.311651 (+0.28z)| norm 0.2175 (-0.63z)| lr 4.65e-06 | 2531.67 ms | 53.3% bf16 MFU | 206960 tok/s +step 18504/19560 | loss 3.345950 (+1.04z)| norm 0.2257 (-0.37z)| lr 4.64e-06 | 2532.81 ms | 53.3% bf16 MFU | 206962 tok/s +step 18505/19560 | loss 3.304102 (+0.09z)| norm 0.2407 (+0.11z)| lr 4.63e-06 | 2533.14 ms | 53.3% bf16 MFU | 206962 tok/s +step 18506/19560 | loss 3.251819 (-1.08z)| norm 0.2274 (-0.31z)| lr 4.62e-06 | 2533.08 ms | 53.3% bf16 MFU | 206963 tok/s +step 18507/19560 | loss 3.389330 (+2.02z)| norm 0.2301 (-0.22z)| lr 4.61e-06 | 2533.33 ms | 53.3% bf16 MFU | 206963 tok/s +step 18508/19560 | loss 3.266175 (-0.75z)| norm 0.2215 (-0.50z)| lr 4.60e-06 | 2534.27 ms | 53.3% bf16 MFU | 206958 tok/s +step 18509/19560 | loss 3.256354 (-0.96z)| norm 0.2206 (-0.52z)| lr 4.59e-06 | 2533.50 ms | 53.3% bf16 MFU | 206958 tok/s +step 18510/19560 | loss 3.265003 (-0.76z)| norm 0.2242 (-0.41z)| lr 4.59e-06 | 2532.49 ms | 53.3% bf16 MFU | 206961 tok/s +step 18511/19560 | loss 3.250839 (-1.06z)| norm 0.2206 (-0.52z)| lr 4.58e-06 | 2534.46 ms | 53.3% bf16 MFU | 206956 tok/s +step 18512/19560 | loss 3.223489 (-1.64z)| norm 0.2455 (+0.27z)| lr 4.57e-06 | 2532.21 ms | 53.3% bf16 MFU | 206961 tok/s +step 18513/19560 | loss 3.311140 (+0.29z)| norm 0.2235 (-0.42z)| lr 4.56e-06 | 2534.23 ms | 53.3% bf16 MFU | 206957 tok/s +step 18514/19560 | loss 3.303469 (+0.11z)| norm 0.2375 (+0.03z)| lr 4.55e-06 | 2533.84 ms | 53.3% bf16 MFU | 206955 tok/s +step 18515/19560 | loss 3.315687 (+0.37z)| norm 0.2268 (-0.31z)| lr 4.54e-06 | 2533.40 ms | 53.3% bf16 MFU | 206954 tok/s +step 18516/19560 | loss 3.256326 (-0.95z)| norm 0.2327 (-0.13z)| lr 4.53e-06 | 2532.63 ms | 53.3% bf16 MFU | 206957 tok/s +step 18517/19560 | loss 3.296815 (-0.05z)| norm 0.2313 (-0.21z)| lr 4.52e-06 | 2533.50 ms | 53.3% bf16 MFU | 206957 tok/s +step 18518/19560 | loss 3.329362 (+0.67z)| norm 0.2286 (-0.39z)| lr 4.52e-06 | 2534.58 ms | 53.3% bf16 MFU | 206951 tok/s +step 18519/19560 | loss 3.266112 (-0.73z)| norm 0.2322 (-0.13z)| lr 4.51e-06 | 2533.30 ms | 53.3% bf16 MFU | 206952 tok/s +step 18520/19560 | loss 3.292268 (-0.15z)| norm 0.2342 (+0.01z)| lr 4.50e-06 | 2533.86 ms | 53.3% bf16 MFU | 206950 tok/s +step 18521/19560 | loss 3.304656 (+0.15z)| norm 0.2233 (-0.77z)| lr 4.49e-06 | 2533.78 ms | 53.3% bf16 MFU | 206948 tok/s +step 18522/19560 | loss 3.247764 (-1.14z)| norm 0.2267 (-0.52z)| lr 4.48e-06 | 2532.50 ms | 53.3% bf16 MFU | 206952 tok/s +step 18523/19560 | loss 3.267627 (-0.69z)| norm 0.2214 (-0.89z)| lr 4.47e-06 | 2531.65 ms | 53.3% bf16 MFU | 206959 tok/s +step 18524/19560 | loss 3.273031 (-0.56z)| norm 0.2202 (-0.98z)| lr 4.46e-06 | 2534.23 ms | 53.3% bf16 MFU | 206955 tok/s +step 18525/19560 | loss 3.301802 (+0.11z)| norm 0.2293 (-0.31z)| lr 4.46e-06 | 2531.87 ms | 53.3% bf16 MFU | 206961 tok/s +step 18526/19560 | loss 3.307250 (+0.23z)| norm 0.2256 (-0.58z)| lr 4.45e-06 | 2533.13 ms | 53.3% bf16 MFU | 206962 tok/s +step 18527/19560 | loss 3.303170 (+0.13z)| norm 0.2312 (-0.17z)| lr 4.44e-06 | 2533.81 ms | 53.3% bf16 MFU | 206960 tok/s +step 18528/19560 | loss 3.276863 (-0.49z)| norm 0.2201 (-0.97z)| lr 4.43e-06 | 2533.01 ms | 53.3% bf16 MFU | 206961 tok/s +step 18529/19560 | loss 3.317485 (+0.45z)| norm 0.2228 (-0.77z)| lr 4.42e-06 | 2532.39 ms | 53.3% bf16 MFU | 206964 tok/s +step 18530/19560 | loss 3.293689 (-0.12z)| norm 0.2365 (+0.22z)| lr 4.41e-06 | 2532.70 ms | 53.3% bf16 MFU | 206967 tok/s +step 18531/19560 | loss 3.295842 (-0.06z)| norm 0.2884 (+3.73z)| lr 4.40e-06 | 2536.21 ms | 53.2% bf16 MFU | 206954 tok/s +step 18532/19560 | loss 3.285597 (-0.29z)| norm 0.2296 (-0.30z)| lr 4.40e-06 | 2534.65 ms | 53.3% bf16 MFU | 206949 tok/s +step 18533/19560 | loss 3.256334 (-0.97z)| norm 0.2315 (-0.17z)| lr 4.39e-06 | 2535.03 ms | 53.3% bf16 MFU | 206942 tok/s +step 18534/19560 | loss 3.272845 (-0.58z)| norm 0.2208 (-0.90z)| lr 4.38e-06 | 2532.67 ms | 53.3% bf16 MFU | 206946 tok/s +step 18535/19560 | loss 3.306154 (+0.21z)| norm 0.2241 (-0.67z)| lr 4.37e-06 | 2533.13 ms | 53.3% bf16 MFU | 206947 tok/s +step 18536/19560 | loss 3.266518 (-0.72z)| norm 0.2323 (-0.10z)| lr 4.36e-06 | 2530.95 ms | 53.3% bf16 MFU | 206957 tok/s +step 18537/19560 | loss 3.217167 (-1.83z)| norm 0.2171 (-1.13z)| lr 4.35e-06 | 2532.74 ms | 53.3% bf16 MFU | 206960 tok/s +step 18538/19560 | loss 3.267069 (-0.67z)| norm 0.2257 (-0.54z)| lr 4.35e-06 | 2531.41 ms | 53.3% bf16 MFU | 206967 tok/s +step 18539/19560 | loss 3.332851 (+0.85z)| norm 0.2275 (-0.41z)| lr 4.34e-06 | 2532.75 ms | 53.3% bf16 MFU | 206969 tok/s +step 18540/19560 | loss 3.265202 (-0.70z)| norm 0.2296 (-0.26z)| lr 4.33e-06 | 2532.88 ms | 53.3% bf16 MFU | 206970 tok/s +step 18541/19560 | loss 3.265953 (-0.69z)| norm 0.2324 (-0.08z)| lr 4.32e-06 | 2534.75 ms | 53.3% bf16 MFU | 206964 tok/s +step 18542/19560 | loss 3.266801 (-0.68z)| norm 0.2278 (-0.39z)| lr 4.31e-06 | 2534.72 ms | 53.3% bf16 MFU | 206958 tok/s +step 18543/19560 | loss 3.320405 (+0.58z)| norm 0.2309 (-0.17z)| lr 4.30e-06 | 2533.11 ms | 53.3% bf16 MFU | 206959 tok/s +step 18544/19560 | loss 3.337085 (+0.96z)| norm 0.2261 (-0.50z)| lr 4.29e-06 | 2533.20 ms | 53.3% bf16 MFU | 206959 tok/s +step 18545/19560 | loss 3.290602 (-0.12z)| norm 0.2364 (+0.20z)| lr 4.29e-06 | 2531.88 ms | 53.3% bf16 MFU | 206965 tok/s +step 18546/19560 | loss 3.310199 (+0.33z)| norm 0.2202 (-0.89z)| lr 4.28e-06 | 2533.68 ms | 53.3% bf16 MFU | 206963 tok/s +step 18547/19560 | loss 3.289335 (-0.17z)| norm 0.2465 (+0.89z)| lr 4.27e-06 | 2533.88 ms | 53.3% bf16 MFU | 206960 tok/s +step 18548/19560 | loss 3.381559 (+1.98z)| norm 0.2681 (+2.29z)| lr 4.26e-06 | 2531.18 ms | 53.3% bf16 MFU | 206969 tok/s +step 18549/19560 | loss 3.360138 (+1.47z)| norm 0.2418 (+0.54z)| lr 4.25e-06 | 2532.58 ms | 53.3% bf16 MFU | 206971 tok/s +step 18550/19560 | loss 3.344796 (+1.09z)| norm 0.2264 (-0.48z)| lr 4.24e-06 | 2533.96 ms | 53.3% bf16 MFU | 206968 tok/s +step 18551/19560 | loss 3.306663 (+0.20z)| norm 0.2308 (-0.19z)| lr 4.24e-06 | 2532.52 ms | 53.3% bf16 MFU | 206971 tok/s +step 18552/19560 | loss 3.366981 (+1.59z)| norm 0.2265 (-0.47z)| lr 4.23e-06 | 2532.59 ms | 53.3% bf16 MFU | 206973 tok/s +step 18553/19560 | loss 3.289821 (-0.21z)| norm 0.2303 (-0.21z)| lr 4.22e-06 | 2532.27 ms | 53.3% bf16 MFU | 206976 tok/s +step 18554/19560 | loss 3.276292 (-0.52z)| norm 0.2432 (+0.63z)| lr 4.21e-06 | 2533.47 ms | 53.3% bf16 MFU | 206975 tok/s +step 18555/19560 | loss 3.317604 (+0.44z)| norm 0.2301 (-0.23z)| lr 4.20e-06 | 2531.55 ms | 53.3% bf16 MFU | 206981 tok/s +step 18556/19560 | loss 3.316026 (+0.42z)| norm 0.2235 (-0.68z)| lr 4.19e-06 | 2532.98 ms | 53.3% bf16 MFU | 206981 tok/s +step 18557/19560 | loss 3.289068 (-0.21z)| norm 0.2334 (+0.03z)| lr 4.19e-06 | 2530.79 ms | 53.3% bf16 MFU | 206990 tok/s +step 18558/19560 | loss 3.337102 (+0.92z)| norm 0.2360 (+0.22z)| lr 4.18e-06 | 2535.03 ms | 53.3% bf16 MFU | 206982 tok/s +step 18559/19560 | loss 3.217590 (-1.88z)| norm 0.2334 (+0.03z)| lr 4.17e-06 | 2533.54 ms | 53.3% bf16 MFU | 206980 tok/s +step 18560/19560 | loss 3.272321 (-0.59z)| norm 0.2402 (+0.52z)| lr 4.16e-06 | 2531.71 ms | 53.3% bf16 MFU | 206985 tok/s +step 18561/19560 | loss 3.273471 (-0.56z)| norm 0.2220 (-0.79z)| lr 4.15e-06 | 2534.44 ms | 53.3% bf16 MFU | 206979 tok/s +step 18562/19560 | loss 3.243473 (-1.26z)| norm 0.2287 (-0.31z)| lr 4.14e-06 | 2533.02 ms | 53.3% bf16 MFU | 206979 tok/s +step 18563/19560 | loss 3.324376 (+0.62z)| norm 0.2226 (-0.74z)| lr 4.14e-06 | 2533.34 ms | 53.3% bf16 MFU | 206978 tok/s +step 18564/19560 | loss 3.290017 (-0.16z)| norm 0.3131 (+6.43z)| lr 4.13e-06 | 2531.18 ms | 53.3% bf16 MFU | 206986 tok/s +step 18565/19560 | loss 3.362201 (+1.58z)| norm 0.2235 (-0.72z)| lr 4.12e-06 | 2534.45 ms | 53.3% bf16 MFU | 206980 tok/s +step 18566/19560 | loss 3.287642 (-0.24z)| norm 0.2438 (+0.89z)| lr 4.11e-06 | 2532.67 ms | 53.3% bf16 MFU | 206981 tok/s +step 18567/19560 | loss 3.315710 (+0.45z)| norm 0.2373 (+0.37z)| lr 4.10e-06 | 2532.76 ms | 53.3% bf16 MFU | 206982 tok/s +step 18568/19560 | loss 3.260690 (-0.90z)| norm 0.2412 (+0.68z)| lr 4.09e-06 | 2532.12 ms | 53.3% bf16 MFU | 206986 tok/s +step 18569/19560 | loss 3.314382 (+0.41z)| norm 0.2312 (-0.12z)| lr 4.09e-06 | 2533.02 ms | 53.3% bf16 MFU | 206986 tok/s +step 18570/19560 | loss 3.250024 (-1.16z)| norm 0.2265 (-0.49z)| lr 4.08e-06 | 2533.35 ms | 53.3% bf16 MFU | 206984 tok/s +step 18571/19560 | loss 3.277961 (-0.48z)| norm 0.2247 (-0.63z)| lr 4.07e-06 | 2533.20 ms | 53.3% bf16 MFU | 206983 tok/s +step 18572/19560 | loss 3.290788 (-0.16z)| norm 0.2329 (+0.02z)| lr 4.06e-06 | 2534.24 ms | 53.3% bf16 MFU | 206978 tok/s +step 18573/19560 | loss 3.414784 (+2.76z)| norm 0.2683 (+2.74z)| lr 4.05e-06 | 2531.56 ms | 53.3% bf16 MFU | 206984 tok/s +step 18574/19560 | loss 3.286770 (-0.27z)| norm 0.2197 (-1.01z)| lr 4.05e-06 | 2533.70 ms | 53.3% bf16 MFU | 206981 tok/s +step 18575/19560 | loss 3.310745 (+0.31z)| norm 0.2283 (-0.35z)| lr 4.04e-06 | 2534.08 ms | 53.3% bf16 MFU | 206977 tok/s +step 18576/19560 | loss 3.316089 (+0.44z)| norm 0.2221 (-0.82z)| lr 4.03e-06 | 2532.15 ms | 53.3% bf16 MFU | 206981 tok/s +step 18577/19560 | loss 3.324795 (+0.64z)| norm 0.2326 (+0.00z)| lr 4.02e-06 | 2533.09 ms | 53.3% bf16 MFU | 206981 tok/s +step 18578/19560 | loss 3.252869 (-1.08z)| norm 0.2256 (-0.54z)| lr 4.01e-06 | 2532.05 ms | 53.3% bf16 MFU | 206985 tok/s +step 18579/19560 | loss 3.312620 (+0.35z)| norm 0.2392 (+0.50z)| lr 4.00e-06 | 2532.58 ms | 53.3% bf16 MFU | 206986 tok/s +step 18580/19560 | loss 3.276856 (-0.50z)| norm 0.2293 (-0.26z)| lr 4.00e-06 | 2532.94 ms | 53.3% bf16 MFU | 206986 tok/s +step 18581/19560 | loss 3.369849 (+1.71z)| norm 0.2693 (+2.73z)| lr 3.99e-06 | 2532.82 ms | 53.3% bf16 MFU | 206987 tok/s +step 18582/19560 | loss 3.288641 (-0.25z)| norm 0.2228 (-0.76z)| lr 3.98e-06 | 2531.90 ms | 53.3% bf16 MFU | 206991 tok/s +step 18583/19560 | loss 3.299786 (+0.02z)| norm 0.2217 (-0.83z)| lr 3.97e-06 | 2531.66 ms | 53.3% bf16 MFU | 206996 tok/s +step 18584/19560 | loss 3.359874 (+1.44z)| norm 0.2389 (+0.46z)| lr 3.96e-06 | 2531.53 ms | 53.3% bf16 MFU | 207002 tok/s +step 18585/19560 | loss 3.364024 (+1.51z)| norm 0.2335 (+0.05z)| lr 3.96e-06 | 2531.45 ms | 53.3% bf16 MFU | 207007 tok/s +step 18586/19560 | loss 3.316133 (+0.36z)| norm 0.2255 (-0.55z)| lr 3.95e-06 | 2532.09 ms | 53.3% bf16 MFU | 207009 tok/s +step 18587/19560 | loss 3.316619 (+0.37z)| norm 0.2329 (+0.01z)| lr 3.94e-06 | 2531.09 ms | 53.3% bf16 MFU | 207016 tok/s +step 18588/19560 | loss 3.271014 (-0.71z)| norm 0.2258 (-0.53z)| lr 3.93e-06 | 2531.80 ms | 53.3% bf16 MFU | 207019 tok/s +step 18589/19560 | loss 3.331707 (+0.75z)| norm 0.2228 (-0.74z)| lr 3.92e-06 | 2531.44 ms | 53.3% bf16 MFU | 207024 tok/s +step 18590/19560 | loss 3.295003 (-0.14z)| norm 0.2263 (-0.47z)| lr 3.92e-06 | 2532.75 ms | 53.3% bf16 MFU | 207023 tok/s +step 18591/19560 | loss 3.428804 (+2.97z)| norm 0.3037 (+4.80z)| lr 3.91e-06 | 2531.32 ms | 53.3% bf16 MFU | 207028 tok/s +step 18592/19560 | loss 3.276886 (-0.57z)| norm 0.2366 (+0.24z)| lr 3.90e-06 | 2532.73 ms | 53.3% bf16 MFU | 207027 tok/s +step 18593/19560 | loss 3.234515 (-1.52z)| norm 0.2526 (+1.31z)| lr 3.89e-06 | 2533.27 ms | 53.3% bf16 MFU | 207023 tok/s +step 18594/19560 | loss 3.263287 (-0.87z)| norm 0.2540 (+1.39z)| lr 3.88e-06 | 2533.29 ms | 53.3% bf16 MFU | 207020 tok/s +step 18595/19560 | loss 3.235344 (-1.50z)| norm 0.2293 (-0.27z)| lr 3.88e-06 | 2533.45 ms | 53.3% bf16 MFU | 207016 tok/s +step 18596/19560 | loss 3.244767 (-1.26z)| norm 0.2548 (+1.42z)| lr 3.87e-06 | 2534.69 ms | 53.3% bf16 MFU | 207008 tok/s +step 18597/19560 | loss 3.277884 (-0.49z)| norm 0.2311 (-0.17z)| lr 3.86e-06 | 2534.74 ms | 53.3% bf16 MFU | 206999 tok/s +step 18598/19560 | loss 3.402200 (+2.29z)| norm 0.2308 (-0.19z)| lr 3.85e-06 | 2533.26 ms | 53.3% bf16 MFU | 206998 tok/s +step 18599/19560 | loss 3.276304 (-0.53z)| norm 0.2348 (+0.08z)| lr 3.84e-06 | 2533.25 ms | 53.3% bf16 MFU | 206996 tok/s +step 18600/19560 | loss 3.293484 (-0.15z)| norm 0.2443 (+0.71z)| lr 3.84e-06 | 2532.06 ms | 53.3% bf16 MFU | 206999 tok/s +step 18601/19560 | loss 3.256669 (-0.99z)| norm 0.2436 (+0.66z)| lr 3.83e-06 | 2533.13 ms | 53.3% bf16 MFU | 206998 tok/s +step 18602/19560 | loss 3.319967 (+0.43z)| norm 0.2372 (+0.22z)| lr 3.82e-06 | 2532.17 ms | 53.3% bf16 MFU | 207000 tok/s +step 18603/19560 | loss 3.215539 (-1.90z)| norm 0.2310 (-0.19z)| lr 3.81e-06 | 2532.63 ms | 53.3% bf16 MFU | 207001 tok/s +step 18604/19560 | loss 3.279135 (-0.47z)| norm 0.2273 (-0.44z)| lr 3.80e-06 | 2530.48 ms | 53.4% bf16 MFU | 207010 tok/s +step 18605/19560 | loss 3.290737 (-0.22z)| norm 0.2257 (-0.54z)| lr 3.80e-06 | 2532.98 ms | 53.3% bf16 MFU | 207009 tok/s +step 18606/19560 | loss 3.241150 (-1.35z)| norm 0.2230 (-0.72z)| lr 3.79e-06 | 2534.11 ms | 53.3% bf16 MFU | 207003 tok/s +step 18607/19560 | loss 3.288004 (-0.27z)| norm 0.2350 (+0.08z)| lr 3.78e-06 | 2534.03 ms | 53.3% bf16 MFU | 206998 tok/s +step 18608/19560 | loss 3.286898 (-0.29z)| norm 0.2283 (-0.37z)| lr 3.77e-06 | 2534.44 ms | 53.3% bf16 MFU | 206991 tok/s +step 18609/19560 | loss 3.263073 (-0.85z)| norm 0.2307 (-0.20z)| lr 3.76e-06 | 2533.68 ms | 53.3% bf16 MFU | 206988 tok/s +step 18610/19560 | loss 3.237975 (-1.41z)| norm 0.2239 (-0.65z)| lr 3.76e-06 | 2532.51 ms | 53.3% bf16 MFU | 206990 tok/s +step 18611/19560 | loss 3.342576 (+1.00z)| norm 0.2396 (+0.40z)| lr 3.75e-06 | 2532.53 ms | 53.3% bf16 MFU | 206992 tok/s +step 18612/19560 | loss 3.315597 (+0.38z)| norm 0.2459 (+0.82z)| lr 3.74e-06 | 2533.56 ms | 53.3% bf16 MFU | 206989 tok/s +step 18613/19560 | loss 3.331457 (+0.74z)| norm 0.2410 (+0.48z)| lr 3.73e-06 | 2533.68 ms | 53.3% bf16 MFU | 206986 tok/s +step 18614/19560 | loss 3.340038 (+0.93z)| norm 0.2386 (+0.32z)| lr 3.72e-06 | 2534.26 ms | 53.3% bf16 MFU | 206980 tok/s +step 18615/19560 | loss 3.255834 (-1.01z)| norm 0.2414 (+0.51z)| lr 3.72e-06 | 2533.84 ms | 53.3% bf16 MFU | 206977 tok/s +step 18616/19560 | loss 3.266495 (-0.75z)| norm 0.2368 (+0.22z)| lr 3.71e-06 | 2532.35 ms | 53.3% bf16 MFU | 206980 tok/s +step 18617/19560 | loss 3.176125 (-2.77z)| norm 0.2287 (-0.34z)| lr 3.70e-06 | 2534.34 ms | 53.3% bf16 MFU | 206975 tok/s +step 18618/19560 | loss 3.217647 (-1.79z)| norm 0.2478 (+0.98z)| lr 3.69e-06 | 2532.45 ms | 53.3% bf16 MFU | 206977 tok/s +step 18619/19560 | loss 3.266310 (-0.68z)| norm 0.2352 (+0.10z)| lr 3.69e-06 | 2533.34 ms | 53.3% bf16 MFU | 206976 tok/s +step 18620/19560 | loss 3.338961 (+1.01z)| norm 0.2346 (+0.06z)| lr 3.68e-06 | 2534.15 ms | 53.3% bf16 MFU | 206972 tok/s +step 18621/19560 | loss 3.252169 (-1.00z)| norm 0.2268 (-0.48z)| lr 3.67e-06 | 2532.80 ms | 53.3% bf16 MFU | 206973 tok/s +step 18622/19560 | loss 3.269325 (-0.61z)| norm 0.2358 (+0.14z)| lr 3.66e-06 | 2533.89 ms | 53.3% bf16 MFU | 206970 tok/s +step 18623/19560 | loss 3.401237 (+2.42z)| norm 0.2353 (+0.11z)| lr 3.65e-06 | 2532.41 ms | 53.3% bf16 MFU | 206973 tok/s +step 18624/19560 | loss 3.279906 (-0.36z)| norm 0.2374 (+0.25z)| lr 3.65e-06 | 2533.39 ms | 53.3% bf16 MFU | 206972 tok/s +step 18625/19560 | loss 3.352999 (+1.31z)| norm 0.2459 (+0.84z)| lr 3.64e-06 | 2534.87 ms | 53.3% bf16 MFU | 206965 tok/s +step 18626/19560 | loss 3.372707 (+1.72z)| norm 0.2389 (+0.35z)| lr 3.63e-06 | 2532.34 ms | 53.3% bf16 MFU | 206969 tok/s +step 18627/19560 | loss 3.283427 (-0.28z)| norm 0.2242 (-0.67z)| lr 3.62e-06 | 2532.79 ms | 53.3% bf16 MFU | 206970 tok/s +step 18628/19560 | loss 3.325469 (+0.68z)| norm 0.2235 (-0.71z)| lr 3.62e-06 | 2531.64 ms | 53.3% bf16 MFU | 206976 tok/s +step 18629/19560 | loss 3.283138 (-0.28z)| norm 0.2287 (-0.34z)| lr 3.61e-06 | 2532.47 ms | 53.3% bf16 MFU | 206979 tok/s +step 18630/19560 | loss 3.262105 (-0.76z)| norm 0.2275 (-0.43z)| lr 3.60e-06 | 2533.69 ms | 53.3% bf16 MFU | 206976 tok/s +step 18631/19560 | loss 3.278608 (-0.38z)| norm 0.2222 (-0.80z)| lr 3.59e-06 | 2530.96 ms | 53.3% bf16 MFU | 206985 tok/s +step 18632/19560 | loss 3.275853 (-0.43z)| norm 0.2329 (-0.06z)| lr 3.58e-06 | 2534.15 ms | 53.3% bf16 MFU | 206980 tok/s +step 18633/19560 | loss 3.244431 (-1.14z)| norm 0.2259 (-0.54z)| lr 3.58e-06 | 2533.47 ms | 53.3% bf16 MFU | 206978 tok/s +step 18634/19560 | loss 3.284166 (-0.23z)| norm 0.2232 (-0.73z)| lr 3.57e-06 | 2532.09 ms | 53.3% bf16 MFU | 206982 tok/s +step 18635/19560 | loss 3.238246 (-1.28z)| norm 0.2210 (-0.88z)| lr 3.56e-06 | 2532.16 ms | 53.3% bf16 MFU | 206986 tok/s +step 18636/19560 | loss 3.285382 (-0.18z)| norm 0.2239 (-0.68z)| lr 3.55e-06 | 2531.84 ms | 53.3% bf16 MFU | 206990 tok/s +step 18637/19560 | loss 3.279179 (-0.33z)| norm 0.2225 (-0.78z)| lr 3.55e-06 | 2531.81 ms | 53.3% bf16 MFU | 206995 tok/s +step 18638/19560 | loss 3.260611 (-0.77z)| norm 0.2221 (-0.80z)| lr 3.54e-06 | 2532.99 ms | 53.3% bf16 MFU | 206994 tok/s +step 18639/19560 | loss 3.279705 (-0.33z)| norm 0.2281 (-0.39z)| lr 3.53e-06 | 2532.26 ms | 53.3% bf16 MFU | 206997 tok/s +step 18640/19560 | loss 3.277904 (-0.38z)| norm 0.2326 (-0.06z)| lr 3.52e-06 | 2533.77 ms | 53.3% bf16 MFU | 206993 tok/s +step 18641/19560 | loss 3.223188 (-1.65z)| norm 0.2208 (-0.89z)| lr 3.52e-06 | 2532.73 ms | 53.3% bf16 MFU | 206994 tok/s +step 18642/19560 | loss 3.202083 (-2.09z)| norm 0.2307 (-0.20z)| lr 3.51e-06 | 2533.07 ms | 53.3% bf16 MFU | 206993 tok/s +step 18643/19560 | loss 3.336111 (+1.00z)| norm 0.2550 (+1.48z)| lr 3.50e-06 | 2533.13 ms | 53.3% bf16 MFU | 206992 tok/s +step 18644/19560 | loss 3.259607 (-0.77z)| norm 0.2216 (-0.83z)| lr 3.49e-06 | 2534.13 ms | 53.3% bf16 MFU | 206987 tok/s +step 18645/19560 | loss 3.293880 (+0.03z)| norm 0.2260 (-0.52z)| lr 3.49e-06 | 2531.96 ms | 53.3% bf16 MFU | 206991 tok/s +step 18646/19560 | loss 3.280515 (-0.27z)| norm 0.2319 (-0.12z)| lr 3.48e-06 | 2533.05 ms | 53.3% bf16 MFU | 206990 tok/s +step 18647/19560 | loss 3.252589 (-0.92z)| norm 0.2184 (-1.04z)| lr 3.47e-06 | 2532.18 ms | 53.3% bf16 MFU | 206993 tok/s +step 18648/19560 | loss 3.269611 (-0.52z)| norm 0.2263 (-0.49z)| lr 3.46e-06 | 2532.45 ms | 53.3% bf16 MFU | 206995 tok/s +step 18649/19560 | loss 3.273271 (-0.43z)| norm 0.2286 (-0.33z)| lr 3.46e-06 | 2531.67 ms | 53.3% bf16 MFU | 207000 tok/s +step 18650/19560 | loss 3.301189 (+0.21z)| norm 0.2379 (+0.30z)| lr 3.45e-06 | 2531.62 ms | 53.3% bf16 MFU | 207004 tok/s +step 18651/19560 | loss 3.252816 (-0.91z)| norm 0.2631 (+1.99z)| lr 3.44e-06 | 2533.19 ms | 53.3% bf16 MFU | 207003 tok/s +step 18652/19560 | loss 3.246226 (-1.05z)| norm 0.2187 (-1.03z)| lr 3.43e-06 | 2530.05 ms | 53.4% bf16 MFU | 207014 tok/s +step 18653/19560 | loss 3.328121 (+0.83z)| norm 0.2293 (-0.31z)| lr 3.42e-06 | 2532.59 ms | 53.3% bf16 MFU | 207014 tok/s +step 18654/19560 | loss 3.263608 (-0.65z)| norm 0.2242 (-0.66z)| lr 3.42e-06 | 2532.10 ms | 53.3% bf16 MFU | 207016 tok/s +step 18655/19560 | loss 3.277118 (-0.33z)| norm 0.2222 (-0.79z)| lr 3.41e-06 | 2532.67 ms | 53.3% bf16 MFU | 207016 tok/s +step 18656/19560 | loss 3.308669 (+0.39z)| norm 0.2471 (+0.89z)| lr 3.40e-06 | 2531.00 ms | 53.3% bf16 MFU | 207022 tok/s +step 18657/19560 | loss 3.283324 (-0.19z)| norm 0.2302 (-0.26z)| lr 3.39e-06 | 2531.98 ms | 53.3% bf16 MFU | 207024 tok/s +step 18658/19560 | loss 3.269927 (-0.49z)| norm 0.2467 (+0.85z)| lr 3.39e-06 | 2531.12 ms | 53.3% bf16 MFU | 207030 tok/s +step 18659/19560 | loss 3.257728 (-0.76z)| norm 0.2301 (-0.26z)| lr 3.38e-06 | 2533.85 ms | 53.3% bf16 MFU | 207024 tok/s +step 18660/19560 | loss 3.256449 (-0.79z)| norm 0.2245 (-0.65z)| lr 3.37e-06 | 2533.23 ms | 53.3% bf16 MFU | 207021 tok/s +step 18661/19560 | loss 3.268429 (-0.51z)| norm 0.2299 (-0.27z)| lr 3.36e-06 | 2532.44 ms | 53.3% bf16 MFU | 207022 tok/s +step 18662/19560 | loss 3.301502 (+0.24z)| norm 0.2274 (-0.45z)| lr 3.36e-06 | 2534.51 ms | 53.3% bf16 MFU | 207014 tok/s +step 18663/19560 | loss 3.275737 (-0.35z)| norm 0.2287 (-0.36z)| lr 3.35e-06 | 2532.21 ms | 53.3% bf16 MFU | 207015 tok/s +step 18664/19560 | loss 3.261388 (-0.67z)| norm 0.2405 (+0.48z)| lr 3.34e-06 | 2532.18 ms | 53.3% bf16 MFU | 207017 tok/s +step 18665/19560 | loss 3.174128 (-2.62z)| norm 0.2365 (+0.19z)| lr 3.34e-06 | 2531.61 ms | 53.3% bf16 MFU | 207021 tok/s +step 18666/19560 | loss 3.243271 (-1.06z)| norm 0.2302 (-0.27z)| lr 3.33e-06 | 2533.81 ms | 53.3% bf16 MFU | 207016 tok/s +step 18667/19560 | loss 3.219305 (-1.56z)| norm 0.2298 (-0.30z)| lr 3.32e-06 | 2532.04 ms | 53.3% bf16 MFU | 207018 tok/s +step 18668/19560 | loss 3.286113 (-0.08z)| norm 0.2334 (-0.04z)| lr 3.31e-06 | 2531.83 ms | 53.3% bf16 MFU | 207021 tok/s +step 18669/19560 | loss 3.248289 (-0.92z)| norm 0.2302 (-0.27z)| lr 3.31e-06 | 2534.27 ms | 53.3% bf16 MFU | 207014 tok/s +step 18670/19560 | loss 3.254916 (-0.77z)| norm 0.2220 (-0.86z)| lr 3.30e-06 | 2533.46 ms | 53.3% bf16 MFU | 207011 tok/s +step 18671/19560 | loss 3.308189 (+0.42z)| norm 0.2181 (-1.13z)| lr 3.29e-06 | 2534.35 ms | 53.3% bf16 MFU | 207004 tok/s +step 18672/19560 | loss 3.255785 (-0.73z)| norm 0.2318 (-0.15z)| lr 3.28e-06 | 2533.06 ms | 53.3% bf16 MFU | 207002 tok/s +step 18673/19560 | loss 3.277807 (-0.24z)| norm 0.2290 (-0.35z)| lr 3.28e-06 | 2531.84 ms | 53.3% bf16 MFU | 207006 tok/s +step 18674/19560 | loss 3.294792 (+0.14z)| norm 0.2369 (+0.22z)| lr 3.27e-06 | 2531.97 ms | 53.3% bf16 MFU | 207009 tok/s +step 18675/19560 | loss 3.309796 (+0.47z)| norm 0.2326 (-0.09z)| lr 3.26e-06 | 2535.19 ms | 53.3% bf16 MFU | 206999 tok/s +step 18676/19560 | loss 3.319861 (+0.72z)| norm 0.2180 (-1.14z)| lr 3.25e-06 | 2532.78 ms | 53.3% bf16 MFU | 206999 tok/s +step 18677/19560 | loss 3.212296 (-1.69z)| norm 0.2529 (+1.42z)| lr 3.25e-06 | 2531.26 ms | 53.3% bf16 MFU | 207005 tok/s +step 18678/19560 | loss 3.271028 (-0.35z)| norm 0.2306 (-0.22z)| lr 3.24e-06 | 2534.51 ms | 53.3% bf16 MFU | 206998 tok/s +step 18679/19560 | loss 3.261945 (-0.55z)| norm 0.2502 (+1.21z)| lr 3.23e-06 | 2531.31 ms | 53.3% bf16 MFU | 207004 tok/s +step 18680/19560 | loss 3.254925 (-0.70z)| norm 0.2230 (-0.78z)| lr 3.22e-06 | 2535.17 ms | 53.3% bf16 MFU | 206994 tok/s +step 18681/19560 | loss 3.347199 (+1.41z)| norm 0.2249 (-0.64z)| lr 3.22e-06 | 2532.15 ms | 53.3% bf16 MFU | 206997 tok/s +step 18682/19560 | loss 3.312992 (+0.62z)| norm 0.2489 (+1.11z)| lr 3.21e-06 | 2535.37 ms | 53.3% bf16 MFU | 206987 tok/s +step 18683/19560 | loss 3.296868 (+0.25z)| norm 0.2244 (-0.67z)| lr 3.20e-06 | 2532.38 ms | 53.3% bf16 MFU | 206989 tok/s +step 18684/19560 | loss 3.316580 (+0.71z)| norm 0.2370 (+0.23z)| lr 3.20e-06 | 2534.36 ms | 53.3% bf16 MFU | 206983 tok/s +step 18685/19560 | loss 3.295205 (+0.22z)| norm 0.2221 (-0.84z)| lr 3.19e-06 | 2530.41 ms | 53.4% bf16 MFU | 206994 tok/s +step 18686/19560 | loss 3.297775 (+0.28z)| norm 0.2334 (-0.02z)| lr 3.18e-06 | 2531.13 ms | 53.3% bf16 MFU | 207001 tok/s +step 18687/19560 | loss 3.309147 (+0.53z)| norm 0.2515 (+1.28z)| lr 3.17e-06 | 2531.92 ms | 53.3% bf16 MFU | 207004 tok/s +step 18688/19560 | loss 3.306019 (+0.45z)| norm 0.2365 (+0.20z)| lr 3.17e-06 | 2533.43 ms | 53.3% bf16 MFU | 207002 tok/s +step 18689/19560 | loss 3.309884 (+0.54z)| norm 0.2347 (+0.06z)| lr 3.16e-06 | 2532.12 ms | 53.3% bf16 MFU | 207004 tok/s +step 18690/19560 | loss 3.337176 (+1.15z)| norm 0.2246 (-0.67z)| lr 3.15e-06 | 2532.34 ms | 53.3% bf16 MFU | 207006 tok/s +step 18691/19560 | loss 3.276561 (-0.24z)| norm 0.2262 (-0.56z)| lr 3.14e-06 | 2531.91 ms | 53.3% bf16 MFU | 207009 tok/s +step 18692/19560 | loss 3.235480 (-1.18z)| norm 0.2268 (-0.53z)| lr 3.14e-06 | 2531.88 ms | 53.3% bf16 MFU | 207012 tok/s +step 18693/19560 | loss 3.297611 (+0.27z)| norm 0.2464 (+1.09z)| lr 3.13e-06 | 2533.08 ms | 53.3% bf16 MFU | 207011 tok/s +step 18694/19560 | loss 3.232171 (-1.24z)| norm 0.2436 (+0.86z)| lr 3.12e-06 | 2533.18 ms | 53.3% bf16 MFU | 207009 tok/s +step 18695/19560 | loss 3.282249 (-0.07z)| norm 0.2237 (-0.79z)| lr 3.12e-06 | 2532.62 ms | 53.3% bf16 MFU | 207009 tok/s +step 18696/19560 | loss 3.295213 (+0.22z)| norm 0.2182 (-1.23z)| lr 3.11e-06 | 2531.03 ms | 53.3% bf16 MFU | 207016 tok/s +step 18697/19560 | loss 3.276956 (-0.20z)| norm 0.2222 (-0.90z)| lr 3.10e-06 | 2533.07 ms | 53.3% bf16 MFU | 207014 tok/s +step 18698/19560 | loss 3.263997 (-0.50z)| norm 0.2381 (+0.42z)| lr 3.09e-06 | 2534.73 ms | 53.3% bf16 MFU | 207005 tok/s +step 18699/19560 | loss 3.257381 (-0.65z)| norm 0.2351 (+0.16z)| lr 3.09e-06 | 2533.39 ms | 53.3% bf16 MFU | 207002 tok/s +step 18700/19560 | loss 3.284079 (-0.03z)| norm 0.2294 (-0.31z)| lr 3.08e-06 | 2533.07 ms | 53.3% bf16 MFU | 207001 tok/s +step 18701/19560 | loss 3.244202 (-0.96z)| norm 0.2276 (-0.45z)| lr 3.07e-06 | 2531.78 ms | 53.3% bf16 MFU | 207005 tok/s +step 18702/19560 | loss 3.243245 (-0.97z)| norm 0.2261 (-0.59z)| lr 3.07e-06 | 2531.89 ms | 53.3% bf16 MFU | 207009 tok/s +step 18703/19560 | loss 3.231318 (-1.24z)| norm 0.2358 (+0.25z)| lr 3.06e-06 | 2531.68 ms | 53.3% bf16 MFU | 207013 tok/s +step 18704/19560 | loss 3.279768 (-0.07z)| norm 0.2440 (+0.94z)| lr 3.05e-06 | 2533.53 ms | 53.3% bf16 MFU | 207009 tok/s +step 18705/19560 | loss 3.306954 (+0.58z)| norm 0.2328 (-0.02z)| lr 3.04e-06 | 2533.07 ms | 53.3% bf16 MFU | 207008 tok/s +step 18706/19560 | loss 3.301149 (+0.44z)| norm 0.2287 (-0.38z)| lr 3.04e-06 | 2533.55 ms | 53.3% bf16 MFU | 207004 tok/s +step 18707/19560 | loss 3.298336 (+0.37z)| norm 0.2266 (-0.55z)| lr 3.03e-06 | 2533.38 ms | 53.3% bf16 MFU | 207001 tok/s +step 18708/19560 | loss 3.301006 (+0.43z)| norm 0.2308 (-0.19z)| lr 3.02e-06 | 2532.15 ms | 53.3% bf16 MFU | 207004 tok/s +step 18709/19560 | loss 3.237417 (-1.09z)| norm 0.2214 (-1.01z)| lr 3.02e-06 | 2533.48 ms | 53.3% bf16 MFU | 207001 tok/s +step 18710/19560 | loss 3.237105 (-1.08z)| norm 0.2337 (+0.09z)| lr 3.01e-06 | 2531.84 ms | 53.3% bf16 MFU | 207005 tok/s +step 18711/19560 | loss 3.309092 (+0.66z)| norm 0.2380 (+0.46z)| lr 3.00e-06 | 2533.25 ms | 53.3% bf16 MFU | 207003 tok/s +step 18712/19560 | loss 3.306949 (+0.63z)| norm 0.2411 (+0.74z)| lr 3.00e-06 | 2533.01 ms | 53.3% bf16 MFU | 207002 tok/s +step 18713/19560 | loss 3.310955 (+0.75z)| norm 0.2260 (-0.61z)| lr 2.99e-06 | 2533.50 ms | 53.3% bf16 MFU | 206999 tok/s +step 18714/19560 | loss 3.314913 (+0.85z)| norm 0.2236 (-0.83z)| lr 2.98e-06 | 2531.72 ms | 53.3% bf16 MFU | 207003 tok/s +step 18715/19560 | loss 3.259791 (-0.52z)| norm 0.2337 (+0.08z)| lr 2.97e-06 | 2532.44 ms | 53.3% bf16 MFU | 207004 tok/s +step 18716/19560 | loss 3.331442 (+1.25z)| norm 0.2248 (-0.73z)| lr 2.97e-06 | 2533.09 ms | 53.3% bf16 MFU | 207003 tok/s +step 18717/19560 | loss 3.253239 (-0.67z)| norm 0.2177 (-1.35z)| lr 2.96e-06 | 2531.88 ms | 53.3% bf16 MFU | 207007 tok/s +step 18718/19560 | loss 3.244502 (-0.88z)| norm 0.2223 (-0.93z)| lr 2.95e-06 | 2532.04 ms | 53.3% bf16 MFU | 207009 tok/s +step 18719/19560 | loss 3.228574 (-1.30z)| norm 0.2187 (-1.43z)| lr 2.95e-06 | 2532.41 ms | 53.3% bf16 MFU | 207010 tok/s +step 18720/19560 | loss 3.315170 (+0.95z)| norm 0.2251 (-0.74z)| lr 2.94e-06 | 2531.31 ms | 53.3% bf16 MFU | 207016 tok/s +step 18721/19560 | loss 3.251470 (-0.71z)| norm 0.2223 (-1.03z)| lr 2.93e-06 | 2534.39 ms | 53.3% bf16 MFU | 207009 tok/s +step 18722/19560 | loss 3.269576 (-0.24z)| norm 0.2282 (-0.37z)| lr 2.92e-06 | 2532.36 ms | 53.3% bf16 MFU | 207010 tok/s +step 18723/19560 | loss 3.279869 (+0.02z)| norm 0.2404 (+0.96z)| lr 2.92e-06 | 2533.63 ms | 53.3% bf16 MFU | 207006 tok/s +step 18724/19560 | loss 3.289616 (+0.27z)| norm 0.2320 (+0.06z)| lr 2.91e-06 | 2532.35 ms | 53.3% bf16 MFU | 207008 tok/s +step 18725/19560 | loss 3.242890 (-0.96z)| norm 0.2181 (-1.49z)| lr 2.90e-06 | 2533.57 ms | 53.3% bf16 MFU | 207004 tok/s +step 18726/19560 | loss 3.306803 (+0.77z)| norm 0.2279 (-0.39z)| lr 2.90e-06 | 2534.25 ms | 53.3% bf16 MFU | 206998 tok/s +step 18727/19560 | loss 3.298541 (+0.54z)| norm 0.2205 (-1.20z)| lr 2.89e-06 | 2534.93 ms | 53.3% bf16 MFU | 206989 tok/s +step 18728/19560 | loss 3.334317 (+1.50z)| norm 0.2205 (-1.19z)| lr 2.88e-06 | 2532.15 ms | 53.3% bf16 MFU | 206992 tok/s +step 18729/19560 | loss 3.336094 (+1.52z)| norm 0.2274 (-0.40z)| lr 2.88e-06 | 2534.21 ms | 53.3% bf16 MFU | 206987 tok/s +step 18730/19560 | loss 3.278472 (-0.02z)| norm 0.2181 (-1.42z)| lr 2.87e-06 | 2533.71 ms | 53.3% bf16 MFU | 206984 tok/s +step 18731/19560 | loss 3.254058 (-0.70z)| norm 0.2295 (-0.14z)| lr 2.86e-06 | 2533.76 ms | 53.3% bf16 MFU | 206981 tok/s +step 18732/19560 | loss 3.278715 (-0.02z)| norm 0.2255 (-0.59z)| lr 2.86e-06 | 2532.35 ms | 53.3% bf16 MFU | 206983 tok/s +step 18733/19560 | loss 3.320612 (+1.11z)| norm 0.2288 (-0.23z)| lr 2.85e-06 | 2533.34 ms | 53.3% bf16 MFU | 206982 tok/s +step 18734/19560 | loss 3.285370 (+0.14z)| norm 0.2353 (+0.50z)| lr 2.84e-06 | 2533.06 ms | 53.3% bf16 MFU | 206982 tok/s +step 18735/19560 | loss 3.292408 (+0.33z)| norm 0.2255 (-0.60z)| lr 2.84e-06 | 2531.30 ms | 53.3% bf16 MFU | 206989 tok/s +step 18736/19560 | loss 3.293909 (+0.37z)| norm 0.2257 (-0.57z)| lr 2.83e-06 | 2531.86 ms | 53.3% bf16 MFU | 206993 tok/s +step 18737/19560 | loss 3.310282 (+0.81z)| norm 0.2228 (-0.89z)| lr 2.82e-06 | 2533.02 ms | 53.3% bf16 MFU | 206993 tok/s +step 18738/19560 | loss 3.363669 (+2.20z)| norm 0.2328 (+0.23z)| lr 2.81e-06 | 2533.49 ms | 53.3% bf16 MFU | 206990 tok/s +step 18739/19560 | loss 3.331920 (+1.36z)| norm 0.2359 (+0.58z)| lr 2.81e-06 | 2532.22 ms | 53.3% bf16 MFU | 206993 tok/s +step 18740/19560 | loss 3.256310 (-0.67z)| norm 0.2437 (+1.46z)| lr 2.80e-06 | 2532.37 ms | 53.3% bf16 MFU | 206995 tok/s +step 18741/19560 | loss 3.280788 (+0.00z)| norm 0.2425 (+1.33z)| lr 2.79e-06 | 2532.37 ms | 53.3% bf16 MFU | 206997 tok/s +step 18742/19560 | loss 3.303638 (+0.64z)| norm 0.2369 (+0.70z)| lr 2.79e-06 | 2532.94 ms | 53.3% bf16 MFU | 206997 tok/s +step 18743/19560 | loss 3.272207 (-0.23z)| norm 0.2268 (-0.44z)| lr 2.78e-06 | 2531.53 ms | 53.3% bf16 MFU | 207002 tok/s +step 18744/19560 | loss 3.308233 (+0.75z)| norm 0.2344 (+0.43z)| lr 2.77e-06 | 2532.57 ms | 53.3% bf16 MFU | 207003 tok/s +step 18745/19560 | loss 3.309799 (+0.79z)| norm 0.2224 (-0.93z)| lr 2.77e-06 | 2532.68 ms | 53.3% bf16 MFU | 207003 tok/s +step 18746/19560 | loss 3.296751 (+0.41z)| norm 0.2261 (-0.50z)| lr 2.76e-06 | 2531.86 ms | 53.3% bf16 MFU | 207007 tok/s +step 18747/19560 | loss 3.224672 (-1.63z)| norm 0.2217 (-0.98z)| lr 2.75e-06 | 2532.61 ms | 53.3% bf16 MFU | 207007 tok/s +step 18748/19560 | loss 3.297762 (+0.46z)| norm 0.2347 (+0.50z)| lr 2.75e-06 | 2532.13 ms | 53.3% bf16 MFU | 207009 tok/s +step 18749/19560 | loss 3.253520 (-0.81z)| norm 0.2214 (-1.01z)| lr 2.74e-06 | 2533.47 ms | 53.3% bf16 MFU | 207006 tok/s +step 18750/19560 | loss 3.315035 (+0.94z)| norm 0.2287 (-0.17z)| lr 2.73e-06 | 2532.28 ms | 53.3% bf16 MFU | 207008 tok/s +val loss 3.285737 +evaluating HellaSwag: 0/628 evaluating HellaSwag: 10/628 evaluating HellaSwag: 20/628 evaluating HellaSwag: 30/628 evaluating HellaSwag: 40/628 evaluating HellaSwag: 50/628 evaluating HellaSwag: 60/628 evaluating HellaSwag: 70/628 evaluating HellaSwag: 80/628 evaluating HellaSwag: 90/628 evaluating HellaSwag: 100/628 evaluating HellaSwag: 110/628 evaluating HellaSwag: 120/628 evaluating HellaSwag: 130/628 evaluating HellaSwag: 140/628 evaluating HellaSwag: 150/628 evaluating HellaSwag: 160/628 evaluating HellaSwag: 170/628 evaluating HellaSwag: 180/628 evaluating HellaSwag: 190/628 evaluating HellaSwag: 200/628 evaluating HellaSwag: 210/628 evaluating HellaSwag: 220/628 evaluating HellaSwag: 230/628 evaluating HellaSwag: 240/628 evaluating HellaSwag: 250/628 evaluating HellaSwag: 260/628 evaluating HellaSwag: 270/628 evaluating HellaSwag: 280/628 evaluating HellaSwag: 290/628 evaluating HellaSwag: 300/628 evaluating HellaSwag: 310/628 evaluating HellaSwag: 320/628 evaluating HellaSwag: 330/628 evaluating HellaSwag: 340/628 evaluating HellaSwag: 350/628 evaluating HellaSwag: 360/628 evaluating HellaSwag: 370/628 evaluating HellaSwag: 380/628 evaluating HellaSwag: 390/628 evaluating HellaSwag: 400/628 evaluating HellaSwag: 410/628 evaluating HellaSwag: 420/628 evaluating HellaSwag: 430/628 evaluating HellaSwag: 440/628 evaluating HellaSwag: 450/628 evaluating HellaSwag: 460/628 evaluating HellaSwag: 470/628 evaluating HellaSwag: 480/628 evaluating HellaSwag: 490/628 evaluating HellaSwag: 500/628 evaluating HellaSwag: 510/628 evaluating HellaSwag: 520/628 evaluating HellaSwag: 530/628 evaluating HellaSwag: 540/628 evaluating HellaSwag: 550/628 evaluating HellaSwag: 560/628 evaluating HellaSwag: 570/628 evaluating HellaSwag: 580/628 evaluating HellaSwag: 590/628 evaluating HellaSwag: 600/628 evaluating HellaSwag: 610/628 evaluating HellaSwag: 620/628 HellaSwag: 3031/10042 = 0.301832 +step 18751/19560 | loss 3.327172 (+1.35z)| norm 0.2356 (+0.62z)| lr 2.73e-06 | 2532.57 ms | 53.3% bf16 MFU | 207008 tok/s +step 18752/19560 | loss 3.373853 (+2.64z)| norm 0.3579 (+8.91z)| lr 2.72e-06 | 2531.16 ms | 53.3% bf16 MFU | 207015 tok/s +step 18753/19560 | loss 3.219935 (-1.78z)| norm 0.2279 (-0.22z)| lr 2.71e-06 | 2531.60 ms | 53.3% bf16 MFU | 207019 tok/s +step 18754/19560 | loss 3.317834 (+1.10z)| norm 0.2231 (-0.55z)| lr 2.71e-06 | 2533.78 ms | 53.3% bf16 MFU | 207014 tok/s +step 18755/19560 | loss 3.242571 (-1.12z)| norm 0.2216 (-0.65z)| lr 2.70e-06 | 2531.43 ms | 53.3% bf16 MFU | 207019 tok/s +step 18756/19560 | loss 3.293147 (+0.38z)| norm 0.2305 (-0.03z)| lr 2.69e-06 | 2534.29 ms | 53.3% bf16 MFU | 207012 tok/s +step 18757/19560 | loss 3.270001 (-0.30z)| norm 0.2145 (-1.14z)| lr 2.69e-06 | 2531.02 ms | 53.3% bf16 MFU | 207018 tok/s +step 18758/19560 | loss 3.301963 (+0.64z)| norm 0.2183 (-0.87z)| lr 2.68e-06 | 2534.10 ms | 53.3% bf16 MFU | 207012 tok/s +step 18759/19560 | loss 3.301236 (+0.61z)| norm 0.2331 (+0.16z)| lr 2.67e-06 | 2532.21 ms | 53.3% bf16 MFU | 207014 tok/s +step 18760/19560 | loss 3.243359 (-1.10z)| norm 0.2236 (-0.50z)| lr 2.67e-06 | 2532.50 ms | 53.3% bf16 MFU | 207014 tok/s +step 18761/19560 | loss 3.296627 (+0.47z)| norm 0.2217 (-0.63z)| lr 2.66e-06 | 2534.08 ms | 53.3% bf16 MFU | 207008 tok/s +step 18762/19560 | loss 3.316462 (+1.04z)| norm 0.2181 (-0.88z)| lr 2.65e-06 | 2533.77 ms | 53.3% bf16 MFU | 207004 tok/s +step 18763/19560 | loss 3.275692 (-0.17z)| norm 0.2193 (-0.79z)| lr 2.65e-06 | 2533.93 ms | 53.3% bf16 MFU | 206999 tok/s +step 18764/19560 | loss 3.272241 (-0.27z)| norm 0.2360 (+0.37z)| lr 2.64e-06 | 2534.64 ms | 53.3% bf16 MFU | 206992 tok/s +step 18765/19560 | loss 3.345490 (+1.87z)| norm 0.2362 (+0.37z)| lr 2.63e-06 | 2533.35 ms | 53.3% bf16 MFU | 206990 tok/s +step 18766/19560 | loss 3.214926 (-1.92z)| norm 0.2439 (+0.90z)| lr 2.63e-06 | 2535.59 ms | 53.2% bf16 MFU | 206979 tok/s +step 18767/19560 | loss 3.310480 (+0.83z)| norm 0.2274 (-0.25z)| lr 2.62e-06 | 2533.17 ms | 53.3% bf16 MFU | 206978 tok/s +step 18768/19560 | loss 3.231675 (-1.42z)| norm 0.2296 (-0.10z)| lr 2.61e-06 | 2533.84 ms | 53.3% bf16 MFU | 206975 tok/s +step 18769/19560 | loss 3.285463 (+0.10z)| norm 0.2194 (-0.81z)| lr 2.61e-06 | 2532.90 ms | 53.3% bf16 MFU | 206976 tok/s +step 18770/19560 | loss 3.314090 (+0.92z)| norm 0.2458 (+1.02z)| lr 2.60e-06 | 2533.80 ms | 53.3% bf16 MFU | 206973 tok/s +step 18771/19560 | loss 3.278004 (-0.13z)| norm 0.2251 (-0.40z)| lr 2.59e-06 | 2532.99 ms | 53.3% bf16 MFU | 206974 tok/s +step 18772/19560 | loss 3.263167 (-0.57z)| norm 0.2262 (-0.33z)| lr 2.59e-06 | 2533.27 ms | 53.3% bf16 MFU | 206973 tok/s +step 18773/19560 | loss 3.226422 (-1.63z)| norm 0.2279 (-0.21z)| lr 2.58e-06 | 2533.09 ms | 53.3% bf16 MFU | 206973 tok/s +step 18774/19560 | loss 3.269904 (-0.35z)| norm 0.2175 (-0.93z)| lr 2.57e-06 | 2533.03 ms | 53.3% bf16 MFU | 206973 tok/s +step 18775/19560 | loss 3.279566 (-0.07z)| norm 0.2205 (-0.72z)| lr 2.57e-06 | 2533.64 ms | 53.3% bf16 MFU | 206971 tok/s +step 18776/19560 | loss 3.274583 (-0.22z)| norm 0.2244 (-0.45z)| lr 2.56e-06 | 2532.89 ms | 53.3% bf16 MFU | 206972 tok/s +step 18777/19560 | loss 3.339897 (+1.68z)| norm 0.2280 (-0.20z)| lr 2.55e-06 | 2532.81 ms | 53.3% bf16 MFU | 206974 tok/s +step 18778/19560 | loss 3.344254 (+1.78z)| norm 0.2252 (-0.39z)| lr 2.55e-06 | 2532.47 ms | 53.3% bf16 MFU | 206976 tok/s +step 18779/19560 | loss 3.313965 (+0.89z)| norm 0.2191 (-0.80z)| lr 2.54e-06 | 2534.44 ms | 53.3% bf16 MFU | 206971 tok/s +step 18780/19560 | loss 3.270020 (-0.39z)| norm 0.2309 (+0.03z)| lr 2.54e-06 | 2531.61 ms | 53.3% bf16 MFU | 206977 tok/s +step 18781/19560 | loss 3.262687 (-0.59z)| norm 0.2260 (-0.31z)| lr 2.53e-06 | 2534.05 ms | 53.3% bf16 MFU | 206973 tok/s +step 18782/19560 | loss 3.303775 (+0.60z)| norm 0.2283 (-0.15z)| lr 2.52e-06 | 2532.80 ms | 53.3% bf16 MFU | 206974 tok/s +step 18783/19560 | loss 3.324008 (+1.17z)| norm 0.2313 (+0.06z)| lr 2.52e-06 | 2532.86 ms | 53.3% bf16 MFU | 206975 tok/s +step 18784/19560 | loss 3.285039 (+0.05z)| norm 0.2219 (-0.60z)| lr 2.51e-06 | 2532.20 ms | 53.3% bf16 MFU | 206979 tok/s +step 18785/19560 | loss 3.284697 (+0.04z)| norm 0.2396 (+0.66z)| lr 2.50e-06 | 2533.60 ms | 53.3% bf16 MFU | 206977 tok/s +step 18786/19560 | loss 3.289025 (+0.16z)| norm 0.2322 (+0.14z)| lr 2.50e-06 | 2534.61 ms | 53.3% bf16 MFU | 206970 tok/s +step 18787/19560 | loss 3.312055 (+0.81z)| norm 0.2308 (+0.03z)| lr 2.49e-06 | 2532.58 ms | 53.3% bf16 MFU | 206973 tok/s +step 18788/19560 | loss 3.251670 (-0.94z)| norm 0.2791 (+3.34z)| lr 2.48e-06 | 2532.80 ms | 53.3% bf16 MFU | 206974 tok/s +step 18789/19560 | loss 3.309980 (+0.74z)| norm 0.2565 (+1.75z)| lr 2.48e-06 | 2533.02 ms | 53.3% bf16 MFU | 206974 tok/s +step 18790/19560 | loss 3.266487 (-0.51z)| norm 0.2404 (+0.64z)| lr 2.47e-06 | 2532.55 ms | 53.3% bf16 MFU | 206977 tok/s +step 18791/19560 | loss 3.313610 (+0.85z)| norm 0.2526 (+1.45z)| lr 2.46e-06 | 2533.05 ms | 53.3% bf16 MFU | 206977 tok/s +step 18792/19560 | loss 3.276073 (-0.24z)| norm 0.2246 (-0.44z)| lr 2.46e-06 | 2532.28 ms | 53.3% bf16 MFU | 206980 tok/s +step 18793/19560 | loss 3.313457 (+0.84z)| norm 0.2196 (-0.76z)| lr 2.45e-06 | 2533.08 ms | 53.3% bf16 MFU | 206980 tok/s +step 18794/19560 | loss 3.302655 (+0.50z)| norm 0.2256 (-0.36z)| lr 2.45e-06 | 2532.74 ms | 53.3% bf16 MFU | 206981 tok/s +step 18795/19560 | loss 3.287621 (+0.03z)| norm 0.2479 (+1.13z)| lr 2.44e-06 | 2531.05 ms | 53.3% bf16 MFU | 206989 tok/s +step 18796/19560 | loss 3.299971 (+0.41z)| norm 0.2219 (-0.61z)| lr 2.43e-06 | 2535.57 ms | 53.2% bf16 MFU | 206978 tok/s +step 18797/19560 | loss 3.324406 (+1.14z)| norm 0.2252 (-0.39z)| lr 2.43e-06 | 2533.60 ms | 53.3% bf16 MFU | 206976 tok/s +step 18798/19560 | loss 3.315641 (+0.86z)| norm 0.2166 (-0.96z)| lr 2.42e-06 | 2534.42 ms | 53.3% bf16 MFU | 206971 tok/s +step 18799/19560 | loss 3.310906 (+0.71z)| norm 0.2325 (+0.10z)| lr 2.41e-06 | 2535.17 ms | 53.3% bf16 MFU | 206962 tok/s +step 18800/19560 | loss 3.324354 (+1.11z)| norm 0.2370 (+0.40z)| lr 2.41e-06 | 2531.37 ms | 53.3% bf16 MFU | 206970 tok/s +step 18801/19560 | loss 3.307959 (+0.60z)| norm 0.2232 (-0.52z)| lr 2.40e-06 | 2531.30 ms | 53.3% bf16 MFU | 206978 tok/s +step 18802/19560 | loss 3.314127 (+0.78z)| norm 0.2262 (-0.32z)| lr 2.39e-06 | 2532.42 ms | 53.3% bf16 MFU | 206980 tok/s +step 18803/19560 | loss 3.331686 (+1.31z)| norm 0.2273 (-0.24z)| lr 2.39e-06 | 2533.61 ms | 53.3% bf16 MFU | 206978 tok/s +step 18804/19560 | loss 3.337816 (+1.48z)| norm 0.2672 (+2.36z)| lr 2.38e-06 | 2533.66 ms | 53.3% bf16 MFU | 206976 tok/s +step 18805/19560 | loss 3.323855 (+1.05z)| norm 0.2246 (-0.43z)| lr 2.38e-06 | 2531.34 ms | 53.3% bf16 MFU | 206983 tok/s +step 18806/19560 | loss 3.254166 (-1.09z)| norm 0.2365 (+0.36z)| lr 2.37e-06 | 2530.66 ms | 53.4% bf16 MFU | 206992 tok/s +step 18807/19560 | loss 3.368088 (+2.34z)| norm 0.2233 (-0.50z)| lr 2.36e-06 | 2534.14 ms | 53.3% bf16 MFU | 206987 tok/s +step 18808/19560 | loss 3.339404 (+1.45z)| norm 0.2496 (+1.23z)| lr 2.36e-06 | 2531.06 ms | 53.3% bf16 MFU | 206995 tok/s +step 18809/19560 | loss 3.275930 (-0.45z)| norm 0.2225 (-0.57z)| lr 2.35e-06 | 2534.59 ms | 53.3% bf16 MFU | 206988 tok/s +step 18810/19560 | loss 3.348637 (+1.74z)| norm 0.2323 (+0.09z)| lr 2.34e-06 | 2534.65 ms | 53.3% bf16 MFU | 206981 tok/s +step 18811/19560 | loss 3.281847 (-0.27z)| norm 0.2238 (-0.47z)| lr 2.34e-06 | 2531.98 ms | 53.3% bf16 MFU | 206985 tok/s +step 18812/19560 | loss 3.304980 (+0.43z)| norm 0.2384 (+0.50z)| lr 2.33e-06 | 2532.62 ms | 53.3% bf16 MFU | 206987 tok/s +step 18813/19560 | loss 3.247425 (-1.28z)| norm 0.2316 (+0.04z)| lr 2.33e-06 | 2533.54 ms | 53.3% bf16 MFU | 206984 tok/s +step 18814/19560 | loss 3.354843 (+1.89z)| norm 0.2639 (+2.13z)| lr 2.32e-06 | 2532.74 ms | 53.3% bf16 MFU | 206985 tok/s +step 18815/19560 | loss 3.395521 (+2.97z)| norm 0.2286 (-0.16z)| lr 2.31e-06 | 2530.89 ms | 53.3% bf16 MFU | 206994 tok/s +step 18816/19560 | loss 3.276431 (-0.42z)| norm 0.2275 (-0.23z)| lr 2.31e-06 | 2532.57 ms | 53.3% bf16 MFU | 206995 tok/s +step 18817/19560 | loss 3.320990 (+0.84z)| norm 0.2308 (-0.01z)| lr 2.30e-06 | 2532.67 ms | 53.3% bf16 MFU | 206996 tok/s +step 18818/19560 | loss 3.304012 (+0.37z)| norm 0.2243 (-0.44z)| lr 2.29e-06 | 2533.64 ms | 53.3% bf16 MFU | 206992 tok/s +step 18819/19560 | loss 3.320636 (+0.84z)| norm 0.2170 (-0.91z)| lr 2.29e-06 | 2533.41 ms | 53.3% bf16 MFU | 206990 tok/s +step 18820/19560 | loss 3.304876 (+0.37z)| norm 0.2271 (-0.25z)| lr 2.28e-06 | 2532.42 ms | 53.3% bf16 MFU | 206992 tok/s +step 18821/19560 | loss 3.293858 (+0.06z)| norm 0.2259 (-0.32z)| lr 2.28e-06 | 2532.85 ms | 53.3% bf16 MFU | 206992 tok/s +step 18822/19560 | loss 3.330145 (+1.09z)| norm 0.2245 (-0.40z)| lr 2.27e-06 | 2535.63 ms | 53.2% bf16 MFU | 206981 tok/s +step 18823/19560 | loss 3.337861 (+1.29z)| norm 0.2366 (+0.39z)| lr 2.26e-06 | 2531.86 ms | 53.3% bf16 MFU | 206986 tok/s +step 18824/19560 | loss 3.290671 (-0.07z)| norm 0.2284 (-0.16z)| lr 2.26e-06 | 2532.85 ms | 53.3% bf16 MFU | 206986 tok/s +step 18825/19560 | loss 3.319454 (+0.75z)| norm 0.2283 (-0.17z)| lr 2.25e-06 | 2532.50 ms | 53.3% bf16 MFU | 206988 tok/s +step 18826/19560 | loss 3.313389 (+0.57z)| norm 0.2215 (-0.61z)| lr 2.25e-06 | 2531.84 ms | 53.3% bf16 MFU | 206993 tok/s +step 18827/19560 | loss 3.309968 (+0.46z)| norm 0.2179 (-0.84z)| lr 2.24e-06 | 2534.96 ms | 53.3% bf16 MFU | 206984 tok/s +step 18828/19560 | loss 3.320559 (+0.75z)| norm 0.2306 (+0.00z)| lr 2.23e-06 | 2531.59 ms | 53.3% bf16 MFU | 206990 tok/s +step 18829/19560 | loss 3.339146 (+1.27z)| norm 0.2325 (+0.13z)| lr 2.23e-06 | 2532.17 ms | 53.3% bf16 MFU | 206993 tok/s +step 18830/19560 | loss 3.301634 (+0.17z)| norm 0.2213 (-0.61z)| lr 2.22e-06 | 2532.69 ms | 53.3% bf16 MFU | 206994 tok/s +step 18831/19560 | loss 3.305018 (+0.26z)| norm 0.2229 (-0.50z)| lr 2.22e-06 | 2534.17 ms | 53.3% bf16 MFU | 206988 tok/s +step 18832/19560 | loss 3.310821 (+0.42z)| norm 0.2216 (-0.58z)| lr 2.21e-06 | 2534.12 ms | 53.3% bf16 MFU | 206984 tok/s +step 18833/19560 | loss 3.307049 (+0.31z)| norm 0.2255 (-0.32z)| lr 2.20e-06 | 2532.64 ms | 53.3% bf16 MFU | 206985 tok/s +step 18834/19560 | loss 3.256690 (-1.16z)| norm 0.2280 (-0.15z)| lr 2.20e-06 | 2534.79 ms | 53.3% bf16 MFU | 206978 tok/s +step 18835/19560 | loss 3.301162 (+0.15z)| norm 0.2297 (-0.04z)| lr 2.19e-06 | 2533.04 ms | 53.3% bf16 MFU | 206978 tok/s +step 18836/19560 | loss 3.303490 (+0.22z)| norm 0.2198 (-0.68z)| lr 2.19e-06 | 2533.43 ms | 53.3% bf16 MFU | 206976 tok/s +step 18837/19560 | loss 3.412884 (+3.29z)| norm 0.2207 (-0.62z)| lr 2.18e-06 | 2532.33 ms | 53.3% bf16 MFU | 206979 tok/s +step 18838/19560 | loss 3.367389 (+1.96z)| norm 0.2278 (-0.15z)| lr 2.17e-06 | 2533.09 ms | 53.3% bf16 MFU | 206979 tok/s +step 18839/19560 | loss 3.369112 (+1.96z)| norm 0.2282 (-0.12z)| lr 2.17e-06 | 2532.66 ms | 53.3% bf16 MFU | 206981 tok/s +step 18840/19560 | loss 3.283544 (-0.43z)| norm 0.2225 (-0.49z)| lr 2.16e-06 | 2532.51 ms | 53.3% bf16 MFU | 206983 tok/s +step 18841/19560 | loss 3.270919 (-0.77z)| norm 0.2213 (-0.57z)| lr 2.16e-06 | 2533.70 ms | 53.3% bf16 MFU | 206980 tok/s +step 18842/19560 | loss 3.281601 (-0.47z)| norm 0.2320 (+0.14z)| lr 2.15e-06 | 2532.83 ms | 53.3% bf16 MFU | 206981 tok/s +step 18843/19560 | loss 3.284647 (-0.39z)| norm 0.2252 (-0.31z)| lr 2.14e-06 | 2532.08 ms | 53.3% bf16 MFU | 206985 tok/s +step 18844/19560 | loss 3.266567 (-0.88z)| norm 0.2224 (-0.49z)| lr 2.14e-06 | 2533.86 ms | 53.3% bf16 MFU | 206981 tok/s +step 18845/19560 | loss 3.261924 (-1.02z)| norm 0.2347 (+0.31z)| lr 2.13e-06 | 2532.36 ms | 53.3% bf16 MFU | 206984 tok/s +step 18846/19560 | loss 3.275817 (-0.64z)| norm 0.2227 (-0.48z)| lr 2.13e-06 | 2533.17 ms | 53.3% bf16 MFU | 206983 tok/s +step 18847/19560 | loss 3.297647 (-0.03z)| norm 0.2144 (-1.03z)| lr 2.12e-06 | 2532.55 ms | 53.3% bf16 MFU | 206985 tok/s +step 18848/19560 | loss 3.291039 (-0.22z)| norm 0.2289 (-0.07z)| lr 2.11e-06 | 2533.84 ms | 53.3% bf16 MFU | 206981 tok/s +step 18849/19560 | loss 3.316740 (+0.51z)| norm 0.2396 (+0.63z)| lr 2.11e-06 | 2532.13 ms | 53.3% bf16 MFU | 206985 tok/s +step 18850/19560 | loss 3.291374 (-0.23z)| norm 0.2443 (+0.92z)| lr 2.10e-06 | 2535.76 ms | 53.2% bf16 MFU | 206974 tok/s +step 18851/19560 | loss 3.314276 (+0.43z)| norm 0.2264 (-0.24z)| lr 2.10e-06 | 2531.56 ms | 53.3% bf16 MFU | 206980 tok/s +step 18852/19560 | loss 3.302446 (+0.08z)| norm 0.2254 (-0.31z)| lr 2.09e-06 | 2533.27 ms | 53.3% bf16 MFU | 206979 tok/s +step 18853/19560 | loss 3.304357 (+0.12z)| norm 0.2207 (-0.62z)| lr 2.08e-06 | 2534.18 ms | 53.3% bf16 MFU | 206974 tok/s +step 18854/19560 | loss 3.298061 (-0.06z)| norm 0.2205 (-0.63z)| lr 2.08e-06 | 2533.88 ms | 53.3% bf16 MFU | 206971 tok/s +step 18855/19560 | loss 3.222923 (-2.21z)| norm 0.2294 (-0.04z)| lr 2.07e-06 | 2533.67 ms | 53.3% bf16 MFU | 206969 tok/s +step 18856/19560 | loss 3.298828 (-0.01z)| norm 0.2325 (+0.16z)| lr 2.07e-06 | 2534.18 ms | 53.3% bf16 MFU | 206965 tok/s +step 18857/19560 | loss 3.195832 (-2.88z)| norm 0.2379 (+0.51z)| lr 2.06e-06 | 2535.24 ms | 53.3% bf16 MFU | 206957 tok/s +step 18858/19560 | loss 3.371082 (+2.00z)| norm 0.2266 (-0.25z)| lr 2.05e-06 | 2536.04 ms | 53.2% bf16 MFU | 206946 tok/s +step 18859/19560 | loss 3.309197 (+0.28z)| norm 0.2313 (+0.07z)| lr 2.05e-06 | 2531.85 ms | 53.3% bf16 MFU | 206952 tok/s +step 18860/19560 | loss 3.255705 (-1.20z)| norm 0.2342 (+0.25z)| lr 2.04e-06 | 2533.43 ms | 53.3% bf16 MFU | 206952 tok/s +step 18861/19560 | loss 3.290664 (-0.23z)| norm 0.2174 (-0.85z)| lr 2.04e-06 | 2532.71 ms | 53.3% bf16 MFU | 206955 tok/s +step 18862/19560 | loss 3.304738 (+0.16z)| norm 0.2237 (-0.43z)| lr 2.03e-06 | 2534.28 ms | 53.3% bf16 MFU | 206951 tok/s +step 18863/19560 | loss 3.290346 (-0.24z)| norm 0.2189 (-0.74z)| lr 2.03e-06 | 2533.86 ms | 53.3% bf16 MFU | 206949 tok/s +step 18864/19560 | loss 3.353312 (+1.48z)| norm 0.2271 (-0.20z)| lr 2.02e-06 | 2535.35 ms | 53.3% bf16 MFU | 206941 tok/s +step 18865/19560 | loss 3.337800 (+1.05z)| norm 0.2227 (-0.49z)| lr 2.01e-06 | 2532.10 ms | 53.3% bf16 MFU | 206947 tok/s +step 18866/19560 | loss 3.278849 (-0.56z)| norm 0.2302 (-0.00z)| lr 2.01e-06 | 2532.84 ms | 53.3% bf16 MFU | 206949 tok/s +step 18867/19560 | loss 3.274654 (-0.67z)| norm 0.2193 (-0.71z)| lr 2.00e-06 | 2533.38 ms | 53.3% bf16 MFU | 206949 tok/s +step 18868/19560 | loss 3.251425 (-1.31z)| norm 0.2226 (-0.48z)| lr 2.00e-06 | 2532.55 ms | 53.3% bf16 MFU | 206953 tok/s +step 18869/19560 | loss 3.276883 (-0.60z)| norm 0.2201 (-0.64z)| lr 1.99e-06 | 2534.47 ms | 53.3% bf16 MFU | 206948 tok/s +step 18870/19560 | loss 3.331658 (+0.91z)| norm 0.2204 (-0.61z)| lr 1.99e-06 | 2533.09 ms | 53.3% bf16 MFU | 206950 tok/s +step 18871/19560 | loss 3.313962 (+0.41z)| norm 0.2312 (+0.10z)| lr 1.98e-06 | 2533.27 ms | 53.3% bf16 MFU | 206950 tok/s +step 18872/19560 | loss 3.362180 (+1.72z)| norm 0.2333 (+0.24z)| lr 1.97e-06 | 2533.82 ms | 53.3% bf16 MFU | 206949 tok/s +step 18873/19560 | loss 3.267076 (-0.88z)| norm 0.2239 (-0.37z)| lr 1.97e-06 | 2535.61 ms | 53.2% bf16 MFU | 206940 tok/s +step 18874/19560 | loss 3.293952 (-0.14z)| norm 0.2273 (-0.15z)| lr 1.96e-06 | 2533.68 ms | 53.3% bf16 MFU | 206939 tok/s +step 18875/19560 | loss 3.275713 (-0.66z)| norm 0.2255 (-0.28z)| lr 1.96e-06 | 2532.70 ms | 53.3% bf16 MFU | 206942 tok/s +step 18876/19560 | loss 3.321755 (+0.61z)| norm 0.2410 (+0.75z)| lr 1.95e-06 | 2534.04 ms | 53.3% bf16 MFU | 206940 tok/s +step 18877/19560 | loss 3.245902 (-1.48z)| norm 0.2158 (-0.91z)| lr 1.95e-06 | 2532.53 ms | 53.3% bf16 MFU | 206944 tok/s +step 18878/19560 | loss 3.316531 (+0.47z)| norm 0.2275 (-0.14z)| lr 1.94e-06 | 2532.79 ms | 53.3% bf16 MFU | 206947 tok/s +step 18879/19560 | loss 3.300241 (+0.02z)| norm 0.2221 (-0.49z)| lr 1.93e-06 | 2535.27 ms | 53.3% bf16 MFU | 206940 tok/s +step 18880/19560 | loss 3.355315 (+1.56z)| norm 0.2435 (+1.46z)| lr 1.93e-06 | 2534.26 ms | 53.3% bf16 MFU | 206937 tok/s +step 18881/19560 | loss 3.309585 (+0.27z)| norm 0.2180 (-1.04z)| lr 1.92e-06 | 2535.21 ms | 53.3% bf16 MFU | 206930 tok/s +step 18882/19560 | loss 3.272157 (-0.78z)| norm 0.2288 (+0.01z)| lr 1.92e-06 | 2533.59 ms | 53.3% bf16 MFU | 206930 tok/s +step 18883/19560 | loss 3.372848 (+2.03z)| norm 0.2369 (+0.80z)| lr 1.91e-06 | 2532.48 ms | 53.3% bf16 MFU | 206935 tok/s +step 18884/19560 | loss 3.367532 (+1.84z)| norm 0.2302 (+0.14z)| lr 1.91e-06 | 2532.70 ms | 53.3% bf16 MFU | 206939 tok/s +step 18885/19560 | loss 3.351347 (+1.37z)| norm 0.2207 (-0.79z)| lr 1.90e-06 | 2533.70 ms | 53.3% bf16 MFU | 206938 tok/s +step 18886/19560 | loss 3.288606 (-0.36z)| norm 0.2290 (+0.02z)| lr 1.89e-06 | 2535.47 ms | 53.3% bf16 MFU | 206930 tok/s +step 18887/19560 | loss 3.290876 (-0.30z)| norm 0.2223 (-0.64z)| lr 1.89e-06 | 2532.24 ms | 53.3% bf16 MFU | 206936 tok/s +step 18888/19560 | loss 3.287670 (-0.40z)| norm 0.2189 (-0.97z)| lr 1.88e-06 | 2533.00 ms | 53.3% bf16 MFU | 206938 tok/s +step 18889/19560 | loss 3.308279 (+0.17z)| norm 0.2235 (-0.52z)| lr 1.88e-06 | 2533.51 ms | 53.3% bf16 MFU | 206938 tok/s +step 18890/19560 | loss 3.307157 (+0.14z)| norm 0.2257 (-0.31z)| lr 1.87e-06 | 2533.41 ms | 53.3% bf16 MFU | 206939 tok/s +step 18891/19560 | loss 3.308011 (+0.16z)| norm 0.2266 (-0.23z)| lr 1.87e-06 | 2533.89 ms | 53.3% bf16 MFU | 206937 tok/s +step 18892/19560 | loss 3.267389 (-0.98z)| norm 0.2274 (-0.14z)| lr 1.86e-06 | 2533.23 ms | 53.3% bf16 MFU | 206939 tok/s +step 18893/19560 | loss 3.288223 (-0.38z)| norm 0.2189 (-0.97z)| lr 1.86e-06 | 2532.58 ms | 53.3% bf16 MFU | 206943 tok/s +step 18894/19560 | loss 3.295597 (-0.20z)| norm 0.2263 (-0.23z)| lr 1.85e-06 | 2531.85 ms | 53.3% bf16 MFU | 206950 tok/s +step 18895/19560 | loss 3.267460 (-1.00z)| norm 0.2428 (+1.40z)| lr 1.84e-06 | 2532.37 ms | 53.3% bf16 MFU | 206954 tok/s +step 18896/19560 | loss 3.199630 (-2.88z)| norm 0.2196 (-0.89z)| lr 1.84e-06 | 2535.44 ms | 53.3% bf16 MFU | 206945 tok/s +step 18897/19560 | loss 3.288383 (-0.38z)| norm 0.2202 (-0.83z)| lr 1.83e-06 | 2534.13 ms | 53.3% bf16 MFU | 206942 tok/s +step 18898/19560 | loss 3.299637 (-0.06z)| norm 0.2176 (-1.08z)| lr 1.83e-06 | 2535.74 ms | 53.2% bf16 MFU | 206933 tok/s +step 18899/19560 | loss 3.339716 (+1.05z)| norm 0.2201 (-0.82z)| lr 1.82e-06 | 2533.10 ms | 53.3% bf16 MFU | 206935 tok/s +step 18900/19560 | loss 3.261716 (-1.14z)| norm 0.2203 (-0.79z)| lr 1.82e-06 | 2533.37 ms | 53.3% bf16 MFU | 206936 tok/s +step 18901/19560 | loss 3.355440 (+1.48z)| norm 0.2323 (+0.39z)| lr 1.81e-06 | 2533.73 ms | 53.3% bf16 MFU | 206936 tok/s +step 18902/19560 | loss 3.339125 (+1.00z)| norm 0.2188 (-0.94z)| lr 1.81e-06 | 2532.59 ms | 53.3% bf16 MFU | 206940 tok/s +step 18903/19560 | loss 3.380187 (+2.11z)| norm 0.2276 (-0.08z)| lr 1.80e-06 | 2533.46 ms | 53.3% bf16 MFU | 206940 tok/s +step 18904/19560 | loss 3.253559 (-1.41z)| norm 0.2312 (+0.28z)| lr 1.79e-06 | 2533.92 ms | 53.3% bf16 MFU | 206938 tok/s +step 18905/19560 | loss 3.272012 (-0.89z)| norm 0.2167 (-1.16z)| lr 1.79e-06 | 2534.91 ms | 53.3% bf16 MFU | 206933 tok/s +step 18906/19560 | loss 3.309436 (+0.16z)| norm 0.2340 (+0.56z)| lr 1.78e-06 | 2533.32 ms | 53.3% bf16 MFU | 206934 tok/s +step 18907/19560 | loss 3.288538 (-0.42z)| norm 0.2435 (+1.47z)| lr 1.78e-06 | 2535.82 ms | 53.2% bf16 MFU | 206925 tok/s +step 18908/19560 | loss 3.270409 (-0.92z)| norm 0.2183 (-1.01z)| lr 1.77e-06 | 2535.43 ms | 53.3% bf16 MFU | 206918 tok/s +step 18909/19560 | loss 3.298865 (-0.14z)| norm 0.2460 (+1.69z)| lr 1.77e-06 | 2534.58 ms | 53.3% bf16 MFU | 206915 tok/s +step 18910/19560 | loss 3.328365 (+0.69z)| norm 0.2236 (-0.49z)| lr 1.76e-06 | 2534.54 ms | 53.3% bf16 MFU | 206912 tok/s +step 18911/19560 | loss 3.346183 (+1.18z)| norm 0.2223 (-0.61z)| lr 1.76e-06 | 2535.63 ms | 53.2% bf16 MFU | 206905 tok/s +step 18912/19560 | loss 3.330882 (+0.74z)| norm 0.2279 (-0.07z)| lr 1.75e-06 | 2533.71 ms | 53.3% bf16 MFU | 206906 tok/s +step 18913/19560 | loss 3.299260 (-0.15z)| norm 0.2242 (-0.42z)| lr 1.75e-06 | 2533.75 ms | 53.3% bf16 MFU | 206906 tok/s +step 18914/19560 | loss 3.288403 (-0.45z)| norm 0.2269 (-0.15z)| lr 1.74e-06 | 2534.15 ms | 53.3% bf16 MFU | 206906 tok/s +step 18915/19560 | loss 3.291554 (-0.36z)| norm 0.2348 (+0.61z)| lr 1.74e-06 | 2533.33 ms | 53.3% bf16 MFU | 206908 tok/s +step 18916/19560 | loss 3.321012 (+0.45z)| norm 0.2199 (-0.87z)| lr 1.73e-06 | 2533.22 ms | 53.3% bf16 MFU | 206911 tok/s +step 18917/19560 | loss 3.237827 (-1.85z)| norm 0.2247 (-0.35z)| lr 1.72e-06 | 2533.48 ms | 53.3% bf16 MFU | 206913 tok/s +step 18918/19560 | loss 3.409943 (+2.82z)| norm 0.2264 (-0.14z)| lr 1.72e-06 | 2534.58 ms | 53.3% bf16 MFU | 206910 tok/s +step 18919/19560 | loss 3.310523 (+0.14z)| norm 0.2267 (-0.09z)| lr 1.71e-06 | 2533.97 ms | 53.3% bf16 MFU | 206909 tok/s +step 18920/19560 | loss 3.277201 (-0.76z)| norm 0.2180 (-1.10z)| lr 1.71e-06 | 2533.96 ms | 53.3% bf16 MFU | 206909 tok/s +step 18921/19560 | loss 3.295361 (-0.27z)| norm 0.2348 (+0.84z)| lr 1.70e-06 | 2534.66 ms | 53.3% bf16 MFU | 206906 tok/s +step 18922/19560 | loss 3.294705 (-0.29z)| norm 0.2761 (+5.03z)| lr 1.70e-06 | 2535.31 ms | 53.3% bf16 MFU | 206900 tok/s +step 18923/19560 | loss 3.329164 (+0.64z)| norm 0.2158 (-1.25z)| lr 1.69e-06 | 2532.81 ms | 53.3% bf16 MFU | 206905 tok/s +step 18924/19560 | loss 3.183564 (-3.14z)| norm 0.2357 (+0.83z)| lr 1.69e-06 | 2534.15 ms | 53.3% bf16 MFU | 206905 tok/s +step 18925/19560 | loss 3.280776 (-0.61z)| norm 0.2190 (-0.91z)| lr 1.68e-06 | 2534.44 ms | 53.3% bf16 MFU | 206903 tok/s +step 18926/19560 | loss 3.318726 (+0.37z)| norm 0.2187 (-0.95z)| lr 1.68e-06 | 2532.13 ms | 53.3% bf16 MFU | 206910 tok/s +step 18927/19560 | loss 3.346508 (+1.08z)| norm 0.2191 (-0.89z)| lr 1.67e-06 | 2531.70 ms | 53.3% bf16 MFU | 206919 tok/s +step 18928/19560 | loss 3.289274 (-0.39z)| norm 0.2268 (-0.08z)| lr 1.67e-06 | 2533.31 ms | 53.3% bf16 MFU | 206921 tok/s +step 18929/19560 | loss 3.302486 (-0.05z)| norm 0.2162 (-1.19z)| lr 1.66e-06 | 2533.85 ms | 53.3% bf16 MFU | 206921 tok/s +step 18930/19560 | loss 3.366400 (+1.58z)| norm 0.2652 (+3.70z)| lr 1.66e-06 | 2532.86 ms | 53.3% bf16 MFU | 206924 tok/s +step 18931/19560 | loss 3.277156 (-0.69z)| norm 0.2590 (+2.96z)| lr 1.65e-06 | 2532.03 ms | 53.3% bf16 MFU | 206931 tok/s +step 18932/19560 | loss 3.290574 (-0.34z)| norm 0.2196 (-0.82z)| lr 1.65e-06 | 2534.55 ms | 53.3% bf16 MFU | 206928 tok/s +step 18933/19560 | loss 3.236986 (-1.68z)| norm 0.2294 (+0.17z)| lr 1.64e-06 | 2533.73 ms | 53.3% bf16 MFU | 206927 tok/s +step 18934/19560 | loss 3.319363 (+0.40z)| norm 0.2151 (-1.25z)| lr 1.63e-06 | 2532.77 ms | 53.3% bf16 MFU | 206931 tok/s +step 18935/19560 | loss 3.300101 (-0.08z)| norm 0.2209 (-0.67z)| lr 1.63e-06 | 2532.81 ms | 53.3% bf16 MFU | 206934 tok/s +step 18936/19560 | loss 3.251074 (-1.32z)| norm 0.2391 (+1.19z)| lr 1.62e-06 | 2532.41 ms | 53.3% bf16 MFU | 206939 tok/s +step 18937/19560 | loss 3.377486 (+1.88z)| norm 0.2523 (+2.46z)| lr 1.62e-06 | 2532.45 ms | 53.3% bf16 MFU | 206944 tok/s +step 18938/19560 | loss 3.256019 (-1.18z)| norm 0.2703 (+3.96z)| lr 1.61e-06 | 2535.88 ms | 53.2% bf16 MFU | 206934 tok/s +step 18939/19560 | loss 3.295198 (-0.19z)| norm 0.2292 (+0.11z)| lr 1.61e-06 | 2531.84 ms | 53.3% bf16 MFU | 206941 tok/s +step 18940/19560 | loss 3.347092 (+1.11z)| norm 0.2240 (-0.37z)| lr 1.60e-06 | 2530.88 ms | 53.3% bf16 MFU | 206952 tok/s +step 18941/19560 | loss 3.326841 (+0.59z)| norm 0.2152 (-1.17z)| lr 1.60e-06 | 2532.33 ms | 53.3% bf16 MFU | 206956 tok/s +step 18942/19560 | loss 3.309966 (+0.17z)| norm 0.2264 (-0.11z)| lr 1.59e-06 | 2533.63 ms | 53.3% bf16 MFU | 206955 tok/s +step 18943/19560 | loss 3.394936 (+2.34z)| norm 0.2293 (+0.18z)| lr 1.59e-06 | 2532.67 ms | 53.3% bf16 MFU | 206958 tok/s +step 18944/19560 | loss 3.313178 (+0.25z)| norm 0.2272 (-0.03z)| lr 1.58e-06 | 2533.34 ms | 53.3% bf16 MFU | 206958 tok/s +step 18945/19560 | loss 3.345198 (+1.06z)| norm 0.2282 (+0.07z)| lr 1.58e-06 | 2533.30 ms | 53.3% bf16 MFU | 206958 tok/s +step 18946/19560 | loss 3.240926 (-1.58z)| norm 0.2245 (-0.29z)| lr 1.57e-06 | 2533.07 ms | 53.3% bf16 MFU | 206959 tok/s +step 18947/19560 | loss 3.299837 (-0.08z)| norm 0.2582 (+2.88z)| lr 1.57e-06 | 2532.58 ms | 53.3% bf16 MFU | 206962 tok/s +step 18948/19560 | loss 3.279016 (-0.60z)| norm 0.2215 (-0.60z)| lr 1.56e-06 | 2530.58 ms | 53.4% bf16 MFU | 206972 tok/s +step 18949/19560 | loss 3.318257 (+0.38z)| norm 0.2265 (-0.12z)| lr 1.56e-06 | 2531.72 ms | 53.3% bf16 MFU | 206978 tok/s +step 18950/19560 | loss 3.245243 (-1.44z)| norm 0.2226 (-0.48z)| lr 1.55e-06 | 2533.85 ms | 53.3% bf16 MFU | 206975 tok/s +step 18951/19560 | loss 3.387755 (+2.10z)| norm 0.2210 (-0.62z)| lr 1.55e-06 | 2532.76 ms | 53.3% bf16 MFU | 206976 tok/s +step 18952/19560 | loss 3.302465 (-0.01z)| norm 0.2274 (-0.02z)| lr 1.54e-06 | 2533.21 ms | 53.3% bf16 MFU | 206976 tok/s +step 18953/19560 | loss 3.340118 (+0.91z)| norm 0.2347 (+0.66z)| lr 1.54e-06 | 2531.41 ms | 53.3% bf16 MFU | 206983 tok/s +step 18954/19560 | loss 3.295810 (-0.18z)| norm 0.2761 (+4.23z)| lr 1.53e-06 | 2534.34 ms | 53.3% bf16 MFU | 206977 tok/s +step 18955/19560 | loss 3.280112 (-0.56z)| norm 0.2174 (-0.94z)| lr 1.53e-06 | 2533.45 ms | 53.3% bf16 MFU | 206976 tok/s +step 18956/19560 | loss 3.281706 (-0.51z)| norm 0.2236 (-0.39z)| lr 1.52e-06 | 2533.71 ms | 53.3% bf16 MFU | 206973 tok/s +step 18957/19560 | loss 3.273121 (-0.71z)| norm 0.2183 (-0.85z)| lr 1.52e-06 | 2534.38 ms | 53.3% bf16 MFU | 206968 tok/s +step 18958/19560 | loss 3.260458 (-1.01z)| norm 0.2233 (-0.40z)| lr 1.51e-06 | 2531.97 ms | 53.3% bf16 MFU | 206973 tok/s +step 18959/19560 | loss 3.318464 (+0.41z)| norm 0.2250 (-0.26z)| lr 1.51e-06 | 2533.87 ms | 53.3% bf16 MFU | 206970 tok/s +step 18960/19560 | loss 3.258636 (-1.04z)| norm 0.2223 (-0.49z)| lr 1.50e-06 | 2531.48 ms | 53.3% bf16 MFU | 206977 tok/s +step 18961/19560 | loss 3.313781 (+0.30z)| norm 0.2201 (-0.69z)| lr 1.50e-06 | 2532.31 ms | 53.3% bf16 MFU | 206980 tok/s +step 18962/19560 | loss 3.280682 (-0.51z)| norm 0.2261 (-0.16z)| lr 1.49e-06 | 2532.77 ms | 53.3% bf16 MFU | 206981 tok/s +step 18963/19560 | loss 3.300791 (-0.02z)| norm 0.2263 (-0.14z)| lr 1.49e-06 | 2536.09 ms | 53.2% bf16 MFU | 206969 tok/s +step 18964/19560 | loss 3.294664 (-0.17z)| norm 0.2206 (-0.64z)| lr 1.48e-06 | 2534.69 ms | 53.3% bf16 MFU | 206962 tok/s +step 18965/19560 | loss 3.278204 (-0.56z)| norm 0.2204 (-0.66z)| lr 1.48e-06 | 2533.13 ms | 53.3% bf16 MFU | 206963 tok/s +step 18966/19560 | loss 3.306370 (+0.16z)| norm 0.2302 (+0.20z)| lr 1.47e-06 | 2535.17 ms | 53.3% bf16 MFU | 206955 tok/s +step 18967/19560 | loss 3.361922 (+1.59z)| norm 0.2404 (+1.08z)| lr 1.47e-06 | 2533.82 ms | 53.3% bf16 MFU | 206953 tok/s +step 18968/19560 | loss 3.279997 (-0.51z)| norm 0.2374 (+0.81z)| lr 1.46e-06 | 2533.41 ms | 53.3% bf16 MFU | 206953 tok/s +step 18969/19560 | loss 3.286245 (-0.35z)| norm 0.2243 (-0.34z)| lr 1.46e-06 | 2534.21 ms | 53.3% bf16 MFU | 206949 tok/s +step 18970/19560 | loss 3.250654 (-1.25z)| norm 0.2199 (-0.71z)| lr 1.45e-06 | 2535.42 ms | 53.3% bf16 MFU | 206941 tok/s +step 18971/19560 | loss 3.295673 (-0.11z)| norm 0.2338 (+0.49z)| lr 1.45e-06 | 2532.22 ms | 53.3% bf16 MFU | 206947 tok/s +step 18972/19560 | loss 3.275945 (-0.61z)| norm 0.2342 (+0.52z)| lr 1.44e-06 | 2534.26 ms | 53.3% bf16 MFU | 206943 tok/s +step 18973/19560 | loss 3.320322 (+0.51z)| norm 0.2178 (-0.89z)| lr 1.44e-06 | 2533.70 ms | 53.3% bf16 MFU | 206942 tok/s +step 18974/19560 | loss 3.285889 (-0.37z)| norm 0.2189 (-0.79z)| lr 1.43e-06 | 2535.02 ms | 53.3% bf16 MFU | 206936 tok/s +step 18975/19560 | loss 3.293197 (-0.18z)| norm 0.2240 (-0.36z)| lr 1.43e-06 | 2535.82 ms | 53.2% bf16 MFU | 206927 tok/s +step 18976/19560 | loss 3.332173 (+0.81z)| norm 0.2228 (-0.46z)| lr 1.42e-06 | 2535.06 ms | 53.3% bf16 MFU | 206921 tok/s +step 18977/19560 | loss 3.290468 (-0.26z)| norm 0.2618 (+2.84z)| lr 1.42e-06 | 2532.56 ms | 53.3% bf16 MFU | 206926 tok/s +step 18978/19560 | loss 3.289914 (-0.27z)| norm 0.2529 (+2.06z)| lr 1.41e-06 | 2531.68 ms | 53.3% bf16 MFU | 206934 tok/s +step 18979/19560 | loss 3.276044 (-0.62z)| norm 0.2240 (-0.36z)| lr 1.41e-06 | 2531.48 ms | 53.3% bf16 MFU | 206943 tok/s +step 18980/19560 | loss 3.332513 (+0.82z)| norm 0.2300 (+0.14z)| lr 1.40e-06 | 2531.97 ms | 53.3% bf16 MFU | 206949 tok/s +step 18981/19560 | loss 3.271492 (-0.73z)| norm 0.2210 (-0.61z)| lr 1.40e-06 | 2532.99 ms | 53.3% bf16 MFU | 206951 tok/s +step 18982/19560 | loss 3.267353 (-0.83z)| norm 0.2337 (+0.44z)| lr 1.39e-06 | 2532.48 ms | 53.3% bf16 MFU | 206955 tok/s +step 18983/19560 | loss 3.309562 (+0.23z)| norm 0.2227 (-0.48z)| lr 1.39e-06 | 2532.28 ms | 53.3% bf16 MFU | 206959 tok/s +step 18984/19560 | loss 3.256546 (-1.12z)| norm 0.2479 (+1.61z)| lr 1.38e-06 | 2531.87 ms | 53.3% bf16 MFU | 206965 tok/s +step 18985/19560 | loss 3.285606 (-0.41z)| norm 0.2223 (-0.51z)| lr 1.38e-06 | 2532.07 ms | 53.3% bf16 MFU | 206970 tok/s +step 18986/19560 | loss 3.338489 (+1.01z)| norm 0.2204 (-0.66z)| lr 1.38e-06 | 2534.27 ms | 53.3% bf16 MFU | 206965 tok/s +step 18987/19560 | loss 3.252856 (-1.26z)| norm 0.2355 (+0.59z)| lr 1.37e-06 | 2532.09 ms | 53.3% bf16 MFU | 206970 tok/s +step 18988/19560 | loss 3.245835 (-1.44z)| norm 0.2398 (+0.94z)| lr 1.37e-06 | 2531.97 ms | 53.3% bf16 MFU | 206975 tok/s +step 18989/19560 | loss 3.331287 (+0.81z)| norm 0.2284 (-0.01z)| lr 1.36e-06 | 2532.66 ms | 53.3% bf16 MFU | 206976 tok/s +step 18990/19560 | loss 3.351429 (+1.32z)| norm 0.2292 (+0.06z)| lr 1.36e-06 | 2534.10 ms | 53.3% bf16 MFU | 206972 tok/s +step 18991/19560 | loss 3.316714 (+0.41z)| norm 0.2387 (+0.83z)| lr 1.35e-06 | 2532.45 ms | 53.3% bf16 MFU | 206975 tok/s +step 18992/19560 | loss 3.311261 (+0.28z)| norm 0.2300 (+0.11z)| lr 1.35e-06 | 2532.75 ms | 53.3% bf16 MFU | 206976 tok/s +step 18993/19560 | loss 3.300041 (-0.01z)| norm 0.2213 (-0.61z)| lr 1.34e-06 | 2530.86 ms | 53.3% bf16 MFU | 206986 tok/s +step 18994/19560 | loss 3.328711 (+0.74z)| norm 0.2246 (-0.33z)| lr 1.34e-06 | 2534.87 ms | 53.3% bf16 MFU | 206978 tok/s +step 18995/19560 | loss 3.344593 (+1.14z)| norm 0.2176 (-0.91z)| lr 1.33e-06 | 2532.07 ms | 53.3% bf16 MFU | 206982 tok/s +step 18996/19560 | loss 3.333039 (+0.82z)| norm 0.2250 (-0.31z)| lr 1.33e-06 | 2532.30 ms | 53.3% bf16 MFU | 206985 tok/s +step 18997/19560 | loss 3.271861 (-0.80z)| norm 0.2176 (-0.92z)| lr 1.32e-06 | 2532.25 ms | 53.3% bf16 MFU | 206988 tok/s +step 18998/19560 | loss 3.251784 (-1.31z)| norm 0.2269 (-0.15z)| lr 1.32e-06 | 2531.99 ms | 53.3% bf16 MFU | 206992 tok/s +step 18999/19560 | loss 3.331459 (+0.79z)| norm 0.2329 (+0.35z)| lr 1.31e-06 | 2531.60 ms | 53.3% bf16 MFU | 206997 tok/s +step 19000/19560 | loss 3.229005 (-1.87z)| norm 0.2270 (-0.14z)| lr 1.31e-06 | 2534.76 ms | 53.3% bf16 MFU | 206989 tok/s +val loss 3.285374 +evaluating HellaSwag: 0/628 evaluating HellaSwag: 10/628 evaluating HellaSwag: 20/628 evaluating HellaSwag: 30/628 evaluating HellaSwag: 40/628 evaluating HellaSwag: 50/628 evaluating HellaSwag: 60/628 evaluating HellaSwag: 70/628 evaluating HellaSwag: 80/628 evaluating HellaSwag: 90/628 evaluating HellaSwag: 100/628 evaluating HellaSwag: 110/628 evaluating HellaSwag: 120/628 evaluating HellaSwag: 130/628 evaluating HellaSwag: 140/628 evaluating HellaSwag: 150/628 evaluating HellaSwag: 160/628 evaluating HellaSwag: 170/628 evaluating HellaSwag: 180/628 evaluating HellaSwag: 190/628 evaluating HellaSwag: 200/628 evaluating HellaSwag: 210/628 evaluating HellaSwag: 220/628 evaluating HellaSwag: 230/628 evaluating HellaSwag: 240/628 evaluating HellaSwag: 250/628 evaluating HellaSwag: 260/628 evaluating HellaSwag: 270/628 evaluating HellaSwag: 280/628 evaluating HellaSwag: 290/628 evaluating HellaSwag: 300/628 evaluating HellaSwag: 310/628 evaluating HellaSwag: 320/628 evaluating HellaSwag: 330/628 evaluating HellaSwag: 340/628 evaluating HellaSwag: 350/628 evaluating HellaSwag: 360/628 evaluating HellaSwag: 370/628 evaluating HellaSwag: 380/628 evaluating HellaSwag: 390/628 evaluating HellaSwag: 400/628 evaluating HellaSwag: 410/628 evaluating HellaSwag: 420/628 evaluating HellaSwag: 430/628 evaluating HellaSwag: 440/628 evaluating HellaSwag: 450/628 evaluating HellaSwag: 460/628 evaluating HellaSwag: 470/628 evaluating HellaSwag: 480/628 evaluating HellaSwag: 490/628 evaluating HellaSwag: 500/628 evaluating HellaSwag: 510/628 evaluating HellaSwag: 520/628 evaluating HellaSwag: 530/628 evaluating HellaSwag: 540/628 evaluating HellaSwag: 550/628 evaluating HellaSwag: 560/628 evaluating HellaSwag: 570/628 evaluating HellaSwag: 580/628 evaluating HellaSwag: 590/628 evaluating HellaSwag: 600/628 evaluating HellaSwag: 610/628 evaluating HellaSwag: 620/628 HellaSwag: 3038/10042 = 0.302529 +step 19001/19560 | loss 3.335665 (+0.91z)| norm 0.2169 (-0.97z)| lr 1.30e-06 | 2531.20 ms | 53.3% bf16 MFU | 206996 tok/s +step 19002/19560 | loss 3.307029 (+0.15z)| norm 0.2224 (-0.51z)| lr 1.30e-06 | 2531.27 ms | 53.3% bf16 MFU | 207002 tok/s +step 19003/19560 | loss 3.299442 (-0.05z)| norm 0.2199 (-0.71z)| lr 1.29e-06 | 2533.66 ms | 53.3% bf16 MFU | 206999 tok/s +step 19004/19560 | loss 3.286257 (-0.39z)| norm 0.2268 (-0.13z)| lr 1.29e-06 | 2532.46 ms | 53.3% bf16 MFU | 207000 tok/s +step 19005/19560 | loss 3.286247 (-0.40z)| norm 0.2230 (-0.46z)| lr 1.29e-06 | 2534.84 ms | 53.3% bf16 MFU | 206992 tok/s +step 19006/19560 | loss 3.292264 (-0.23z)| norm 0.2283 (-0.01z)| lr 1.28e-06 | 2532.62 ms | 53.3% bf16 MFU | 206993 tok/s +step 19007/19560 | loss 3.349235 (+1.25z)| norm 0.2190 (-0.78z)| lr 1.28e-06 | 2533.02 ms | 53.3% bf16 MFU | 206992 tok/s +step 19008/19560 | loss 3.270218 (-0.81z)| norm 0.2323 (+0.33z)| lr 1.27e-06 | 2532.95 ms | 53.3% bf16 MFU | 206992 tok/s +step 19009/19560 | loss 3.336232 (+0.93z)| norm 0.2212 (-0.60z)| lr 1.27e-06 | 2531.47 ms | 53.3% bf16 MFU | 206998 tok/s +step 19010/19560 | loss 3.254349 (-1.22z)| norm 0.2267 (-0.14z)| lr 1.26e-06 | 2534.44 ms | 53.3% bf16 MFU | 206991 tok/s +step 19011/19560 | loss 3.327401 (+0.71z)| norm 0.2378 (+0.79z)| lr 1.26e-06 | 2533.05 ms | 53.3% bf16 MFU | 206991 tok/s +step 19012/19560 | loss 3.310193 (+0.27z)| norm 0.2446 (+1.34z)| lr 1.25e-06 | 2533.47 ms | 53.3% bf16 MFU | 206988 tok/s +step 19013/19560 | loss 3.319428 (+0.53z)| norm 0.2377 (+0.75z)| lr 1.25e-06 | 2533.36 ms | 53.3% bf16 MFU | 206987 tok/s +step 19014/19560 | loss 3.329892 (+0.80z)| norm 0.2224 (-0.51z)| lr 1.24e-06 | 2535.33 ms | 53.3% bf16 MFU | 206977 tok/s +step 19015/19560 | loss 3.235522 (-1.71z)| norm 0.2383 (+0.80z)| lr 1.24e-06 | 2532.16 ms | 53.3% bf16 MFU | 206981 tok/s +step 19016/19560 | loss 3.313906 (+0.37z)| norm 0.2266 (-0.18z)| lr 1.24e-06 | 2532.07 ms | 53.3% bf16 MFU | 206985 tok/s +step 19017/19560 | loss 3.304948 (+0.13z)| norm 0.2209 (-0.65z)| lr 1.23e-06 | 2534.67 ms | 53.3% bf16 MFU | 206978 tok/s +step 19018/19560 | loss 3.317688 (+0.47z)| norm 0.2188 (-0.82z)| lr 1.23e-06 | 2533.67 ms | 53.3% bf16 MFU | 206975 tok/s +step 19019/19560 | loss 3.342578 (+1.12z)| norm 0.2194 (-0.76z)| lr 1.22e-06 | 2532.47 ms | 53.3% bf16 MFU | 206978 tok/s +step 19020/19560 | loss 3.253623 (-1.23z)| norm 0.2313 (+0.22z)| lr 1.22e-06 | 2533.42 ms | 53.3% bf16 MFU | 206976 tok/s +step 19021/19560 | loss 3.357153 (+1.48z)| norm 0.2468 (+1.47z)| lr 1.21e-06 | 2532.06 ms | 53.3% bf16 MFU | 206980 tok/s +step 19022/19560 | loss 3.249835 (-1.32z)| norm 0.2259 (-0.24z)| lr 1.21e-06 | 2533.62 ms | 53.3% bf16 MFU | 206978 tok/s +step 19023/19560 | loss 3.437609 (+3.39z)| norm 0.2395 (+0.87z)| lr 1.20e-06 | 2530.38 ms | 53.4% bf16 MFU | 206989 tok/s +step 19024/19560 | loss 3.255339 (-1.19z)| norm 0.2622 (+2.64z)| lr 1.20e-06 | 2532.98 ms | 53.3% bf16 MFU | 206989 tok/s +step 19025/19560 | loss 3.245419 (-1.42z)| norm 0.2265 (-0.22z)| lr 1.19e-06 | 2531.74 ms | 53.3% bf16 MFU | 206994 tok/s +step 19026/19560 | loss 3.322481 (+0.52z)| norm 0.2295 (+0.01z)| lr 1.19e-06 | 2534.46 ms | 53.3% bf16 MFU | 206987 tok/s +step 19027/19560 | loss 3.305480 (+0.10z)| norm 0.2295 (+0.01z)| lr 1.19e-06 | 2534.11 ms | 53.3% bf16 MFU | 206982 tok/s +step 19028/19560 | loss 3.289362 (-0.32z)| norm 0.2156 (-1.10z)| lr 1.18e-06 | 2532.32 ms | 53.3% bf16 MFU | 206985 tok/s +step 19029/19560 | loss 3.241379 (-1.51z)| norm 0.2235 (-0.47z)| lr 1.18e-06 | 2534.63 ms | 53.3% bf16 MFU | 206978 tok/s +step 19030/19560 | loss 3.270249 (-0.77z)| norm 0.2224 (-0.56z)| lr 1.17e-06 | 2535.09 ms | 53.3% bf16 MFU | 206970 tok/s +step 19031/19560 | loss 3.277503 (-0.57z)| norm 0.2237 (-0.45z)| lr 1.17e-06 | 2533.24 ms | 53.3% bf16 MFU | 206970 tok/s +step 19032/19560 | loss 3.293459 (-0.17z)| norm 0.2125 (-1.33z)| lr 1.16e-06 | 2532.69 ms | 53.3% bf16 MFU | 206972 tok/s +step 19033/19560 | loss 3.268603 (-0.81z)| norm 0.2283 (-0.07z)| lr 1.16e-06 | 2533.82 ms | 53.3% bf16 MFU | 206969 tok/s +step 19034/19560 | loss 3.228415 (-1.81z)| norm 0.2230 (-0.49z)| lr 1.16e-06 | 2534.48 ms | 53.3% bf16 MFU | 206964 tok/s +step 19035/19560 | loss 3.277996 (-0.54z)| norm 0.2250 (-0.32z)| lr 1.15e-06 | 2532.47 ms | 53.3% bf16 MFU | 206967 tok/s +step 19036/19560 | loss 3.223564 (-1.90z)| norm 0.2213 (-0.62z)| lr 1.15e-06 | 2532.85 ms | 53.3% bf16 MFU | 206968 tok/s +step 19037/19560 | loss 3.288527 (-0.26z)| norm 0.2368 (+0.63z)| lr 1.14e-06 | 2535.11 ms | 53.3% bf16 MFU | 206960 tok/s +step 19038/19560 | loss 3.303543 (+0.12z)| norm 0.2266 (-0.20z)| lr 1.14e-06 | 2533.13 ms | 53.3% bf16 MFU | 206961 tok/s +step 19039/19560 | loss 3.306211 (+0.20z)| norm 0.2315 (+0.20z)| lr 1.13e-06 | 2534.47 ms | 53.3% bf16 MFU | 206956 tok/s +step 19040/19560 | loss 3.310787 (+0.32z)| norm 0.2213 (-0.63z)| lr 1.13e-06 | 2534.09 ms | 53.3% bf16 MFU | 206953 tok/s +step 19041/19560 | loss 3.281627 (-0.42z)| norm 0.2307 (+0.13z)| lr 1.12e-06 | 2534.99 ms | 53.3% bf16 MFU | 206946 tok/s +step 19042/19560 | loss 3.338893 (+1.03z)| norm 0.2214 (-0.61z)| lr 1.12e-06 | 2533.58 ms | 53.3% bf16 MFU | 206946 tok/s +step 19043/19560 | loss 3.300568 (+0.05z)| norm 0.2225 (-0.51z)| lr 1.12e-06 | 2534.51 ms | 53.3% bf16 MFU | 206941 tok/s +step 19044/19560 | loss 3.329769 (+0.79z)| norm 0.2390 (+0.81z)| lr 1.11e-06 | 2533.79 ms | 53.3% bf16 MFU | 206940 tok/s +step 19045/19560 | loss 3.277542 (-0.55z)| norm 0.2308 (+0.14z)| lr 1.11e-06 | 2532.16 ms | 53.3% bf16 MFU | 206946 tok/s +step 19046/19560 | loss 3.280510 (-0.46z)| norm 0.2319 (+0.23z)| lr 1.10e-06 | 2533.20 ms | 53.3% bf16 MFU | 206947 tok/s +step 19047/19560 | loss 3.290197 (-0.20z)| norm 0.2209 (-0.66z)| lr 1.10e-06 | 2532.01 ms | 53.3% bf16 MFU | 206953 tok/s +step 19048/19560 | loss 3.232381 (-1.70z)| norm 0.2266 (-0.21z)| lr 1.09e-06 | 2532.52 ms | 53.3% bf16 MFU | 206956 tok/s +step 19049/19560 | loss 3.267017 (-0.78z)| norm 0.2138 (-1.23z)| lr 1.09e-06 | 2532.22 ms | 53.3% bf16 MFU | 206961 tok/s +step 19050/19560 | loss 3.311736 (+0.38z)| norm 0.2203 (-0.70z)| lr 1.09e-06 | 2532.70 ms | 53.3% bf16 MFU | 206963 tok/s +step 19051/19560 | loss 3.293336 (-0.10z)| norm 0.2188 (-0.83z)| lr 1.08e-06 | 2533.27 ms | 53.3% bf16 MFU | 206963 tok/s +step 19052/19560 | loss 3.377335 (+2.10z)| norm 0.2691 (+3.30z)| lr 1.08e-06 | 2531.90 ms | 53.3% bf16 MFU | 206968 tok/s +step 19053/19560 | loss 3.250356 (-1.27z)| norm 0.2235 (-0.45z)| lr 1.07e-06 | 2533.71 ms | 53.3% bf16 MFU | 206966 tok/s +step 19054/19560 | loss 3.258777 (-1.03z)| norm 0.2216 (-0.60z)| lr 1.07e-06 | 2531.63 ms | 53.3% bf16 MFU | 206973 tok/s +step 19055/19560 | loss 3.234421 (-1.64z)| norm 0.2524 (+1.89z)| lr 1.06e-06 | 2532.88 ms | 53.3% bf16 MFU | 206974 tok/s +step 19056/19560 | loss 3.288175 (-0.23z)| norm 0.2265 (-0.22z)| lr 1.06e-06 | 2536.26 ms | 53.2% bf16 MFU | 206961 tok/s +step 19057/19560 | loss 3.244188 (-1.36z)| norm 0.2210 (-0.67z)| lr 1.06e-06 | 2534.18 ms | 53.3% bf16 MFU | 206957 tok/s +step 19058/19560 | loss 3.402328 (+2.71z)| norm 0.2326 (+0.31z)| lr 1.05e-06 | 2531.66 ms | 53.3% bf16 MFU | 206964 tok/s +step 19059/19560 | loss 3.302917 (+0.15z)| norm 0.2274 (-0.11z)| lr 1.05e-06 | 2532.19 ms | 53.3% bf16 MFU | 206968 tok/s +step 19060/19560 | loss 3.271880 (-0.64z)| norm 0.2219 (-0.59z)| lr 1.04e-06 | 2532.73 ms | 53.3% bf16 MFU | 206970 tok/s +step 19061/19560 | loss 3.332136 (+0.89z)| norm 0.2207 (-0.69z)| lr 1.04e-06 | 2533.99 ms | 53.3% bf16 MFU | 206967 tok/s +step 19062/19560 | loss 3.415514 (+2.92z)| norm 0.2361 (+0.64z)| lr 1.04e-06 | 2533.20 ms | 53.3% bf16 MFU | 206967 tok/s +step 19063/19560 | loss 3.249185 (-1.21z)| norm 0.2217 (-0.62z)| lr 1.03e-06 | 2532.45 ms | 53.3% bf16 MFU | 206970 tok/s +step 19064/19560 | loss 3.293959 (-0.11z)| norm 0.2124 (-1.40z)| lr 1.03e-06 | 2531.29 ms | 53.3% bf16 MFU | 206977 tok/s +step 19065/19560 | loss 3.356023 (+1.46z)| norm 0.2542 (+2.21z)| lr 1.02e-06 | 2534.44 ms | 53.3% bf16 MFU | 206972 tok/s +step 19066/19560 | loss 3.281597 (-0.42z)| norm 0.2464 (+1.62z)| lr 1.02e-06 | 2533.50 ms | 53.3% bf16 MFU | 206970 tok/s +step 19067/19560 | loss 3.302368 (+0.10z)| norm 0.2204 (-0.72z)| lr 1.02e-06 | 2531.33 ms | 53.3% bf16 MFU | 206978 tok/s +step 19068/19560 | loss 3.312288 (+0.36z)| norm 0.2224 (-0.54z)| lr 1.01e-06 | 2530.81 ms | 53.3% bf16 MFU | 206987 tok/s +step 19069/19560 | loss 3.234976 (-1.57z)| norm 0.2215 (-0.63z)| lr 1.01e-06 | 2532.13 ms | 53.3% bf16 MFU | 206990 tok/s +step 19070/19560 | loss 3.532833 (+5.21z)| norm 0.2697 (+3.51z)| lr 1.00e-06 | 2531.91 ms | 53.3% bf16 MFU | 206994 tok/s +step 19071/19560 | loss 3.278634 (-0.44z)| norm 0.2180 (-0.91z)| lr 9.99e-07 | 2534.50 ms | 53.3% bf16 MFU | 206988 tok/s +step 19072/19560 | loss 3.290133 (-0.18z)| norm 0.2165 (-1.03z)| lr 9.95e-07 | 2532.38 ms | 53.3% bf16 MFU | 206990 tok/s +step 19073/19560 | loss 3.348467 (+1.14z)| norm 0.2216 (-0.59z)| lr 9.91e-07 | 2534.33 ms | 53.3% bf16 MFU | 206984 tok/s +step 19074/19560 | loss 3.212462 (-1.92z)| norm 0.2223 (-0.53z)| lr 9.87e-07 | 2531.51 ms | 53.3% bf16 MFU | 206990 tok/s +step 19075/19560 | loss 3.274376 (-0.52z)| norm 0.2272 (-0.09z)| lr 9.83e-07 | 2535.16 ms | 53.3% bf16 MFU | 206981 tok/s +step 19076/19560 | loss 3.276706 (-0.47z)| norm 0.2191 (-0.80z)| lr 9.78e-07 | 2533.28 ms | 53.3% bf16 MFU | 206980 tok/s +step 19077/19560 | loss 3.262249 (-0.78z)| norm 0.2369 (+0.74z)| lr 9.74e-07 | 2533.69 ms | 53.3% bf16 MFU | 206977 tok/s +step 19078/19560 | loss 3.298936 (+0.03z)| norm 0.2296 (+0.10z)| lr 9.70e-07 | 2535.52 ms | 53.3% bf16 MFU | 206967 tok/s +step 19079/19560 | loss 3.328690 (+0.72z)| norm 0.2207 (-0.66z)| lr 9.66e-07 | 2532.49 ms | 53.3% bf16 MFU | 206970 tok/s +step 19080/19560 | loss 3.323010 (+0.59z)| norm 0.2213 (-0.61z)| lr 9.62e-07 | 2534.70 ms | 53.3% bf16 MFU | 206964 tok/s +step 19081/19560 | loss 3.284362 (-0.29z)| norm 0.2312 (+0.25z)| lr 9.58e-07 | 2533.34 ms | 53.3% bf16 MFU | 206964 tok/s +step 19082/19560 | loss 3.309286 (+0.28z)| norm 0.2266 (-0.13z)| lr 9.54e-07 | 2532.28 ms | 53.3% bf16 MFU | 206967 tok/s +step 19083/19560 | loss 3.361285 (+1.45z)| norm 0.2234 (-0.42z)| lr 9.50e-07 | 2534.19 ms | 53.3% bf16 MFU | 206963 tok/s +step 19084/19560 | loss 3.257810 (-0.90z)| norm 0.2356 (+0.71z)| lr 9.46e-07 | 2534.61 ms | 53.3% bf16 MFU | 206958 tok/s +step 19085/19560 | loss 3.245583 (-1.17z)| norm 0.2296 (+0.13z)| lr 9.43e-07 | 2531.90 ms | 53.3% bf16 MFU | 206963 tok/s +step 19086/19560 | loss 3.306967 (+0.21z)| norm 0.2261 (-0.20z)| lr 9.39e-07 | 2530.67 ms | 53.4% bf16 MFU | 206974 tok/s +step 19087/19560 | loss 3.256356 (-0.92z)| norm 0.2264 (-0.16z)| lr 9.35e-07 | 2533.46 ms | 53.3% bf16 MFU | 206973 tok/s +step 19088/19560 | loss 3.382998 (+1.90z)| norm 0.2279 (-0.03z)| lr 9.31e-07 | 2533.84 ms | 53.3% bf16 MFU | 206970 tok/s +step 19089/19560 | loss 3.273753 (-0.53z)| norm 0.2247 (-0.34z)| lr 9.27e-07 | 2534.69 ms | 53.3% bf16 MFU | 206963 tok/s +step 19090/19560 | loss 3.317321 (+0.43z)| norm 0.2175 (-1.01z)| lr 9.23e-07 | 2533.78 ms | 53.3% bf16 MFU | 206961 tok/s +step 19091/19560 | loss 3.277684 (-0.45z)| norm 0.2278 (-0.03z)| lr 9.19e-07 | 2534.54 ms | 53.3% bf16 MFU | 206956 tok/s +step 19092/19560 | loss 3.239309 (-1.29z)| norm 0.2227 (-0.52z)| lr 9.15e-07 | 2532.43 ms | 53.3% bf16 MFU | 206960 tok/s +step 19093/19560 | loss 3.557384 (+5.10z)| norm 0.2754 (+4.11z)| lr 9.11e-07 | 2531.82 ms | 53.3% bf16 MFU | 206966 tok/s +step 19094/19560 | loss 3.325924 (+0.52z)| norm 0.2220 (-0.58z)| lr 9.07e-07 | 2533.21 ms | 53.3% bf16 MFU | 206966 tok/s +step 19095/19560 | loss 3.268789 (-0.60z)| norm 0.2893 (+4.81z)| lr 9.03e-07 | 2533.33 ms | 53.3% bf16 MFU | 206965 tok/s +step 19096/19560 | loss 3.250129 (-0.96z)| norm 0.2273 (-0.13z)| lr 8.99e-07 | 2534.17 ms | 53.3% bf16 MFU | 206961 tok/s +step 19097/19560 | loss 3.342517 (+0.86z)| norm 0.2411 (+0.96z)| lr 8.96e-07 | 2532.09 ms | 53.3% bf16 MFU | 206966 tok/s +step 19098/19560 | loss 3.290518 (-0.18z)| norm 0.2170 (-0.95z)| lr 8.92e-07 | 2534.45 ms | 53.3% bf16 MFU | 206961 tok/s +step 19099/19560 | loss 3.283503 (-0.31z)| norm 0.2260 (-0.24z)| lr 8.88e-07 | 2534.56 ms | 53.3% bf16 MFU | 206956 tok/s +step 19100/19560 | loss 3.311687 (+0.24z)| norm 0.2183 (-0.83z)| lr 8.84e-07 | 2535.63 ms | 53.2% bf16 MFU | 206946 tok/s +step 19101/19560 | loss 3.277294 (-0.44z)| norm 0.2215 (-0.59z)| lr 8.80e-07 | 2533.11 ms | 53.3% bf16 MFU | 206948 tok/s +step 19102/19560 | loss 3.267972 (-0.62z)| norm 0.2193 (-0.76z)| lr 8.76e-07 | 2534.63 ms | 53.3% bf16 MFU | 206943 tok/s +step 19103/19560 | loss 3.278144 (-0.41z)| norm 0.2146 (-1.12z)| lr 8.73e-07 | 2533.37 ms | 53.3% bf16 MFU | 206943 tok/s +step 19104/19560 | loss 3.205004 (-1.82z)| norm 0.2584 (+2.28z)| lr 8.69e-07 | 2534.03 ms | 53.3% bf16 MFU | 206941 tok/s +step 19105/19560 | loss 3.288085 (-0.19z)| norm 0.2202 (-0.67z)| lr 8.65e-07 | 2534.51 ms | 53.3% bf16 MFU | 206937 tok/s +step 19106/19560 | loss 3.241996 (-1.08z)| norm 0.2294 (+0.07z)| lr 8.61e-07 | 2533.33 ms | 53.3% bf16 MFU | 206938 tok/s +step 19107/19560 | loss 3.348410 (+0.97z)| norm 0.2359 (+0.58z)| lr 8.57e-07 | 2535.06 ms | 53.3% bf16 MFU | 206932 tok/s +step 19108/19560 | loss 3.323809 (+0.50z)| norm 0.2390 (+0.83z)| lr 8.54e-07 | 2533.72 ms | 53.3% bf16 MFU | 206932 tok/s +step 19109/19560 | loss 3.270760 (-0.53z)| norm 0.2220 (-0.54z)| lr 8.50e-07 | 2533.25 ms | 53.3% bf16 MFU | 206933 tok/s +step 19110/19560 | loss 3.290599 (-0.15z)| norm 0.2212 (-0.59z)| lr 8.46e-07 | 2535.42 ms | 53.3% bf16 MFU | 206926 tok/s +step 19111/19560 | loss 3.303638 (+0.10z)| norm 0.2268 (-0.15z)| lr 8.42e-07 | 2532.66 ms | 53.3% bf16 MFU | 206930 tok/s +step 19112/19560 | loss 3.263326 (-0.68z)| norm 0.2136 (-1.19z)| lr 8.39e-07 | 2532.23 ms | 53.3% bf16 MFU | 206936 tok/s +step 19113/19560 | loss 3.234387 (-1.23z)| norm 0.2947 (+4.81z)| lr 8.35e-07 | 2535.76 ms | 53.2% bf16 MFU | 206927 tok/s +step 19114/19560 | loss 3.341285 (+0.84z)| norm 0.2331 (+0.30z)| lr 8.31e-07 | 2533.40 ms | 53.3% bf16 MFU | 206928 tok/s +step 19115/19560 | loss 3.263750 (-0.66z)| norm 0.2210 (-0.58z)| lr 8.28e-07 | 2532.02 ms | 53.3% bf16 MFU | 206935 tok/s +step 19116/19560 | loss 3.267583 (-0.59z)| norm 0.2175 (-0.82z)| lr 8.24e-07 | 2535.26 ms | 53.3% bf16 MFU | 206928 tok/s +step 19117/19560 | loss 3.284686 (-0.26z)| norm 0.2171 (-0.85z)| lr 8.20e-07 | 2534.30 ms | 53.3% bf16 MFU | 206925 tok/s +step 19118/19560 | loss 3.234059 (-1.22z)| norm 0.2155 (-0.95z)| lr 8.16e-07 | 2533.68 ms | 53.3% bf16 MFU | 206926 tok/s +step 19119/19560 | loss 3.295052 (-0.03z)| norm 0.2303 (+0.13z)| lr 8.13e-07 | 2535.16 ms | 53.3% bf16 MFU | 206920 tok/s +step 19120/19560 | loss 3.360480 (+1.22z)| norm 0.2311 (+0.19z)| lr 8.09e-07 | 2533.96 ms | 53.3% bf16 MFU | 206919 tok/s +step 19121/19560 | loss 3.396879 (+1.88z)| norm 0.2904 (+4.16z)| lr 8.05e-07 | 2533.20 ms | 53.3% bf16 MFU | 206921 tok/s +step 19122/19560 | loss 3.293294 (-0.08z)| norm 0.2180 (-0.74z)| lr 8.02e-07 | 2533.82 ms | 53.3% bf16 MFU | 206921 tok/s +step 19123/19560 | loss 3.272535 (-0.47z)| norm 0.2254 (-0.24z)| lr 7.98e-07 | 2533.60 ms | 53.3% bf16 MFU | 206922 tok/s +step 19124/19560 | loss 3.215288 (-1.53z)| norm 0.2206 (-0.57z)| lr 7.94e-07 | 2534.53 ms | 53.3% bf16 MFU | 206918 tok/s +step 19125/19560 | loss 3.330103 (+0.63z)| norm 0.2258 (-0.22z)| lr 7.91e-07 | 2533.17 ms | 53.3% bf16 MFU | 206921 tok/s +step 19126/19560 | loss 3.284213 (-0.24z)| norm 0.2214 (-0.52z)| lr 7.87e-07 | 2531.60 ms | 53.3% bf16 MFU | 206930 tok/s +step 19127/19560 | loss 3.322139 (+0.48z)| norm 0.2260 (-0.20z)| lr 7.84e-07 | 2534.30 ms | 53.3% bf16 MFU | 206927 tok/s +step 19128/19560 | loss 3.316293 (+0.36z)| norm 0.2351 (+0.41z)| lr 7.80e-07 | 2534.65 ms | 53.3% bf16 MFU | 206923 tok/s +step 19129/19560 | loss 3.272024 (-0.48z)| norm 0.2244 (-0.32z)| lr 7.76e-07 | 2532.21 ms | 53.3% bf16 MFU | 206929 tok/s +step 19130/19560 | loss 3.260958 (-0.68z)| norm 0.2260 (-0.21z)| lr 7.73e-07 | 2533.71 ms | 53.3% bf16 MFU | 206929 tok/s +step 19131/19560 | loss 3.266271 (-0.57z)| norm 0.2186 (-0.71z)| lr 7.69e-07 | 2533.53 ms | 53.3% bf16 MFU | 206930 tok/s +step 19132/19560 | loss 3.262347 (-0.65z)| norm 0.2319 (+0.18z)| lr 7.66e-07 | 2534.09 ms | 53.3% bf16 MFU | 206928 tok/s +step 19133/19560 | loss 3.304084 (+0.15z)| norm 0.2129 (-1.09z)| lr 7.62e-07 | 2534.13 ms | 53.3% bf16 MFU | 206926 tok/s +step 19134/19560 | loss 3.269419 (-0.51z)| norm 0.2193 (-0.66z)| lr 7.59e-07 | 2532.62 ms | 53.3% bf16 MFU | 206930 tok/s +step 19135/19560 | loss 3.249946 (-0.87z)| norm 0.2218 (-0.49z)| lr 7.55e-07 | 2534.29 ms | 53.3% bf16 MFU | 206928 tok/s +step 19136/19560 | loss 3.246252 (-0.93z)| norm 0.2324 (+0.23z)| lr 7.51e-07 | 2531.81 ms | 53.3% bf16 MFU | 206935 tok/s +step 19137/19560 | loss 3.295698 (+0.02z)| norm 0.2265 (-0.18z)| lr 7.48e-07 | 2532.98 ms | 53.3% bf16 MFU | 206938 tok/s +step 19138/19560 | loss 3.326900 (+0.60z)| norm 0.2372 (+0.54z)| lr 7.44e-07 | 2532.65 ms | 53.3% bf16 MFU | 206942 tok/s +step 19139/19560 | loss 3.299537 (+0.08z)| norm 0.2201 (-0.61z)| lr 7.41e-07 | 2533.85 ms | 53.3% bf16 MFU | 206940 tok/s +step 19140/19560 | loss 3.346186 (+0.97z)| norm 0.2275 (-0.09z)| lr 7.37e-07 | 2532.65 ms | 53.3% bf16 MFU | 206944 tok/s +step 19141/19560 | loss 3.305588 (+0.19z)| norm 0.2176 (-0.75z)| lr 7.34e-07 | 2533.66 ms | 53.3% bf16 MFU | 206943 tok/s +step 19142/19560 | loss 3.276412 (-0.35z)| norm 0.2278 (-0.07z)| lr 7.30e-07 | 2532.56 ms | 53.3% bf16 MFU | 206947 tok/s +step 19143/19560 | loss 3.275535 (-0.38z)| norm 0.2243 (-0.30z)| lr 7.27e-07 | 2532.88 ms | 53.3% bf16 MFU | 206949 tok/s +step 19144/19560 | loss 3.298418 (+0.06z)| norm 0.2291 (+0.03z)| lr 7.23e-07 | 2532.89 ms | 53.3% bf16 MFU | 206951 tok/s +step 19145/19560 | loss 3.449454 (+2.84z)| norm 0.2399 (+0.75z)| lr 7.20e-07 | 2531.34 ms | 53.3% bf16 MFU | 206960 tok/s +step 19146/19560 | loss 3.372061 (+1.39z)| norm 0.3134 (+5.09z)| lr 7.17e-07 | 2533.02 ms | 53.3% bf16 MFU | 206961 tok/s +step 19147/19560 | loss 3.311229 (+0.27z)| norm 0.2382 (+0.52z)| lr 7.13e-07 | 2534.25 ms | 53.3% bf16 MFU | 206957 tok/s +step 19148/19560 | loss 3.295067 (-0.03z)| norm 0.2298 (+0.01z)| lr 7.10e-07 | 2534.51 ms | 53.3% bf16 MFU | 206952 tok/s +step 19149/19560 | loss 3.303914 (+0.14z)| norm 0.2203 (-0.56z)| lr 7.06e-07 | 2533.63 ms | 53.3% bf16 MFU | 206951 tok/s +step 19150/19560 | loss 3.313274 (+0.31z)| norm 0.2275 (-0.12z)| lr 7.03e-07 | 2532.91 ms | 53.3% bf16 MFU | 206953 tok/s +step 19151/19560 | loss 3.331114 (+0.67z)| norm 0.2200 (-0.57z)| lr 6.99e-07 | 2533.05 ms | 53.3% bf16 MFU | 206954 tok/s +step 19152/19560 | loss 3.383763 (+1.65z)| norm 0.2341 (+0.31z)| lr 6.96e-07 | 2534.48 ms | 53.3% bf16 MFU | 206950 tok/s +step 19153/19560 | loss 3.284324 (-0.25z)| norm 0.2302 (+0.07z)| lr 6.93e-07 | 2534.10 ms | 53.3% bf16 MFU | 206947 tok/s +step 19154/19560 | loss 3.293197 (-0.07z)| norm 0.2260 (-0.19z)| lr 6.89e-07 | 2533.18 ms | 53.3% bf16 MFU | 206948 tok/s +step 19155/19560 | loss 3.278776 (-0.35z)| norm 0.2196 (-0.58z)| lr 6.86e-07 | 2533.60 ms | 53.3% bf16 MFU | 206947 tok/s +step 19156/19560 | loss 3.287369 (-0.18z)| norm 0.2236 (-0.34z)| lr 6.82e-07 | 2532.50 ms | 53.3% bf16 MFU | 206951 tok/s +step 19157/19560 | loss 3.292023 (-0.10z)| norm 0.2241 (-0.31z)| lr 6.79e-07 | 2535.97 ms | 53.2% bf16 MFU | 206940 tok/s +step 19158/19560 | loss 3.262785 (-0.66z)| norm 0.2207 (-0.52z)| lr 6.76e-07 | 2531.71 ms | 53.3% bf16 MFU | 206948 tok/s +step 19159/19560 | loss 3.264950 (-0.62z)| norm 0.2258 (-0.21z)| lr 6.72e-07 | 2534.74 ms | 53.3% bf16 MFU | 206943 tok/s +step 19160/19560 | loss 3.226705 (-1.33z)| norm 0.2252 (-0.25z)| lr 6.69e-07 | 2532.50 ms | 53.3% bf16 MFU | 206947 tok/s +step 19161/19560 | loss 3.281092 (-0.30z)| norm 0.2211 (-0.50z)| lr 6.66e-07 | 2533.61 ms | 53.3% bf16 MFU | 206946 tok/s +step 19162/19560 | loss 3.286937 (-0.20z)| norm 0.2221 (-0.44z)| lr 6.62e-07 | 2532.02 ms | 53.3% bf16 MFU | 206952 tok/s +step 19163/19560 | loss 3.337577 (+0.76z)| norm 0.2251 (-0.25z)| lr 6.59e-07 | 2534.05 ms | 53.3% bf16 MFU | 206949 tok/s +step 19164/19560 | loss 3.350733 (+1.00z)| norm 0.2235 (-0.36z)| lr 6.56e-07 | 2533.20 ms | 53.3% bf16 MFU | 206950 tok/s +step 19165/19560 | loss 3.263834 (-0.66z)| norm 0.2660 (+2.23z)| lr 6.52e-07 | 2532.48 ms | 53.3% bf16 MFU | 206954 tok/s +step 19166/19560 | loss 3.295095 (-0.06z)| norm 0.2231 (-0.38z)| lr 6.49e-07 | 2535.33 ms | 53.3% bf16 MFU | 206946 tok/s +step 19167/19560 | loss 3.305134 (+0.13z)| norm 0.2259 (-0.21z)| lr 6.46e-07 | 2533.98 ms | 53.3% bf16 MFU | 206944 tok/s +step 19168/19560 | loss 3.297593 (-0.01z)| norm 0.2446 (+0.91z)| lr 6.43e-07 | 2533.22 ms | 53.3% bf16 MFU | 206945 tok/s +step 19169/19560 | loss 3.268742 (-0.56z)| norm 0.2236 (-0.35z)| lr 6.39e-07 | 2535.05 ms | 53.3% bf16 MFU | 206938 tok/s +step 19170/19560 | loss 3.251031 (-0.89z)| norm 0.2558 (+1.57z)| lr 6.36e-07 | 2534.21 ms | 53.3% bf16 MFU | 206935 tok/s +step 19171/19560 | loss 3.266500 (-0.59z)| norm 0.2154 (-0.86z)| lr 6.33e-07 | 2535.33 ms | 53.3% bf16 MFU | 206928 tok/s +step 19172/19560 | loss 3.332253 (+0.67z)| norm 0.2355 (+0.35z)| lr 6.30e-07 | 2534.45 ms | 53.3% bf16 MFU | 206925 tok/s +step 19173/19560 | loss 3.222178 (-1.41z)| norm 0.2309 (+0.07z)| lr 6.26e-07 | 2533.15 ms | 53.3% bf16 MFU | 206927 tok/s +step 19174/19560 | loss 3.254790 (-0.79z)| norm 0.2205 (-0.54z)| lr 6.23e-07 | 2535.20 ms | 53.3% bf16 MFU | 206921 tok/s +step 19175/19560 | loss 3.251014 (-0.85z)| norm 0.2196 (-0.60z)| lr 6.20e-07 | 2532.71 ms | 53.3% bf16 MFU | 206925 tok/s +step 19176/19560 | loss 3.266576 (-0.57z)| norm 0.2198 (-0.58z)| lr 6.17e-07 | 2531.79 ms | 53.3% bf16 MFU | 206933 tok/s +step 19177/19560 | loss 3.279477 (-0.33z)| norm 0.2269 (-0.16z)| lr 6.14e-07 | 2531.95 ms | 53.3% bf16 MFU | 206940 tok/s +step 19178/19560 | loss 3.307851 (+0.21z)| norm 0.2330 (+0.20z)| lr 6.10e-07 | 2532.81 ms | 53.3% bf16 MFU | 206943 tok/s +step 19179/19560 | loss 3.261088 (-0.67z)| norm 0.2301 (+0.02z)| lr 6.07e-07 | 2534.56 ms | 53.3% bf16 MFU | 206939 tok/s +step 19180/19560 | loss 3.275539 (-0.38z)| norm 0.2178 (-0.71z)| lr 6.04e-07 | 2533.96 ms | 53.3% bf16 MFU | 206937 tok/s +step 19181/19560 | loss 3.274335 (-0.41z)| norm 0.2270 (-0.15z)| lr 6.01e-07 | 2531.89 ms | 53.3% bf16 MFU | 206944 tok/s +step 19182/19560 | loss 3.252082 (-0.83z)| norm 0.2147 (-0.90z)| lr 5.98e-07 | 2533.65 ms | 53.3% bf16 MFU | 206943 tok/s +step 19183/19560 | loss 3.474328 (+3.25z)| norm 0.3194 (+4.97z)| lr 5.94e-07 | 2532.53 ms | 53.3% bf16 MFU | 206947 tok/s +step 19184/19560 | loss 3.295810 (-0.03z)| norm 0.2247 (-0.29z)| lr 5.91e-07 | 2534.30 ms | 53.3% bf16 MFU | 206944 tok/s +step 19185/19560 | loss 3.266247 (-0.58z)| norm 0.2171 (-0.71z)| lr 5.88e-07 | 2532.56 ms | 53.3% bf16 MFU | 206947 tok/s +step 19186/19560 | loss 3.295884 (-0.02z)| norm 0.2166 (-0.73z)| lr 5.85e-07 | 2533.96 ms | 53.3% bf16 MFU | 206945 tok/s +step 19187/19560 | loss 3.278378 (-0.34z)| norm 0.2158 (-0.77z)| lr 5.82e-07 | 2532.69 ms | 53.3% bf16 MFU | 206948 tok/s +step 19188/19560 | loss 3.259529 (-0.70z)| norm 0.2266 (-0.17z)| lr 5.79e-07 | 2533.16 ms | 53.3% bf16 MFU | 206949 tok/s +step 19189/19560 | loss 3.304568 (+0.15z)| norm 0.2253 (-0.24z)| lr 5.76e-07 | 2531.33 ms | 53.3% bf16 MFU | 206958 tok/s +step 19190/19560 | loss 3.325899 (+0.58z)| norm 0.2188 (-0.60z)| lr 5.73e-07 | 2532.17 ms | 53.3% bf16 MFU | 206963 tok/s +step 19191/19560 | loss 3.278302 (-0.34z)| norm 0.2324 (+0.15z)| lr 5.70e-07 | 2533.14 ms | 53.3% bf16 MFU | 206963 tok/s +step 19192/19560 | loss 3.306381 (+0.20z)| norm 0.2268 (-0.17z)| lr 5.67e-07 | 2532.69 ms | 53.3% bf16 MFU | 206965 tok/s +step 19193/19560 | loss 3.300315 (+0.09z)| norm 0.2263 (-0.18z)| lr 5.63e-07 | 2531.35 ms | 53.3% bf16 MFU | 206973 tok/s +step 19194/19560 | loss 3.298062 (+0.04z)| norm 0.2213 (-0.45z)| lr 5.60e-07 | 2533.82 ms | 53.3% bf16 MFU | 206970 tok/s +step 19195/19560 | loss 3.276474 (-0.37z)| norm 0.2241 (-0.30z)| lr 5.57e-07 | 2532.70 ms | 53.3% bf16 MFU | 206972 tok/s +step 19196/19560 | loss 3.341440 (+0.88z)| norm 0.2256 (-0.22z)| lr 5.54e-07 | 2533.24 ms | 53.3% bf16 MFU | 206971 tok/s +step 19197/19560 | loss 3.283793 (-0.24z)| norm 0.2151 (-0.80z)| lr 5.51e-07 | 2534.31 ms | 53.3% bf16 MFU | 206967 tok/s +step 19198/19560 | loss 3.319246 (+0.52z)| norm 0.2226 (-0.37z)| lr 5.48e-07 | 2532.34 ms | 53.3% bf16 MFU | 206970 tok/s +step 19199/19560 | loss 3.235686 (-1.23z)| norm 0.2298 (+0.04z)| lr 5.45e-07 | 2534.35 ms | 53.3% bf16 MFU | 206965 tok/s +step 19200/19560 | loss 3.261873 (-0.67z)| norm 0.2192 (-0.57z)| lr 5.42e-07 | 2533.17 ms | 53.3% bf16 MFU | 206966 tok/s +step 19201/19560 | loss 3.282738 (-0.23z)| norm 0.2121 (-0.97z)| lr 5.39e-07 | 2535.36 ms | 53.3% bf16 MFU | 206957 tok/s +step 19202/19560 | loss 3.271874 (-0.47z)| norm 0.2528 (+1.33z)| lr 5.36e-07 | 2532.28 ms | 53.3% bf16 MFU | 206961 tok/s +step 19203/19560 | loss 3.277801 (-0.34z)| norm 0.2452 (+0.90z)| lr 5.33e-07 | 2533.98 ms | 53.3% bf16 MFU | 206958 tok/s +step 19204/19560 | loss 3.292119 (-0.04z)| norm 0.2257 (-0.21z)| lr 5.30e-07 | 2532.27 ms | 53.3% bf16 MFU | 206962 tok/s +step 19205/19560 | loss 3.233301 (-1.28z)| norm 0.2240 (-0.30z)| lr 5.27e-07 | 2533.59 ms | 53.3% bf16 MFU | 206961 tok/s +step 19206/19560 | loss 3.281301 (-0.26z)| norm 0.2257 (-0.20z)| lr 5.24e-07 | 2534.56 ms | 53.3% bf16 MFU | 206956 tok/s +step 19207/19560 | loss 3.293200 (-0.00z)| norm 0.2336 (+0.24z)| lr 5.21e-07 | 2533.08 ms | 53.3% bf16 MFU | 206957 tok/s +step 19208/19560 | loss 3.263602 (-0.62z)| norm 0.2213 (-0.46z)| lr 5.18e-07 | 2532.56 ms | 53.3% bf16 MFU | 206960 tok/s +step 19209/19560 | loss 3.284106 (-0.19z)| norm 0.2253 (-0.23z)| lr 5.16e-07 | 2534.48 ms | 53.3% bf16 MFU | 206955 tok/s +step 19210/19560 | loss 3.343016 (+1.05z)| norm 0.2262 (-0.18z)| lr 5.13e-07 | 2532.75 ms | 53.3% bf16 MFU | 206957 tok/s +step 19211/19560 | loss 3.275627 (-0.36z)| norm 0.2138 (-0.88z)| lr 5.10e-07 | 2534.52 ms | 53.3% bf16 MFU | 206952 tok/s +step 19212/19560 | loss 3.280936 (-0.25z)| norm 0.2222 (-0.40z)| lr 5.07e-07 | 2533.47 ms | 53.3% bf16 MFU | 206952 tok/s +step 19213/19560 | loss 3.233669 (-1.25z)| norm 0.2180 (-0.63z)| lr 5.04e-07 | 2533.56 ms | 53.3% bf16 MFU | 206951 tok/s +step 19214/19560 | loss 3.196682 (-1.99z)| norm 0.2260 (-0.18z)| lr 5.01e-07 | 2531.99 ms | 53.3% bf16 MFU | 206957 tok/s +step 19215/19560 | loss 3.266278 (-0.54z)| norm 0.2191 (-0.56z)| lr 4.98e-07 | 2532.90 ms | 53.3% bf16 MFU | 206959 tok/s +step 19216/19560 | loss 3.296428 (+0.11z)| norm 0.2422 (+0.73z)| lr 4.95e-07 | 2535.28 ms | 53.3% bf16 MFU | 206951 tok/s +step 19217/19560 | loss 3.236059 (-1.16z)| norm 0.2137 (-0.86z)| lr 4.92e-07 | 2534.33 ms | 53.3% bf16 MFU | 206947 tok/s +step 19218/19560 | loss 3.303795 (+0.28z)| norm 0.2380 (+0.49z)| lr 4.90e-07 | 2534.51 ms | 53.3% bf16 MFU | 206942 tok/s +step 19219/19560 | loss 3.286239 (-0.10z)| norm 0.2223 (-0.39z)| lr 4.87e-07 | 2535.23 ms | 53.3% bf16 MFU | 206935 tok/s +step 19220/19560 | loss 3.276813 (-0.31z)| norm 0.2288 (-0.02z)| lr 4.84e-07 | 2533.11 ms | 53.3% bf16 MFU | 206937 tok/s +step 19221/19560 | loss 3.321235 (+0.78z)| norm 0.2292 (+0.02z)| lr 4.81e-07 | 2533.96 ms | 53.3% bf16 MFU | 206936 tok/s +step 19222/19560 | loss 3.290759 (+0.04z)| norm 0.2228 (-0.35z)| lr 4.78e-07 | 2531.46 ms | 53.3% bf16 MFU | 206944 tok/s +step 19223/19560 | loss 3.202194 (-2.09z)| norm 0.2231 (-0.32z)| lr 4.75e-07 | 2532.99 ms | 53.3% bf16 MFU | 206946 tok/s +step 19224/19560 | loss 3.334911 (+1.11z)| norm 0.2302 (+0.11z)| lr 4.73e-07 | 2534.79 ms | 53.3% bf16 MFU | 206941 tok/s +step 19225/19560 | loss 3.352309 (+1.52z)| norm 0.2296 (+0.08z)| lr 4.70e-07 | 2535.05 ms | 53.3% bf16 MFU | 206935 tok/s +step 19226/19560 | loss 3.248647 (-0.97z)| norm 0.2220 (-0.39z)| lr 4.67e-07 | 2533.36 ms | 53.3% bf16 MFU | 206936 tok/s +step 19227/19560 | loss 3.280087 (-0.21z)| norm 0.2342 (+0.35z)| lr 4.64e-07 | 2534.76 ms | 53.3% bf16 MFU | 206931 tok/s +step 19228/19560 | loss 3.291180 (+0.06z)| norm 0.2172 (-0.68z)| lr 4.61e-07 | 2535.40 ms | 53.3% bf16 MFU | 206924 tok/s +step 19229/19560 | loss 3.260542 (-0.67z)| norm 0.2243 (-0.25z)| lr 4.59e-07 | 2535.86 ms | 53.2% bf16 MFU | 206915 tok/s +step 19230/19560 | loss 3.280366 (-0.20z)| norm 0.2240 (-0.27z)| lr 4.56e-07 | 2535.43 ms | 53.3% bf16 MFU | 206908 tok/s +step 19231/19560 | loss 3.269932 (-0.45z)| norm 0.2186 (-0.60z)| lr 4.53e-07 | 2533.12 ms | 53.3% bf16 MFU | 206912 tok/s +step 19232/19560 | loss 3.272339 (-0.41z)| norm 0.2243 (-0.25z)| lr 4.50e-07 | 2534.19 ms | 53.3% bf16 MFU | 206910 tok/s +step 19233/19560 | loss 3.232517 (-1.36z)| norm 0.2120 (-0.99z)| lr 4.48e-07 | 2534.79 ms | 53.3% bf16 MFU | 206907 tok/s +step 19234/19560 | loss 3.273191 (-0.38z)| norm 0.2214 (-0.41z)| lr 4.45e-07 | 2536.72 ms | 53.2% bf16 MFU | 206895 tok/s +step 19235/19560 | loss 3.307772 (+0.47z)| norm 0.2297 (+0.10z)| lr 4.42e-07 | 2535.24 ms | 53.3% bf16 MFU | 206890 tok/s +step 19236/19560 | loss 3.261682 (-0.65z)| norm 0.2249 (-0.19z)| lr 4.40e-07 | 2534.69 ms | 53.3% bf16 MFU | 206888 tok/s +step 19237/19560 | loss 3.293331 (+0.12z)| norm 0.2166 (-0.70z)| lr 4.37e-07 | 2534.09 ms | 53.3% bf16 MFU | 206888 tok/s +step 19238/19560 | loss 3.322569 (+0.83z)| norm 0.2192 (-0.53z)| lr 4.34e-07 | 2535.04 ms | 53.3% bf16 MFU | 206885 tok/s +step 19239/19560 | loss 3.238945 (-1.20z)| norm 0.2284 (+0.03z)| lr 4.31e-07 | 2534.79 ms | 53.3% bf16 MFU | 206882 tok/s +step 19240/19560 | loss 3.319102 (+0.75z)| norm 0.2370 (+0.55z)| lr 4.29e-07 | 2535.49 ms | 53.3% bf16 MFU | 206877 tok/s +step 19241/19560 | loss 3.373487 (+2.03z)| norm 0.2553 (+1.80z)| lr 4.26e-07 | 2534.53 ms | 53.3% bf16 MFU | 206876 tok/s +step 19242/19560 | loss 3.315128 (+0.62z)| norm 0.2225 (-0.34z)| lr 4.23e-07 | 2534.36 ms | 53.3% bf16 MFU | 206876 tok/s +step 19243/19560 | loss 3.291642 (+0.05z)| norm 0.2205 (-0.47z)| lr 4.21e-07 | 2533.15 ms | 53.3% bf16 MFU | 206881 tok/s +step 19244/19560 | loss 3.263785 (-0.63z)| norm 0.2340 (+0.40z)| lr 4.18e-07 | 2532.83 ms | 53.3% bf16 MFU | 206887 tok/s +step 19245/19560 | loss 3.225644 (-1.53z)| norm 0.2594 (+2.02z)| lr 4.16e-07 | 2535.03 ms | 53.3% bf16 MFU | 206883 tok/s +step 19246/19560 | loss 3.326537 (+0.89z)| norm 0.2224 (-0.37z)| lr 4.13e-07 | 2533.63 ms | 53.3% bf16 MFU | 206886 tok/s +step 19247/19560 | loss 3.235806 (-1.29z)| norm 0.2203 (-0.51z)| lr 4.10e-07 | 2534.92 ms | 53.3% bf16 MFU | 206883 tok/s +step 19248/19560 | loss 3.255870 (-0.79z)| norm 0.2194 (-0.56z)| lr 4.08e-07 | 2533.31 ms | 53.3% bf16 MFU | 206886 tok/s +step 19249/19560 | loss 3.259946 (-0.69z)| norm 0.2298 (+0.16z)| lr 4.05e-07 | 2534.25 ms | 53.3% bf16 MFU | 206886 tok/s +step 19250/19560 | loss 3.247782 (-0.98z)| norm 0.2201 (-0.52z)| lr 4.02e-07 | 2533.20 ms | 53.3% bf16 MFU | 206890 tok/s +val loss 3.285250 +evaluating HellaSwag: 0/628 evaluating HellaSwag: 10/628 evaluating HellaSwag: 20/628 evaluating HellaSwag: 30/628 evaluating HellaSwag: 40/628 evaluating HellaSwag: 50/628 evaluating HellaSwag: 60/628 evaluating HellaSwag: 70/628 evaluating HellaSwag: 80/628 evaluating HellaSwag: 90/628 evaluating HellaSwag: 100/628 evaluating HellaSwag: 110/628 evaluating HellaSwag: 120/628 evaluating HellaSwag: 130/628 evaluating HellaSwag: 140/628 evaluating HellaSwag: 150/628 evaluating HellaSwag: 160/628 evaluating HellaSwag: 170/628 evaluating HellaSwag: 180/628 evaluating HellaSwag: 190/628 evaluating HellaSwag: 200/628 evaluating HellaSwag: 210/628 evaluating HellaSwag: 220/628 evaluating HellaSwag: 230/628 evaluating HellaSwag: 240/628 evaluating HellaSwag: 250/628 evaluating HellaSwag: 260/628 evaluating HellaSwag: 270/628 evaluating HellaSwag: 280/628 evaluating HellaSwag: 290/628 evaluating HellaSwag: 300/628 evaluating HellaSwag: 310/628 evaluating HellaSwag: 320/628 evaluating HellaSwag: 330/628 evaluating HellaSwag: 340/628 evaluating HellaSwag: 350/628 evaluating HellaSwag: 360/628 evaluating HellaSwag: 370/628 evaluating HellaSwag: 380/628 evaluating HellaSwag: 390/628 evaluating HellaSwag: 400/628 evaluating HellaSwag: 410/628 evaluating HellaSwag: 420/628 evaluating HellaSwag: 430/628 evaluating HellaSwag: 440/628 evaluating HellaSwag: 450/628 evaluating HellaSwag: 460/628 evaluating HellaSwag: 470/628 evaluating HellaSwag: 480/628 evaluating HellaSwag: 490/628 evaluating HellaSwag: 500/628 evaluating HellaSwag: 510/628 evaluating HellaSwag: 520/628 evaluating HellaSwag: 530/628 evaluating HellaSwag: 540/628 evaluating HellaSwag: 550/628 evaluating HellaSwag: 560/628 evaluating HellaSwag: 570/628 evaluating HellaSwag: 580/628 evaluating HellaSwag: 590/628 evaluating HellaSwag: 600/628 evaluating HellaSwag: 610/628 evaluating HellaSwag: 620/628 HellaSwag: 3030/10042 = 0.301733 +step 19251/19560 | loss 3.302064 (+0.36z)| norm 0.2382 (+0.73z)| lr 4.00e-07 | 2535.02 ms | 53.3% bf16 MFU | 206886 tok/s +step 19252/19560 | loss 3.323846 (+0.89z)| norm 0.2221 (-0.38z)| lr 3.97e-07 | 2532.93 ms | 53.3% bf16 MFU | 206892 tok/s +step 19253/19560 | loss 3.244269 (-1.08z)| norm 0.2224 (-0.36z)| lr 3.95e-07 | 2534.93 ms | 53.3% bf16 MFU | 206888 tok/s +step 19254/19560 | loss 3.233241 (-1.34z)| norm 0.2639 (+2.42z)| lr 3.92e-07 | 2534.23 ms | 53.3% bf16 MFU | 206888 tok/s +step 19255/19560 | loss 3.306590 (+0.49z)| norm 0.2223 (-0.38z)| lr 3.90e-07 | 2533.29 ms | 53.3% bf16 MFU | 206892 tok/s +step 19256/19560 | loss 3.232823 (-1.33z)| norm 0.2086 (-1.28z)| lr 3.87e-07 | 2536.02 ms | 53.2% bf16 MFU | 206884 tok/s +step 19257/19560 | loss 3.421740 (+3.19z)| norm 0.2508 (+1.52z)| lr 3.85e-07 | 2533.84 ms | 53.3% bf16 MFU | 206885 tok/s +step 19258/19560 | loss 3.256496 (-0.74z)| norm 0.2233 (-0.31z)| lr 3.82e-07 | 2534.21 ms | 53.3% bf16 MFU | 206885 tok/s +step 19259/19560 | loss 3.512554 (+4.80z)| norm 0.2965 (+4.20z)| lr 3.80e-07 | 2533.10 ms | 53.3% bf16 MFU | 206890 tok/s +step 19260/19560 | loss 3.295463 (+0.12z)| norm 0.2183 (-0.62z)| lr 3.77e-07 | 2533.90 ms | 53.3% bf16 MFU | 206891 tok/s +step 19261/19560 | loss 3.308086 (+0.40z)| norm 0.2399 (+0.70z)| lr 3.75e-07 | 2532.15 ms | 53.3% bf16 MFU | 206899 tok/s +step 19262/19560 | loss 3.245618 (-0.94z)| norm 0.2222 (-0.40z)| lr 3.72e-07 | 2533.17 ms | 53.3% bf16 MFU | 206902 tok/s +step 19263/19560 | loss 3.250873 (-0.83z)| norm 0.2268 (-0.12z)| lr 3.70e-07 | 2533.87 ms | 53.3% bf16 MFU | 206903 tok/s +step 19264/19560 | loss 3.381079 (+1.92z)| norm 0.2415 (+0.79z)| lr 3.67e-07 | 2534.27 ms | 53.3% bf16 MFU | 206902 tok/s +step 19265/19560 | loss 3.317832 (+0.57z)| norm 0.2244 (-0.27z)| lr 3.65e-07 | 2534.09 ms | 53.3% bf16 MFU | 206901 tok/s +step 19266/19560 | loss 3.282553 (-0.17z)| norm 0.2223 (-0.39z)| lr 3.62e-07 | 2534.65 ms | 53.3% bf16 MFU | 206899 tok/s +step 19267/19560 | loss 3.331950 (+0.87z)| norm 0.2251 (-0.22z)| lr 3.60e-07 | 2532.01 ms | 53.3% bf16 MFU | 206907 tok/s +step 19268/19560 | loss 3.286702 (-0.07z)| norm 0.2135 (-0.93z)| lr 3.57e-07 | 2532.29 ms | 53.3% bf16 MFU | 206914 tok/s +step 19269/19560 | loss 3.263945 (-0.55z)| norm 0.2212 (-0.46z)| lr 3.55e-07 | 2533.21 ms | 53.3% bf16 MFU | 206916 tok/s +step 19270/19560 | loss 3.299114 (+0.19z)| norm 0.2215 (-0.43z)| lr 3.52e-07 | 2533.37 ms | 53.3% bf16 MFU | 206918 tok/s +step 19271/19560 | loss 3.338957 (+1.03z)| norm 0.2246 (-0.24z)| lr 3.50e-07 | 2532.01 ms | 53.3% bf16 MFU | 206925 tok/s +step 19272/19560 | loss 3.265923 (-0.52z)| norm 0.2359 (+0.45z)| lr 3.48e-07 | 2533.84 ms | 53.3% bf16 MFU | 206925 tok/s +step 19273/19560 | loss 3.260648 (-0.63z)| norm 0.2238 (-0.29z)| lr 3.45e-07 | 2532.81 ms | 53.3% bf16 MFU | 206928 tok/s +step 19274/19560 | loss 3.245746 (-0.94z)| norm 0.2234 (-0.30z)| lr 3.43e-07 | 2535.61 ms | 53.2% bf16 MFU | 206920 tok/s +step 19275/19560 | loss 3.318466 (+0.68z)| norm 0.2210 (-0.46z)| lr 3.40e-07 | 2532.68 ms | 53.3% bf16 MFU | 206925 tok/s +step 19276/19560 | loss 3.283082 (-0.11z)| norm 0.2287 (+0.07z)| lr 3.38e-07 | 2533.11 ms | 53.3% bf16 MFU | 206927 tok/s +step 19277/19560 | loss 3.301183 (+0.30z)| norm 0.2321 (+0.30z)| lr 3.36e-07 | 2532.76 ms | 53.3% bf16 MFU | 206931 tok/s +step 19278/19560 | loss 3.272927 (-0.33z)| norm 0.2224 (-0.37z)| lr 3.33e-07 | 2534.70 ms | 53.3% bf16 MFU | 206927 tok/s +step 19279/19560 | loss 3.262106 (-0.56z)| norm 0.2153 (-0.86z)| lr 3.31e-07 | 2534.51 ms | 53.3% bf16 MFU | 206923 tok/s +step 19280/19560 | loss 3.333372 (+1.07z)| norm 0.2244 (-0.22z)| lr 3.29e-07 | 2533.92 ms | 53.3% bf16 MFU | 206923 tok/s +step 19281/19560 | loss 3.329140 (+0.96z)| norm 0.2189 (-0.60z)| lr 3.26e-07 | 2536.10 ms | 53.2% bf16 MFU | 206913 tok/s +step 19282/19560 | loss 3.249524 (-0.84z)| norm 0.2156 (-0.82z)| lr 3.24e-07 | 2534.92 ms | 53.3% bf16 MFU | 206909 tok/s +step 19283/19560 | loss 3.268125 (-0.42z)| norm 0.2276 (+0.01z)| lr 3.22e-07 | 2534.27 ms | 53.3% bf16 MFU | 206907 tok/s +step 19284/19560 | loss 3.311648 (+0.56z)| norm 0.2279 (+0.03z)| lr 3.19e-07 | 2535.76 ms | 53.2% bf16 MFU | 206900 tok/s +step 19285/19560 | loss 3.367384 (+1.79z)| norm 0.2345 (+0.48z)| lr 3.17e-07 | 2533.39 ms | 53.3% bf16 MFU | 206902 tok/s +step 19286/19560 | loss 3.386954 (+2.17z)| norm 0.2487 (+1.44z)| lr 3.15e-07 | 2534.35 ms | 53.3% bf16 MFU | 206901 tok/s +step 19287/19560 | loss 3.277261 (-0.24z)| norm 0.2163 (-0.79z)| lr 3.12e-07 | 2532.84 ms | 53.3% bf16 MFU | 206906 tok/s +step 19288/19560 | loss 3.262434 (-0.58z)| norm 0.2270 (-0.05z)| lr 3.10e-07 | 2534.26 ms | 53.3% bf16 MFU | 206904 tok/s +step 19289/19560 | loss 3.286206 (-0.05z)| norm 0.2336 (+0.40z)| lr 3.08e-07 | 2532.99 ms | 53.3% bf16 MFU | 206908 tok/s +step 19290/19560 | loss 3.330207 (+0.91z)| norm 0.2275 (-0.03z)| lr 3.06e-07 | 2531.77 ms | 53.3% bf16 MFU | 206917 tok/s +step 19291/19560 | loss 3.234996 (-1.17z)| norm 0.2179 (-0.68z)| lr 3.03e-07 | 2532.36 ms | 53.3% bf16 MFU | 206923 tok/s +step 19292/19560 | loss 3.253519 (-0.75z)| norm 0.2151 (-0.87z)| lr 3.01e-07 | 2534.46 ms | 53.3% bf16 MFU | 206920 tok/s +step 19293/19560 | loss 3.283938 (-0.08z)| norm 0.2222 (-0.37z)| lr 2.99e-07 | 2532.96 ms | 53.3% bf16 MFU | 206923 tok/s +step 19294/19560 | loss 3.274186 (-0.29z)| norm 0.2258 (-0.12z)| lr 2.97e-07 | 2533.00 ms | 53.3% bf16 MFU | 206926 tok/s +step 19295/19560 | loss 3.283921 (-0.07z)| norm 0.2348 (+0.52z)| lr 2.94e-07 | 2534.03 ms | 53.3% bf16 MFU | 206925 tok/s +step 19296/19560 | loss 3.271728 (-0.34z)| norm 0.2261 (-0.09z)| lr 2.92e-07 | 2533.03 ms | 53.3% bf16 MFU | 206928 tok/s +step 19297/19560 | loss 3.280264 (-0.15z)| norm 0.2287 (+0.09z)| lr 2.90e-07 | 2532.48 ms | 53.3% bf16 MFU | 206933 tok/s +step 19298/19560 | loss 3.313745 (+0.58z)| norm 0.2198 (-0.52z)| lr 2.88e-07 | 2534.38 ms | 53.3% bf16 MFU | 206929 tok/s +step 19299/19560 | loss 3.319630 (+0.70z)| norm 0.2335 (+0.45z)| lr 2.86e-07 | 2534.76 ms | 53.3% bf16 MFU | 206925 tok/s +step 19300/19560 | loss 3.347210 (+1.30z)| norm 0.2192 (-0.57z)| lr 2.83e-07 | 2533.66 ms | 53.3% bf16 MFU | 206925 tok/s +step 19301/19560 | loss 3.333060 (+0.98z)| norm 0.2312 (+0.30z)| lr 2.81e-07 | 2534.79 ms | 53.3% bf16 MFU | 206921 tok/s +step 19302/19560 | loss 3.288459 (-0.02z)| norm 0.2225 (-0.34z)| lr 2.79e-07 | 2535.17 ms | 53.3% bf16 MFU | 206915 tok/s +step 19303/19560 | loss 3.317603 (+0.62z)| norm 0.2166 (-0.76z)| lr 2.77e-07 | 2532.26 ms | 53.3% bf16 MFU | 206921 tok/s +step 19304/19560 | loss 3.316522 (+0.59z)| norm 0.2319 (+0.34z)| lr 2.75e-07 | 2535.51 ms | 53.3% bf16 MFU | 206914 tok/s +step 19305/19560 | loss 3.321303 (+0.68z)| norm 0.2210 (-0.44z)| lr 2.73e-07 | 2533.26 ms | 53.3% bf16 MFU | 206917 tok/s +step 19306/19560 | loss 3.312821 (+0.49z)| norm 0.2210 (-0.44z)| lr 2.71e-07 | 2534.03 ms | 53.3% bf16 MFU | 206916 tok/s +step 19307/19560 | loss 3.284116 (-0.15z)| norm 0.2180 (-0.65z)| lr 2.68e-07 | 2534.56 ms | 53.3% bf16 MFU | 206913 tok/s +step 19308/19560 | loss 3.253744 (-0.82z)| norm 0.2314 (+0.31z)| lr 2.66e-07 | 2533.44 ms | 53.3% bf16 MFU | 206914 tok/s +step 19309/19560 | loss 3.259891 (-0.68z)| norm 0.2572 (+2.11z)| lr 2.64e-07 | 2535.04 ms | 53.3% bf16 MFU | 206910 tok/s +step 19310/19560 | loss 3.350969 (+1.32z)| norm 0.2506 (+1.62z)| lr 2.62e-07 | 2534.56 ms | 53.3% bf16 MFU | 206907 tok/s +step 19311/19560 | loss 3.279491 (-0.24z)| norm 0.2277 (+0.07z)| lr 2.60e-07 | 2532.04 ms | 53.3% bf16 MFU | 206915 tok/s +step 19312/19560 | loss 3.243902 (-1.07z)| norm 0.2158 (-0.94z)| lr 2.58e-07 | 2532.41 ms | 53.3% bf16 MFU | 206920 tok/s +step 19313/19560 | loss 3.226588 (-1.46z)| norm 0.2177 (-0.78z)| lr 2.56e-07 | 2532.26 ms | 53.3% bf16 MFU | 206927 tok/s +step 19314/19560 | loss 3.272390 (-0.38z)| norm 0.2253 (-0.14z)| lr 2.54e-07 | 2531.71 ms | 53.3% bf16 MFU | 206935 tok/s +step 19315/19560 | loss 3.276591 (-0.29z)| norm 0.2176 (-0.80z)| lr 2.52e-07 | 2534.86 ms | 53.3% bf16 MFU | 206929 tok/s +step 19316/19560 | loss 3.329771 (+0.94z)| norm 0.2372 (+0.87z)| lr 2.50e-07 | 2533.90 ms | 53.3% bf16 MFU | 206928 tok/s +step 19317/19560 | loss 3.358590 (+1.59z)| norm 0.2378 (+0.91z)| lr 2.48e-07 | 2533.17 ms | 53.3% bf16 MFU | 206930 tok/s +step 19318/19560 | loss 3.228728 (-1.39z)| norm 0.2137 (-1.14z)| lr 2.46e-07 | 2533.48 ms | 53.3% bf16 MFU | 206931 tok/s +step 19319/19560 | loss 3.287376 (-0.04z)| norm 0.2188 (-0.69z)| lr 2.44e-07 | 2535.26 ms | 53.3% bf16 MFU | 206924 tok/s +step 19320/19560 | loss 3.318505 (+0.67z)| norm 0.2300 (+0.25z)| lr 2.42e-07 | 2533.67 ms | 53.3% bf16 MFU | 206925 tok/s +step 19321/19560 | loss 3.233365 (-1.26z)| norm 0.2164 (-0.88z)| lr 2.40e-07 | 2533.40 ms | 53.3% bf16 MFU | 206926 tok/s +step 19322/19560 | loss 3.273646 (-0.34z)| norm 0.2166 (-0.87z)| lr 2.38e-07 | 2532.85 ms | 53.3% bf16 MFU | 206929 tok/s +step 19323/19560 | loss 3.313012 (+0.55z)| norm 0.2186 (-0.69z)| lr 2.36e-07 | 2534.30 ms | 53.3% bf16 MFU | 206927 tok/s +step 19324/19560 | loss 3.262878 (-0.58z)| norm 0.2161 (-0.90z)| lr 2.34e-07 | 2533.61 ms | 53.3% bf16 MFU | 206927 tok/s +step 19325/19560 | loss 3.296634 (+0.19z)| norm 0.2230 (-0.32z)| lr 2.32e-07 | 2533.59 ms | 53.3% bf16 MFU | 206928 tok/s +step 19326/19560 | loss 3.315095 (+0.62z)| norm 0.2236 (-0.27z)| lr 2.30e-07 | 2532.68 ms | 53.3% bf16 MFU | 206932 tok/s +step 19327/19560 | loss 3.351476 (+1.43z)| norm 0.2427 (+1.32z)| lr 2.28e-07 | 2532.88 ms | 53.3% bf16 MFU | 206935 tok/s +step 19328/19560 | loss 3.283581 (-0.13z)| norm 0.2159 (-0.92z)| lr 2.26e-07 | 2535.03 ms | 53.3% bf16 MFU | 206929 tok/s +step 19329/19560 | loss 3.275184 (-0.32z)| norm 0.2201 (-0.57z)| lr 2.24e-07 | 2533.63 ms | 53.3% bf16 MFU | 206929 tok/s +step 19330/19560 | loss 3.274951 (-0.33z)| norm 0.2205 (-0.53z)| lr 2.22e-07 | 2536.17 ms | 53.2% bf16 MFU | 206919 tok/s +step 19331/19560 | loss 3.309132 (+0.45z)| norm 0.2297 (+0.27z)| lr 2.20e-07 | 2536.39 ms | 53.2% bf16 MFU | 206908 tok/s +step 19332/19560 | loss 3.303725 (+0.32z)| norm 0.2231 (-0.30z)| lr 2.18e-07 | 2535.41 ms | 53.3% bf16 MFU | 206902 tok/s +step 19333/19560 | loss 3.374072 (+1.89z)| norm 0.2456 (+1.61z)| lr 2.16e-07 | 2533.11 ms | 53.3% bf16 MFU | 206906 tok/s +step 19334/19560 | loss 3.271465 (-0.43z)| norm 0.2193 (-0.62z)| lr 2.14e-07 | 2535.14 ms | 53.3% bf16 MFU | 206901 tok/s +step 19335/19560 | loss 3.245450 (-1.01z)| norm 0.2303 (+0.31z)| lr 2.13e-07 | 2536.07 ms | 53.2% bf16 MFU | 206892 tok/s +step 19336/19560 | loss 3.389292 (+2.18z)| norm 0.2446 (+1.50z)| lr 2.11e-07 | 2532.93 ms | 53.3% bf16 MFU | 206897 tok/s +step 19337/19560 | loss 3.300931 (+0.21z)| norm 0.2173 (-0.80z)| lr 2.09e-07 | 2532.66 ms | 53.3% bf16 MFU | 206903 tok/s +step 19338/19560 | loss 3.206696 (-1.84z)| norm 0.2319 (+0.43z)| lr 2.07e-07 | 2535.21 ms | 53.3% bf16 MFU | 206898 tok/s +step 19339/19560 | loss 3.276420 (-0.30z)| norm 0.2258 (-0.09z)| lr 2.05e-07 | 2533.36 ms | 53.3% bf16 MFU | 206901 tok/s +step 19340/19560 | loss 3.227779 (-1.36z)| norm 0.2307 (+0.32z)| lr 2.03e-07 | 2534.70 ms | 53.3% bf16 MFU | 206898 tok/s +step 19341/19560 | loss 3.433689 (+3.01z)| norm 0.2750 (+3.81z)| lr 2.01e-07 | 2533.30 ms | 53.3% bf16 MFU | 206901 tok/s +step 19342/19560 | loss 3.249213 (-0.91z)| norm 0.2209 (-0.52z)| lr 2.00e-07 | 2532.28 ms | 53.3% bf16 MFU | 206908 tok/s +step 19343/19560 | loss 3.246020 (-0.98z)| norm 0.2106 (-1.33z)| lr 1.98e-07 | 2533.44 ms | 53.3% bf16 MFU | 206910 tok/s +step 19344/19560 | loss 3.292722 (+0.02z)| norm 0.2291 (+0.15z)| lr 1.96e-07 | 2533.79 ms | 53.3% bf16 MFU | 206910 tok/s +step 19345/19560 | loss 3.291611 (-0.01z)| norm 0.2263 (-0.08z)| lr 1.94e-07 | 2533.55 ms | 53.3% bf16 MFU | 206912 tok/s +step 19346/19560 | loss 3.352329 (+1.28z)| norm 0.2239 (-0.27z)| lr 1.92e-07 | 2533.69 ms | 53.3% bf16 MFU | 206912 tok/s +step 19347/19560 | loss 3.312527 (+0.42z)| norm 0.2159 (-0.90z)| lr 1.91e-07 | 2534.10 ms | 53.3% bf16 MFU | 206911 tok/s +step 19348/19560 | loss 3.233112 (-1.26z)| norm 0.2211 (-0.48z)| lr 1.89e-07 | 2532.52 ms | 53.3% bf16 MFU | 206917 tok/s +step 19349/19560 | loss 3.292004 (-0.00z)| norm 0.2171 (-0.79z)| lr 1.87e-07 | 2533.08 ms | 53.3% bf16 MFU | 206920 tok/s +step 19350/19560 | loss 3.361724 (+1.46z)| norm 0.2565 (+2.29z)| lr 1.85e-07 | 2532.92 ms | 53.3% bf16 MFU | 206923 tok/s +step 19351/19560 | loss 3.297460 (+0.09z)| norm 0.2267 (-0.05z)| lr 1.84e-07 | 2535.44 ms | 53.3% bf16 MFU | 206916 tok/s +step 19352/19560 | loss 3.268816 (-0.52z)| norm 0.2177 (-0.74z)| lr 1.82e-07 | 2534.16 ms | 53.3% bf16 MFU | 206915 tok/s +step 19353/19560 | loss 3.318865 (+0.57z)| norm 0.2227 (-0.35z)| lr 1.80e-07 | 2533.87 ms | 53.3% bf16 MFU | 206915 tok/s +step 19354/19560 | loss 3.276765 (-0.35z)| norm 0.2246 (-0.20z)| lr 1.78e-07 | 2533.40 ms | 53.3% bf16 MFU | 206917 tok/s +step 19355/19560 | loss 3.336894 (+0.94z)| norm 0.2227 (-0.34z)| lr 1.77e-07 | 2533.52 ms | 53.3% bf16 MFU | 206918 tok/s +step 19356/19560 | loss 3.282568 (-0.23z)| norm 0.2260 (-0.09z)| lr 1.75e-07 | 2534.24 ms | 53.3% bf16 MFU | 206916 tok/s +step 19357/19560 | loss 3.285107 (-0.18z)| norm 0.2290 (+0.15z)| lr 1.73e-07 | 2534.70 ms | 53.3% bf16 MFU | 206912 tok/s +step 19358/19560 | loss 3.288572 (-0.11z)| norm 0.2155 (-0.91z)| lr 1.72e-07 | 2535.44 ms | 53.3% bf16 MFU | 206906 tok/s +step 19359/19560 | loss 3.371402 (+1.65z)| norm 0.2259 (-0.10z)| lr 1.70e-07 | 2534.09 ms | 53.3% bf16 MFU | 206905 tok/s +step 19360/19560 | loss 3.280447 (-0.30z)| norm 0.2198 (-0.58z)| lr 1.68e-07 | 2532.82 ms | 53.3% bf16 MFU | 206910 tok/s +step 19361/19560 | loss 3.426990 (+2.74z)| norm 0.2211 (-0.48z)| lr 1.66e-07 | 2534.80 ms | 53.3% bf16 MFU | 206906 tok/s +step 19362/19560 | loss 3.277836 (-0.38z)| norm 0.2269 (-0.03z)| lr 1.65e-07 | 2533.17 ms | 53.3% bf16 MFU | 206909 tok/s +step 19363/19560 | loss 3.346569 (+1.05z)| norm 0.2171 (-0.79z)| lr 1.63e-07 | 2534.47 ms | 53.3% bf16 MFU | 206907 tok/s +step 19364/19560 | loss 3.333456 (+0.76z)| norm 0.2278 (+0.05z)| lr 1.62e-07 | 2531.79 ms | 53.3% bf16 MFU | 206916 tok/s +step 19365/19560 | loss 3.285918 (-0.22z)| norm 0.2306 (+0.26z)| lr 1.60e-07 | 2533.49 ms | 53.3% bf16 MFU | 206917 tok/s +step 19366/19560 | loss 3.264602 (-0.66z)| norm 0.2234 (-0.31z)| lr 1.58e-07 | 2531.72 ms | 53.3% bf16 MFU | 206926 tok/s +step 19367/19560 | loss 3.302304 (+0.12z)| norm 0.2353 (+0.63z)| lr 1.57e-07 | 2533.95 ms | 53.3% bf16 MFU | 206925 tok/s +step 19368/19560 | loss 3.389969 (+1.91z)| norm 0.2273 (+0.00z)| lr 1.55e-07 | 2532.47 ms | 53.3% bf16 MFU | 206930 tok/s +step 19369/19560 | loss 3.227925 (-1.41z)| norm 0.2170 (-0.80z)| lr 1.53e-07 | 2535.91 ms | 53.2% bf16 MFU | 206921 tok/s +step 19370/19560 | loss 3.275635 (-0.42z)| norm 0.2131 (-1.11z)| lr 1.52e-07 | 2534.40 ms | 53.3% bf16 MFU | 206918 tok/s +step 19371/19560 | loss 3.322263 (+0.54z)| norm 0.2153 (-0.92z)| lr 1.50e-07 | 2533.68 ms | 53.3% bf16 MFU | 206918 tok/s +step 19372/19560 | loss 3.354436 (+1.19z)| norm 0.2346 (+0.62z)| lr 1.49e-07 | 2532.16 ms | 53.3% bf16 MFU | 206925 tok/s +step 19373/19560 | loss 3.333015 (+0.73z)| norm 0.2178 (-0.72z)| lr 1.47e-07 | 2533.49 ms | 53.3% bf16 MFU | 206926 tok/s +step 19374/19560 | loss 3.321186 (+0.49z)| norm 0.2455 (+1.53z)| lr 1.46e-07 | 2533.39 ms | 53.3% bf16 MFU | 206927 tok/s +step 19375/19560 | loss 3.322430 (+0.50z)| norm 0.2490 (+1.77z)| lr 1.44e-07 | 2534.68 ms | 53.3% bf16 MFU | 206923 tok/s +step 19376/19560 | loss 3.323220 (+0.51z)| norm 0.2172 (-0.78z)| lr 1.42e-07 | 2533.25 ms | 53.3% bf16 MFU | 206925 tok/s +step 19377/19560 | loss 3.250224 (-1.01z)| norm 0.2341 (+0.57z)| lr 1.41e-07 | 2533.56 ms | 53.3% bf16 MFU | 206926 tok/s +step 19378/19560 | loss 3.333879 (+0.72z)| norm 0.2437 (+1.32z)| lr 1.39e-07 | 2532.73 ms | 53.3% bf16 MFU | 206930 tok/s +step 19379/19560 | loss 3.304941 (+0.12z)| norm 0.2260 (-0.08z)| lr 1.38e-07 | 2533.11 ms | 53.3% bf16 MFU | 206932 tok/s +step 19380/19560 | loss 3.345258 (+0.95z)| norm 0.2188 (-0.66z)| lr 1.36e-07 | 2532.75 ms | 53.3% bf16 MFU | 206936 tok/s +step 19381/19560 | loss 3.337188 (+0.77z)| norm 0.2275 (+0.03z)| lr 1.35e-07 | 2534.44 ms | 53.3% bf16 MFU | 206932 tok/s +step 19382/19560 | loss 3.303669 (+0.06z)| norm 0.2209 (-0.48z)| lr 1.33e-07 | 2534.19 ms | 53.3% bf16 MFU | 206930 tok/s +step 19383/19560 | loss 3.318522 (+0.37z)| norm 0.2332 (+0.52z)| lr 1.32e-07 | 2535.40 ms | 53.3% bf16 MFU | 206923 tok/s +step 19384/19560 | loss 3.318009 (+0.35z)| norm 0.2185 (-0.70z)| lr 1.30e-07 | 2533.60 ms | 53.3% bf16 MFU | 206923 tok/s +step 19385/19560 | loss 3.347527 (+1.01z)| norm 0.2267 (-0.00z)| lr 1.29e-07 | 2533.38 ms | 53.3% bf16 MFU | 206925 tok/s +step 19386/19560 | loss 3.308073 (+0.14z)| norm 0.2161 (-0.89z)| lr 1.27e-07 | 2536.47 ms | 53.2% bf16 MFU | 206913 tok/s +step 19387/19560 | loss 3.331775 (+0.76z)| norm 0.2229 (-0.31z)| lr 1.26e-07 | 2532.48 ms | 53.3% bf16 MFU | 206919 tok/s +step 19388/19560 | loss 3.266140 (-0.80z)| norm 0.2327 (+0.65z)| lr 1.25e-07 | 2533.97 ms | 53.3% bf16 MFU | 206918 tok/s +step 19389/19560 | loss 3.397681 (+2.26z)| norm 0.3796 (+9.04z)| lr 1.23e-07 | 2532.76 ms | 53.3% bf16 MFU | 206922 tok/s +step 19390/19560 | loss 3.312706 (+0.27z)| norm 0.2325 (+0.30z)| lr 1.22e-07 | 2534.80 ms | 53.3% bf16 MFU | 206918 tok/s +step 19391/19560 | loss 3.321596 (+0.47z)| norm 0.2205 (-0.40z)| lr 1.20e-07 | 2533.54 ms | 53.3% bf16 MFU | 206919 tok/s +step 19392/19560 | loss 3.271073 (-0.70z)| norm 0.2193 (-0.47z)| lr 1.19e-07 | 2532.73 ms | 53.3% bf16 MFU | 206923 tok/s +step 19393/19560 | loss 3.376952 (+1.78z)| norm 0.2487 (+1.26z)| lr 1.17e-07 | 2532.42 ms | 53.3% bf16 MFU | 206929 tok/s +step 19394/19560 | loss 3.337535 (+0.84z)| norm 0.2266 (-0.05z)| lr 1.16e-07 | 2534.21 ms | 53.3% bf16 MFU | 206927 tok/s +step 19395/19560 | loss 3.335443 (+0.79z)| norm 0.2222 (-0.30z)| lr 1.15e-07 | 2534.01 ms | 53.3% bf16 MFU | 206925 tok/s +step 19396/19560 | loss 3.318923 (+0.40z)| norm 0.2180 (-0.55z)| lr 1.13e-07 | 2533.77 ms | 53.3% bf16 MFU | 206925 tok/s +step 19397/19560 | loss 3.263204 (-0.91z)| norm 0.2152 (-0.72z)| lr 1.12e-07 | 2533.63 ms | 53.3% bf16 MFU | 206925 tok/s +step 19398/19560 | loss 3.348018 (+1.07z)| norm 0.2261 (-0.07z)| lr 1.11e-07 | 2532.70 ms | 53.3% bf16 MFU | 206929 tok/s +step 19399/19560 | loss 3.316634 (+0.34z)| norm 0.2283 (+0.05z)| lr 1.09e-07 | 2534.59 ms | 53.3% bf16 MFU | 206926 tok/s +step 19400/19560 | loss 3.458934 (+3.47z)| norm 0.2369 (+0.56z)| lr 1.08e-07 | 2531.69 ms | 53.3% bf16 MFU | 206934 tok/s +step 19401/19560 | loss 3.299744 (-0.09z)| norm 0.2222 (-0.31z)| lr 1.07e-07 | 2532.43 ms | 53.3% bf16 MFU | 206939 tok/s +step 19402/19560 | loss 3.260705 (-0.97z)| norm 0.2245 (-0.17z)| lr 1.05e-07 | 2533.29 ms | 53.3% bf16 MFU | 206940 tok/s +step 19403/19560 | loss 3.282864 (-0.47z)| norm 0.2229 (-0.27z)| lr 1.04e-07 | 2531.21 ms | 53.3% bf16 MFU | 206949 tok/s +step 19404/19560 | loss 3.364354 (+1.34z)| norm 0.2227 (-0.27z)| lr 1.03e-07 | 2533.25 ms | 53.3% bf16 MFU | 206950 tok/s +step 19405/19560 | loss 3.318740 (+0.32z)| norm 0.2238 (-0.21z)| lr 1.01e-07 | 2532.03 ms | 53.3% bf16 MFU | 206955 tok/s +step 19406/19560 | loss 3.266084 (-0.86z)| norm 0.2185 (-0.52z)| lr 1.00e-07 | 2532.89 ms | 53.3% bf16 MFU | 206957 tok/s +step 19407/19560 | loss 3.367700 (+1.39z)| norm 0.2188 (-0.50z)| lr 9.87e-08 | 2532.92 ms | 53.3% bf16 MFU | 206959 tok/s +step 19408/19560 | loss 3.317806 (+0.28z)| norm 0.2292 (+0.11z)| lr 9.74e-08 | 2533.37 ms | 53.3% bf16 MFU | 206959 tok/s +step 19409/19560 | loss 3.295913 (-0.20z)| norm 0.2176 (-0.57z)| lr 9.61e-08 | 2533.76 ms | 53.3% bf16 MFU | 206957 tok/s +step 19410/19560 | loss 3.203839 (-2.21z)| norm 0.2469 (+1.14z)| lr 9.49e-08 | 2533.42 ms | 53.3% bf16 MFU | 206956 tok/s +step 19411/19560 | loss 3.325641 (+0.45z)| norm 0.2217 (-0.34z)| lr 9.36e-08 | 2536.04 ms | 53.2% bf16 MFU | 206945 tok/s +step 19412/19560 | loss 3.303820 (-0.02z)| norm 0.2242 (-0.19z)| lr 9.24e-08 | 2532.80 ms | 53.3% bf16 MFU | 206948 tok/s +step 19413/19560 | loss 3.318535 (+0.31z)| norm 0.2421 (+0.86z)| lr 9.12e-08 | 2532.51 ms | 53.3% bf16 MFU | 206952 tok/s +step 19414/19560 | loss 3.289614 (-0.32z)| norm 0.2190 (-0.49z)| lr 8.99e-08 | 2534.33 ms | 53.3% bf16 MFU | 206948 tok/s +step 19415/19560 | loss 3.265864 (-0.85z)| norm 0.2197 (-0.45z)| lr 8.87e-08 | 2533.07 ms | 53.3% bf16 MFU | 206949 tok/s +step 19416/19560 | loss 3.306506 (+0.06z)| norm 0.2142 (-0.77z)| lr 8.75e-08 | 2534.79 ms | 53.3% bf16 MFU | 206944 tok/s +step 19417/19560 | loss 3.286015 (-0.40z)| norm 0.2284 (+0.07z)| lr 8.63e-08 | 2534.05 ms | 53.3% bf16 MFU | 206941 tok/s +step 19418/19560 | loss 3.320096 (+0.36z)| norm 0.2270 (-0.01z)| lr 8.51e-08 | 2533.11 ms | 53.3% bf16 MFU | 206943 tok/s +step 19419/19560 | loss 3.330787 (+0.59z)| norm 0.2259 (-0.08z)| lr 8.39e-08 | 2533.29 ms | 53.3% bf16 MFU | 206944 tok/s +step 19420/19560 | loss 3.356527 (+1.16z)| norm 0.2360 (+0.51z)| lr 8.27e-08 | 2533.63 ms | 53.3% bf16 MFU | 206943 tok/s +step 19421/19560 | loss 3.235606 (-1.56z)| norm 0.2234 (-0.24z)| lr 8.16e-08 | 2534.40 ms | 53.3% bf16 MFU | 206939 tok/s +step 19422/19560 | loss 3.321757 (+0.37z)| norm 0.2282 (+0.04z)| lr 8.04e-08 | 2533.89 ms | 53.3% bf16 MFU | 206938 tok/s +step 19423/19560 | loss 3.345102 (+0.88z)| norm 0.2370 (+0.56z)| lr 7.93e-08 | 2534.38 ms | 53.3% bf16 MFU | 206935 tok/s +step 19424/19560 | loss 3.347240 (+0.91z)| norm 0.2175 (-0.58z)| lr 7.81e-08 | 2533.92 ms | 53.3% bf16 MFU | 206933 tok/s +step 19425/19560 | loss 3.284242 (-0.50z)| norm 0.2289 (+0.09z)| lr 7.70e-08 | 2535.25 ms | 53.3% bf16 MFU | 206926 tok/s +step 19426/19560 | loss 3.292120 (-0.32z)| norm 0.2179 (-0.56z)| lr 7.59e-08 | 2532.73 ms | 53.3% bf16 MFU | 206930 tok/s +step 19427/19560 | loss 3.390001 (+1.84z)| norm 0.2201 (-0.42z)| lr 7.47e-08 | 2534.29 ms | 53.3% bf16 MFU | 206928 tok/s +step 19428/19560 | loss 3.303864 (-0.06z)| norm 0.2482 (+1.22z)| lr 7.36e-08 | 2533.19 ms | 53.3% bf16 MFU | 206930 tok/s +step 19429/19560 | loss 3.307957 (+0.03z)| norm 0.3597 (+6.36z)| lr 7.25e-08 | 2532.09 ms | 53.3% bf16 MFU | 206936 tok/s +step 19430/19560 | loss 3.262197 (-0.98z)| norm 0.2183 (-0.49z)| lr 7.14e-08 | 2532.23 ms | 53.3% bf16 MFU | 206942 tok/s +step 19431/19560 | loss 3.330572 (+0.54z)| norm 0.2144 (-0.68z)| lr 7.03e-08 | 2533.52 ms | 53.3% bf16 MFU | 206942 tok/s +step 19432/19560 | loss 3.333874 (+0.61z)| norm 0.2239 (-0.22z)| lr 6.93e-08 | 2534.13 ms | 53.3% bf16 MFU | 206939 tok/s +step 19433/19560 | loss 3.283669 (-0.50z)| norm 0.2272 (-0.06z)| lr 6.82e-08 | 2533.24 ms | 53.3% bf16 MFU | 206940 tok/s +step 19434/19560 | loss 3.284813 (-0.47z)| norm 0.2230 (-0.27z)| lr 6.71e-08 | 2534.94 ms | 53.3% bf16 MFU | 206934 tok/s +step 19435/19560 | loss 3.367528 (+1.34z)| norm 0.2340 (+0.26z)| lr 6.61e-08 | 2534.01 ms | 53.3% bf16 MFU | 206933 tok/s +step 19436/19560 | loss 3.281832 (-0.55z)| norm 0.2172 (-0.54z)| lr 6.50e-08 | 2535.39 ms | 53.3% bf16 MFU | 206926 tok/s +step 19437/19560 | loss 3.324520 (+0.38z)| norm 0.2531 (+1.19z)| lr 6.40e-08 | 2535.62 ms | 53.2% bf16 MFU | 206918 tok/s +step 19438/19560 | loss 3.366385 (+1.30z)| norm 0.2227 (-0.27z)| lr 6.30e-08 | 2533.99 ms | 53.3% bf16 MFU | 206917 tok/s +step 19439/19560 | loss 3.390322 (+1.79z)| norm 0.2429 (+0.71z)| lr 6.19e-08 | 2532.15 ms | 53.3% bf16 MFU | 206924 tok/s +step 19440/19560 | loss 3.363289 (+1.18z)| norm 0.2486 (+0.97z)| lr 6.09e-08 | 2531.96 ms | 53.3% bf16 MFU | 206931 tok/s +step 19441/19560 | loss 3.284719 (-0.55z)| norm 0.2180 (-0.52z)| lr 5.99e-08 | 2529.82 ms | 53.4% bf16 MFU | 206947 tok/s +step 19442/19560 | loss 3.337964 (+0.61z)| norm 0.2264 (-0.11z)| lr 5.89e-08 | 2533.30 ms | 53.3% bf16 MFU | 206947 tok/s +step 19443/19560 | loss 3.309029 (-0.03z)| norm 0.2267 (-0.10z)| lr 5.80e-08 | 2533.81 ms | 53.3% bf16 MFU | 206946 tok/s +step 19444/19560 | loss 3.339228 (+0.64z)| norm 0.2367 (+0.39z)| lr 5.70e-08 | 2533.32 ms | 53.3% bf16 MFU | 206946 tok/s +step 19445/19560 | loss 3.310714 (+0.01z)| norm 0.2253 (-0.16z)| lr 5.60e-08 | 2533.53 ms | 53.3% bf16 MFU | 206946 tok/s +step 19446/19560 | loss 3.290164 (-0.46z)| norm 0.2164 (-0.60z)| lr 5.50e-08 | 2534.38 ms | 53.3% bf16 MFU | 206942 tok/s +step 19447/19560 | loss 3.344861 (+0.76z)| norm 0.2243 (-0.21z)| lr 5.41e-08 | 2532.74 ms | 53.3% bf16 MFU | 206945 tok/s +step 19448/19560 | loss 3.312391 (+0.03z)| norm 0.2215 (-0.34z)| lr 5.31e-08 | 2532.99 ms | 53.3% bf16 MFU | 206947 tok/s +step 19449/19560 | loss 3.273916 (-0.85z)| norm 0.2303 (+0.08z)| lr 5.22e-08 | 2531.71 ms | 53.3% bf16 MFU | 206954 tok/s +step 19450/19560 | loss 3.306147 (-0.13z)| norm 0.2239 (-0.24z)| lr 5.13e-08 | 2533.99 ms | 53.3% bf16 MFU | 206952 tok/s +step 19451/19560 | loss 3.330430 (+0.43z)| norm 0.2217 (-0.34z)| lr 5.04e-08 | 2532.58 ms | 53.3% bf16 MFU | 206955 tok/s +step 19452/19560 | loss 3.266451 (-1.03z)| norm 0.2167 (-0.59z)| lr 4.94e-08 | 2534.14 ms | 53.3% bf16 MFU | 206952 tok/s +step 19453/19560 | loss 3.285202 (-0.60z)| norm 0.2162 (-0.61z)| lr 4.85e-08 | 2532.26 ms | 53.3% bf16 MFU | 206956 tok/s +step 19454/19560 | loss 3.316191 (+0.10z)| norm 0.2229 (-0.28z)| lr 4.77e-08 | 2533.15 ms | 53.3% bf16 MFU | 206957 tok/s +step 19455/19560 | loss 3.285609 (-0.58z)| norm 0.2264 (-0.11z)| lr 4.68e-08 | 2532.85 ms | 53.3% bf16 MFU | 206959 tok/s +step 19456/19560 | loss 3.277706 (-0.76z)| norm 0.2412 (+0.60z)| lr 4.59e-08 | 2533.64 ms | 53.3% bf16 MFU | 206957 tok/s +step 19457/19560 | loss 3.282027 (-0.67z)| norm 0.2350 (+0.30z)| lr 4.50e-08 | 2534.22 ms | 53.3% bf16 MFU | 206954 tok/s +step 19458/19560 | loss 3.237931 (-1.65z)| norm 0.2301 (+0.05z)| lr 4.41e-08 | 2533.54 ms | 53.3% bf16 MFU | 206953 tok/s +step 19459/19560 | loss 3.287818 (-0.52z)| norm 0.2190 (-0.48z)| lr 4.33e-08 | 2534.37 ms | 53.3% bf16 MFU | 206949 tok/s +step 19460/19560 | loss 3.281651 (-0.65z)| norm 0.2214 (-0.37z)| lr 4.25e-08 | 2534.20 ms | 53.3% bf16 MFU | 206946 tok/s +step 19461/19560 | loss 3.299718 (-0.23z)| norm 0.2265 (-0.11z)| lr 4.16e-08 | 2533.70 ms | 53.3% bf16 MFU | 206945 tok/s +step 19462/19560 | loss 3.265101 (-1.02z)| norm 0.2224 (-0.31z)| lr 4.08e-08 | 2534.50 ms | 53.3% bf16 MFU | 206940 tok/s +step 19463/19560 | loss 3.274321 (-0.82z)| norm 0.2201 (-0.42z)| lr 4.00e-08 | 2534.80 ms | 53.3% bf16 MFU | 206935 tok/s +step 19464/19560 | loss 3.331317 (+0.50z)| norm 0.2248 (-0.18z)| lr 3.92e-08 | 2536.02 ms | 53.2% bf16 MFU | 206925 tok/s +step 19465/19560 | loss 3.332737 (+0.53z)| norm 0.2250 (-0.18z)| lr 3.84e-08 | 2535.52 ms | 53.3% bf16 MFU | 206918 tok/s +step 19466/19560 | loss 3.295778 (-0.35z)| norm 0.2196 (-0.44z)| lr 3.76e-08 | 2534.49 ms | 53.3% bf16 MFU | 206915 tok/s +step 19467/19560 | loss 3.286730 (-0.57z)| norm 0.2210 (-0.37z)| lr 3.68e-08 | 2533.18 ms | 53.3% bf16 MFU | 206918 tok/s +step 19468/19560 | loss 3.311769 (+0.01z)| norm 0.2289 (+0.02z)| lr 3.60e-08 | 2534.75 ms | 53.3% bf16 MFU | 206914 tok/s +step 19469/19560 | loss 3.316020 (+0.14z)| norm 0.2213 (-0.33z)| lr 3.52e-08 | 2533.34 ms | 53.3% bf16 MFU | 206916 tok/s +step 19470/19560 | loss 3.383566 (+1.79z)| norm 0.2355 (+0.37z)| lr 3.45e-08 | 2533.16 ms | 53.3% bf16 MFU | 206919 tok/s +step 19471/19560 | loss 3.384001 (+1.77z)| norm 0.2314 (+0.15z)| lr 3.37e-08 | 2534.08 ms | 53.3% bf16 MFU | 206917 tok/s +step 19472/19560 | loss 3.284009 (-0.71z)| norm 0.2186 (-0.48z)| lr 3.30e-08 | 2534.65 ms | 53.3% bf16 MFU | 206914 tok/s +step 19473/19560 | loss 3.318075 (+0.13z)| norm 0.2180 (-0.51z)| lr 3.22e-08 | 2530.40 ms | 53.4% bf16 MFU | 206928 tok/s +step 19474/19560 | loss 3.231994 (-1.96z)| norm 0.2401 (+0.59z)| lr 3.15e-08 | 2532.39 ms | 53.3% bf16 MFU | 206933 tok/s +step 19475/19560 | loss 3.361174 (+1.19z)| norm 0.2230 (-0.27z)| lr 3.08e-08 | 2534.44 ms | 53.3% bf16 MFU | 206930 tok/s +step 19476/19560 | loss 3.271872 (-1.00z)| norm 0.2214 (-0.35z)| lr 3.01e-08 | 2534.36 ms | 53.3% bf16 MFU | 206927 tok/s +step 19477/19560 | loss 3.332053 (+0.47z)| norm 0.2166 (-0.59z)| lr 2.94e-08 | 2530.27 ms | 53.4% bf16 MFU | 206941 tok/s +step 19478/19560 | loss 3.348749 (+0.89z)| norm 0.2827 (+2.66z)| lr 2.87e-08 | 2533.98 ms | 53.3% bf16 MFU | 206939 tok/s +step 19479/19560 | loss 3.334008 (+0.52z)| norm 0.2256 (-0.15z)| lr 2.80e-08 | 2532.83 ms | 53.3% bf16 MFU | 206942 tok/s +step 19480/19560 | loss 3.317616 (+0.11z)| norm 0.2243 (-0.21z)| lr 2.73e-08 | 2533.00 ms | 53.3% bf16 MFU | 206944 tok/s +step 19481/19560 | loss 3.300121 (-0.33z)| norm 0.2185 (-0.49z)| lr 2.66e-08 | 2532.37 ms | 53.3% bf16 MFU | 206949 tok/s +step 19482/19560 | loss 3.282100 (-0.77z)| norm 0.2226 (-0.29z)| lr 2.60e-08 | 2533.37 ms | 53.3% bf16 MFU | 206949 tok/s +step 19483/19560 | loss 3.261569 (-1.26z)| norm 0.2266 (-0.10z)| lr 2.53e-08 | 2532.41 ms | 53.3% bf16 MFU | 206953 tok/s +step 19484/19560 | loss 3.397295 (+2.04z)| norm 0.3217 (+4.21z)| lr 2.47e-08 | 2533.49 ms | 53.3% bf16 MFU | 206952 tok/s +step 19485/19560 | loss 3.356524 (+1.03z)| norm 0.2331 (+0.17z)| lr 2.40e-08 | 2533.41 ms | 53.3% bf16 MFU | 206952 tok/s +step 19486/19560 | loss 3.270199 (-1.06z)| norm 0.2177 (-0.53z)| lr 2.34e-08 | 2531.88 ms | 53.3% bf16 MFU | 206958 tok/s +step 19487/19560 | loss 3.368350 (+1.32z)| norm 0.2258 (-0.16z)| lr 2.28e-08 | 2531.92 ms | 53.3% bf16 MFU | 206964 tok/s +step 19488/19560 | loss 3.310455 (-0.09z)| norm 0.2240 (-0.24z)| lr 2.22e-08 | 2531.89 ms | 53.3% bf16 MFU | 206969 tok/s +step 19489/19560 | loss 3.331915 (+0.46z)| norm 0.2262 (-0.15z)| lr 2.16e-08 | 2532.74 ms | 53.3% bf16 MFU | 206971 tok/s +step 19490/19560 | loss 3.305019 (-0.22z)| norm 0.2250 (-0.20z)| lr 2.10e-08 | 2534.60 ms | 53.3% bf16 MFU | 206965 tok/s +step 19491/19560 | loss 3.354902 (+1.03z)| norm 0.2237 (-0.26z)| lr 2.04e-08 | 2532.33 ms | 53.3% bf16 MFU | 206969 tok/s +step 19492/19560 | loss 3.279612 (-0.84z)| norm 0.2210 (-0.38z)| lr 1.98e-08 | 2533.51 ms | 53.3% bf16 MFU | 206967 tok/s +step 19493/19560 | loss 3.308086 (-0.14z)| norm 0.2227 (-0.30z)| lr 1.92e-08 | 2534.94 ms | 53.3% bf16 MFU | 206960 tok/s +step 19494/19560 | loss 3.287042 (-0.67z)| norm 0.2228 (-0.30z)| lr 1.87e-08 | 2535.75 ms | 53.2% bf16 MFU | 206950 tok/s +step 19495/19560 | loss 3.361011 (+1.17z)| norm 0.2195 (-0.45z)| lr 1.81e-08 | 2531.91 ms | 53.3% bf16 MFU | 206956 tok/s +step 19496/19560 | loss 3.262949 (-1.27z)| norm 0.2211 (-0.37z)| lr 1.76e-08 | 2533.89 ms | 53.3% bf16 MFU | 206954 tok/s +step 19497/19560 | loss 3.341977 (+0.72z)| norm 0.2242 (-0.23z)| lr 1.70e-08 | 2535.06 ms | 53.3% bf16 MFU | 206947 tok/s +step 19498/19560 | loss 3.325642 (+0.29z)| norm 0.2187 (-0.48z)| lr 1.65e-08 | 2534.53 ms | 53.3% bf16 MFU | 206943 tok/s +step 19499/19560 | loss 3.339894 (+0.65z)| norm 0.2281 (-0.06z)| lr 1.60e-08 | 2534.61 ms | 53.3% bf16 MFU | 206938 tok/s +step 19500/19560 | loss 3.349694 (+0.91z)| norm 0.2230 (-0.29z)| lr 1.55e-08 | 2532.46 ms | 53.3% bf16 MFU | 206942 tok/s +val loss 3.285180 +evaluating HellaSwag: 0/628 evaluating HellaSwag: 10/628 evaluating HellaSwag: 20/628 evaluating HellaSwag: 30/628 evaluating HellaSwag: 40/628 evaluating HellaSwag: 50/628 evaluating HellaSwag: 60/628 evaluating HellaSwag: 70/628 evaluating HellaSwag: 80/628 evaluating HellaSwag: 90/628 evaluating HellaSwag: 100/628 evaluating HellaSwag: 110/628 evaluating HellaSwag: 120/628 evaluating HellaSwag: 130/628 evaluating HellaSwag: 140/628 evaluating HellaSwag: 150/628 evaluating HellaSwag: 160/628 evaluating HellaSwag: 170/628 evaluating HellaSwag: 180/628 evaluating HellaSwag: 190/628 evaluating HellaSwag: 200/628 evaluating HellaSwag: 210/628 evaluating HellaSwag: 220/628 evaluating HellaSwag: 230/628 evaluating HellaSwag: 240/628 evaluating HellaSwag: 250/628 evaluating HellaSwag: 260/628 evaluating HellaSwag: 270/628 evaluating HellaSwag: 280/628 evaluating HellaSwag: 290/628 evaluating HellaSwag: 300/628 evaluating HellaSwag: 310/628 evaluating HellaSwag: 320/628 evaluating HellaSwag: 330/628 evaluating HellaSwag: 340/628 evaluating HellaSwag: 350/628 evaluating HellaSwag: 360/628 evaluating HellaSwag: 370/628 evaluating HellaSwag: 380/628 evaluating HellaSwag: 390/628 evaluating HellaSwag: 400/628 evaluating HellaSwag: 410/628 evaluating HellaSwag: 420/628 evaluating HellaSwag: 430/628 evaluating HellaSwag: 440/628 evaluating HellaSwag: 450/628 evaluating HellaSwag: 460/628 evaluating HellaSwag: 470/628 evaluating HellaSwag: 480/628 evaluating HellaSwag: 490/628 evaluating HellaSwag: 500/628 evaluating HellaSwag: 510/628 evaluating HellaSwag: 520/628 evaluating HellaSwag: 530/628 evaluating HellaSwag: 540/628 evaluating HellaSwag: 550/628 evaluating HellaSwag: 560/628 evaluating HellaSwag: 570/628 evaluating HellaSwag: 580/628 evaluating HellaSwag: 590/628 evaluating HellaSwag: 600/628 evaluating HellaSwag: 610/628 evaluating HellaSwag: 620/628 HellaSwag: 3030/10042 = 0.301733 +step 19501/19560 | loss 3.310617 (-0.10z)| norm 0.2155 (-0.63z)| lr 1.50e-08 | 2532.18 ms | 53.3% bf16 MFU | 206948 tok/s +step 19502/19560 | loss 3.303820 (-0.27z)| norm 0.2230 (-0.28z)| lr 1.45e-08 | 2531.23 ms | 53.3% bf16 MFU | 206957 tok/s +step 19503/19560 | loss 3.248018 (-1.67z)| norm 0.2149 (-0.64z)| lr 1.40e-08 | 2534.27 ms | 53.3% bf16 MFU | 206953 tok/s +step 19504/19560 | loss 3.254200 (-1.49z)| norm 0.2238 (-0.24z)| lr 1.35e-08 | 2531.34 ms | 53.3% bf16 MFU | 206961 tok/s +step 19505/19560 | loss 3.312983 (-0.01z)| norm 0.2260 (-0.13z)| lr 1.31e-08 | 2532.21 ms | 53.3% bf16 MFU | 206966 tok/s +step 19506/19560 | loss 3.317055 (+0.09z)| norm 0.2241 (-0.21z)| lr 1.26e-08 | 2535.94 ms | 53.2% bf16 MFU | 206955 tok/s +step 19507/19560 | loss 3.270969 (-1.07z)| norm 0.2285 (-0.01z)| lr 1.21e-08 | 2533.26 ms | 53.3% bf16 MFU | 206955 tok/s +step 19508/19560 | loss 3.307769 (-0.13z)| norm 0.2317 (+0.13z)| lr 1.17e-08 | 2533.64 ms | 53.3% bf16 MFU | 206954 tok/s +step 19509/19560 | loss 3.337241 (+0.62z)| norm 0.2236 (-0.24z)| lr 1.12e-08 | 2534.08 ms | 53.3% bf16 MFU | 206951 tok/s +step 19510/19560 | loss 3.291303 (-0.55z)| norm 0.2205 (-0.38z)| lr 1.08e-08 | 2533.15 ms | 53.3% bf16 MFU | 206952 tok/s +step 19511/19560 | loss 3.413887 (+2.49z)| norm 0.2286 (-0.01z)| lr 1.04e-08 | 2532.54 ms | 53.3% bf16 MFU | 206955 tok/s +step 19512/19560 | loss 3.297271 (-0.40z)| norm 0.2399 (+0.51z)| lr 1.00e-08 | 2532.31 ms | 53.3% bf16 MFU | 206959 tok/s +step 19513/19560 | loss 3.368051 (+1.35z)| norm 0.2213 (-0.35z)| lr 9.58e-09 | 2531.42 ms | 53.3% bf16 MFU | 206967 tok/s +step 19514/19560 | loss 3.316531 (+0.07z)| norm 0.2253 (-0.17z)| lr 9.19e-09 | 2533.43 ms | 53.3% bf16 MFU | 206966 tok/s +step 19515/19560 | loss 3.335388 (+0.54z)| norm 0.2522 (+1.06z)| lr 8.82e-09 | 2533.57 ms | 53.3% bf16 MFU | 206965 tok/s +step 19516/19560 | loss 3.257920 (-1.37z)| norm 0.2257 (-0.16z)| lr 8.42e-09 | 2534.46 ms | 53.3% bf16 MFU | 206960 tok/s +step 19517/19560 | loss 3.251552 (-1.51z)| norm 0.2226 (-0.30z)| lr 8.06e-09 | 2534.38 ms | 53.3% bf16 MFU | 206955 tok/s +step 19518/19560 | loss 3.321980 (+0.24z)| norm 0.2223 (-0.32z)| lr 7.69e-09 | 2534.95 ms | 53.3% bf16 MFU | 206948 tok/s +step 19519/19560 | loss 3.330657 (+0.45z)| norm 0.2165 (-0.65z)| lr 7.35e-09 | 2533.25 ms | 53.3% bf16 MFU | 206949 tok/s +step 19520/19560 | loss 3.327482 (+0.36z)| norm 0.2232 (-0.26z)| lr 6.99e-09 | 2533.98 ms | 53.3% bf16 MFU | 206947 tok/s +step 19521/19560 | loss 3.314029 (+0.04z)| norm 0.2243 (-0.19z)| lr 6.65e-09 | 2531.22 ms | 53.3% bf16 MFU | 206956 tok/s +step 19522/19560 | loss 3.311019 (-0.03z)| norm 0.2219 (-0.33z)| lr 6.33e-09 | 2532.78 ms | 53.3% bf16 MFU | 206958 tok/s +step 19523/19560 | loss 3.283302 (-0.72z)| norm 0.2338 (+0.36z)| lr 6.01e-09 | 2533.50 ms | 53.3% bf16 MFU | 206957 tok/s +step 19524/19560 | loss 3.292767 (-0.48z)| norm 0.2226 (-0.29z)| lr 5.70e-09 | 2534.32 ms | 53.3% bf16 MFU | 206953 tok/s +step 19525/19560 | loss 3.258957 (-1.33z)| norm 0.2152 (-0.73z)| lr 5.40e-09 | 2534.95 ms | 53.3% bf16 MFU | 206947 tok/s +step 19526/19560 | loss 3.432324 (+2.93z)| norm 0.3140 (+4.57z)| lr 5.10e-09 | 2534.05 ms | 53.3% bf16 MFU | 206944 tok/s +step 19527/19560 | loss 3.369690 (+1.38z)| norm 0.2230 (-0.29z)| lr 4.81e-09 | 2534.10 ms | 53.3% bf16 MFU | 206942 tok/s +step 19528/19560 | loss 3.289921 (-0.55z)| norm 0.2252 (-0.16z)| lr 4.52e-09 | 2534.53 ms | 53.3% bf16 MFU | 206938 tok/s +step 19529/19560 | loss 3.369823 (+1.46z)| norm 0.2210 (-0.38z)| lr 4.26e-09 | 2535.46 ms | 53.3% bf16 MFU | 206930 tok/s +step 19530/19560 | loss 3.243058 (-1.73z)| norm 0.2352 (+0.37z)| lr 4.01e-09 | 2533.62 ms | 53.3% bf16 MFU | 206930 tok/s +step 19531/19560 | loss 3.252074 (-1.49z)| norm 0.2287 (+0.02z)| lr 3.74e-09 | 2533.91 ms | 53.3% bf16 MFU | 206929 tok/s +step 19532/19560 | loss 3.268495 (-1.06z)| norm 0.2670 (+2.01z)| lr 3.50e-09 | 2534.20 ms | 53.3% bf16 MFU | 206927 tok/s +step 19533/19560 | loss 3.330939 (+0.50z)| norm 0.2284 (-0.02z)| lr 3.25e-09 | 2533.01 ms | 53.3% bf16 MFU | 206929 tok/s +step 19534/19560 | loss 3.300328 (-0.27z)| norm 0.2300 (+0.06z)| lr 3.04e-09 | 2532.32 ms | 53.3% bf16 MFU | 206935 tok/s +step 19535/19560 | loss 3.248812 (-1.54z)| norm 0.2281 (-0.04z)| lr 2.81e-09 | 2533.02 ms | 53.3% bf16 MFU | 206937 tok/s +step 19536/19560 | loss 3.319240 (+0.22z)| norm 0.2195 (-0.49z)| lr 2.59e-09 | 2533.02 ms | 53.3% bf16 MFU | 206939 tok/s +step 19537/19560 | loss 3.295946 (-0.36z)| norm 0.2182 (-0.56z)| lr 2.40e-09 | 2532.31 ms | 53.3% bf16 MFU | 206944 tok/s +step 19538/19560 | loss 3.264894 (-1.18z)| norm 0.2189 (-0.51z)| lr 2.20e-09 | 2533.29 ms | 53.3% bf16 MFU | 206945 tok/s +step 19539/19560 | loss 3.322773 (+0.31z)| norm 0.2324 (+0.20z)| lr 2.02e-09 | 2533.05 ms | 53.3% bf16 MFU | 206947 tok/s +step 19540/19560 | loss 3.335407 (+0.63z)| norm 0.2218 (-0.36z)| lr 1.84e-09 | 2533.93 ms | 53.3% bf16 MFU | 206945 tok/s +step 19541/19560 | loss 3.364981 (+1.37z)| norm 0.2204 (-0.43z)| lr 1.66e-09 | 2534.87 ms | 53.3% bf16 MFU | 206939 tok/s +step 19542/19560 | loss 3.291565 (-0.50z)| norm 0.2179 (-0.56z)| lr 1.50e-09 | 2532.43 ms | 53.3% bf16 MFU | 206944 tok/s +step 19543/19560 | loss 3.258425 (-1.34z)| norm 0.2271 (-0.08z)| lr 1.34e-09 | 2534.25 ms | 53.3% bf16 MFU | 206941 tok/s +step 19544/19560 | loss 3.305693 (-0.14z)| norm 0.2142 (-0.76z)| lr 1.20e-09 | 2535.48 ms | 53.3% bf16 MFU | 206933 tok/s +step 19545/19560 | loss 3.281474 (-0.76z)| norm 0.2297 (+0.06z)| lr 1.07e-09 | 2534.47 ms | 53.3% bf16 MFU | 206929 tok/s +step 19546/19560 | loss 3.252035 (-1.48z)| norm 0.2236 (-0.26z)| lr 9.30e-10 | 2534.84 ms | 53.3% bf16 MFU | 206924 tok/s +step 19547/19560 | loss 3.320108 (+0.24z)| norm 0.2265 (-0.11z)| lr 8.23e-10 | 2533.74 ms | 53.3% bf16 MFU | 206924 tok/s +step 19548/19560 | loss 3.401761 (+2.25z)| norm 0.2244 (-0.22z)| lr 6.97e-10 | 2533.84 ms | 53.3% bf16 MFU | 206924 tok/s +step 19549/19560 | loss 3.366529 (+1.36z)| norm 0.2189 (-0.50z)| lr 6.08e-10 | 2534.62 ms | 53.3% bf16 MFU | 206920 tok/s +step 19550/19560 | loss 3.277678 (-0.85z)| norm 0.2220 (-0.34z)| lr 5.01e-10 | 2533.32 ms | 53.3% bf16 MFU | 206922 tok/s +step 19551/19560 | loss 3.281588 (-0.74z)| norm 0.2413 (+0.68z)| lr 4.11e-10 | 2533.29 ms | 53.3% bf16 MFU | 206924 tok/s +step 19552/19560 | loss 3.262738 (-1.19z)| norm 0.2288 (+0.01z)| lr 3.40e-10 | 2532.32 ms | 53.3% bf16 MFU | 206930 tok/s +step 19553/19560 | loss 3.268528 (-1.04z)| norm 0.2318 (+0.17z)| lr 2.68e-10 | 2532.27 ms | 53.3% bf16 MFU | 206935 tok/s +step 19554/19560 | loss 3.231813 (-1.91z)| norm 0.2582 (+1.54z)| lr 1.97e-10 | 2532.42 ms | 53.3% bf16 MFU | 206940 tok/s +step 19555/19560 | loss 3.284984 (-0.60z)| norm 0.2379 (+0.47z)| lr 1.43e-10 | 2532.72 ms | 53.3% bf16 MFU | 206943 tok/s +step 19556/19560 | loss 3.331341 (+0.55z)| norm 0.2328 (+0.21z)| lr 1.07e-10 | 2532.38 ms | 53.3% bf16 MFU | 206948 tok/s +step 19557/19560 | loss 3.314257 (+0.12z)| norm 0.2307 (+0.19z)| lr 7.15e-11 | 2532.15 ms | 53.3% bf16 MFU | 206953 tok/s +step 19558/19560 | loss 3.297Error: Token out of vocabulary at train_gpt2.cu:675 +Error details: + File: train_gpt2.cu + Line: 675 + Token: 1047150199 + Position: 0 + Vocab: 50257 +475 (-0.30z)| norm 0.2259 (-0.13z)| lr 3.58e-11 | 2532.00 ms | 53.3% bf16 MFU | 206959 tok/s +step 19559/19560 | loss 3.275387 (-0.84z)| norm 0.2303 (+0.15z)| lr 1.79e-11 | 2532.45 ms | 53.3% bf16 MFU | 206962 tok/s +step 19560/19560 | loss 3.252716 (-1.38z)| norm 0.2172 (-0.72z)| lr 0.00e+00 | 2532.94 ms | 53.3% bf16 MFU | 206963 tok/s +val loss 3.285180 +evaluating HellaSwag: 0/628 evaluating HellaSwag: 10/628 evaluating HellaSwag: 20/628 evaluating HellaSwag: 30/628 evaluating HellaSwag: 40/628 evaluating HellaSwag: 50/628 evaluating HellaSwag: 60/628 evaluating HellaSwag: 70/628 evaluating HellaSwag: 80/628 evaluating HellaSwag: 90/628 evaluating HellaSwag: 100/628 evaluating HellaSwag: 110/628 evaluating HellaSwag: 120/628 evaluating HellaSwag: 130/628 evaluating HellaSwag: 140/628 evaluating HellaSwag: 150/628 evaluating HellaSwag: 160/628 evaluating HellaSwag: 170/628 evaluating HellaSwag: 180/628 evaluating HellaSwag: 190/628 evaluating HellaSwag: 200/628 evaluating HellaSwag: 210/628 evaluating HellaSwag: 220/628 evaluating HellaSwag: 230/628 evaluating HellaSwag: 240/628 evaluating HellaSwag: 250/628 evaluating HellaSwag: 260/628 evaluating HellaSwag: 270/628 evaluating HellaSwag: 280/628 evaluating HellaSwag: 290/628 evaluating HellaSwag: 300/628 evaluating HellaSwag: 310/628 evaluating HellaSwag: 320/628 evaluating HellaSwag: 330/628 evaluating HellaSwag: 340/628 evaluating HellaSwag: 350/628 evaluating HellaSwag: 360/628 evaluating HellaSwag: 370/628 evaluating HellaSwag: 380/628 evaluating HellaSwag: 390/628 evaluating HellaSwag: 400/628 evaluating HellaSwag: 410/628 evaluating HellaSwag: 420/628 evaluating HellaSwag: 430/628 evaluating HellaSwag: 440/628 evaluating HellaSwag: 450/628 evaluating HellaSwag: 460/628 evaluating HellaSwag: 470/628 evaluating HellaSwag: 480/628 evaluating HellaSwag: 490/628 evaluating HellaSwag: 500/628 evaluating HellaSwag: 510/628 evaluating HellaSwag: 520/628 evaluating HellaSwag: 530/628 evaluating HellaSwag: 540/628 evaluating HellaSwag: 550/628 evaluating HellaSwag: 560/628 evaluating HellaSwag: 570/628 evaluating HellaSwag: 580/628 evaluating HellaSwag: 590/628 evaluating HellaSwag: 600/628 evaluating HellaSwag: 610/628 evaluating HellaSwag: 620/628 HellaSwag: 3022/10042 = 0.300936 +generating: +---