diff --git "a/1x_A10G_24GB/nohup.out" "b/1x_A10G_24GB/nohup.out" --- "a/1x_A10G_24GB/nohup.out" +++ "b/1x_A10G_24GB/nohup.out" @@ -11067,4 +11067,7489 @@ step 10906/19560 | loss 3.347153 (-0.63z)| norm 0.2522 (-0.18z)| lr 2.61e-04 | 8 step 10907/19560 | loss 3.446925 (+1.51z)| norm 0.2666 (-0.16z)| lr 2.61e-04 | 8449.85 ms | -100.0% bf16 MFU | 62048 tok/s step 10908/19560 | loss 3.413799 (+0.78z)| norm 0.2456 (-0.18z)| lr 2.61e-04 | 8448.29 ms | -100.0% bf16 MFU | 62049 tok/s step 10909/19560 | loss 3.384904 (+0.17z)| norm 0.2783 (-0.14z)| lr 2.61e-04 | 8447.14 ms | -100.0% bf16 MFU | 62050 tok/s -step 10910/19560 | loss 3.392893 (+0.34z)| norm 0.2512 (-0.18z)| lr 2.61e-04 \ No newline at end of file +step 10910/19560 | loss 3.392893 (+0.34z)| norm 0.2512 (-0.18z)| lr 2.61e-04 | 8444.13 ms | -100.0% bf16 MFU | 62052 tok/s +step 10911/19560 | loss 3.389444 (+0.27z)| norm 0.2660 (-0.16z)| lr 2.61e-04 | 8444.85 ms | -100.0% bf16 MFU | 62053 tok/s +step 10912/19560 | loss 3.416796 (+0.85z)| norm 0.2704 (-0.15z)| lr 2.61e-04 | 8446.39 ms | -100.0% bf16 MFU | 62054 tok/s +step 10913/19560 | loss 3.403760 (+0.59z)| norm 0.2726 (-0.15z)| lr 2.61e-04 | 8450.18 ms | -100.0% bf16 MFU | 62054 tok/s +step 10914/19560 | loss 3.332393 (-0.96z)| norm 0.2593 (-0.16z)| lr 2.61e-04 | 8445.85 ms | -100.0% bf16 MFU | 62055 tok/s +step 10915/19560 | loss 3.372683 (-0.08z)| norm 0.2807 (-0.14z)| lr 2.61e-04 | 8448.46 ms | -100.0% bf16 MFU | 62055 tok/s +step 10916/19560 | loss 3.343349 (-0.72z)| norm 0.2595 (-0.16z)| lr 2.61e-04 | 8446.78 ms | -100.0% bf16 MFU | 62056 tok/s +step 10917/19560 | loss 3.384692 (+0.17z)| norm 0.2688 (-0.15z)| lr 2.61e-04 | 8448.36 ms | -100.0% bf16 MFU | 62056 tok/s +step 10918/19560 | loss 3.383450 (+0.14z)| norm 0.2568 (-0.17z)| lr 2.61e-04 | 8448.90 ms | -100.0% bf16 MFU | 62056 tok/s +step 10919/19560 | loss 3.344144 (-0.73z)| norm 0.2669 (-0.15z)| lr 2.61e-04 | 8446.88 ms | -100.0% bf16 MFU | 62056 tok/s +step 10920/19560 | loss 3.360804 (-0.36z)| norm 0.2581 (-0.17z)| lr 2.61e-04 | 8450.75 ms | -100.0% bf16 MFU | 62056 tok/s +step 10921/19560 | loss 3.381530 (+0.09z)| norm 0.2703 (-0.15z)| lr 2.61e-04 | 8447.32 ms | -100.0% bf16 MFU | 62056 tok/s +step 10922/19560 | loss 3.333292 (-0.96z)| norm 0.2703 (-0.15z)| lr 2.61e-04 | 8448.70 ms | -100.0% bf16 MFU | 62056 tok/s +step 10923/19560 | loss 3.372064 (-0.11z)| norm 0.2615 (-0.16z)| lr 2.61e-04 | 8448.21 ms | -100.0% bf16 MFU | 62056 tok/s +step 10924/19560 | loss 3.333439 (-0.94z)| norm 0.2759 (-0.14z)| lr 2.60e-04 | 8447.16 ms | -100.0% bf16 MFU | 62057 tok/s +step 10925/19560 | loss 3.342129 (-0.75z)| norm 0.2590 (-0.16z)| lr 2.60e-04 | 8448.29 ms | -100.0% bf16 MFU | 62057 tok/s +step 10926/19560 | loss 3.420571 (+0.95z)| norm 0.2841 (-0.13z)| lr 2.60e-04 | 8445.00 ms | -100.0% bf16 MFU | 62058 tok/s +step 10927/19560 | loss 3.372353 (-0.11z)| norm 0.2783 (-0.14z)| lr 2.60e-04 | 8445.15 ms | -100.0% bf16 MFU | 62059 tok/s +step 10928/19560 | loss 3.349082 (-0.62z)| norm 0.2775 (-0.14z)| lr 2.60e-04 | 8445.39 ms | -100.0% bf16 MFU | 62060 tok/s +step 10929/19560 | loss 3.349356 (-0.62z)| norm 0.2690 (-0.15z)| lr 2.60e-04 | 8444.07 ms | -100.0% bf16 MFU | 62062 tok/s +step 10930/19560 | loss 3.426662 (+1.06z)| norm 0.2572 (-0.16z)| lr 2.60e-04 | 8447.28 ms | -100.0% bf16 MFU | 62062 tok/s +step 10931/19560 | loss 3.291591 (-1.87z)| norm 0.2713 (-0.15z)| lr 2.60e-04 | 8447.70 ms | -100.0% bf16 MFU | 62062 tok/s +step 10932/19560 | loss 3.322472 (-1.21z)| norm 0.2638 (-0.16z)| lr 2.60e-04 | 8451.01 ms | -100.0% bf16 MFU | 62061 tok/s +step 10933/19560 | loss 3.369003 (-0.18z)| norm 0.2729 (-0.14z)| lr 2.60e-04 | 8448.79 ms | -100.0% bf16 MFU | 62061 tok/s +step 10934/19560 | loss 3.329471 (-1.04z)| norm 0.2672 (-0.15z)| lr 2.60e-04 | 8448.89 ms | -100.0% bf16 MFU | 62060 tok/s +step 10935/19560 | loss 3.356557 (-0.45z)| norm 0.2779 (-0.14z)| lr 2.60e-04 | 8449.65 ms | -100.0% bf16 MFU | 62060 tok/s +step 10936/19560 | loss 3.339944 (-0.81z)| norm 0.7315 (+0.46z)| lr 2.60e-04 | 8443.73 ms | -100.0% bf16 MFU | 62061 tok/s +step 10937/19560 | loss 3.323101 (-1.17z)| norm 0.3189 (-0.09z)| lr 2.60e-04 | 8450.15 ms | -100.0% bf16 MFU | 62060 tok/s +step 10938/19560 | loss 3.349122 (-0.60z)| norm 0.3092 (-0.10z)| lr 2.60e-04 | 8446.59 ms | -100.0% bf16 MFU | 62061 tok/s +step 10939/19560 | loss 3.366832 (-0.21z)| norm 0.2913 (-0.12z)| lr 2.60e-04 | 8444.74 ms | -100.0% bf16 MFU | 62062 tok/s +step 10940/19560 | loss 3.454621 (+1.66z)| norm 0.2983 (-0.12z)| lr 2.60e-04 | 8447.23 ms | -100.0% bf16 MFU | 62062 tok/s +step 10941/19560 | loss 3.365339 (-0.25z)| norm 0.2851 (-0.13z)| lr 2.60e-04 | 8444.71 ms | -100.0% bf16 MFU | 62063 tok/s +step 10942/19560 | loss 3.332830 (-0.95z)| norm 0.2812 (-0.14z)| lr 2.60e-04 | 8444.63 ms | -100.0% bf16 MFU | 62065 tok/s +step 10943/19560 | loss 3.413586 (+0.78z)| norm 0.3006 (-0.11z)| lr 2.60e-04 | 8445.56 ms | -100.0% bf16 MFU | 62065 tok/s +step 10944/19560 | loss 3.443649 (+1.40z)| norm 0.2898 (-0.13z)| lr 2.59e-04 | 8449.16 ms | -100.0% bf16 MFU | 62065 tok/s +step 10945/19560 | loss 3.313935 (-1.36z)| norm 0.2825 (-0.14z)| lr 2.59e-04 | 8447.14 ms | -100.0% bf16 MFU | 62065 tok/s +step 10946/19560 | loss 3.347801 (-0.64z)| norm 0.2870 (-0.13z)| lr 2.59e-04 | 8445.23 ms | -100.0% bf16 MFU | 62066 tok/s +step 10947/19560 | loss 3.360909 (-0.35z)| norm 0.2843 (-0.14z)| lr 2.59e-04 | 8447.04 ms | -100.0% bf16 MFU | 62066 tok/s +step 10948/19560 | loss 3.335811 (-0.90z)| norm 0.2789 (-0.14z)| lr 2.59e-04 | 8449.85 ms | -100.0% bf16 MFU | 62065 tok/s +step 10949/19560 | loss 3.391962 (+0.35z)| norm 0.2917 (-0.12z)| lr 2.59e-04 | 8446.13 ms | -100.0% bf16 MFU | 62065 tok/s +step 10950/19560 | loss 3.393537 (+0.39z)| norm 1.3377 (+1.33z)| lr 2.59e-04 | 8448.29 ms | -100.0% bf16 MFU | 62065 tok/s +step 10951/19560 | loss 3.395903 (+0.44z)| norm 0.3411 (-0.03z)| lr 2.59e-04 | 8449.70 ms | -100.0% bf16 MFU | 62064 tok/s +step 10952/19560 | loss 3.362103 (-0.32z)| norm 0.2755 (-0.12z)| lr 2.59e-04 | 8446.35 ms | -100.0% bf16 MFU | 62064 tok/s +step 10953/19560 | loss 3.384855 (+0.20z)| norm 0.3048 (-0.08z)| lr 2.59e-04 | 8447.56 ms | -100.0% bf16 MFU | 62064 tok/s +step 10954/19560 | loss 3.347099 (-0.65z)| norm 0.2767 (-0.12z)| lr 2.59e-04 | 8443.43 ms | -100.0% bf16 MFU | 62066 tok/s +step 10955/19560 | loss 3.327277 (-1.08z)| norm 0.2800 (-0.12z)| lr 2.59e-04 | 8444.66 ms | -100.0% bf16 MFU | 62067 tok/s +step 10956/19560 | loss 3.362725 (-0.29z)| norm 0.2689 (-0.13z)| lr 2.59e-04 | 8446.46 ms | -100.0% bf16 MFU | 62067 tok/s +step 10957/19560 | loss 3.409898 (+0.76z)| norm 0.2851 (-0.11z)| lr 2.59e-04 | 8447.58 ms | -100.0% bf16 MFU | 62067 tok/s +step 10958/19560 | loss 3.420653 (+1.02z)| norm 0.2752 (-0.12z)| lr 2.59e-04 | 8448.29 ms | -100.0% bf16 MFU | 62067 tok/s +step 10959/19560 | loss 3.455212 (+1.77z)| norm 2.0084 (+2.20z)| lr 2.59e-04 | 8444.29 ms | -100.0% bf16 MFU | 62068 tok/s +step 10960/19560 | loss 3.367589 (-0.18z)| norm 0.3114 (-0.09z)| lr 2.59e-04 | 8443.81 ms | -100.0% bf16 MFU | 62069 tok/s +step 10961/19560 | loss 3.385567 (+0.22z)| norm 0.2981 (-0.11z)| lr 2.59e-04 | 8445.89 ms | -100.0% bf16 MFU | 62069 tok/s +step 10962/19560 | loss 3.375144 (-0.00z)| norm 0.2945 (-0.11z)| lr 2.59e-04 | 8445.21 ms | -100.0% bf16 MFU | 62070 tok/s +step 10963/19560 | loss 3.394404 (+0.44z)| norm 0.3092 (-0.09z)| lr 2.59e-04 | 8448.13 ms | -100.0% bf16 MFU | 62069 tok/s +step 10964/19560 | loss 3.385109 (+0.22z)| norm 0.2788 (-0.13z)| lr 2.59e-04 | 8445.27 ms | -100.0% bf16 MFU | 62070 tok/s +step 10965/19560 | loss 3.343039 (-0.74z)| norm 0.2977 (-0.11z)| lr 2.58e-04 | 8447.10 ms | -100.0% bf16 MFU | 62070 tok/s +step 10966/19560 | loss 3.389749 (+0.32z)| norm 0.3438 (-0.05z)| lr 2.58e-04 | 8446.43 ms | -100.0% bf16 MFU | 62070 tok/s +step 10967/19560 | loss 3.371950 (-0.08z)| norm 0.2579 (-0.16z)| lr 2.58e-04 | 8447.16 ms | -100.0% bf16 MFU | 62070 tok/s +step 10968/19560 | loss 3.331453 (-0.99z)| norm 0.3087 (-0.10z)| lr 2.58e-04 | 8442.68 ms | -100.0% bf16 MFU | 62071 tok/s +step 10969/19560 | loss 3.460320 (+1.93z)| norm 0.3159 (-0.09z)| lr 2.58e-04 | 8444.48 ms | -100.0% bf16 MFU | 62072 tok/s +step 10970/19560 | loss 3.420727 (+1.02z)| norm 0.2495 (-0.18z)| lr 2.58e-04 | 8445.11 ms | -100.0% bf16 MFU | 62072 tok/s +step 10971/19560 | loss 3.377016 (+0.03z)| norm 0.3071 (-0.10z)| lr 2.58e-04 | 8443.70 ms | -100.0% bf16 MFU | 62073 tok/s +step 10972/19560 | loss 3.420167 (+1.00z)| norm 0.2504 (-0.17z)| lr 2.58e-04 | 8445.23 ms | -100.0% bf16 MFU | 62074 tok/s +step 10973/19560 | loss 3.326344 (-1.11z)| norm 0.3165 (-0.09z)| lr 2.58e-04 | 8444.83 ms | -100.0% bf16 MFU | 62074 tok/s +step 10974/19560 | loss 3.436373 (+1.34z)| norm 0.3044 (-0.10z)| lr 2.58e-04 | 8444.04 ms | -100.0% bf16 MFU | 62075 tok/s +step 10975/19560 | loss 3.390194 (+0.30z)| norm 0.2797 (-0.14z)| lr 2.58e-04 | 8455.12 ms | -100.0% bf16 MFU | 62072 tok/s +step 10976/19560 | loss 3.364694 (-0.28z)| norm 0.3646 (-0.02z)| lr 2.58e-04 | 8475.44 ms | -100.0% bf16 MFU | 62061 tok/s +step 10977/19560 | loss 3.392020 (+0.34z)| norm 0.2979 (-0.11z)| lr 2.58e-04 | 8478.55 ms | -100.0% bf16 MFU | 62050 tok/s +step 10978/19560 | loss 3.415349 (+0.90z)| norm 0.3056 (-0.10z)| lr 2.58e-04 | 8474.72 ms | -100.0% bf16 MFU | 62041 tok/s +step 10979/19560 | loss 3.392095 (+0.38z)| norm 0.3078 (-0.10z)| lr 2.58e-04 | 8475.14 ms | -100.0% bf16 MFU | 62032 tok/s +step 10980/19560 | loss 3.336923 (-0.90z)| norm 0.2685 (-0.15z)| lr 2.58e-04 | 8475.22 ms | -100.0% bf16 MFU | 62023 tok/s +step 10981/19560 | loss 3.333310 (-0.98z)| norm 0.2898 (-0.12z)| lr 2.58e-04 | 8473.25 ms | -100.0% bf16 MFU | 62016 tok/s +step 10982/19560 | loss 3.409250 (+0.78z)| norm 0.2750 (-0.14z)| lr 2.58e-04 | 8471.76 ms | -100.0% bf16 MFU | 62009 tok/s +step 10983/19560 | loss 3.424180 (+1.14z)| norm 0.2813 (-0.13z)| lr 2.58e-04 | 8471.92 ms | -100.0% bf16 MFU | 62003 tok/s +step 10984/19560 | loss 3.389310 (+0.32z)| norm 0.2807 (-0.13z)| lr 2.58e-04 | 8476.76 ms | -100.0% bf16 MFU | 61995 tok/s +step 10985/19560 | loss 3.414545 (+0.90z)| norm 0.2863 (-0.12z)| lr 2.57e-04 | 8476.33 ms | -100.0% bf16 MFU | 61988 tok/s +step 10986/19560 | loss 3.382368 (+0.15z)| norm 0.2743 (-0.14z)| lr 2.57e-04 | 8468.27 ms | -100.0% bf16 MFU | 61985 tok/s +step 10987/19560 | loss 3.380691 (+0.10z)| norm 0.2736 (-0.14z)| lr 2.57e-04 | 8468.86 ms | -100.0% bf16 MFU | 61981 tok/s +step 10988/19560 | loss 3.422610 (+1.07z)| norm 0.2853 (-0.13z)| lr 2.57e-04 | 8472.21 ms | -100.0% bf16 MFU | 61976 tok/s +step 10989/19560 | loss 3.439175 (+1.44z)| norm 0.2546 (-0.17z)| lr 2.57e-04 | 8471.71 ms | -100.0% bf16 MFU | 61971 tok/s +step 10990/19560 | loss 3.413462 (+0.82z)| norm 0.2812 (-0.13z)| lr 2.57e-04 | 8468.80 ms | -100.0% bf16 MFU | 61968 tok/s +step 10991/19560 | loss 3.383605 (+0.12z)| norm 0.2513 (-0.17z)| lr 2.57e-04 | 8471.03 ms | -100.0% bf16 MFU | 61964 tok/s +step 10992/19560 | loss 3.487161 (+2.46z)| norm 0.2699 (-0.15z)| lr 2.57e-04 | 8467.42 ms | -100.0% bf16 MFU | 61962 tok/s +step 10993/19560 | loss 3.408371 (+0.65z)| norm 0.2612 (-0.16z)| lr 2.57e-04 | 8471.62 ms | -100.0% bf16 MFU | 61958 tok/s +step 10994/19560 | loss 3.532958 (+3.31z)| norm 0.2595 (-0.16z)| lr 2.57e-04 | 8471.46 ms | -100.0% bf16 MFU | 61955 tok/s +step 10995/19560 | loss 3.470892 (+1.91z)| norm 0.2768 (-0.14z)| lr 2.57e-04 | 8464.95 ms | -100.0% bf16 MFU | 61954 tok/s +step 10996/19560 | loss 3.396790 (+0.41z)| norm 0.2616 (-0.27z)| lr 2.57e-04 | 8465.21 ms | -100.0% bf16 MFU | 61953 tok/s +step 10997/19560 | loss 3.381262 (+0.03z)| norm 0.2735 (-0.21z)| lr 2.57e-04 | 8470.11 ms | -100.0% bf16 MFU | 61950 tok/s +step 10998/19560 | loss 3.454427 (+1.82z)| norm 0.2995 (-0.07z)| lr 2.57e-04 | 8469.40 ms | -100.0% bf16 MFU | 61948 tok/s +step 10999/19560 | loss 3.404727 (+0.59z)| norm 0.3146 (+0.01z)| lr 2.57e-04 | 8470.21 ms | -100.0% bf16 MFU | 61945 tok/s +step 11000/19560 | loss 3.365928 (-0.36z)| norm 0.2575 (-0.29z)| lr 2.57e-04 | 8466.71 ms | -100.0% bf16 MFU | 61944 tok/s +val loss 3.366960 +evaluating HellaSwag: 0/1256 evaluating HellaSwag: 10/1256 evaluating HellaSwag: 20/1256 evaluating HellaSwag: 30/1256 evaluating HellaSwag: 40/1256 evaluating HellaSwag: 50/1256 evaluating HellaSwag: 60/1256 evaluating HellaSwag: 70/1256 evaluating HellaSwag: 80/1256 evaluating HellaSwag: 90/1256 evaluating HellaSwag: 100/1256 evaluating HellaSwag: 110/1256 evaluating HellaSwag: 120/1256 evaluating HellaSwag: 130/1256 evaluating HellaSwag: 140/1256 evaluating HellaSwag: 150/1256 evaluating HellaSwag: 160/1256 evaluating HellaSwag: 170/1256 evaluating HellaSwag: 180/1256 evaluating HellaSwag: 190/1256 evaluating HellaSwag: 200/1256 evaluating HellaSwag: 210/1256 evaluating HellaSwag: 220/1256 evaluating HellaSwag: 230/1256 evaluating HellaSwag: 240/1256 evaluating HellaSwag: 250/1256 evaluating HellaSwag: 260/1256 evaluating HellaSwag: 270/1256 evaluating HellaSwag: 280/1256 evaluating HellaSwag: 290/1256 evaluating HellaSwag: 300/1256 evaluating HellaSwag: 310/1256 evaluating HellaSwag: 320/1256 evaluating HellaSwag: 330/1256 evaluating HellaSwag: 340/1256 evaluating HellaSwag: 350/1256 evaluating HellaSwag: 360/1256 evaluating HellaSwag: 370/1256 evaluating HellaSwag: 380/1256 evaluating HellaSwag: 390/1256 evaluating HellaSwag: 400/1256 evaluating HellaSwag: 410/1256 evaluating HellaSwag: 420/1256 evaluating HellaSwag: 430/1256 evaluating HellaSwag: 440/1256 evaluating HellaSwag: 450/1256 evaluating HellaSwag: 460/1256 evaluating HellaSwag: 470/1256 evaluating HellaSwag: 480/1256 evaluating HellaSwag: 490/1256 evaluating HellaSwag: 500/1256 evaluating HellaSwag: 510/1256 evaluating HellaSwag: 520/1256 evaluating HellaSwag: 530/1256 evaluating HellaSwag: 540/1256 evaluating HellaSwag: 550/1256 evaluating HellaSwag: 560/1256 evaluating HellaSwag: 570/1256 evaluating HellaSwag: 580/1256 evaluating HellaSwag: 590/1256 evaluating HellaSwag: 600/1256 evaluating HellaSwag: 610/1256 evaluating HellaSwag: 620/1256 evaluating HellaSwag: 630/1256 evaluating HellaSwag: 640/1256 evaluating HellaSwag: 650/1256 evaluating HellaSwag: 660/1256 evaluating HellaSwag: 670/1256 evaluating HellaSwag: 680/1256 evaluating HellaSwag: 690/1256 evaluating HellaSwag: 700/1256 evaluating HellaSwag: 710/1256 evaluating HellaSwag: 720/1256 evaluating HellaSwag: 730/1256 evaluating HellaSwag: 740/1256 evaluating HellaSwag: 750/1256 evaluating HellaSwag: 760/1256 evaluating HellaSwag: 770/1256 evaluating HellaSwag: 780/1256 evaluating HellaSwag: 790/1256 evaluating HellaSwag: 800/1256 evaluating HellaSwag: 810/1256 evaluating HellaSwag: 820/1256 evaluating HellaSwag: 830/1256 evaluating HellaSwag: 840/1256 evaluating HellaSwag: 850/1256 evaluating HellaSwag: 860/1256 evaluating HellaSwag: 870/1256 evaluating HellaSwag: 880/1256 evaluating HellaSwag: 890/1256 evaluating HellaSwag: 900/1256 evaluating HellaSwag: 910/1256 evaluating HellaSwag: 920/1256 evaluating HellaSwag: 930/1256 evaluating HellaSwag: 940/1256 evaluating HellaSwag: 950/1256 evaluating HellaSwag: 960/1256 evaluating HellaSwag: 970/1256 evaluating HellaSwag: 980/1256 evaluating HellaSwag: 990/1256 evaluating HellaSwag: 1000/1256 evaluating HellaSwag: 1010/1256 evaluating HellaSwag: 1020/1256 evaluating HellaSwag: 1030/1256 evaluating HellaSwag: 1040/1256 evaluating HellaSwag: 1050/1256 evaluating HellaSwag: 1060/1256 evaluating HellaSwag: 1070/1256 evaluating HellaSwag: 1080/1256 evaluating HellaSwag: 1090/1256 evaluating HellaSwag: 1100/1256 evaluating HellaSwag: 1110/1256 evaluating HellaSwag: 1120/1256 evaluating HellaSwag: 1130/1256 evaluating HellaSwag: 1140/1256 evaluating HellaSwag: 1150/1256 evaluating HellaSwag: 1160/1256 evaluating HellaSwag: 1170/1256 evaluating HellaSwag: 1180/1256 evaluating HellaSwag: 1190/1256 evaluating HellaSwag: 1200/1256 evaluating HellaSwag: 1210/1256 evaluating HellaSwag: 1220/1256 evaluating HellaSwag: 1230/1256 evaluating HellaSwag: 1240/1256 evaluating HellaSwag: 1250/1256 HellaSwag: 2915/10042 = 0.290281 +step 11001/19560 | loss 3.419828 (+0.95z)| norm 0.3231 (+0.05z)| lr 2.57e-04 | 8462.38 ms | -100.0% bf16 MFU | 61945 tok/s +step 11002/19560 | loss 3.357410 (-0.60z)| norm 0.2630 (-0.26z)| lr 2.57e-04 | 8470.39 ms | -100.0% bf16 MFU | 61942 tok/s +step 11003/19560 | loss 3.373596 (-0.20z)| norm 0.2970 (-0.08z)| lr 2.57e-04 | 8468.80 ms | -100.0% bf16 MFU | 61941 tok/s +step 11004/19560 | loss 3.400552 (+0.46z)| norm 0.2552 (-0.30z)| lr 2.57e-04 | 8467.43 ms | -100.0% bf16 MFU | 61940 tok/s +step 11005/19560 | loss 3.342828 (-1.00z)| norm 0.2723 (-0.20z)| lr 2.56e-04 | 8460.47 ms | -100.0% bf16 MFU | 61941 tok/s +step 11006/19560 | loss 3.418175 (+0.89z)| norm 0.2745 (-0.19z)| lr 2.56e-04 | 8463.81 ms | -100.0% bf16 MFU | 61941 tok/s +step 11007/19560 | loss 3.406971 (+0.61z)| norm 0.2673 (-0.23z)| lr 2.56e-04 | 8462.53 ms | -100.0% bf16 MFU | 61942 tok/s +step 11008/19560 | loss 3.425563 (+1.06z)| norm 0.2841 (-0.14z)| lr 2.56e-04 | 8466.30 ms | -100.0% bf16 MFU | 61941 tok/s +step 11009/19560 | loss 3.381885 (-0.04z)| norm 0.2674 (-0.22z)| lr 2.56e-04 | 8467.09 ms | -100.0% bf16 MFU | 61940 tok/s +step 11010/19560 | loss 3.379055 (-0.10z)| norm 0.2618 (-0.25z)| lr 2.56e-04 | 8463.80 ms | -100.0% bf16 MFU | 61940 tok/s +step 11011/19560 | loss 3.534711 (+3.62z)| norm 0.2811 (-0.15z)| lr 2.56e-04 | 8463.44 ms | -100.0% bf16 MFU | 61941 tok/s +step 11012/19560 | loss 3.373363 (-0.26z)| norm 0.2746 (-0.19z)| lr 2.56e-04 | 8467.28 ms | -100.0% bf16 MFU | 61940 tok/s +step 11013/19560 | loss 3.345644 (-0.93z)| norm 0.2802 (-0.16z)| lr 2.56e-04 | 8468.73 ms | -100.0% bf16 MFU | 61938 tok/s +step 11014/19560 | loss 3.411795 (+0.66z)| norm 0.2856 (-0.13z)| lr 2.56e-04 | 8461.75 ms | -100.0% bf16 MFU | 61939 tok/s +step 11015/19560 | loss 3.432941 (+1.15z)| norm 0.2595 (-0.26z)| lr 2.56e-04 | 8463.07 ms | -100.0% bf16 MFU | 61940 tok/s +step 11016/19560 | loss 3.375919 (-0.23z)| norm 0.2734 (-0.19z)| lr 2.56e-04 | 8465.34 ms | -100.0% bf16 MFU | 61939 tok/s +step 11017/19560 | loss 3.405458 (+0.48z)| norm 0.2582 (-0.25z)| lr 2.56e-04 | 8464.05 ms | -100.0% bf16 MFU | 61940 tok/s +step 11018/19560 | loss 3.376729 (-0.21z)| norm 0.2650 (-0.22z)| lr 2.56e-04 | 8463.04 ms | -100.0% bf16 MFU | 61940 tok/s +step 11019/19560 | loss 3.348303 (-0.89z)| norm 0.2699 (-0.19z)| lr 2.56e-04 | 8460.22 ms | -100.0% bf16 MFU | 61942 tok/s +step 11020/19560 | loss 3.386953 (+0.04z)| norm 0.2641 (-0.22z)| lr 2.56e-04 | 8460.51 ms | -100.0% bf16 MFU | 61943 tok/s +step 11021/19560 | loss 3.392809 (+0.19z)| norm 0.2659 (-0.21z)| lr 2.56e-04 | 8460.26 ms | -100.0% bf16 MFU | 61944 tok/s +step 11022/19560 | loss 3.435232 (+1.19z)| norm 0.2580 (-0.25z)| lr 2.56e-04 | 8460.11 ms | -100.0% bf16 MFU | 61946 tok/s +step 11023/19560 | loss 3.473474 (+2.06z)| norm 0.2851 (-0.10z)| lr 2.56e-04 | 8458.11 ms | -100.0% bf16 MFU | 61948 tok/s +step 11024/19560 | loss 3.377748 (-0.21z)| norm 0.2618 (-0.23z)| lr 2.56e-04 | 8461.77 ms | -100.0% bf16 MFU | 61948 tok/s +step 11025/19560 | loss 3.433583 (+1.10z)| norm 0.2961 (-0.05z)| lr 2.55e-04 | 8457.44 ms | -100.0% bf16 MFU | 61951 tok/s +step 11026/19560 | loss 3.405811 (+0.46z)| norm 0.2910 (-0.07z)| lr 2.55e-04 | 8456.76 ms | -100.0% bf16 MFU | 61953 tok/s +step 11027/19560 | loss 3.353700 (-0.81z)| norm 0.3060 (+0.01z)| lr 2.55e-04 | 8459.43 ms | -100.0% bf16 MFU | 61954 tok/s +step 11028/19560 | loss 3.390447 (+0.08z)| norm 0.2537 (-0.28z)| lr 2.55e-04 | 8464.64 ms | -100.0% bf16 MFU | 61953 tok/s +step 11029/19560 | loss 3.461010 (+1.76z)| norm 0.2844 (-0.11z)| lr 2.55e-04 | 8460.86 ms | -100.0% bf16 MFU | 61954 tok/s +step 11030/19560 | loss 3.447353 (+1.41z)| norm 0.2504 (-0.30z)| lr 2.55e-04 | 8451.58 ms | -100.0% bf16 MFU | 61958 tok/s +step 11031/19560 | loss 3.444060 (+1.32z)| norm 0.3104 (+0.03z)| lr 2.55e-04 | 8461.25 ms | -100.0% bf16 MFU | 61958 tok/s +step 11032/19560 | loss 3.398512 (+0.23z)| norm 0.2557 (-0.27z)| lr 2.55e-04 | 8459.40 ms | -100.0% bf16 MFU | 61959 tok/s +step 11033/19560 | loss 3.374357 (-0.33z)| norm 0.2977 (-0.04z)| lr 2.55e-04 | 8460.82 ms | -100.0% bf16 MFU | 61960 tok/s +step 11034/19560 | loss 3.468516 (+1.85z)| norm 0.2681 (-0.20z)| lr 2.55e-04 | 8463.04 ms | -100.0% bf16 MFU | 61959 tok/s +step 11035/19560 | loss 3.460526 (+1.66z)| norm 0.2850 (-0.11z)| lr 2.55e-04 | 8456.74 ms | -100.0% bf16 MFU | 61961 tok/s +step 11036/19560 | loss 3.416363 (+0.63z)| norm 0.2776 (-0.15z)| lr 2.55e-04 | 8456.62 ms | -100.0% bf16 MFU | 61963 tok/s +step 11037/19560 | loss 3.419850 (+0.70z)| norm 0.2593 (-0.25z)| lr 2.55e-04 | 8454.87 ms | -100.0% bf16 MFU | 61965 tok/s +step 11038/19560 | loss 3.457601 (+1.55z)| norm 0.2747 (-0.17z)| lr 2.55e-04 | 8461.02 ms | -100.0% bf16 MFU | 61965 tok/s +step 11039/19560 | loss 3.412545 (+0.51z)| norm 0.2750 (-0.17z)| lr 2.55e-04 | 8458.32 ms | -100.0% bf16 MFU | 61966 tok/s +step 11040/19560 | loss 3.442542 (+1.19z)| norm 0.2608 (-0.25z)| lr 2.55e-04 | 8462.85 ms | -100.0% bf16 MFU | 61965 tok/s +step 11041/19560 | loss 3.416182 (+0.58z)| norm 0.2766 (-0.16z)| lr 2.55e-04 | 8455.32 ms | -100.0% bf16 MFU | 61967 tok/s +step 11042/19560 | loss 3.414891 (+0.54z)| norm 0.2688 (-0.20z)| lr 2.55e-04 | 8454.41 ms | -100.0% bf16 MFU | 61970 tok/s +step 11043/19560 | loss 3.355820 (-0.82z)| norm 0.2592 (-0.25z)| lr 2.55e-04 | 8457.71 ms | -100.0% bf16 MFU | 61971 tok/s +step 11044/19560 | loss 3.365370 (-0.60z)| norm 0.2676 (-0.21z)| lr 2.55e-04 | 8458.64 ms | -100.0% bf16 MFU | 61971 tok/s +step 11045/19560 | loss 3.449243 (+1.31z)| norm 0.2698 (-0.20z)| lr 2.55e-04 | 8461.07 ms | -100.0% bf16 MFU | 61971 tok/s +step 11046/19560 | loss 3.374772 (-0.39z)| norm 0.2804 (-0.14z)| lr 2.54e-04 | 8453.08 ms | -100.0% bf16 MFU | 61974 tok/s +step 11047/19560 | loss 3.348493 (-1.00z)| norm 0.2684 (-0.21z)| lr 2.54e-04 | 8455.77 ms | -100.0% bf16 MFU | 61975 tok/s +step 11048/19560 | loss 3.362689 (-0.67z)| norm 0.2636 (-0.23z)| lr 2.54e-04 | 8457.77 ms | -100.0% bf16 MFU | 61976 tok/s +step 11049/19560 | loss 3.405727 (+0.31z)| norm 0.2744 (-0.17z)| lr 2.54e-04 | 8461.99 ms | -100.0% bf16 MFU | 61975 tok/s +step 11050/19560 | loss 3.365577 (-0.62z)| norm 0.3005 (-0.03z)| lr 2.54e-04 | 8454.36 ms | -100.0% bf16 MFU | 61977 tok/s +step 11051/19560 | loss 3.419586 (+0.62z)| norm 0.2660 (-0.22z)| lr 2.54e-04 | 8455.75 ms | -100.0% bf16 MFU | 61978 tok/s +step 11052/19560 | loss 3.480612 (+1.98z)| norm 0.3059 (-0.00z)| lr 2.54e-04 | 8455.83 ms | -100.0% bf16 MFU | 61979 tok/s +step 11053/19560 | loss 3.433582 (+0.89z)| norm 0.2939 (-0.07z)| lr 2.54e-04 | 8456.14 ms | -100.0% bf16 MFU | 61981 tok/s +step 11054/19560 | loss 3.468164 (+1.66z)| norm 0.2959 (-0.06z)| lr 2.54e-04 | 8456.66 ms | -100.0% bf16 MFU | 61981 tok/s +step 11055/19560 | loss 3.403999 (+0.20z)| norm 0.3160 (+0.05z)| lr 2.54e-04 | 8457.12 ms | -100.0% bf16 MFU | 61982 tok/s +step 11056/19560 | loss 3.390912 (-0.11z)| norm 0.3042 (-0.02z)| lr 2.54e-04 | 8462.63 ms | -100.0% bf16 MFU | 61981 tok/s +step 11057/19560 | loss 3.416123 (+0.46z)| norm 0.2984 (-0.05z)| lr 2.54e-04 | 8452.78 ms | -100.0% bf16 MFU | 61983 tok/s +step 11058/19560 | loss 3.464085 (+1.54z)| norm 0.2694 (-0.21z)| lr 2.54e-04 | 8456.23 ms | -100.0% bf16 MFU | 61984 tok/s +step 11059/19560 | loss 3.394707 (-0.06z)| norm 0.2823 (-0.14z)| lr 2.54e-04 | 8451.92 ms | -100.0% bf16 MFU | 61986 tok/s +step 11060/19560 | loss 3.436208 (+0.89z)| norm 0.2815 (-0.14z)| lr 2.54e-04 | 8452.25 ms | -100.0% bf16 MFU | 61988 tok/s +step 11061/19560 | loss 3.379805 (-0.43z)| norm 0.2756 (-0.18z)| lr 2.54e-04 | 8452.33 ms | -100.0% bf16 MFU | 61990 tok/s +step 11062/19560 | loss 3.386475 (-0.29z)| norm 0.3070 (-0.01z)| lr 2.54e-04 | 8457.48 ms | -100.0% bf16 MFU | 61990 tok/s +step 11063/19560 | loss 3.433254 (+0.81z)| norm 0.2869 (-0.12z)| lr 2.54e-04 | 8459.34 ms | -100.0% bf16 MFU | 61990 tok/s +step 11064/19560 | loss 3.386789 (-0.31z)| norm 0.3033 (-0.01z)| lr 2.54e-04 | 8455.20 ms | -100.0% bf16 MFU | 61991 tok/s +step 11065/19560 | loss 3.387016 (-0.32z)| norm 0.2922 (-0.07z)| lr 2.54e-04 | 8453.73 ms | -100.0% bf16 MFU | 61992 tok/s +step 11066/19560 | loss 3.420065 (+0.47z)| norm 0.2954 (-0.05z)| lr 2.53e-04 | 8453.90 ms | -100.0% bf16 MFU | 61993 tok/s +step 11067/19560 | loss 3.446273 (+1.09z)| norm 0.2962 (-0.05z)| lr 2.53e-04 | 8454.66 ms | -100.0% bf16 MFU | 61994 tok/s +step 11068/19560 | loss 3.339897 (-1.47z)| norm 0.2709 (-0.19z)| lr 2.53e-04 | 8454.32 ms | -100.0% bf16 MFU | 61995 tok/s +step 11069/19560 | loss 3.405287 (+0.11z)| norm 0.2826 (-0.12z)| lr 2.53e-04 | 8457.92 ms | -100.0% bf16 MFU | 61995 tok/s +step 11070/19560 | loss 3.429099 (+0.68z)| norm 0.2752 (-0.16z)| lr 2.53e-04 | 8456.54 ms | -100.0% bf16 MFU | 61995 tok/s +step 11071/19560 | loss 3.393007 (-0.20z)| norm 0.2893 (-0.08z)| lr 2.53e-04 | 8456.64 ms | -100.0% bf16 MFU | 61995 tok/s +step 11072/19560 | loss 3.391718 (-0.23z)| norm 0.2647 (-0.22z)| lr 2.53e-04 | 8452.13 ms | -100.0% bf16 MFU | 61997 tok/s +step 11073/19560 | loss 3.401007 (-0.01z)| norm 0.3073 (+0.02z)| lr 2.53e-04 | 8453.99 ms | -100.0% bf16 MFU | 61998 tok/s +step 11074/19560 | loss 3.406994 (+0.13z)| norm 0.2643 (-0.22z)| lr 2.53e-04 | 8457.48 ms | -100.0% bf16 MFU | 61997 tok/s +step 11075/19560 | loss 3.409167 (+0.17z)| norm 0.3082 (+0.02z)| lr 2.53e-04 | 8446.66 ms | -100.0% bf16 MFU | 62001 tok/s +step 11076/19560 | loss 3.390199 (-0.32z)| norm 0.2572 (-0.26z)| lr 2.53e-04 | 8455.36 ms | -100.0% bf16 MFU | 62001 tok/s +step 11077/19560 | loss 3.455691 (+1.34z)| norm 0.3076 (+0.02z)| lr 2.53e-04 | 8451.24 ms | -100.0% bf16 MFU | 62003 tok/s +step 11078/19560 | loss 3.393833 (-0.24z)| norm 0.2795 (-0.11z)| lr 2.53e-04 | 8456.73 ms | -100.0% bf16 MFU | 62003 tok/s +step 11079/19560 | loss 3.433742 (+0.77z)| norm 0.2624 (-0.22z)| lr 2.53e-04 | 8454.93 ms | -100.0% bf16 MFU | 62003 tok/s +step 11080/19560 | loss 3.412479 (+0.22z)| norm 0.2928 (-0.02z)| lr 2.53e-04 | 8456.04 ms | -100.0% bf16 MFU | 62003 tok/s +step 11081/19560 | loss 3.358043 (-1.16z)| norm 0.2493 (-0.30z)| lr 2.53e-04 | 8455.03 ms | -100.0% bf16 MFU | 62003 tok/s +step 11082/19560 | loss 3.448842 (+1.13z)| norm 0.3079 (+0.08z)| lr 2.53e-04 | 8456.13 ms | -100.0% bf16 MFU | 62003 tok/s +step 11083/19560 | loss 3.396107 (-0.23z)| norm 0.2642 (-0.20z)| lr 2.53e-04 | 8453.33 ms | -100.0% bf16 MFU | 62004 tok/s +step 11084/19560 | loss 3.489421 (+2.13z)| norm 0.2798 (-0.10z)| lr 2.53e-04 | 8453.82 ms | -100.0% bf16 MFU | 62005 tok/s +step 11085/19560 | loss 3.377009 (-0.74z)| norm 0.2826 (-0.08z)| lr 2.53e-04 | 8456.75 ms | -100.0% bf16 MFU | 62004 tok/s +step 11086/19560 | loss 3.407964 (+0.06z)| norm 0.2677 (-0.18z)| lr 2.52e-04 | 8451.41 ms | -100.0% bf16 MFU | 62006 tok/s +step 11087/19560 | loss 3.383866 (-0.55z)| norm 0.2679 (-0.68z)| lr 2.52e-04 | 8453.42 ms | -100.0% bf16 MFU | 62007 tok/s +step 11088/19560 | loss 3.424895 (+0.50z)| norm 0.2527 (-1.40z)| lr 2.52e-04 | 8454.08 ms | -100.0% bf16 MFU | 62007 tok/s +step 11089/19560 | loss 3.421391 (+0.40z)| norm 0.2759 (-0.25z)| lr 2.52e-04 | 8453.35 ms | -100.0% bf16 MFU | 62008 tok/s +step 11090/19560 | loss 3.394309 (-0.30z)| norm 0.2691 (-0.58z)| lr 2.52e-04 | 8453.58 ms | -100.0% bf16 MFU | 62008 tok/s +step 11091/19560 | loss 3.397933 (-0.21z)| norm 0.2735 (-0.35z)| lr 2.52e-04 | 8450.91 ms | -100.0% bf16 MFU | 62010 tok/s +step 11092/19560 | loss 3.429773 (+0.60z)| norm 0.2802 (-0.02z)| lr 2.52e-04 | 8455.35 ms | -100.0% bf16 MFU | 62010 tok/s +step 11093/19560 | loss 3.418175 (+0.29z)| norm 0.2536 (-1.32z)| lr 2.52e-04 | 8451.95 ms | -100.0% bf16 MFU | 62011 tok/s +step 11094/19560 | loss 3.414100 (+0.18z)| norm 0.2512 (-1.44z)| lr 2.52e-04 | 8454.52 ms | -100.0% bf16 MFU | 62011 tok/s +step 11095/19560 | loss 3.388552 (-0.49z)| norm 0.2753 (-0.22z)| lr 2.52e-04 | 8450.68 ms | -100.0% bf16 MFU | 62013 tok/s +step 11096/19560 | loss 3.361241 (-1.22z)| norm 0.2501 (-1.49z)| lr 2.52e-04 | 8450.72 ms | -100.0% bf16 MFU | 62014 tok/s +step 11097/19560 | loss 3.462701 (+1.45z)| norm 0.2515 (-1.40z)| lr 2.52e-04 | 8454.35 ms | -100.0% bf16 MFU | 62014 tok/s +step 11098/19560 | loss 3.438653 (+0.81z)| norm 0.2694 (-0.49z)| lr 2.52e-04 | 8453.25 ms | -100.0% bf16 MFU | 62014 tok/s +step 11099/19560 | loss 3.514161 (+2.70z)| norm 0.2697 (-0.46z)| lr 2.52e-04 | 8450.42 ms | -100.0% bf16 MFU | 62016 tok/s +step 11100/19560 | loss 3.364910 (-1.11z)| norm 0.3084 (+1.54z)| lr 2.52e-04 | 8453.23 ms | -100.0% bf16 MFU | 62016 tok/s +step 11101/19560 | loss 3.411125 (+0.05z)| norm 0.2801 (+0.08z)| lr 2.52e-04 | 8437.57 ms | -100.0% bf16 MFU | 62022 tok/s +step 11102/19560 | loss 3.385455 (-0.60z)| norm 0.2984 (+1.05z)| lr 2.52e-04 | 8438.12 ms | -100.0% bf16 MFU | 62028 tok/s +step 11103/19560 | loss 3.407169 (-0.04z)| norm 0.2774 (-0.06z)| lr 2.52e-04 | 8437.05 ms | -100.0% bf16 MFU | 62033 tok/s +step 11104/19560 | loss 3.357349 (-1.33z)| norm 0.2596 (-1.06z)| lr 2.52e-04 | 8438.82 ms | -100.0% bf16 MFU | 62038 tok/s +step 11105/19560 | loss 3.374701 (-0.88z)| norm 0.2698 (-0.46z)| lr 2.52e-04 | 8436.71 ms | -100.0% bf16 MFU | 62043 tok/s +step 11106/19560 | loss 3.458171 (+1.27z)| norm 0.2833 (+0.35z)| lr 2.51e-04 | 8433.92 ms | -100.0% bf16 MFU | 62049 tok/s +step 11107/19560 | loss 3.383862 (-0.64z)| norm 0.2853 (+0.48z)| lr 2.51e-04 | 8435.08 ms | -100.0% bf16 MFU | 62055 tok/s +step 11108/19560 | loss 3.408965 (-0.01z)| norm 0.2748 (-0.15z)| lr 2.51e-04 | 8434.65 ms | -100.0% bf16 MFU | 62060 tok/s +step 11109/19560 | loss 3.376488 (-0.88z)| norm 0.2840 (+0.41z)| lr 2.51e-04 | 8435.96 ms | -100.0% bf16 MFU | 62064 tok/s +step 11110/19560 | loss 3.436520 (+0.70z)| norm 0.2967 (+1.15z)| lr 2.51e-04 | 8436.56 ms | -100.0% bf16 MFU | 62068 tok/s +step 11111/19560 | loss 3.451127 (+1.07z)| norm 0.2938 (+0.97z)| lr 2.51e-04 | 8436.49 ms | -100.0% bf16 MFU | 62072 tok/s +step 11112/19560 | loss 3.411073 (+0.02z)| norm 0.2650 (-0.73z)| lr 2.51e-04 | 8434.95 ms | -100.0% bf16 MFU | 62076 tok/s +step 11113/19560 | loss 3.426307 (+0.42z)| norm 0.2883 (+0.65z)| lr 2.51e-04 | 8437.57 ms | -100.0% bf16 MFU | 62080 tok/s +step 11114/19560 | loss 3.407824 (-0.07z)| norm 0.2962 (+1.10z)| lr 2.51e-04 | 8435.39 ms | -100.0% bf16 MFU | 62083 tok/s +step 11115/19560 | loss 3.404519 (-0.17z)| norm 0.2788 (+0.07z)| lr 2.51e-04 | 8439.86 ms | -100.0% bf16 MFU | 62085 tok/s +step 11116/19560 | loss 3.426122 (+0.40z)| norm 0.2794 (+0.11z)| lr 2.51e-04 | 8436.44 ms | -100.0% bf16 MFU | 62088 tok/s +step 11117/19560 | loss 3.389402 (-0.56z)| norm 0.2772 (-0.03z)| lr 2.51e-04 | 8435.66 ms | -100.0% bf16 MFU | 62091 tok/s +step 11118/19560 | loss 3.459298 (+1.27z)| norm 0.2698 (-0.47z)| lr 2.51e-04 | 8435.60 ms | -100.0% bf16 MFU | 62094 tok/s +step 11119/19560 | loss 3.425850 (+0.39z)| norm 0.2708 (-0.42z)| lr 2.51e-04 | 8435.69 ms | -100.0% bf16 MFU | 62097 tok/s +step 11120/19560 | loss 3.410235 (-0.01z)| norm 0.2862 (+0.49z)| lr 2.51e-04 | 8434.69 ms | -100.0% bf16 MFU | 62100 tok/s +step 11121/19560 | loss 3.432190 (+0.57z)| norm 0.2680 (-0.60z)| lr 2.51e-04 | 8435.30 ms | -100.0% bf16 MFU | 62103 tok/s +step 11122/19560 | loss 3.375995 (-0.93z)| norm 0.2799 (+0.11z)| lr 2.51e-04 | 8437.05 ms | -100.0% bf16 MFU | 62105 tok/s +step 11123/19560 | loss 3.399353 (-0.27z)| norm 0.2752 (-0.18z)| lr 2.51e-04 | 8442.45 ms | -100.0% bf16 MFU | 62105 tok/s +step 11124/19560 | loss 3.537583 (+3.41z)| norm 0.2784 (+0.01z)| lr 2.51e-04 | 8439.73 ms | -100.0% bf16 MFU | 62105 tok/s +step 11125/19560 | loss 3.420074 (+0.26z)| norm 0.2564 (-1.31z)| lr 2.51e-04 | 8441.78 ms | -100.0% bf16 MFU | 62106 tok/s +step 11126/19560 | loss 3.384035 (-0.69z)| norm 0.2742 (-0.23z)| lr 2.51e-04 | 8439.15 ms | -100.0% bf16 MFU | 62107 tok/s +step 11127/19560 | loss 3.395347 (-0.39z)| norm 0.2493 (-1.72z)| lr 2.50e-04 | 8436.09 ms | -100.0% bf16 MFU | 62109 tok/s +step 11128/19560 | loss 3.422360 (+0.33z)| norm 0.2853 (+0.47z)| lr 2.50e-04 | 8439.13 ms | -100.0% bf16 MFU | 62109 tok/s +step 11129/19560 | loss 3.392897 (-0.46z)| norm 0.2504 (-1.67z)| lr 2.50e-04 | 8441.55 ms | -100.0% bf16 MFU | 62109 tok/s +step 11130/19560 | loss 3.392231 (-0.49z)| norm 0.2550 (-1.37z)| lr 2.50e-04 | 8447.54 ms | -100.0% bf16 MFU | 62107 tok/s +step 11131/19560 | loss 3.408728 (-0.05z)| norm 0.2794 (+0.16z)| lr 2.50e-04 | 8441.28 ms | -100.0% bf16 MFU | 62107 tok/s +step 11132/19560 | loss 3.393555 (-0.46z)| norm 0.2600 (-1.07z)| lr 2.50e-04 | 8444.30 ms | -100.0% bf16 MFU | 62106 tok/s +step 11133/19560 | loss 3.440910 (+0.82z)| norm 0.2691 (-0.49z)| lr 2.50e-04 | 8440.68 ms | -100.0% bf16 MFU | 62107 tok/s +step 11134/19560 | loss 3.400868 (-0.28z)| norm 0.2513 (-1.58z)| lr 2.50e-04 | 8445.08 ms | -100.0% bf16 MFU | 62105 tok/s +step 11135/19560 | loss 3.447974 (+1.01z)| norm 0.2622 (-0.90z)| lr 2.50e-04 | 8444.82 ms | -100.0% bf16 MFU | 62104 tok/s +step 11136/19560 | loss 3.412651 (+0.04z)| norm 0.2525 (-1.48z)| lr 2.50e-04 | 8444.57 ms | -100.0% bf16 MFU | 62103 tok/s +step 11137/19560 | loss 3.428722 (+0.47z)| norm 0.2798 (+0.21z)| lr 2.50e-04 | 8445.06 ms | -100.0% bf16 MFU | 62102 tok/s +step 11138/19560 | loss 3.322623 (-2.39z)| norm 0.2838 (+0.44z)| lr 2.50e-04 | 8442.91 ms | -100.0% bf16 MFU | 62102 tok/s +step 11139/19560 | loss 3.398384 (-0.33z)| norm 0.2604 (-0.99z)| lr 2.50e-04 | 8446.71 ms | -100.0% bf16 MFU | 62101 tok/s +step 11140/19560 | loss 3.413685 (+0.09z)| norm 0.2739 (-0.16z)| lr 2.50e-04 | 8445.11 ms | -100.0% bf16 MFU | 62100 tok/s +step 11141/19560 | loss 3.390639 (-0.58z)| norm 0.3058 (+1.78z)| lr 2.50e-04 | 8448.48 ms | -100.0% bf16 MFU | 62098 tok/s +step 11142/19560 | loss 3.400507 (-0.29z)| norm 0.2741 (-0.15z)| lr 2.50e-04 | 8444.19 ms | -100.0% bf16 MFU | 62097 tok/s +step 11143/19560 | loss 3.397470 (-0.37z)| norm 0.2546 (-1.34z)| lr 2.50e-04 | 8442.38 ms | -100.0% bf16 MFU | 62097 tok/s +step 11144/19560 | loss 3.393320 (-0.50z)| norm 0.2613 (-0.92z)| lr 2.50e-04 | 8443.92 ms | -100.0% bf16 MFU | 62097 tok/s +step 11145/19560 | loss 3.336786 (-2.08z)| norm 0.2586 (-1.08z)| lr 2.50e-04 | 8445.45 ms | -100.0% bf16 MFU | 62096 tok/s +step 11146/19560 | loss 3.405784 (-0.13z)| norm 0.2695 (-0.43z)| lr 2.50e-04 | 8445.46 ms | -100.0% bf16 MFU | 62095 tok/s +step 11147/19560 | loss 3.423662 (+0.37z)| norm 0.2657 (-0.65z)| lr 2.49e-04 | 8446.62 ms | -100.0% bf16 MFU | 62094 tok/s +step 11148/19560 | loss 3.422211 (+0.32z)| norm 0.2726 (-0.24z)| lr 2.49e-04 | 8448.89 ms | -100.0% bf16 MFU | 62092 tok/s +step 11149/19560 | loss 3.375242 (-1.03z)| norm 0.2807 (+0.24z)| lr 2.49e-04 | 8445.46 ms | -100.0% bf16 MFU | 62091 tok/s +step 11150/19560 | loss 3.395705 (-0.43z)| norm 0.2816 (+0.29z)| lr 2.49e-04 | 8443.20 ms | -100.0% bf16 MFU | 62092 tok/s +step 11151/19560 | loss 3.459289 (+1.41z)| norm 0.2663 (-0.63z)| lr 2.49e-04 | 8447.26 ms | -100.0% bf16 MFU | 62090 tok/s +step 11152/19560 | loss 3.384543 (-0.76z)| norm 0.2777 (+0.05z)| lr 2.49e-04 | 8447.34 ms | -100.0% bf16 MFU | 62089 tok/s +step 11153/19560 | loss 3.387811 (-0.65z)| norm 0.2720 (-0.29z)| lr 2.49e-04 | 8447.44 ms | -100.0% bf16 MFU | 62088 tok/s +step 11154/19560 | loss 3.339945 (-1.99z)| norm 0.2767 (+0.01z)| lr 2.49e-04 | 8447.58 ms | -100.0% bf16 MFU | 62087 tok/s +step 11155/19560 | loss 3.407508 (-0.08z)| norm 0.2701 (-0.38z)| lr 2.49e-04 | 8443.59 ms | -100.0% bf16 MFU | 62087 tok/s +step 11156/19560 | loss 3.447084 (+1.05z)| norm 0.2625 (-0.87z)| lr 2.49e-04 | 8446.58 ms | -100.0% bf16 MFU | 62086 tok/s +step 11157/19560 | loss 3.395821 (-0.41z)| norm 0.2645 (-0.74z)| lr 2.49e-04 | 8452.19 ms | -100.0% bf16 MFU | 62083 tok/s +step 11158/19560 | loss 3.380490 (-0.84z)| norm 0.2719 (-0.28z)| lr 2.49e-04 | 8449.47 ms | -100.0% bf16 MFU | 62082 tok/s +step 11159/19560 | loss 3.414083 (+0.14z)| norm 0.2547 (-1.37z)| lr 2.49e-04 | 8448.57 ms | -100.0% bf16 MFU | 62080 tok/s +step 11160/19560 | loss 3.390633 (-0.54z)| norm 0.2611 (-0.96z)| lr 2.49e-04 | 8448.77 ms | -100.0% bf16 MFU | 62079 tok/s +step 11161/19560 | loss 3.416130 (+0.19z)| norm 0.2556 (-1.29z)| lr 2.49e-04 | 8450.31 ms | -100.0% bf16 MFU | 62077 tok/s +step 11162/19560 | loss 3.372350 (-1.07z)| norm 0.2644 (-0.72z)| lr 2.49e-04 | 8448.88 ms | -100.0% bf16 MFU | 62076 tok/s +step 11163/19560 | loss 3.397008 (-0.34z)| norm 0.2643 (-0.72z)| lr 2.49e-04 | 8450.08 ms | -100.0% bf16 MFU | 62075 tok/s +step 11164/19560 | loss 3.438320 (+0.88z)| norm 0.2555 (-1.26z)| lr 2.49e-04 | 8449.12 ms | -100.0% bf16 MFU | 62074 tok/s +step 11165/19560 | loss 3.384521 (-0.70z)| norm 0.2793 (+0.25z)| lr 2.49e-04 | 8451.74 ms | -100.0% bf16 MFU | 62072 tok/s +step 11166/19560 | loss 3.453775 (+1.35z)| norm 0.2750 (-0.03z)| lr 2.49e-04 | 8467.30 ms | -100.0% bf16 MFU | 62064 tok/s +step 11167/19560 | loss 3.411382 (+0.09z)| norm 0.2646 (-0.69z)| lr 2.48e-04 | 8478.16 ms | -100.0% bf16 MFU | 62053 tok/s +step 11168/19560 | loss 3.363946 (-1.29z)| norm 0.2740 (-0.09z)| lr 2.48e-04 | 8473.82 ms | -100.0% bf16 MFU | 62044 tok/s +step 11169/19560 | loss 3.437741 (+0.88z)| norm 0.2817 (+0.40z)| lr 2.48e-04 | 8483.71 ms | -100.0% bf16 MFU | 62031 tok/s +step 11170/19560 | loss 3.372260 (-1.03z)| norm 0.2863 (+0.69z)| lr 2.48e-04 | 8477.25 ms | -100.0% bf16 MFU | 62022 tok/s +step 11171/19560 | loss 3.448114 (+1.17z)| norm 0.2783 (+0.16z)| lr 2.48e-04 | 8481.92 ms | -100.0% bf16 MFU | 62012 tok/s +step 11172/19560 | loss 3.374266 (-1.00z)| norm 0.2667 (-0.58z)| lr 2.48e-04 | 8475.23 ms | -100.0% bf16 MFU | 62004 tok/s +step 11173/19560 | loss 3.401325 (-0.19z)| norm 0.2555 (-1.29z)| lr 2.48e-04 | 8477.03 ms | -100.0% bf16 MFU | 61996 tok/s +step 11174/19560 | loss 3.369989 (-1.12z)| norm 0.2648 (-0.69z)| lr 2.48e-04 | 8477.59 ms | -100.0% bf16 MFU | 61989 tok/s +step 11175/19560 | loss 3.395909 (-0.37z)| norm 0.2543 (-1.34z)| lr 2.48e-04 | 8474.40 ms | -100.0% bf16 MFU | 61983 tok/s +step 11176/19560 | loss 3.441695 (+0.99z)| norm 0.2764 (+0.06z)| lr 2.48e-04 | 8474.29 ms | -100.0% bf16 MFU | 61977 tok/s +step 11177/19560 | loss 3.383574 (-0.75z)| norm 0.2680 (-0.48z)| lr 2.48e-04 | 8474.89 ms | -100.0% bf16 MFU | 61971 tok/s +step 11178/19560 | loss 3.479155 (+2.07z)| norm 0.2611 (-0.90z)| lr 2.48e-04 | 8471.93 ms | -100.0% bf16 MFU | 61967 tok/s +step 11179/19560 | loss 3.373376 (-1.06z)| norm 0.2676 (-0.48z)| lr 2.48e-04 | 8468.62 ms | -100.0% bf16 MFU | 61964 tok/s +step 11180/19560 | loss 3.350676 (-1.71z)| norm 0.2640 (-0.71z)| lr 2.48e-04 | 8472.35 ms | -100.0% bf16 MFU | 61960 tok/s +step 11181/19560 | loss 3.364737 (-1.27z)| norm 0.2843 (+0.62z)| lr 2.48e-04 | 8465.96 ms | -100.0% bf16 MFU | 61958 tok/s +step 11182/19560 | loss 3.414967 (+0.23z)| norm 0.2474 (-1.75z)| lr 2.48e-04 | 8466.81 ms | -100.0% bf16 MFU | 61957 tok/s +step 11183/19560 | loss 3.453294 (+1.36z)| norm 0.3065 (+2.12z)| lr 2.48e-04 | 8469.92 ms | -100.0% bf16 MFU | 61954 tok/s +step 11184/19560 | loss 3.433270 (+0.75z)| norm 0.2596 (-0.96z)| lr 2.48e-04 | 8469.05 ms | -100.0% bf16 MFU | 61951 tok/s +step 11185/19560 | loss 3.343926 (-1.86z)| norm 0.2903 (+1.10z)| lr 2.48e-04 | 8468.70 ms | -100.0% bf16 MFU | 61949 tok/s +step 11186/19560 | loss 3.312130 (-2.71z)| norm 0.2571 (-1.11z)| lr 2.48e-04 | 8466.33 ms | -100.0% bf16 MFU | 61948 tok/s +step 11187/19560 | loss 3.389947 (-0.47z)| norm 0.2646 (-0.60z)| lr 2.48e-04 | 8467.27 ms | -100.0% bf16 MFU | 61947 tok/s +step 11188/19560 | loss 3.457705 (+1.47z)| norm 0.2648 (-0.58z)| lr 2.47e-04 | 8466.07 ms | -100.0% bf16 MFU | 61946 tok/s +step 11189/19560 | loss 3.345098 (-1.73z)| norm 0.2812 (+0.51z)| lr 2.47e-04 | 8469.89 ms | -100.0% bf16 MFU | 61944 tok/s +step 11190/19560 | loss 3.357746 (-1.35z)| norm 0.2823 (+0.60z)| lr 2.47e-04 | 8467.20 ms | -100.0% bf16 MFU | 61942 tok/s +step 11191/19560 | loss 3.359179 (-1.29z)| norm 0.2883 (+1.01z)| lr 2.47e-04 | 8462.18 ms | -100.0% bf16 MFU | 61943 tok/s +step 11192/19560 | loss 3.358699 (-1.29z)| norm 0.2507 (-1.52z)| lr 2.47e-04 | 8468.09 ms | -100.0% bf16 MFU | 61942 tok/s +step 11193/19560 | loss 3.314658 (-2.45z)| norm 0.2933 (+1.38z)| lr 2.47e-04 | 8470.11 ms | -100.0% bf16 MFU | 61939 tok/s +step 11194/19560 | loss 3.400494 (-0.10z)| norm 0.2625 (-0.70z)| lr 2.47e-04 | 8473.06 ms | -100.0% bf16 MFU | 61936 tok/s +step 11195/19560 | loss 3.501615 (+2.58z)| norm 0.2918 (+1.32z)| lr 2.47e-04 | 8463.39 ms | -100.0% bf16 MFU | 61937 tok/s +step 11196/19560 | loss 3.378055 (-0.73z)| norm 0.2993 (+1.79z)| lr 2.47e-04 | 8467.60 ms | -100.0% bf16 MFU | 61936 tok/s +step 11197/19560 | loss 3.340070 (-1.72z)| norm 0.2623 (-0.72z)| lr 2.47e-04 | 8467.69 ms | -100.0% bf16 MFU | 61935 tok/s +step 11198/19560 | loss 3.421007 (+0.44z)| norm 0.3057 (+2.18z)| lr 2.47e-04 | 8462.51 ms | -100.0% bf16 MFU | 61936 tok/s +step 11199/19560 | loss 3.450117 (+1.20z)| norm 0.2832 (+0.69z)| lr 2.47e-04 | 8465.90 ms | -100.0% bf16 MFU | 61936 tok/s +step 11200/19560 | loss 3.388057 (-0.45z)| norm 0.3199 (+3.01z)| lr 2.47e-04 | 8466.09 ms | -100.0% bf16 MFU | 61935 tok/s +step 11201/19560 | loss 3.433451 (+0.75z)| norm 0.2821 (+0.59z)| lr 2.47e-04 | 8470.21 ms | -100.0% bf16 MFU | 61933 tok/s +step 11202/19560 | loss 3.415214 (+0.26z)| norm 0.2993 (+1.68z)| lr 2.47e-04 | 8463.29 ms | -100.0% bf16 MFU | 61934 tok/s +step 11203/19560 | loss 3.354977 (-1.31z)| norm 0.2653 (-0.52z)| lr 2.47e-04 | 8464.58 ms | -100.0% bf16 MFU | 61934 tok/s +step 11204/19560 | loss 3.351620 (-1.38z)| norm 0.2802 (+0.46z)| lr 2.47e-04 | 8463.54 ms | -100.0% bf16 MFU | 61935 tok/s +step 11205/19560 | loss 3.394215 (-0.26z)| norm 0.2676 (-0.37z)| lr 2.47e-04 | 8460.71 ms | -100.0% bf16 MFU | 61937 tok/s +step 11206/19560 | loss 3.419788 (+0.41z)| norm 0.2792 (+0.42z)| lr 2.47e-04 | 8463.18 ms | -100.0% bf16 MFU | 61937 tok/s +step 11207/19560 | loss 3.362071 (-1.09z)| norm 0.2840 (+0.74z)| lr 2.47e-04 | 8460.61 ms | -100.0% bf16 MFU | 61939 tok/s +step 11208/19560 | loss 3.421694 (+0.47z)| norm 0.2641 (-0.60z)| lr 2.46e-04 | 8457.48 ms | -100.0% bf16 MFU | 61941 tok/s +step 11209/19560 | loss 3.374775 (-0.76z)| norm 0.2784 (+0.36z)| lr 2.46e-04 | 8467.39 ms | -100.0% bf16 MFU | 61940 tok/s +step 11210/19560 | loss 3.412657 (+0.24z)| norm 0.2567 (-1.13z)| lr 2.46e-04 | 8461.27 ms | -100.0% bf16 MFU | 61941 tok/s +step 11211/19560 | loss 3.399549 (-0.11z)| norm 0.2763 (+0.25z)| lr 2.46e-04 | 8462.97 ms | -100.0% bf16 MFU | 61942 tok/s +step 11212/19560 | loss 3.368630 (-0.91z)| norm 0.2491 (-1.64z)| lr 2.46e-04 | 8456.96 ms | -100.0% bf16 MFU | 61944 tok/s +step 11213/19560 | loss 3.376390 (-0.71z)| norm 0.2721 (-0.03z)| lr 2.46e-04 | 8459.86 ms | -100.0% bf16 MFU | 61946 tok/s +step 11214/19560 | loss 3.421019 (+0.49z)| norm 0.2737 (+0.08z)| lr 2.46e-04 | 8459.17 ms | -100.0% bf16 MFU | 61948 tok/s +step 11215/19560 | loss 3.333764 (-1.82z)| norm 0.2846 (+0.83z)| lr 2.46e-04 | 8461.92 ms | -100.0% bf16 MFU | 61948 tok/s +step 11216/19560 | loss 3.367335 (-0.91z)| norm 0.2886 (+1.09z)| lr 2.46e-04 | 8459.75 ms | -100.0% bf16 MFU | 61949 tok/s +step 11217/19560 | loss 3.390622 (-0.29z)| norm 0.2788 (+0.40z)| lr 2.46e-04 | 8465.24 ms | -100.0% bf16 MFU | 61949 tok/s +step 11218/19560 | loss 3.392492 (-0.24z)| norm 0.2853 (+0.85z)| lr 2.46e-04 | 8458.73 ms | -100.0% bf16 MFU | 61950 tok/s +step 11219/19560 | loss 3.388617 (-0.34z)| norm 0.2751 (+0.14z)| lr 2.46e-04 | 8465.11 ms | -100.0% bf16 MFU | 61950 tok/s +step 11220/19560 | loss 3.392853 (-0.22z)| norm 0.2719 (-0.09z)| lr 2.46e-04 | 8455.55 ms | -100.0% bf16 MFU | 61952 tok/s +step 11221/19560 | loss 3.339685 (-1.60z)| norm 0.2771 (+0.27z)| lr 2.46e-04 | 8464.14 ms | -100.0% bf16 MFU | 61952 tok/s +step 11222/19560 | loss 3.424853 (+0.63z)| norm 0.2825 (+0.64z)| lr 2.46e-04 | 8462.33 ms | -100.0% bf16 MFU | 61952 tok/s +step 11223/19560 | loss 3.372708 (-0.73z)| norm 0.2816 (+0.57z)| lr 2.46e-04 | 8460.30 ms | -100.0% bf16 MFU | 61953 tok/s +step 11224/19560 | loss 3.411207 (+0.27z)| norm 0.2731 (-0.05z)| lr 2.46e-04 | 8458.08 ms | -100.0% bf16 MFU | 61955 tok/s +step 11225/19560 | loss 3.425861 (+0.67z)| norm 0.3083 (+2.41z)| lr 2.46e-04 | 8457.49 ms | -100.0% bf16 MFU | 61956 tok/s +step 11226/19560 | loss 3.355409 (-1.18z)| norm 0.2657 (-0.60z)| lr 2.46e-04 | 8467.23 ms | -100.0% bf16 MFU | 61955 tok/s +step 11227/19560 | loss 3.363579 (-0.97z)| norm 0.3050 (+2.12z)| lr 2.46e-04 | 8465.12 ms | -100.0% bf16 MFU | 61954 tok/s +step 11228/19560 | loss 3.423081 (+0.65z)| norm 0.2735 (-0.05z)| lr 2.45e-04 | 8460.81 ms | -100.0% bf16 MFU | 61954 tok/s +step 11229/19560 | loss 3.605900 (+5.04z)| norm 0.3036 (+2.04z)| lr 2.45e-04 | 8455.09 ms | -100.0% bf16 MFU | 61957 tok/s +step 11230/19560 | loss 3.370034 (-0.75z)| norm 0.2859 (+0.82z)| lr 2.45e-04 | 8464.11 ms | -100.0% bf16 MFU | 61956 tok/s +step 11231/19560 | loss 3.522624 (+2.87z)| norm 0.3588 (+5.23z)| lr 2.45e-04 | 8460.47 ms | -100.0% bf16 MFU | 61957 tok/s +step 11232/19560 | loss 3.407763 (+0.14z)| norm 0.3264 (+3.07z)| lr 2.45e-04 | 8459.98 ms | -100.0% bf16 MFU | 61958 tok/s +step 11233/19560 | loss 3.375364 (-0.64z)| norm 0.2940 (+1.11z)| lr 2.45e-04 | 8458.57 ms | -100.0% bf16 MFU | 61959 tok/s +step 11234/19560 | loss 3.350892 (-1.20z)| norm 0.3186 (+2.50z)| lr 2.45e-04 | 8461.96 ms | -100.0% bf16 MFU | 61959 tok/s +step 11235/19560 | loss 3.322520 (-1.84z)| norm 0.2767 (+0.05z)| lr 2.45e-04 | 8456.69 ms | -100.0% bf16 MFU | 61961 tok/s +step 11236/19560 | loss 3.400656 (-0.00z)| norm 0.3338 (+3.23z)| lr 2.45e-04 | 8455.10 ms | -100.0% bf16 MFU | 61963 tok/s +step 11237/19560 | loss 3.317554 (-1.92z)| norm 0.2626 (-0.76z)| lr 2.45e-04 | 8450.59 ms | -100.0% bf16 MFU | 61967 tok/s +step 11238/19560 | loss 3.355393 (-1.03z)| norm 0.3057 (+1.64z)| lr 2.45e-04 | 8442.59 ms | -100.0% bf16 MFU | 61974 tok/s +step 11239/19560 | loss 3.317678 (-1.86z)| norm 0.2818 (+0.32z)| lr 2.45e-04 | 8445.60 ms | -100.0% bf16 MFU | 61979 tok/s +step 11240/19560 | loss 3.322124 (-1.72z)| norm 0.2862 (+0.56z)| lr 2.45e-04 | 8442.30 ms | -100.0% bf16 MFU | 61985 tok/s +step 11241/19560 | loss 3.380208 (-0.39z)| norm 0.2815 (+0.29z)| lr 2.45e-04 | 8447.10 ms | -100.0% bf16 MFU | 61989 tok/s +step 11242/19560 | loss 3.371745 (-0.58z)| norm 0.2581 (-1.00z)| lr 2.45e-04 | 8448.67 ms | -100.0% bf16 MFU | 61993 tok/s +step 11243/19560 | loss 3.405220 (+0.18z)| norm 0.2739 (-0.11z)| lr 2.45e-04 | 8449.38 ms | -100.0% bf16 MFU | 61995 tok/s +step 11244/19560 | loss 3.396562 (-0.01z)| norm 0.2611 (-0.82z)| lr 2.45e-04 | 8444.19 ms | -100.0% bf16 MFU | 62000 tok/s +step 11245/19560 | loss 3.375946 (-0.48z)| norm 0.2666 (-0.50z)| lr 2.45e-04 | 8445.22 ms | -100.0% bf16 MFU | 62004 tok/s +step 11246/19560 | loss 3.364824 (-0.72z)| norm 0.2606 (-0.84z)| lr 2.45e-04 | 8448.41 ms | -100.0% bf16 MFU | 62007 tok/s +step 11247/19560 | loss 3.374242 (-0.49z)| norm 0.2932 (+0.97z)| lr 2.45e-04 | 8446.10 ms | -100.0% bf16 MFU | 62010 tok/s +step 11248/19560 | loss 3.452622 (+1.29z)| norm 0.2733 (-0.13z)| lr 2.45e-04 | 8447.43 ms | -100.0% bf16 MFU | 62013 tok/s +step 11249/19560 | loss 3.414506 (+0.42z)| norm 0.2714 (-0.24z)| lr 2.44e-04 | 8451.29 ms | -100.0% bf16 MFU | 62014 tok/s +step 11250/19560 | loss 3.407722 (+0.26z)| norm 0.2757 (+0.00z)| lr 2.44e-04 | 8449.91 ms | -100.0% bf16 MFU | 62016 tok/s +val loss 3.359588 +evaluating HellaSwag: 0/1256 evaluating HellaSwag: 10/1256 evaluating HellaSwag: 20/1256 evaluating HellaSwag: 30/1256 evaluating HellaSwag: 40/1256 evaluating HellaSwag: 50/1256 evaluating HellaSwag: 60/1256 evaluating HellaSwag: 70/1256 evaluating HellaSwag: 80/1256 evaluating HellaSwag: 90/1256 evaluating HellaSwag: 100/1256 evaluating HellaSwag: 110/1256 evaluating HellaSwag: 120/1256 evaluating HellaSwag: 130/1256 evaluating HellaSwag: 140/1256 evaluating HellaSwag: 150/1256 evaluating HellaSwag: 160/1256 evaluating HellaSwag: 170/1256 evaluating HellaSwag: 180/1256 evaluating HellaSwag: 190/1256 evaluating HellaSwag: 200/1256 evaluating HellaSwag: 210/1256 evaluating HellaSwag: 220/1256 evaluating HellaSwag: 230/1256 evaluating HellaSwag: 240/1256 evaluating HellaSwag: 250/1256 evaluating HellaSwag: 260/1256 evaluating HellaSwag: 270/1256 evaluating HellaSwag: 280/1256 evaluating HellaSwag: 290/1256 evaluating HellaSwag: 300/1256 evaluating HellaSwag: 310/1256 evaluating HellaSwag: 320/1256 evaluating HellaSwag: 330/1256 evaluating HellaSwag: 340/1256 evaluating HellaSwag: 350/1256 evaluating HellaSwag: 360/1256 evaluating HellaSwag: 370/1256 evaluating HellaSwag: 380/1256 evaluating HellaSwag: 390/1256 evaluating HellaSwag: 400/1256 evaluating HellaSwag: 410/1256 evaluating HellaSwag: 420/1256 evaluating HellaSwag: 430/1256 evaluating HellaSwag: 440/1256 evaluating HellaSwag: 450/1256 evaluating HellaSwag: 460/1256 evaluating HellaSwag: 470/1256 evaluating HellaSwag: 480/1256 evaluating HellaSwag: 490/1256 evaluating HellaSwag: 500/1256 evaluating HellaSwag: 510/1256 evaluating HellaSwag: 520/1256 evaluating HellaSwag: 530/1256 evaluating HellaSwag: 540/1256 evaluating HellaSwag: 550/1256 evaluating HellaSwag: 560/1256 evaluating HellaSwag: 570/1256 evaluating HellaSwag: 580/1256 evaluating HellaSwag: 590/1256 evaluating HellaSwag: 600/1256 evaluating HellaSwag: 610/1256 evaluating HellaSwag: 620/1256 evaluating HellaSwag: 630/1256 evaluating HellaSwag: 640/1256 evaluating HellaSwag: 650/1256 evaluating HellaSwag: 660/1256 evaluating HellaSwag: 670/1256 evaluating HellaSwag: 680/1256 evaluating HellaSwag: 690/1256 evaluating HellaSwag: 700/1256 evaluating HellaSwag: 710/1256 evaluating HellaSwag: 720/1256 evaluating HellaSwag: 730/1256 evaluating HellaSwag: 740/1256 evaluating HellaSwag: 750/1256 evaluating HellaSwag: 760/1256 evaluating HellaSwag: 770/1256 evaluating HellaSwag: 780/1256 evaluating HellaSwag: 790/1256 evaluating HellaSwag: 800/1256 evaluating HellaSwag: 810/1256 evaluating HellaSwag: 820/1256 evaluating HellaSwag: 830/1256 evaluating HellaSwag: 840/1256 evaluating HellaSwag: 850/1256 evaluating HellaSwag: 860/1256 evaluating HellaSwag: 870/1256 evaluating HellaSwag: 880/1256 evaluating HellaSwag: 890/1256 evaluating HellaSwag: 900/1256 evaluating HellaSwag: 910/1256 evaluating HellaSwag: 920/1256 evaluating HellaSwag: 930/1256 evaluating HellaSwag: 940/1256 evaluating HellaSwag: 950/1256 evaluating HellaSwag: 960/1256 evaluating HellaSwag: 970/1256 evaluating HellaSwag: 980/1256 evaluating HellaSwag: 990/1256 evaluating HellaSwag: 1000/1256 evaluating HellaSwag: 1010/1256 evaluating HellaSwag: 1020/1256 evaluating HellaSwag: 1030/1256 evaluating HellaSwag: 1040/1256 evaluating HellaSwag: 1050/1256 evaluating HellaSwag: 1060/1256 evaluating HellaSwag: 1070/1256 evaluating HellaSwag: 1080/1256 evaluating HellaSwag: 1090/1256 evaluating HellaSwag: 1100/1256 evaluating HellaSwag: 1110/1256 evaluating HellaSwag: 1120/1256 evaluating HellaSwag: 1130/1256 evaluating HellaSwag: 1140/1256 evaluating HellaSwag: 1150/1256 evaluating HellaSwag: 1160/1256 evaluating HellaSwag: 1170/1256 evaluating HellaSwag: 1180/1256 evaluating HellaSwag: 1190/1256 evaluating HellaSwag: 1200/1256 evaluating HellaSwag: 1210/1256 evaluating HellaSwag: 1220/1256 evaluating HellaSwag: 1230/1256 evaluating HellaSwag: 1240/1256 evaluating HellaSwag: 1250/1256 HellaSwag: 2929/10042 = 0.291675 +step 11251/19560 | loss 3.383568 (-0.29z)| norm 0.2576 (-0.99z)| lr 2.44e-04 | 8456.99 ms | -100.0% bf16 MFU | 62015 tok/s +step 11252/19560 | loss 3.379124 (-0.37z)| norm 0.2605 (-0.82z)| lr 2.44e-04 | 8450.07 ms | -100.0% bf16 MFU | 62016 tok/s +step 11253/19560 | loss 3.386904 (-0.18z)| norm 0.2510 (-1.34z)| lr 2.44e-04 | 8459.26 ms | -100.0% bf16 MFU | 62014 tok/s +step 11254/19560 | loss 3.375231 (-0.46z)| norm 0.2516 (-1.29z)| lr 2.44e-04 | 8454.19 ms | -100.0% bf16 MFU | 62014 tok/s +step 11255/19560 | loss 3.403888 (+0.22z)| norm 0.2511 (-1.32z)| lr 2.44e-04 | 8452.20 ms | -100.0% bf16 MFU | 62015 tok/s +step 11256/19560 | loss 3.356810 (-0.89z)| norm 0.2627 (-0.67z)| lr 2.44e-04 | 8454.19 ms | -100.0% bf16 MFU | 62015 tok/s +step 11257/19560 | loss 3.406987 (+0.31z)| norm 0.2623 (-0.70z)| lr 2.44e-04 | 8456.46 ms | -100.0% bf16 MFU | 62014 tok/s +step 11258/19560 | loss 3.513007 (+2.72z)| norm 0.2398 (-1.92z)| lr 2.44e-04 | 8458.43 ms | -100.0% bf16 MFU | 62013 tok/s +step 11259/19560 | loss 3.336971 (-1.32z)| norm 0.2630 (-0.64z)| lr 2.44e-04 | 8453.47 ms | -100.0% bf16 MFU | 62013 tok/s +step 11260/19560 | loss 3.335980 (-1.32z)| norm 0.2614 (-0.73z)| lr 2.44e-04 | 8461.22 ms | -100.0% bf16 MFU | 62011 tok/s +step 11261/19560 | loss 3.352233 (-0.94z)| norm 0.2648 (-0.54z)| lr 2.44e-04 | 8454.30 ms | -100.0% bf16 MFU | 62011 tok/s +step 11262/19560 | loss 3.370432 (-0.52z)| norm 0.2570 (-0.97z)| lr 2.44e-04 | 8454.60 ms | -100.0% bf16 MFU | 62011 tok/s +step 11263/19560 | loss 3.497640 (+2.34z)| norm 0.2675 (-0.40z)| lr 2.44e-04 | 8452.13 ms | -100.0% bf16 MFU | 62012 tok/s +step 11264/19560 | loss 3.388632 (-0.11z)| norm 0.2799 (+0.26z)| lr 2.44e-04 | 8458.32 ms | -100.0% bf16 MFU | 62011 tok/s +step 11265/19560 | loss 3.417229 (+0.54z)| norm 0.2947 (+1.07z)| lr 2.44e-04 | 8458.30 ms | -100.0% bf16 MFU | 62009 tok/s +step 11266/19560 | loss 3.350458 (-0.97z)| norm 0.2524 (-1.23z)| lr 2.44e-04 | 8454.99 ms | -100.0% bf16 MFU | 62009 tok/s +step 11267/19560 | loss 3.380857 (-0.28z)| norm 0.2641 (-0.59z)| lr 2.44e-04 | 8450.62 ms | -100.0% bf16 MFU | 62011 tok/s +step 11268/19560 | loss 3.473509 (+1.78z)| norm 0.2962 (+1.14z)| lr 2.44e-04 | 8452.86 ms | -100.0% bf16 MFU | 62012 tok/s +step 11269/19560 | loss 3.395786 (+0.04z)| norm 0.2602 (-0.80z)| lr 2.43e-04 | 8456.01 ms | -100.0% bf16 MFU | 62011 tok/s +step 11270/19560 | loss 3.344522 (-1.09z)| norm 0.2806 (+0.31z)| lr 2.43e-04 | 8453.97 ms | -100.0% bf16 MFU | 62011 tok/s +step 11271/19560 | loss 3.337049 (-1.24z)| norm 0.2579 (-0.93z)| lr 2.43e-04 | 8458.81 ms | -100.0% bf16 MFU | 62010 tok/s +step 11272/19560 | loss 3.378510 (-0.32z)| norm 0.2568 (-0.99z)| lr 2.43e-04 | 8453.24 ms | -100.0% bf16 MFU | 62010 tok/s +step 11273/19560 | loss 3.346476 (-1.03z)| norm 0.2578 (-0.93z)| lr 2.43e-04 | 8455.23 ms | -100.0% bf16 MFU | 62010 tok/s +step 11274/19560 | loss 3.384321 (-0.19z)| norm 0.2854 (+0.57z)| lr 2.43e-04 | 8451.94 ms | -100.0% bf16 MFU | 62011 tok/s +step 11275/19560 | loss 3.380105 (-0.27z)| norm 0.2746 (-0.02z)| lr 2.43e-04 | 8457.52 ms | -100.0% bf16 MFU | 62010 tok/s +step 11276/19560 | loss 3.371788 (-0.45z)| norm 0.2619 (-0.71z)| lr 2.43e-04 | 8454.14 ms | -100.0% bf16 MFU | 62011 tok/s +step 11277/19560 | loss 3.368646 (-0.52z)| norm 0.2691 (-0.32z)| lr 2.43e-04 | 8452.61 ms | -100.0% bf16 MFU | 62011 tok/s +step 11278/19560 | loss 3.349417 (-0.93z)| norm 0.2766 (+0.10z)| lr 2.43e-04 | 8451.40 ms | -100.0% bf16 MFU | 62013 tok/s +step 11279/19560 | loss 3.377062 (-0.31z)| norm 0.2869 (+0.65z)| lr 2.43e-04 | 8458.57 ms | -100.0% bf16 MFU | 62011 tok/s +step 11280/19560 | loss 3.354664 (-0.80z)| norm 0.2582 (-0.90z)| lr 2.43e-04 | 8452.24 ms | -100.0% bf16 MFU | 62012 tok/s +step 11281/19560 | loss 3.359042 (-0.70z)| norm 0.2631 (-0.63z)| lr 2.43e-04 | 8452.73 ms | -100.0% bf16 MFU | 62013 tok/s +step 11282/19560 | loss 3.370569 (-0.45z)| norm 0.2762 (+0.08z)| lr 2.43e-04 | 8450.48 ms | -100.0% bf16 MFU | 62014 tok/s +step 11283/19560 | loss 3.383065 (-0.17z)| norm 0.2889 (+0.76z)| lr 2.43e-04 | 8451.88 ms | -100.0% bf16 MFU | 62015 tok/s +step 11284/19560 | loss 3.367428 (-0.50z)| norm 0.2691 (-0.32z)| lr 2.43e-04 | 8451.67 ms | -100.0% bf16 MFU | 62016 tok/s +step 11285/19560 | loss 3.362007 (-0.62z)| norm 0.2429 (-1.71z)| lr 2.43e-04 | 8449.48 ms | -100.0% bf16 MFU | 62018 tok/s +step 11286/19560 | loss 3.395371 (+0.13z)| norm 0.2566 (-0.97z)| lr 2.43e-04 | 8453.68 ms | -100.0% bf16 MFU | 62018 tok/s +step 11287/19560 | loss 3.485665 (+2.10z)| norm 0.2694 (-0.29z)| lr 2.43e-04 | 8452.41 ms | -100.0% bf16 MFU | 62018 tok/s +step 11288/19560 | loss 3.391994 (+0.04z)| norm 0.2494 (-1.36z)| lr 2.43e-04 | 8452.44 ms | -100.0% bf16 MFU | 62019 tok/s +step 11289/19560 | loss 3.453764 (+1.38z)| norm 0.2648 (-0.53z)| lr 2.42e-04 | 8452.53 ms | -100.0% bf16 MFU | 62019 tok/s +step 11290/19560 | loss 3.353173 (-0.82z)| norm 0.2697 (-0.28z)| lr 2.42e-04 | 8454.45 ms | -100.0% bf16 MFU | 62019 tok/s +step 11291/19560 | loss 3.342566 (-1.03z)| norm 0.2583 (-0.88z)| lr 2.42e-04 | 8451.64 ms | -100.0% bf16 MFU | 62020 tok/s +step 11292/19560 | loss 3.386300 (-0.07z)| norm 0.2647 (-0.55z)| lr 2.42e-04 | 8454.15 ms | -100.0% bf16 MFU | 62019 tok/s +step 11293/19560 | loss 3.312947 (-1.65z)| norm 0.2867 (+0.63z)| lr 2.42e-04 | 8451.21 ms | -100.0% bf16 MFU | 62020 tok/s +step 11294/19560 | loss 3.443281 (+1.18z)| norm 0.2482 (-1.41z)| lr 2.42e-04 | 8455.23 ms | -100.0% bf16 MFU | 62020 tok/s +step 11295/19560 | loss 3.425729 (+0.79z)| norm 0.2762 (+0.07z)| lr 2.42e-04 | 8455.81 ms | -100.0% bf16 MFU | 62019 tok/s +step 11296/19560 | loss 3.371838 (-0.38z)| norm 0.2609 (-0.73z)| lr 2.42e-04 | 8453.10 ms | -100.0% bf16 MFU | 62019 tok/s +step 11297/19560 | loss 3.386253 (-0.06z)| norm 0.2721 (-0.13z)| lr 2.42e-04 | 8453.65 ms | -100.0% bf16 MFU | 62019 tok/s +step 11298/19560 | loss 3.395659 (+0.15z)| norm 0.2898 (+0.80z)| lr 2.42e-04 | 8452.71 ms | -100.0% bf16 MFU | 62019 tok/s +step 11299/19560 | loss 3.308692 (-1.71z)| norm 0.2672 (-0.39z)| lr 2.42e-04 | 8454.26 ms | -100.0% bf16 MFU | 62019 tok/s +step 11300/19560 | loss 3.399738 (+0.25z)| norm 0.2675 (-0.37z)| lr 2.42e-04 | 8451.74 ms | -100.0% bf16 MFU | 62020 tok/s +step 11301/19560 | loss 3.335857 (-1.11z)| norm 0.2693 (-0.29z)| lr 2.42e-04 | 8451.22 ms | -100.0% bf16 MFU | 62021 tok/s +step 11302/19560 | loss 3.386014 (-0.04z)| norm 0.2511 (-1.24z)| lr 2.42e-04 | 8452.49 ms | -100.0% bf16 MFU | 62021 tok/s +step 11303/19560 | loss 3.350743 (-0.79z)| norm 0.2668 (-0.42z)| lr 2.42e-04 | 8452.64 ms | -100.0% bf16 MFU | 62021 tok/s +step 11304/19560 | loss 3.318413 (-1.46z)| norm 0.2383 (-1.89z)| lr 2.42e-04 | 8450.01 ms | -100.0% bf16 MFU | 62023 tok/s +step 11305/19560 | loss 3.367163 (-0.41z)| norm 0.2707 (-0.19z)| lr 2.42e-04 | 8448.91 ms | -100.0% bf16 MFU | 62024 tok/s +step 11306/19560 | loss 3.382691 (-0.06z)| norm 0.2608 (-0.71z)| lr 2.42e-04 | 8451.60 ms | -100.0% bf16 MFU | 62025 tok/s +step 11307/19560 | loss 3.363061 (-0.49z)| norm 0.2818 (+0.39z)| lr 2.42e-04 | 8450.29 ms | -100.0% bf16 MFU | 62026 tok/s +step 11308/19560 | loss 3.562668 (+3.63z)| norm 0.3147 (+2.06z)| lr 2.42e-04 | 8449.95 ms | -100.0% bf16 MFU | 62027 tok/s +step 11309/19560 | loss 3.389961 (+0.06z)| norm 0.2743 (-0.02z)| lr 2.42e-04 | 8456.09 ms | -100.0% bf16 MFU | 62025 tok/s +step 11310/19560 | loss 3.399691 (+0.26z)| norm 0.2867 (+0.60z)| lr 2.41e-04 | 8455.06 ms | -100.0% bf16 MFU | 62025 tok/s +step 11311/19560 | loss 3.334003 (-1.08z)| norm 0.2783 (+0.18z)| lr 2.41e-04 | 8452.42 ms | -100.0% bf16 MFU | 62025 tok/s +step 11312/19560 | loss 3.358755 (-0.56z)| norm 0.2696 (-0.28z)| lr 2.41e-04 | 8452.33 ms | -100.0% bf16 MFU | 62025 tok/s +step 11313/19560 | loss 3.395960 (+0.21z)| norm 0.2740 (-0.04z)| lr 2.41e-04 | 8452.74 ms | -100.0% bf16 MFU | 62025 tok/s +step 11314/19560 | loss 3.330602 (-1.16z)| norm 0.2818 (+0.36z)| lr 2.41e-04 | 8450.02 ms | -100.0% bf16 MFU | 62026 tok/s +step 11315/19560 | loss 3.403875 (+0.37z)| norm 0.2909 (+0.83z)| lr 2.41e-04 | 8450.01 ms | -100.0% bf16 MFU | 62027 tok/s +step 11316/19560 | loss 3.383399 (-0.05z)| norm 0.2688 (-0.34z)| lr 2.41e-04 | 8453.39 ms | -100.0% bf16 MFU | 62027 tok/s +step 11317/19560 | loss 3.444424 (+1.22z)| norm 0.2840 (+0.46z)| lr 2.41e-04 | 8448.99 ms | -100.0% bf16 MFU | 62028 tok/s +step 11318/19560 | loss 3.407087 (+0.43z)| norm 0.2838 (+0.45z)| lr 2.41e-04 | 8457.70 ms | -100.0% bf16 MFU | 62026 tok/s +step 11319/19560 | loss 3.497590 (+2.27z)| norm 0.2925 (+0.91z)| lr 2.41e-04 | 8449.48 ms | -100.0% bf16 MFU | 62027 tok/s +step 11320/19560 | loss 3.366144 (-0.45z)| norm 0.2641 (-0.60z)| lr 2.41e-04 | 8449.13 ms | -100.0% bf16 MFU | 62029 tok/s +step 11321/19560 | loss 3.380275 (-0.17z)| norm 0.3115 (+1.89z)| lr 2.41e-04 | 8451.43 ms | -100.0% bf16 MFU | 62029 tok/s +step 11322/19560 | loss 3.414218 (+0.53z)| norm 0.2635 (-0.63z)| lr 2.41e-04 | 8451.02 ms | -100.0% bf16 MFU | 62029 tok/s +step 11323/19560 | loss 3.407077 (+0.41z)| norm 0.2846 (+0.48z)| lr 2.41e-04 | 8450.70 ms | -100.0% bf16 MFU | 62030 tok/s +step 11324/19560 | loss 3.361297 (-0.56z)| norm 0.2526 (-1.19z)| lr 2.41e-04 | 8454.62 ms | -100.0% bf16 MFU | 62029 tok/s +step 11325/19560 | loss 3.447649 (+1.25z)| norm 0.3109 (+1.84z)| lr 2.41e-04 | 8453.22 ms | -100.0% bf16 MFU | 62029 tok/s +step 11326/19560 | loss 3.384509 (-0.08z)| norm 0.2727 (-0.13z)| lr 2.41e-04 | 8448.74 ms | -100.0% bf16 MFU | 62030 tok/s +step 11327/19560 | loss 3.409163 (+0.45z)| norm 0.2924 (+0.90z)| lr 2.41e-04 | 8451.45 ms | -100.0% bf16 MFU | 62030 tok/s +step 11328/19560 | loss 3.329689 (-1.23z)| norm 0.2514 (-1.25z)| lr 2.41e-04 | 8451.46 ms | -100.0% bf16 MFU | 62031 tok/s +step 11329/19560 | loss 3.411826 (+0.52z)| norm 0.2749 (+0.01z)| lr 2.41e-04 | 8451.57 ms | -100.0% bf16 MFU | 62031 tok/s +step 11330/19560 | loss 3.403176 (+0.34z)| norm 0.2499 (-1.31z)| lr 2.40e-04 | 8450.97 ms | -100.0% bf16 MFU | 62031 tok/s +step 11331/19560 | loss 3.374185 (-0.28z)| norm 0.2776 (+0.17z)| lr 2.40e-04 | 8447.30 ms | -100.0% bf16 MFU | 62033 tok/s +step 11332/19560 | loss 3.372407 (-0.33z)| norm 0.2541 (-1.07z)| lr 2.40e-04 | 8449.95 ms | -100.0% bf16 MFU | 62034 tok/s +step 11333/19560 | loss 3.360080 (-0.58z)| norm 0.2765 (+0.11z)| lr 2.40e-04 | 8447.15 ms | -100.0% bf16 MFU | 62035 tok/s +step 11334/19560 | loss 3.384066 (-0.06z)| norm 0.2746 (+0.02z)| lr 2.40e-04 | 8448.73 ms | -100.0% bf16 MFU | 62036 tok/s +step 11335/19560 | loss 3.414191 (+0.57z)| norm 0.2678 (-0.34z)| lr 2.40e-04 | 8450.09 ms | -100.0% bf16 MFU | 62037 tok/s +step 11336/19560 | loss 3.348799 (-0.82z)| norm 0.2731 (-0.06z)| lr 2.40e-04 | 8450.15 ms | -100.0% bf16 MFU | 62037 tok/s +step 11337/19560 | loss 3.337059 (-1.06z)| norm 0.2660 (-0.43z)| lr 2.40e-04 | 8450.38 ms | -100.0% bf16 MFU | 62037 tok/s +step 11338/19560 | loss 3.339613 (-0.99z)| norm 0.2570 (-0.91z)| lr 2.40e-04 | 8450.97 ms | -100.0% bf16 MFU | 62037 tok/s +step 11339/19560 | loss 3.592085 (+4.06z)| norm 0.2863 (+0.64z)| lr 2.40e-04 | 8449.52 ms | -100.0% bf16 MFU | 62038 tok/s +step 11340/19560 | loss 3.390286 (+0.05z)| norm 0.2632 (-0.59z)| lr 2.40e-04 | 8447.22 ms | -100.0% bf16 MFU | 62039 tok/s +step 11341/19560 | loss 3.396681 (+0.18z)| norm 0.2718 (-0.13z)| lr 2.40e-04 | 8452.63 ms | -100.0% bf16 MFU | 62039 tok/s +step 11342/19560 | loss 3.407578 (+0.39z)| norm 0.2733 (-0.06z)| lr 2.40e-04 | 8452.60 ms | -100.0% bf16 MFU | 62038 tok/s +step 11343/19560 | loss 3.359812 (-0.56z)| norm 0.2606 (-0.73z)| lr 2.40e-04 | 8449.95 ms | -100.0% bf16 MFU | 62039 tok/s +step 11344/19560 | loss 3.322000 (-1.30z)| norm 0.2671 (-0.37z)| lr 2.40e-04 | 8450.78 ms | -100.0% bf16 MFU | 62039 tok/s +step 11345/19560 | loss 3.360216 (-0.54z)| norm 0.2657 (-0.44z)| lr 2.40e-04 | 8449.38 ms | -100.0% bf16 MFU | 62039 tok/s +step 11346/19560 | loss 3.423033 (+0.70z)| norm 0.2653 (-0.45z)| lr 2.40e-04 | 8448.62 ms | -100.0% bf16 MFU | 62040 tok/s +step 11347/19560 | loss 3.376585 (-0.22z)| norm 0.2587 (-0.80z)| lr 2.40e-04 | 8450.08 ms | -100.0% bf16 MFU | 62040 tok/s +step 11348/19560 | loss 3.438692 (+1.00z)| norm 0.2561 (-0.93z)| lr 2.40e-04 | 8451.10 ms | -100.0% bf16 MFU | 62040 tok/s +step 11349/19560 | loss 3.343723 (-0.87z)| norm 0.2830 (+0.50z)| lr 2.40e-04 | 8449.72 ms | -100.0% bf16 MFU | 62041 tok/s +step 11350/19560 | loss 3.391994 (+0.09z)| norm 0.2800 (+0.34z)| lr 2.40e-04 | 8452.07 ms | -100.0% bf16 MFU | 62040 tok/s +step 11351/19560 | loss 3.357725 (-0.59z)| norm 0.2739 (+0.02z)| lr 2.39e-04 | 8450.81 ms | -100.0% bf16 MFU | 62040 tok/s +step 11352/19560 | loss 3.340759 (-0.91z)| norm 0.2833 (+0.52z)| lr 2.39e-04 | 8450.58 ms | -100.0% bf16 MFU | 62040 tok/s +step 11353/19560 | loss 3.447029 (+1.18z)| norm 0.3079 (+1.83z)| lr 2.39e-04 | 8451.17 ms | -100.0% bf16 MFU | 62040 tok/s +step 11354/19560 | loss 3.428169 (+0.79z)| norm 0.2814 (+0.42z)| lr 2.39e-04 | 8450.21 ms | -100.0% bf16 MFU | 62040 tok/s +step 11355/19560 | loss 3.353168 (-0.67z)| norm 0.2816 (+0.44z)| lr 2.39e-04 | 8449.35 ms | -100.0% bf16 MFU | 62041 tok/s +step 11356/19560 | loss 3.290987 (-1.85z)| norm 0.2737 (+0.02z)| lr 2.39e-04 | 8452.34 ms | -100.0% bf16 MFU | 62040 tok/s +step 11357/19560 | loss 3.368087 (-0.35z)| norm 0.2519 (-1.14z)| lr 2.39e-04 | 8474.43 ms | -100.0% bf16 MFU | 62032 tok/s +step 11358/19560 | loss 3.484436 (+2.03z)| norm 0.2904 (+0.94z)| lr 2.39e-04 | 8476.57 ms | -100.0% bf16 MFU | 62023 tok/s +step 11359/19560 | loss 3.424490 (+0.84z)| norm 0.2666 (-0.34z)| lr 2.39e-04 | 8477.38 ms | -100.0% bf16 MFU | 62014 tok/s +step 11360/19560 | loss 3.364491 (-0.42z)| norm 0.2576 (-0.88z)| lr 2.39e-04 | 8478.92 ms | -100.0% bf16 MFU | 62005 tok/s +step 11361/19560 | loss 3.434608 (+1.05z)| norm 0.2770 (+0.33z)| lr 2.39e-04 | 8471.97 ms | -100.0% bf16 MFU | 61999 tok/s +step 11362/19560 | loss 3.368902 (-0.34z)| norm 0.3094 (+2.38z)| lr 2.39e-04 | 8474.38 ms | -100.0% bf16 MFU | 61992 tok/s +step 11363/19560 | loss 3.348912 (-0.77z)| norm 0.2827 (+0.70z)| lr 2.39e-04 | 8474.36 ms | -100.0% bf16 MFU | 61986 tok/s +step 11364/19560 | loss 3.373480 (-0.25z)| norm 0.2714 (+0.01z)| lr 2.39e-04 | 8475.65 ms | -100.0% bf16 MFU | 61980 tok/s +step 11365/19560 | loss 3.405176 (+0.41z)| norm 0.2695 (-0.12z)| lr 2.39e-04 | 8471.65 ms | -100.0% bf16 MFU | 61975 tok/s +step 11366/19560 | loss 3.389349 (+0.07z)| norm 0.2641 (-0.46z)| lr 2.39e-04 | 8475.02 ms | -100.0% bf16 MFU | 61969 tok/s +step 11367/19560 | loss 3.374622 (-0.26z)| norm 0.2807 (+0.67z)| lr 2.39e-04 | 8477.28 ms | -100.0% bf16 MFU | 61963 tok/s +step 11368/19560 | loss 3.354160 (-0.71z)| norm 0.2601 (-0.73z)| lr 2.39e-04 | 8473.57 ms | -100.0% bf16 MFU | 61959 tok/s +step 11369/19560 | loss 3.368198 (-0.40z)| norm 0.2766 (+0.41z)| lr 2.39e-04 | 8475.09 ms | -100.0% bf16 MFU | 61954 tok/s +step 11370/19560 | loss 3.384143 (-0.06z)| norm 0.2961 (+1.71z)| lr 2.39e-04 | 8467.14 ms | -100.0% bf16 MFU | 61952 tok/s +step 11371/19560 | loss 3.374590 (-0.26z)| norm 0.3044 (+2.22z)| lr 2.38e-04 | 8475.06 ms | -100.0% bf16 MFU | 61948 tok/s +step 11372/19560 | loss 3.461412 (+1.60z)| norm 0.2744 (+0.21z)| lr 2.38e-04 | 8475.63 ms | -100.0% bf16 MFU | 61943 tok/s +step 11373/19560 | loss 3.368243 (-0.40z)| norm 0.2847 (+0.88z)| lr 2.38e-04 | 8469.67 ms | -100.0% bf16 MFU | 61941 tok/s +step 11374/19560 | loss 3.389569 (+0.05z)| norm 0.2849 (+0.89z)| lr 2.38e-04 | 8472.93 ms | -100.0% bf16 MFU | 61938 tok/s +step 11375/19560 | loss 3.347329 (-0.85z)| norm 0.2681 (-0.22z)| lr 2.38e-04 | 8471.20 ms | -100.0% bf16 MFU | 61936 tok/s +step 11376/19560 | loss 3.362199 (-0.52z)| norm 0.2689 (-0.17z)| lr 2.38e-04 | 8471.42 ms | -100.0% bf16 MFU | 61933 tok/s +step 11377/19560 | loss 3.536131 (+3.10z)| norm 0.2994 (+1.84z)| lr 2.38e-04 | 8473.36 ms | -100.0% bf16 MFU | 61930 tok/s +step 11378/19560 | loss 3.442261 (+1.13z)| norm 0.2725 (+0.06z)| lr 2.38e-04 | 8466.74 ms | -100.0% bf16 MFU | 61930 tok/s +step 11379/19560 | loss 3.357779 (-0.61z)| norm 0.2901 (+1.21z)| lr 2.38e-04 | 8460.24 ms | -100.0% bf16 MFU | 61932 tok/s +step 11380/19560 | loss 3.380833 (-0.13z)| norm 0.2822 (+0.67z)| lr 2.38e-04 | 8455.51 ms | -100.0% bf16 MFU | 61936 tok/s +step 11381/19560 | loss 3.340318 (-0.96z)| norm 0.2890 (+1.11z)| lr 2.38e-04 | 8453.69 ms | -100.0% bf16 MFU | 61940 tok/s +step 11382/19560 | loss 3.495210 (+2.17z)| norm 0.2826 (+0.67z)| lr 2.38e-04 | 8450.28 ms | -100.0% bf16 MFU | 61945 tok/s +step 11383/19560 | loss 3.387459 (-0.01z)| norm 0.3159 (+2.79z)| lr 2.38e-04 | 8448.03 ms | -100.0% bf16 MFU | 61951 tok/s +step 11384/19560 | loss 3.364118 (-0.48z)| norm 0.2875 (+0.92z)| lr 2.38e-04 | 8448.47 ms | -100.0% bf16 MFU | 61956 tok/s +step 11385/19560 | loss 3.499139 (+2.20z)| norm 0.2869 (+0.87z)| lr 2.38e-04 | 8450.86 ms | -100.0% bf16 MFU | 61960 tok/s +step 11386/19560 | loss 3.396202 (+0.18z)| norm 0.2921 (+1.20z)| lr 2.38e-04 | 8442.46 ms | -100.0% bf16 MFU | 61967 tok/s +step 11387/19560 | loss 3.433286 (+0.92z)| norm 0.2795 (+0.36z)| lr 2.38e-04 | 8448.06 ms | -100.0% bf16 MFU | 61972 tok/s +step 11388/19560 | loss 3.422682 (+0.69z)| norm 0.2880 (+0.91z)| lr 2.38e-04 | 8449.24 ms | -100.0% bf16 MFU | 61976 tok/s +step 11389/19560 | loss 3.408873 (+0.40z)| norm 0.2829 (+0.56z)| lr 2.38e-04 | 8441.46 ms | -100.0% bf16 MFU | 61983 tok/s +step 11390/19560 | loss 3.360007 (-0.60z)| norm 0.2491 (-1.65z)| lr 2.38e-04 | 8448.83 ms | -100.0% bf16 MFU | 61986 tok/s +step 11391/19560 | loss 3.354235 (-0.71z)| norm 0.2779 (+0.23z)| lr 2.37e-04 | 8450.78 ms | -100.0% bf16 MFU | 61989 tok/s +step 11392/19560 | loss 3.357822 (-0.63z)| norm 0.2535 (-1.34z)| lr 2.37e-04 | 8448.24 ms | -100.0% bf16 MFU | 61992 tok/s +step 11393/19560 | loss 3.408143 (+0.42z)| norm 0.2685 (-0.36z)| lr 2.37e-04 | 8442.96 ms | -100.0% bf16 MFU | 61998 tok/s +step 11394/19560 | loss 3.369951 (-0.38z)| norm 0.2669 (-0.47z)| lr 2.37e-04 | 8449.48 ms | -100.0% bf16 MFU | 62000 tok/s +step 11395/19560 | loss 3.362508 (-0.53z)| norm 0.2765 (+0.16z)| lr 2.37e-04 | 8449.08 ms | -100.0% bf16 MFU | 62003 tok/s +step 11396/19560 | loss 3.335680 (-1.07z)| norm 0.2491 (-1.62z)| lr 2.37e-04 | 8441.14 ms | -100.0% bf16 MFU | 62008 tok/s +step 11397/19560 | loss 3.405148 (+0.38z)| norm 0.2592 (-0.96z)| lr 2.37e-04 | 8447.92 ms | -100.0% bf16 MFU | 62011 tok/s +step 11398/19560 | loss 3.385762 (-0.03z)| norm 0.2857 (+0.78z)| lr 2.37e-04 | 8446.04 ms | -100.0% bf16 MFU | 62014 tok/s +step 11399/19560 | loss 3.400619 (+0.27z)| norm 0.2717 (-0.14z)| lr 2.37e-04 | 8446.99 ms | -100.0% bf16 MFU | 62017 tok/s +step 11400/19560 | loss 3.379225 (-0.18z)| norm 0.2580 (-1.05z)| lr 2.37e-04 | 8451.98 ms | -100.0% bf16 MFU | 62018 tok/s +step 11401/19560 | loss 3.341310 (-0.98z)| norm 0.2993 (+1.64z)| lr 2.37e-04 | 8450.42 ms | -100.0% bf16 MFU | 62019 tok/s +step 11402/19560 | loss 3.409087 (+0.45z)| norm 0.2565 (-1.15z)| lr 2.37e-04 | 8450.26 ms | -100.0% bf16 MFU | 62020 tok/s +step 11403/19560 | loss 3.339139 (-1.02z)| norm 0.2726 (-0.09z)| lr 2.37e-04 | 8446.30 ms | -100.0% bf16 MFU | 62023 tok/s +step 11404/19560 | loss 3.309340 (-1.62z)| norm 0.2663 (-0.51z)| lr 2.37e-04 | 8455.33 ms | -100.0% bf16 MFU | 62022 tok/s +step 11405/19560 | loss 3.369246 (-0.37z)| norm 0.2542 (-1.29z)| lr 2.37e-04 | 8454.75 ms | -100.0% bf16 MFU | 62021 tok/s +step 11406/19560 | loss 3.384017 (-0.07z)| norm 0.2730 (-0.06z)| lr 2.37e-04 | 8457.51 ms | -100.0% bf16 MFU | 62020 tok/s +step 11407/19560 | loss 3.399640 (+0.25z)| norm 0.2493 (-1.58z)| lr 2.37e-04 | 8451.11 ms | -100.0% bf16 MFU | 62021 tok/s +step 11408/19560 | loss 3.292677 (-1.94z)| norm 0.3022 (+1.81z)| lr 2.37e-04 | 8448.80 ms | -100.0% bf16 MFU | 62022 tok/s +step 11409/19560 | loss 3.333242 (-1.10z)| norm 0.2714 (-0.17z)| lr 2.37e-04 | 8454.39 ms | -100.0% bf16 MFU | 62022 tok/s +step 11410/19560 | loss 3.400490 (+0.27z)| norm 0.2927 (+1.19z)| lr 2.37e-04 | 8453.82 ms | -100.0% bf16 MFU | 62022 tok/s +step 11411/19560 | loss 3.380700 (-0.13z)| norm 0.2699 (-0.26z)| lr 2.37e-04 | 8453.81 ms | -100.0% bf16 MFU | 62022 tok/s +step 11412/19560 | loss 3.372017 (-0.31z)| norm 0.2785 (+0.29z)| lr 2.36e-04 | 8457.68 ms | -100.0% bf16 MFU | 62020 tok/s +step 11413/19560 | loss 3.411510 (+0.49z)| norm 0.2881 (+0.89z)| lr 2.36e-04 | 8453.23 ms | -100.0% bf16 MFU | 62020 tok/s +step 11414/19560 | loss 3.409514 (+0.45z)| norm 0.2809 (+0.41z)| lr 2.36e-04 | 8453.56 ms | -100.0% bf16 MFU | 62020 tok/s +step 11415/19560 | loss 3.406636 (+0.41z)| norm 0.2702 (-0.29z)| lr 2.36e-04 | 8459.61 ms | -100.0% bf16 MFU | 62018 tok/s +step 11416/19560 | loss 3.464580 (+1.59z)| norm 0.2868 (+0.78z)| lr 2.36e-04 | 8454.44 ms | -100.0% bf16 MFU | 62018 tok/s +step 11417/19560 | loss 3.332734 (-1.11z)| norm 0.2794 (+0.29z)| lr 2.36e-04 | 8458.40 ms | -100.0% bf16 MFU | 62016 tok/s +step 11418/19560 | loss 3.362526 (-0.50z)| norm 0.2879 (+0.84z)| lr 2.36e-04 | 8459.39 ms | -100.0% bf16 MFU | 62014 tok/s +step 11419/19560 | loss 3.386515 (-0.01z)| norm 0.2894 (+0.92z)| lr 2.36e-04 | 8457.22 ms | -100.0% bf16 MFU | 62013 tok/s +step 11420/19560 | loss 3.472853 (+1.75z)| norm 0.2853 (+0.64z)| lr 2.36e-04 | 8456.84 ms | -100.0% bf16 MFU | 62012 tok/s +step 11421/19560 | loss 3.414960 (+0.55z)| norm 0.2923 (+1.10z)| lr 2.36e-04 | 8453.77 ms | -100.0% bf16 MFU | 62012 tok/s +step 11422/19560 | loss 3.488561 (+2.04z)| norm 0.3142 (+2.48z)| lr 2.36e-04 | 8463.32 ms | -100.0% bf16 MFU | 62009 tok/s +step 11423/19560 | loss 3.375316 (-0.27z)| norm 0.3049 (+1.83z)| lr 2.36e-04 | 8459.64 ms | -100.0% bf16 MFU | 62008 tok/s +step 11424/19560 | loss 3.325281 (-1.28z)| norm 0.2868 (+0.66z)| lr 2.36e-04 | 8452.83 ms | -100.0% bf16 MFU | 62008 tok/s +step 11425/19560 | loss 3.485343 (+1.93z)| norm 0.3015 (+1.57z)| lr 2.36e-04 | 8459.62 ms | -100.0% bf16 MFU | 62007 tok/s +step 11426/19560 | loss 3.435141 (+0.92z)| norm 0.2898 (+0.83z)| lr 2.36e-04 | 8459.14 ms | -100.0% bf16 MFU | 62005 tok/s +step 11427/19560 | loss 3.309998 (-1.58z)| norm 0.2742 (-0.17z)| lr 2.36e-04 | 8458.58 ms | -100.0% bf16 MFU | 62004 tok/s +step 11428/19560 | loss 3.383362 (-0.11z)| norm 0.3041 (+1.70z)| lr 2.36e-04 | 8461.92 ms | -100.0% bf16 MFU | 62002 tok/s +step 11429/19560 | loss 3.468280 (+1.55z)| norm 0.2822 (+0.32z)| lr 2.36e-04 | 8460.68 ms | -100.0% bf16 MFU | 62000 tok/s +step 11430/19560 | loss 3.390757 (+0.01z)| norm 0.2878 (+0.66z)| lr 2.36e-04 | 8459.17 ms | -100.0% bf16 MFU | 61999 tok/s +step 11431/19560 | loss 3.386566 (-0.08z)| norm 0.2953 (+1.12z)| lr 2.36e-04 | 8457.92 ms | -100.0% bf16 MFU | 61999 tok/s +step 11432/19560 | loss 3.420586 (+0.59z)| norm 0.2803 (+0.15z)| lr 2.35e-04 | 8455.33 ms | -100.0% bf16 MFU | 61999 tok/s +step 11433/19560 | loss 3.343446 (-0.95z)| norm 0.2950 (+1.09z)| lr 2.35e-04 | 8463.80 ms | -100.0% bf16 MFU | 61996 tok/s +step 11434/19560 | loss 3.330985 (-1.19z)| norm 0.2675 (-0.71z)| lr 2.35e-04 | 8457.57 ms | -100.0% bf16 MFU | 61996 tok/s +step 11435/19560 | loss 3.317173 (-1.45z)| norm 0.2700 (-0.53z)| lr 2.35e-04 | 8462.09 ms | -100.0% bf16 MFU | 61994 tok/s +step 11436/19560 | loss 3.340203 (-1.00z)| norm 0.2735 (-0.29z)| lr 2.35e-04 | 8455.42 ms | -100.0% bf16 MFU | 61995 tok/s +step 11437/19560 | loss 3.501804 (+2.27z)| norm 0.2815 (+0.24z)| lr 2.35e-04 | 8457.32 ms | -100.0% bf16 MFU | 61995 tok/s +step 11438/19560 | loss 3.328925 (-1.21z)| norm 0.2691 (-0.58z)| lr 2.35e-04 | 8456.03 ms | -100.0% bf16 MFU | 61995 tok/s +step 11439/19560 | loss 3.367653 (-0.43z)| norm 0.2720 (-0.38z)| lr 2.35e-04 | 8464.70 ms | -100.0% bf16 MFU | 61992 tok/s +step 11440/19560 | loss 3.399151 (+0.20z)| norm 0.2729 (-0.33z)| lr 2.35e-04 | 8458.33 ms | -100.0% bf16 MFU | 61992 tok/s +step 11441/19560 | loss 3.393306 (+0.08z)| norm 0.3083 (+1.98z)| lr 2.35e-04 | 8454.91 ms | -100.0% bf16 MFU | 61993 tok/s +step 11442/19560 | loss 3.359514 (-0.61z)| norm 0.2730 (-0.33z)| lr 2.35e-04 | 8459.77 ms | -100.0% bf16 MFU | 61992 tok/s +step 11443/19560 | loss 3.387257 (-0.05z)| norm 0.3135 (+2.27z)| lr 2.35e-04 | 8459.65 ms | -100.0% bf16 MFU | 61991 tok/s +step 11444/19560 | loss 3.413475 (+0.48z)| norm 0.2954 (+1.09z)| lr 2.35e-04 | 8459.42 ms | -100.0% bf16 MFU | 61990 tok/s +step 11445/19560 | loss 3.344478 (-0.91z)| norm 0.2796 (+0.08z)| lr 2.35e-04 | 8459.77 ms | -100.0% bf16 MFU | 61989 tok/s +step 11446/19560 | loss 3.396096 (+0.15z)| norm 0.2749 (-0.22z)| lr 2.35e-04 | 8458.64 ms | -100.0% bf16 MFU | 61989 tok/s +step 11447/19560 | loss 3.367861 (-0.42z)| norm 0.2636 (-0.93z)| lr 2.35e-04 | 8457.55 ms | -100.0% bf16 MFU | 61989 tok/s +step 11448/19560 | loss 3.399601 (+0.24z)| norm 0.2986 (+1.30z)| lr 2.35e-04 | 8461.32 ms | -100.0% bf16 MFU | 61988 tok/s +step 11449/19560 | loss 3.435162 (+0.96z)| norm 0.2608 (-1.11z)| lr 2.35e-04 | 8464.95 ms | -100.0% bf16 MFU | 61985 tok/s +step 11450/19560 | loss 3.337034 (-1.05z)| norm 0.2884 (+0.67z)| lr 2.35e-04 | 8456.59 ms | -100.0% bf16 MFU | 61986 tok/s +step 11451/19560 | loss 3.373146 (-0.30z)| norm 0.2567 (-1.37z)| lr 2.35e-04 | 8455.59 ms | -100.0% bf16 MFU | 61987 tok/s +step 11452/19560 | loss 3.354688 (-0.68z)| norm 0.2800 (+0.13z)| lr 2.35e-04 | 8458.14 ms | -100.0% bf16 MFU | 61987 tok/s +step 11453/19560 | loss 3.378836 (-0.17z)| norm 0.2946 (+1.10z)| lr 2.34e-04 | 8459.00 ms | -100.0% bf16 MFU | 61986 tok/s +step 11454/19560 | loss 3.430380 (+0.89z)| norm 0.2450 (-2.13z)| lr 2.34e-04 | 8456.50 ms | -100.0% bf16 MFU | 61987 tok/s +step 11455/19560 | loss 3.347976 (-0.80z)| norm 0.2936 (+1.03z)| lr 2.34e-04 | 8459.00 ms | -100.0% bf16 MFU | 61987 tok/s +step 11456/19560 | loss 3.411796 (+0.50z)| norm 0.2515 (-1.70z)| lr 2.34e-04 | 8457.84 ms | -100.0% bf16 MFU | 61987 tok/s +step 11457/19560 | loss 3.346738 (-0.84z)| norm 0.2796 (+0.12z)| lr 2.34e-04 | 8462.34 ms | -100.0% bf16 MFU | 61985 tok/s +step 11458/19560 | loss 3.438996 (+1.06z)| norm 0.3033 (+1.63z)| lr 2.34e-04 | 8456.92 ms | -100.0% bf16 MFU | 61986 tok/s +step 11459/19560 | loss 3.354625 (-0.67z)| norm 0.2581 (-1.29z)| lr 2.34e-04 | 8457.00 ms | -100.0% bf16 MFU | 61986 tok/s +step 11460/19560 | loss 3.385657 (-0.04z)| norm 0.2662 (-0.78z)| lr 2.34e-04 | 8455.68 ms | -100.0% bf16 MFU | 61987 tok/s +step 11461/19560 | loss 3.341634 (-0.94z)| norm 0.2608 (-1.12z)| lr 2.34e-04 | 8454.88 ms | -100.0% bf16 MFU | 61988 tok/s +step 11462/19560 | loss 3.322409 (-1.31z)| norm 0.2766 (-0.10z)| lr 2.34e-04 | 8456.86 ms | -100.0% bf16 MFU | 61989 tok/s +step 11463/19560 | loss 3.393425 (+0.14z)| norm 0.2606 (-1.12z)| lr 2.34e-04 | 8454.21 ms | -100.0% bf16 MFU | 61990 tok/s +step 11464/19560 | loss 3.343628 (-0.88z)| norm 0.2713 (-0.43z)| lr 2.34e-04 | 8457.95 ms | -100.0% bf16 MFU | 61990 tok/s +step 11465/19560 | loss 3.302285 (-1.70z)| norm 0.2497 (-1.80z)| lr 2.34e-04 | 8461.79 ms | -100.0% bf16 MFU | 61988 tok/s +step 11466/19560 | loss 3.361765 (-0.50z)| norm 0.2710 (-0.45z)| lr 2.34e-04 | 8454.00 ms | -100.0% bf16 MFU | 61990 tok/s +step 11467/19560 | loss 3.353067 (-0.69z)| norm 0.2579 (-1.27z)| lr 2.34e-04 | 8459.65 ms | -100.0% bf16 MFU | 61989 tok/s +step 11468/19560 | loss 3.530460 (+3.04z)| norm 0.2655 (-0.79z)| lr 2.34e-04 | 8454.91 ms | -100.0% bf16 MFU | 61990 tok/s +step 11469/19560 | loss 3.331209 (-1.13z)| norm 0.2476 (-1.89z)| lr 2.34e-04 | 8458.18 ms | -100.0% bf16 MFU | 61990 tok/s +step 11470/19560 | loss 3.402936 (+0.37z)| norm 0.2692 (-0.52z)| lr 2.34e-04 | 8455.59 ms | -100.0% bf16 MFU | 61991 tok/s +step 11471/19560 | loss 3.402036 (+0.35z)| norm 0.2857 (+0.50z)| lr 2.34e-04 | 8456.13 ms | -100.0% bf16 MFU | 61991 tok/s +step 11472/19560 | loss 3.358170 (-0.58z)| norm 0.2751 (-0.17z)| lr 2.34e-04 | 8455.76 ms | -100.0% bf16 MFU | 61992 tok/s +step 11473/19560 | loss 3.359818 (-0.54z)| norm 0.2860 (+0.51z)| lr 2.33e-04 | 8458.96 ms | -100.0% bf16 MFU | 61991 tok/s +step 11474/19560 | loss 3.395407 (+0.21z)| norm 0.2620 (-1.01z)| lr 2.33e-04 | 8457.73 ms | -100.0% bf16 MFU | 61991 tok/s +step 11475/19560 | loss 3.388077 (+0.05z)| norm 0.2538 (-1.52z)| lr 2.33e-04 | 8454.21 ms | -100.0% bf16 MFU | 61992 tok/s +step 11476/19560 | loss 3.371288 (-0.29z)| norm 0.2728 (-0.33z)| lr 2.33e-04 | 8459.24 ms | -100.0% bf16 MFU | 61991 tok/s +step 11477/19560 | loss 3.336714 (-1.02z)| norm 0.2725 (-0.35z)| lr 2.33e-04 | 8457.07 ms | -100.0% bf16 MFU | 61992 tok/s +step 11478/19560 | loss 3.387737 (+0.06z)| norm 0.2758 (-0.14z)| lr 2.33e-04 | 8454.23 ms | -100.0% bf16 MFU | 61993 tok/s +step 11479/19560 | loss 3.408134 (+0.48z)| norm 0.2899 (+0.75z)| lr 2.33e-04 | 8458.88 ms | -100.0% bf16 MFU | 61992 tok/s +step 11480/19560 | loss 3.331880 (-1.13z)| norm 0.2881 (+0.64z)| lr 2.33e-04 | 8457.62 ms | -100.0% bf16 MFU | 61992 tok/s +step 11481/19560 | loss 3.352243 (-0.69z)| norm 0.2919 (+0.89z)| lr 2.33e-04 | 8456.64 ms | -100.0% bf16 MFU | 61992 tok/s +step 11482/19560 | loss 3.408259 (+0.51z)| norm 0.2978 (+1.26z)| lr 2.33e-04 | 8455.05 ms | -100.0% bf16 MFU | 61993 tok/s +step 11483/19560 | loss 3.426993 (+0.89z)| norm 0.2827 (+0.29z)| lr 2.33e-04 | 8457.30 ms | -100.0% bf16 MFU | 61993 tok/s +step 11484/19560 | loss 3.406904 (+0.45z)| norm 0.2959 (+1.12z)| lr 2.33e-04 | 8455.57 ms | -100.0% bf16 MFU | 61994 tok/s +step 11485/19560 | loss 3.392525 (+0.14z)| norm 0.2719 (-0.42z)| lr 2.33e-04 | 8459.32 ms | -100.0% bf16 MFU | 61993 tok/s +step 11486/19560 | loss 3.367984 (-0.38z)| norm 0.3267 (+2.97z)| lr 2.33e-04 | 8456.92 ms | -100.0% bf16 MFU | 61993 tok/s +step 11487/19560 | loss 3.307199 (-1.68z)| norm 0.2699 (-0.54z)| lr 2.33e-04 | 8458.08 ms | -100.0% bf16 MFU | 61993 tok/s +step 11488/19560 | loss 3.371832 (-0.27z)| norm 0.2795 (+0.04z)| lr 2.33e-04 | 8461.66 ms | -100.0% bf16 MFU | 61991 tok/s +step 11489/19560 | loss 3.374354 (-0.21z)| norm 0.2682 (-0.66z)| lr 2.33e-04 | 8456.74 ms | -100.0% bf16 MFU | 61991 tok/s +step 11490/19560 | loss 3.378456 (-0.12z)| norm 0.2970 (+1.15z)| lr 2.33e-04 | 8457.88 ms | -100.0% bf16 MFU | 61991 tok/s +step 11491/19560 | loss 3.309927 (-1.60z)| norm 0.2650 (-0.86z)| lr 2.33e-04 | 8457.04 ms | -100.0% bf16 MFU | 61991 tok/s +step 11492/19560 | loss 3.372100 (-0.25z)| norm 0.2896 (+0.68z)| lr 2.33e-04 | 8457.23 ms | -100.0% bf16 MFU | 61991 tok/s +step 11493/19560 | loss 3.379305 (-0.09z)| norm 0.2489 (-1.84z)| lr 2.33e-04 | 8453.36 ms | -100.0% bf16 MFU | 61993 tok/s +step 11494/19560 | loss 3.389937 (+0.14z)| norm 0.2877 (+0.55z)| lr 2.32e-04 | 8456.20 ms | -100.0% bf16 MFU | 61993 tok/s +step 11495/19560 | loss 3.388808 (+0.12z)| norm 0.2780 (-0.04z)| lr 2.32e-04 | 8455.45 ms | -100.0% bf16 MFU | 61994 tok/s +step 11496/19560 | loss 3.392009 (+0.18z)| norm 0.2628 (-0.99z)| lr 2.32e-04 | 8454.98 ms | -100.0% bf16 MFU | 61995 tok/s +step 11497/19560 | loss 3.368459 (-0.33z)| norm 0.2734 (-0.33z)| lr 2.32e-04 | 8453.46 ms | -100.0% bf16 MFU | 61996 tok/s +step 11498/19560 | loss 3.316558 (-1.44z)| norm 0.2657 (-0.79z)| lr 2.32e-04 | 8456.75 ms | -100.0% bf16 MFU | 61996 tok/s +step 11499/19560 | loss 3.390229 (+0.15z)| norm 0.2660 (-0.77z)| lr 2.32e-04 | 8456.15 ms | -100.0% bf16 MFU | 61996 tok/s +step 11500/19560 | loss 3.436050 (+1.15z)| norm 0.2598 (-1.14z)| lr 2.32e-04 | 8457.97 ms | -100.0% bf16 MFU | 61996 tok/s +val loss 3.353679 +evaluating HellaSwag: 0/1256 evaluating HellaSwag: 10/1256 evaluating HellaSwag: 20/1256 evaluating HellaSwag: 30/1256 evaluating HellaSwag: 40/1256 evaluating HellaSwag: 50/1256 evaluating HellaSwag: 60/1256 evaluating HellaSwag: 70/1256 evaluating HellaSwag: 80/1256 evaluating HellaSwag: 90/1256 evaluating HellaSwag: 100/1256 evaluating HellaSwag: 110/1256 evaluating HellaSwag: 120/1256 evaluating HellaSwag: 130/1256 evaluating HellaSwag: 140/1256 evaluating HellaSwag: 150/1256 evaluating HellaSwag: 160/1256 evaluating HellaSwag: 170/1256 evaluating HellaSwag: 180/1256 evaluating HellaSwag: 190/1256 evaluating HellaSwag: 200/1256 evaluating HellaSwag: 210/1256 evaluating HellaSwag: 220/1256 evaluating HellaSwag: 230/1256 evaluating HellaSwag: 240/1256 evaluating HellaSwag: 250/1256 evaluating HellaSwag: 260/1256 evaluating HellaSwag: 270/1256 evaluating HellaSwag: 280/1256 evaluating HellaSwag: 290/1256 evaluating HellaSwag: 300/1256 evaluating HellaSwag: 310/1256 evaluating HellaSwag: 320/1256 evaluating HellaSwag: 330/1256 evaluating HellaSwag: 340/1256 evaluating HellaSwag: 350/1256 evaluating HellaSwag: 360/1256 evaluating HellaSwag: 370/1256 evaluating HellaSwag: 380/1256 evaluating HellaSwag: 390/1256 evaluating HellaSwag: 400/1256 evaluating HellaSwag: 410/1256 evaluating HellaSwag: 420/1256 evaluating HellaSwag: 430/1256 evaluating HellaSwag: 440/1256 evaluating HellaSwag: 450/1256 evaluating HellaSwag: 460/1256 evaluating HellaSwag: 470/1256 evaluating HellaSwag: 480/1256 evaluating HellaSwag: 490/1256 evaluating HellaSwag: 500/1256 evaluating HellaSwag: 510/1256 evaluating HellaSwag: 520/1256 evaluating HellaSwag: 530/1256 evaluating HellaSwag: 540/1256 evaluating HellaSwag: 550/1256 evaluating HellaSwag: 560/1256 evaluating HellaSwag: 570/1256 evaluating HellaSwag: 580/1256 evaluating HellaSwag: 590/1256 evaluating HellaSwag: 600/1256 evaluating HellaSwag: 610/1256 evaluating HellaSwag: 620/1256 evaluating HellaSwag: 630/1256 evaluating HellaSwag: 640/1256 evaluating HellaSwag: 650/1256 evaluating HellaSwag: 660/1256 evaluating HellaSwag: 670/1256 evaluating HellaSwag: 680/1256 evaluating HellaSwag: 690/1256 evaluating HellaSwag: 700/1256 evaluating HellaSwag: 710/1256 evaluating HellaSwag: 720/1256 evaluating HellaSwag: 730/1256 evaluating HellaSwag: 740/1256 evaluating HellaSwag: 750/1256 evaluating HellaSwag: 760/1256 evaluating HellaSwag: 770/1256 evaluating HellaSwag: 780/1256 evaluating HellaSwag: 790/1256 evaluating HellaSwag: 800/1256 evaluating HellaSwag: 810/1256 evaluating HellaSwag: 820/1256 evaluating HellaSwag: 830/1256 evaluating HellaSwag: 840/1256 evaluating HellaSwag: 850/1256 evaluating HellaSwag: 860/1256 evaluating HellaSwag: 870/1256 evaluating HellaSwag: 880/1256 evaluating HellaSwag: 890/1256 evaluating HellaSwag: 900/1256 evaluating HellaSwag: 910/1256 evaluating HellaSwag: 920/1256 evaluating HellaSwag: 930/1256 evaluating HellaSwag: 940/1256 evaluating HellaSwag: 950/1256 evaluating HellaSwag: 960/1256 evaluating HellaSwag: 970/1256 evaluating HellaSwag: 980/1256 evaluating HellaSwag: 990/1256 evaluating HellaSwag: 1000/1256 evaluating HellaSwag: 1010/1256 evaluating HellaSwag: 1020/1256 evaluating HellaSwag: 1030/1256 evaluating HellaSwag: 1040/1256 evaluating HellaSwag: 1050/1256 evaluating HellaSwag: 1060/1256 evaluating HellaSwag: 1070/1256 evaluating HellaSwag: 1080/1256 evaluating HellaSwag: 1090/1256 evaluating HellaSwag: 1100/1256 evaluating HellaSwag: 1110/1256 evaluating HellaSwag: 1120/1256 evaluating HellaSwag: 1130/1256 evaluating HellaSwag: 1140/1256 evaluating HellaSwag: 1150/1256 evaluating HellaSwag: 1160/1256 evaluating HellaSwag: 1170/1256 evaluating HellaSwag: 1180/1256 evaluating HellaSwag: 1190/1256 evaluating HellaSwag: 1200/1256 evaluating HellaSwag: 1210/1256 evaluating HellaSwag: 1220/1256 evaluating HellaSwag: 1230/1256 evaluating HellaSwag: 1240/1256 evaluating HellaSwag: 1250/1256 HellaSwag: 2882/10042 = 0.286995 +step 11501/19560 | loss 3.383172 (-0.00z)| norm 0.2801 (+0.13z)| lr 2.32e-04 | 8453.22 ms | -100.0% bf16 MFU | 61997 tok/s +step 11502/19560 | loss 3.310953 (-1.55z)| norm 0.2663 (-0.73z)| lr 2.32e-04 | 8453.02 ms | -100.0% bf16 MFU | 61998 tok/s +step 11503/19560 | loss 3.344440 (-0.82z)| norm 0.2725 (-0.34z)| lr 2.32e-04 | 8459.12 ms | -100.0% bf16 MFU | 61997 tok/s +step 11504/19560 | loss 3.377007 (-0.13z)| norm 0.2587 (-1.19z)| lr 2.32e-04 | 8454.23 ms | -100.0% bf16 MFU | 61998 tok/s +step 11505/19560 | loss 3.392966 (+0.25z)| norm 0.2886 (+0.67z)| lr 2.32e-04 | 8453.21 ms | -100.0% bf16 MFU | 62000 tok/s +step 11506/19560 | loss 3.341364 (-0.90z)| norm 0.2808 (+0.18z)| lr 2.32e-04 | 8454.97 ms | -100.0% bf16 MFU | 62000 tok/s +step 11507/19560 | loss 3.377562 (-0.08z)| norm 0.2721 (-0.35z)| lr 2.32e-04 | 8455.02 ms | -100.0% bf16 MFU | 62000 tok/s +step 11508/19560 | loss 3.322208 (-1.31z)| norm 0.2675 (-0.63z)| lr 2.32e-04 | 8454.65 ms | -100.0% bf16 MFU | 62001 tok/s +step 11509/19560 | loss 3.390865 (+0.22z)| norm 0.2802 (+0.17z)| lr 2.32e-04 | 8452.78 ms | -100.0% bf16 MFU | 62002 tok/s +step 11510/19560 | loss 3.343956 (-0.83z)| norm 0.2993 (+1.35z)| lr 2.32e-04 | 8456.20 ms | -100.0% bf16 MFU | 62002 tok/s +step 11511/19560 | loss 3.409511 (+0.68z)| norm 0.2385 (-2.40z)| lr 2.32e-04 | 8453.87 ms | -100.0% bf16 MFU | 62003 tok/s +step 11512/19560 | loss 3.336684 (-0.99z)| norm 0.2708 (-0.38z)| lr 2.32e-04 | 8454.50 ms | -100.0% bf16 MFU | 62003 tok/s +step 11513/19560 | loss 3.371553 (-0.17z)| norm 0.2617 (-0.93z)| lr 2.32e-04 | 8452.49 ms | -100.0% bf16 MFU | 62005 tok/s +step 11514/19560 | loss 3.396739 (+0.43z)| norm 0.2587 (-1.10z)| lr 2.31e-04 | 8455.91 ms | -100.0% bf16 MFU | 62005 tok/s +step 11515/19560 | loss 3.400382 (+0.52z)| norm 0.2620 (-0.89z)| lr 2.31e-04 | 8455.40 ms | -100.0% bf16 MFU | 62005 tok/s +step 11516/19560 | loss 3.385144 (+0.17z)| norm 0.2634 (-0.79z)| lr 2.31e-04 | 8459.64 ms | -100.0% bf16 MFU | 62003 tok/s +step 11517/19560 | loss 3.393234 (+0.36z)| norm 0.2911 (+0.92z)| lr 2.31e-04 | 8456.75 ms | -100.0% bf16 MFU | 62003 tok/s +step 11518/19560 | loss 3.404550 (+0.63z)| norm 0.2572 (-1.18z)| lr 2.31e-04 | 8457.44 ms | -100.0% bf16 MFU | 62002 tok/s +step 11519/19560 | loss 3.355952 (-0.54z)| norm 0.2825 (+0.39z)| lr 2.31e-04 | 8456.93 ms | -100.0% bf16 MFU | 62002 tok/s +step 11520/19560 | loss 3.344411 (-0.81z)| norm 0.2594 (-1.06z)| lr 2.31e-04 | 8453.84 ms | -100.0% bf16 MFU | 62003 tok/s +step 11521/19560 | loss 3.400399 (+0.53z)| norm 0.2790 (+0.16z)| lr 2.31e-04 | 8454.29 ms | -100.0% bf16 MFU | 62003 tok/s +step 11522/19560 | loss 3.336135 (-1.00z)| norm 0.2667 (-0.61z)| lr 2.31e-04 | 8455.19 ms | -100.0% bf16 MFU | 62004 tok/s +step 11523/19560 | loss 3.385702 (+0.18z)| norm 0.2592 (-1.06z)| lr 2.31e-04 | 8455.11 ms | -100.0% bf16 MFU | 62004 tok/s +step 11524/19560 | loss 3.345631 (-0.78z)| norm 0.2691 (-0.46z)| lr 2.31e-04 | 8451.70 ms | -100.0% bf16 MFU | 62005 tok/s +step 11525/19560 | loss 3.307749 (-1.65z)| norm 0.2755 (-0.07z)| lr 2.31e-04 | 8456.94 ms | -100.0% bf16 MFU | 62005 tok/s +step 11526/19560 | loss 3.371929 (-0.13z)| norm 0.2804 (+0.25z)| lr 2.31e-04 | 8455.85 ms | -100.0% bf16 MFU | 62005 tok/s +step 11527/19560 | loss 3.396354 (+0.45z)| norm 0.2678 (-0.55z)| lr 2.31e-04 | 8455.68 ms | -100.0% bf16 MFU | 62005 tok/s +step 11528/19560 | loss 3.390774 (+0.32z)| norm 0.2756 (-0.06z)| lr 2.31e-04 | 8453.64 ms | -100.0% bf16 MFU | 62005 tok/s +step 11529/19560 | loss 3.390558 (+0.30z)| norm 0.2488 (-1.73z)| lr 2.31e-04 | 8455.56 ms | -100.0% bf16 MFU | 62005 tok/s +step 11530/19560 | loss 3.362272 (-0.36z)| norm 0.2721 (-0.27z)| lr 2.31e-04 | 8456.77 ms | -100.0% bf16 MFU | 62005 tok/s +step 11531/19560 | loss 3.384534 (+0.16z)| norm 0.2493 (-1.69z)| lr 2.31e-04 | 8453.54 ms | -100.0% bf16 MFU | 62006 tok/s +step 11532/19560 | loss 3.366346 (-0.29z)| norm 0.2737 (-0.16z)| lr 2.31e-04 | 8456.52 ms | -100.0% bf16 MFU | 62005 tok/s +step 11533/19560 | loss 3.351427 (-0.64z)| norm 0.2649 (-0.72z)| lr 2.31e-04 | 8455.73 ms | -100.0% bf16 MFU | 62005 tok/s +step 11534/19560 | loss 3.406653 (+0.68z)| norm 0.2598 (-1.03z)| lr 2.31e-04 | 8453.62 ms | -100.0% bf16 MFU | 62006 tok/s +step 11535/19560 | loss 3.343635 (-0.82z)| norm 0.2647 (-0.74z)| lr 2.30e-04 | 8454.13 ms | -100.0% bf16 MFU | 62006 tok/s +step 11536/19560 | loss 3.390291 (+0.28z)| norm 0.2649 (-0.71z)| lr 2.30e-04 | 8455.80 ms | -100.0% bf16 MFU | 62006 tok/s +step 11537/19560 | loss 3.346588 (-0.79z)| norm 0.2739 (-0.14z)| lr 2.30e-04 | 8454.40 ms | -100.0% bf16 MFU | 62007 tok/s +step 11538/19560 | loss 3.377648 (-0.02z)| norm 0.2522 (-1.51z)| lr 2.30e-04 | 8455.32 ms | -100.0% bf16 MFU | 62007 tok/s +step 11539/19560 | loss 3.331537 (-1.14z)| norm 0.2672 (-0.54z)| lr 2.30e-04 | 8452.21 ms | -100.0% bf16 MFU | 62008 tok/s +step 11540/19560 | loss 3.431574 (+1.28z)| norm 0.2693 (-0.41z)| lr 2.30e-04 | 8455.91 ms | -100.0% bf16 MFU | 62007 tok/s +step 11541/19560 | loss 3.366964 (-0.28z)| norm 0.2724 (-0.20z)| lr 2.30e-04 | 8455.88 ms | -100.0% bf16 MFU | 62007 tok/s +step 11542/19560 | loss 3.342807 (-0.85z)| norm 0.2748 (-0.04z)| lr 2.30e-04 | 8457.04 ms | -100.0% bf16 MFU | 62007 tok/s +step 11543/19560 | loss 3.404326 (+0.64z)| norm 0.2777 (+0.14z)| lr 2.30e-04 | 8453.54 ms | -100.0% bf16 MFU | 62007 tok/s +step 11544/19560 | loss 3.360048 (-0.42z)| norm 0.2565 (-1.20z)| lr 2.30e-04 | 8456.13 ms | -100.0% bf16 MFU | 62007 tok/s +step 11545/19560 | loss 3.430924 (+1.31z)| norm 0.2942 (+1.19z)| lr 2.30e-04 | 8453.15 ms | -100.0% bf16 MFU | 62008 tok/s +step 11546/19560 | loss 3.341977 (-0.87z)| norm 0.2848 (+0.60z)| lr 2.30e-04 | 8455.18 ms | -100.0% bf16 MFU | 62008 tok/s +step 11547/19560 | loss 3.377270 (-0.01z)| norm 0.2828 (+0.47z)| lr 2.30e-04 | 8463.73 ms | -100.0% bf16 MFU | 62005 tok/s +step 11548/19560 | loss 3.354907 (-0.54z)| norm 0.2827 (+0.47z)| lr 2.30e-04 | 8485.90 ms | -100.0% bf16 MFU | 61994 tok/s +step 11549/19560 | loss 3.369511 (-0.17z)| norm 0.2653 (-0.62z)| lr 2.30e-04 | 8481.21 ms | -100.0% bf16 MFU | 61985 tok/s +step 11550/19560 | loss 3.393346 (+0.46z)| norm 0.2872 (+0.80z)| lr 2.30e-04 | 8482.13 ms | -100.0% bf16 MFU | 61976 tok/s +step 11551/19560 | loss 3.318330 (-1.46z)| norm 0.2707 (-0.26z)| lr 2.30e-04 | 8480.68 ms | -100.0% bf16 MFU | 61968 tok/s +step 11552/19560 | loss 3.320799 (-1.40z)| norm 0.2990 (+1.60z)| lr 2.30e-04 | 8481.96 ms | -100.0% bf16 MFU | 61961 tok/s +step 11553/19560 | loss 3.317473 (-1.48z)| norm 0.2738 (-0.05z)| lr 2.30e-04 | 8478.31 ms | -100.0% bf16 MFU | 61954 tok/s +step 11554/19560 | loss 3.363811 (-0.25z)| norm 0.2782 (+0.25z)| lr 2.30e-04 | 8479.11 ms | -100.0% bf16 MFU | 61948 tok/s +step 11555/19560 | loss 3.379931 (+0.17z)| norm 0.2709 (-0.23z)| lr 2.30e-04 | 8478.85 ms | -100.0% bf16 MFU | 61943 tok/s +step 11556/19560 | loss 3.379639 (+0.16z)| norm 0.2522 (-1.47z)| lr 2.29e-04 | 8481.63 ms | -100.0% bf16 MFU | 61936 tok/s +step 11557/19560 | loss 3.368145 (-0.13z)| norm 0.2811 (+0.48z)| lr 2.29e-04 | 8477.54 ms | -100.0% bf16 MFU | 61932 tok/s +step 11558/19560 | loss 3.392170 (+0.53z)| norm 0.2593 (-0.97z)| lr 2.29e-04 | 8477.21 ms | -100.0% bf16 MFU | 61927 tok/s +step 11559/19560 | loss 3.354718 (-0.50z)| norm 0.2620 (-0.78z)| lr 2.29e-04 | 8470.06 ms | -100.0% bf16 MFU | 61926 tok/s +step 11560/19560 | loss 3.294854 (-2.10z)| norm 0.2654 (-0.54z)| lr 2.29e-04 | 8473.94 ms | -100.0% bf16 MFU | 61923 tok/s +step 11561/19560 | loss 3.355126 (-0.46z)| norm 0.2696 (-0.24z)| lr 2.29e-04 | 8470.04 ms | -100.0% bf16 MFU | 61922 tok/s +step 11562/19560 | loss 3.361406 (-0.29z)| norm 0.2681 (-0.35z)| lr 2.29e-04 | 8471.41 ms | -100.0% bf16 MFU | 61920 tok/s +step 11563/19560 | loss 3.362519 (-0.27z)| norm 0.2539 (-1.30z)| lr 2.29e-04 | 8473.44 ms | -100.0% bf16 MFU | 61918 tok/s +step 11564/19560 | loss 3.389660 (+0.47z)| norm 0.2659 (-0.48z)| lr 2.29e-04 | 8471.75 ms | -100.0% bf16 MFU | 61917 tok/s +step 11565/19560 | loss 3.315946 (-1.61z)| norm 0.2485 (-1.63z)| lr 2.29e-04 | 8474.85 ms | -100.0% bf16 MFU | 61914 tok/s +step 11566/19560 | loss 3.372625 (+0.03z)| norm 0.2602 (-0.83z)| lr 2.29e-04 | 8474.83 ms | -100.0% bf16 MFU | 61911 tok/s +step 11567/19560 | loss 3.343226 (-0.82z)| norm 0.2583 (-0.95z)| lr 2.29e-04 | 8476.45 ms | -100.0% bf16 MFU | 61908 tok/s +step 11568/19560 | loss 3.358323 (-0.37z)| norm 0.2729 (+0.02z)| lr 2.29e-04 | 8473.66 ms | -100.0% bf16 MFU | 61907 tok/s +step 11569/19560 | loss 3.381177 (+0.30z)| norm 0.2739 (+0.11z)| lr 2.29e-04 | 8471.96 ms | -100.0% bf16 MFU | 61906 tok/s +step 11570/19560 | loss 3.343659 (-0.79z)| norm 0.2637 (-0.58z)| lr 2.29e-04 | 8473.20 ms | -100.0% bf16 MFU | 61904 tok/s +step 11571/19560 | loss 3.385404 (+0.42z)| norm 0.2783 (+0.45z)| lr 2.29e-04 | 8471.31 ms | -100.0% bf16 MFU | 61903 tok/s +step 11572/19560 | loss 3.424297 (+1.55z)| norm 0.2771 (+0.38z)| lr 2.29e-04 | 8469.86 ms | -100.0% bf16 MFU | 61903 tok/s +step 11573/19560 | loss 3.326938 (-1.27z)| norm 0.2858 (+0.98z)| lr 2.29e-04 | 8465.21 ms | -100.0% bf16 MFU | 61905 tok/s +step 11574/19560 | loss 3.350543 (-0.58z)| norm 0.2715 (-0.02z)| lr 2.29e-04 | 8466.52 ms | -100.0% bf16 MFU | 61906 tok/s +step 11575/19560 | loss 3.415705 (+1.29z)| norm 0.3483 (+4.85z)| lr 2.29e-04 | 8466.16 ms | -100.0% bf16 MFU | 61907 tok/s +step 11576/19560 | loss 3.381424 (+0.31z)| norm 0.3231 (+3.13z)| lr 2.28e-04 | 8469.38 ms | -100.0% bf16 MFU | 61907 tok/s +step 11577/19560 | loss 3.404906 (+1.01z)| norm 0.2821 (+0.58z)| lr 2.28e-04 | 8473.15 ms | -100.0% bf16 MFU | 61905 tok/s +step 11578/19560 | loss 3.386917 (+0.47z)| norm 0.3074 (+2.11z)| lr 2.28e-04 | 8475.92 ms | -100.0% bf16 MFU | 61903 tok/s +step 11579/19560 | loss 3.331850 (-1.13z)| norm 0.2664 (-0.41z)| lr 2.28e-04 | 8466.14 ms | -100.0% bf16 MFU | 61904 tok/s +step 11580/19560 | loss 3.358297 (-0.36z)| norm 0.2829 (+0.60z)| lr 2.28e-04 | 8473.87 ms | -100.0% bf16 MFU | 61902 tok/s +step 11581/19560 | loss 3.347404 (-0.67z)| norm 0.2809 (+0.49z)| lr 2.28e-04 | 8465.39 ms | -100.0% bf16 MFU | 61904 tok/s +step 11582/19560 | loss 3.341975 (-0.81z)| norm 0.2663 (-0.43z)| lr 2.28e-04 | 8458.95 ms | -100.0% bf16 MFU | 61908 tok/s +step 11583/19560 | loss 3.345609 (-0.70z)| norm 0.2854 (+0.77z)| lr 2.28e-04 | 8452.63 ms | -100.0% bf16 MFU | 61914 tok/s +step 11584/19560 | loss 3.379110 (+0.29z)| norm 0.2705 (-0.17z)| lr 2.28e-04 | 8451.74 ms | -100.0% bf16 MFU | 61920 tok/s +step 11585/19560 | loss 3.324255 (-1.32z)| norm 0.2700 (-0.20z)| lr 2.28e-04 | 8442.15 ms | -100.0% bf16 MFU | 61929 tok/s +step 11586/19560 | loss 3.344061 (-0.73z)| norm 0.2760 (+0.20z)| lr 2.28e-04 | 8440.94 ms | -100.0% bf16 MFU | 61938 tok/s +step 11587/19560 | loss 3.414539 (+1.35z)| norm 0.2755 (+0.15z)| lr 2.28e-04 | 8448.35 ms | -100.0% bf16 MFU | 61944 tok/s +step 11588/19560 | loss 3.344957 (-0.70z)| norm 0.2768 (+0.23z)| lr 2.28e-04 | 8451.54 ms | -100.0% bf16 MFU | 61949 tok/s +step 11589/19560 | loss 3.354142 (-0.43z)| norm 0.2916 (+1.17z)| lr 2.28e-04 | 8454.48 ms | -100.0% bf16 MFU | 61952 tok/s +step 11590/19560 | loss 3.493784 (+3.51z)| norm 0.3204 (+2.90z)| lr 2.28e-04 | 8447.64 ms | -100.0% bf16 MFU | 61957 tok/s +step 11591/19560 | loss 3.307411 (-1.74z)| norm 0.2919 (+1.11z)| lr 2.28e-04 | 8448.47 ms | -100.0% bf16 MFU | 61962 tok/s +step 11592/19560 | loss 3.362302 (-0.20z)| norm 0.2876 (+0.83z)| lr 2.28e-04 | 8452.42 ms | -100.0% bf16 MFU | 61966 tok/s +step 11593/19560 | loss 3.316583 (-1.50z)| norm 0.2722 (-0.13z)| lr 2.28e-04 | 8441.69 ms | -100.0% bf16 MFU | 61973 tok/s +step 11594/19560 | loss 3.363632 (-0.17z)| norm 0.3077 (+2.04z)| lr 2.28e-04 | 8448.69 ms | -100.0% bf16 MFU | 61977 tok/s +step 11595/19560 | loss 3.345141 (-0.69z)| norm 0.2641 (-0.65z)| lr 2.28e-04 | 8447.16 ms | -100.0% bf16 MFU | 61981 tok/s +step 11596/19560 | loss 3.358696 (-0.29z)| norm 0.2789 (+0.25z)| lr 2.28e-04 | 8446.93 ms | -100.0% bf16 MFU | 61986 tok/s +step 11597/19560 | loss 3.400158 (+0.97z)| norm 0.2666 (-0.52z)| lr 2.27e-04 | 8446.17 ms | -100.0% bf16 MFU | 61990 tok/s +step 11598/19560 | loss 3.355079 (-0.41z)| norm 0.2629 (-0.74z)| lr 2.27e-04 | 8448.83 ms | -100.0% bf16 MFU | 61993 tok/s +step 11599/19560 | loss 3.307997 (-1.83z)| norm 0.2871 (+0.76z)| lr 2.27e-04 | 8448.07 ms | -100.0% bf16 MFU | 61997 tok/s +step 11600/19560 | loss 3.408107 (+1.23z)| norm 0.2719 (-0.18z)| lr 2.27e-04 | 8447.16 ms | -100.0% bf16 MFU | 62000 tok/s +step 11601/19560 | loss 3.306555 (-1.84z)| norm 0.2701 (-0.28z)| lr 2.27e-04 | 8445.89 ms | -100.0% bf16 MFU | 62004 tok/s +step 11602/19560 | loss 3.377914 (+0.32z)| norm 0.2888 (+0.86z)| lr 2.27e-04 | 8446.68 ms | -100.0% bf16 MFU | 62007 tok/s +step 11603/19560 | loss 3.317327 (-1.49z)| norm 0.2518 (-1.44z)| lr 2.27e-04 | 8446.60 ms | -100.0% bf16 MFU | 62010 tok/s +step 11604/19560 | loss 3.357756 (-0.27z)| norm 0.2686 (-0.39z)| lr 2.27e-04 | 8447.88 ms | -100.0% bf16 MFU | 62013 tok/s +step 11605/19560 | loss 3.368229 (+0.04z)| norm 0.2688 (-0.38z)| lr 2.27e-04 | 8448.91 ms | -100.0% bf16 MFU | 62015 tok/s +step 11606/19560 | loss 3.349718 (-0.51z)| norm 0.2810 (+0.38z)| lr 2.27e-04 | 8449.15 ms | -100.0% bf16 MFU | 62017 tok/s +step 11607/19560 | loss 3.385740 (+0.58z)| norm 0.2879 (+0.81z)| lr 2.27e-04 | 8449.24 ms | -100.0% bf16 MFU | 62019 tok/s +step 11608/19560 | loss 3.356645 (-0.31z)| norm 0.2597 (-0.93z)| lr 2.27e-04 | 8451.35 ms | -100.0% bf16 MFU | 62020 tok/s +step 11609/19560 | loss 3.379139 (+0.37z)| norm 0.2621 (-0.77z)| lr 2.27e-04 | 8453.13 ms | -100.0% bf16 MFU | 62020 tok/s +step 11610/19560 | loss 3.378119 (+0.35z)| norm 0.2695 (-0.29z)| lr 2.27e-04 | 8458.61 ms | -100.0% bf16 MFU | 62018 tok/s +step 11611/19560 | loss 3.388162 (+0.67z)| norm 0.2804 (+0.39z)| lr 2.27e-04 | 8463.71 ms | -100.0% bf16 MFU | 62014 tok/s +step 11612/19560 | loss 3.369507 (+0.10z)| norm 0.2885 (+0.90z)| lr 2.27e-04 | 8452.90 ms | -100.0% bf16 MFU | 62015 tok/s +step 11613/19560 | loss 3.373905 (+0.25z)| norm 0.2547 (-1.21z)| lr 2.27e-04 | 8459.20 ms | -100.0% bf16 MFU | 62013 tok/s +step 11614/19560 | loss 3.310181 (-1.71z)| norm 0.2603 (-0.86z)| lr 2.27e-04 | 8460.87 ms | -100.0% bf16 MFU | 62011 tok/s +step 11615/19560 | loss 3.367491 (+0.05z)| norm 0.2659 (-0.49z)| lr 2.27e-04 | 8464.99 ms | -100.0% bf16 MFU | 62007 tok/s +step 11616/19560 | loss 3.332723 (-1.03z)| norm 0.2708 (-0.16z)| lr 2.27e-04 | 8456.96 ms | -100.0% bf16 MFU | 62006 tok/s +step 11617/19560 | loss 3.382797 (+0.53z)| norm 0.2634 (-0.65z)| lr 2.26e-04 | 8461.91 ms | -100.0% bf16 MFU | 62004 tok/s +step 11618/19560 | loss 3.437529 (+2.18z)| norm 0.2565 (-1.08z)| lr 2.26e-04 | 8462.17 ms | -100.0% bf16 MFU | 62002 tok/s +step 11619/19560 | loss 3.370773 (+0.13z)| norm 0.2727 (-0.02z)| lr 2.26e-04 | 8461.47 ms | -100.0% bf16 MFU | 62000 tok/s +step 11620/19560 | loss 3.346709 (-0.61z)| norm 0.2602 (-0.83z)| lr 2.26e-04 | 8460.47 ms | -100.0% bf16 MFU | 61998 tok/s +step 11621/19560 | loss 3.365379 (-0.03z)| norm 0.2581 (-0.98z)| lr 2.26e-04 | 8468.38 ms | -100.0% bf16 MFU | 61994 tok/s +step 11622/19560 | loss 3.313762 (-1.59z)| norm 0.2553 (-1.15z)| lr 2.26e-04 | 8458.44 ms | -100.0% bf16 MFU | 61993 tok/s +step 11623/19560 | loss 3.411974 (+1.40z)| norm 0.2776 (+0.33z)| lr 2.26e-04 | 8461.91 ms | -100.0% bf16 MFU | 61991 tok/s +step 11624/19560 | loss 3.319139 (-1.40z)| norm 0.2506 (-1.44z)| lr 2.26e-04 | 8456.42 ms | -100.0% bf16 MFU | 61992 tok/s +step 11625/19560 | loss 3.340655 (-0.74z)| norm 0.2634 (-0.60z)| lr 2.26e-04 | 8456.69 ms | -100.0% bf16 MFU | 61992 tok/s +step 11626/19560 | loss 3.412107 (+1.40z)| norm 0.2761 (+0.23z)| lr 2.26e-04 | 8455.39 ms | -100.0% bf16 MFU | 61993 tok/s +step 11627/19560 | loss 3.326587 (-1.17z)| norm 0.2751 (+0.16z)| lr 2.26e-04 | 8459.11 ms | -100.0% bf16 MFU | 61992 tok/s +step 11628/19560 | loss 3.392352 (+0.83z)| norm 0.2661 (-0.43z)| lr 2.26e-04 | 8460.06 ms | -100.0% bf16 MFU | 61991 tok/s +step 11629/19560 | loss 3.426970 (+1.86z)| norm 0.2803 (+0.50z)| lr 2.26e-04 | 8461.38 ms | -100.0% bf16 MFU | 61990 tok/s +step 11630/19560 | loss 3.372888 (+0.21z)| norm 0.2976 (+1.61z)| lr 2.26e-04 | 8460.27 ms | -100.0% bf16 MFU | 61989 tok/s +step 11631/19560 | loss 3.417006 (+1.53z)| norm 0.2700 (-0.19z)| lr 2.26e-04 | 8456.31 ms | -100.0% bf16 MFU | 61989 tok/s +step 11632/19560 | loss 3.426013 (+1.77z)| norm 0.3001 (+1.73z)| lr 2.26e-04 | 8459.96 ms | -100.0% bf16 MFU | 61988 tok/s +step 11633/19560 | loss 3.358179 (-0.25z)| norm 0.2732 (+0.00z)| lr 2.26e-04 | 8457.85 ms | -100.0% bf16 MFU | 61988 tok/s +step 11634/19560 | loss 3.346121 (-0.62z)| norm 0.2904 (+1.11z)| lr 2.26e-04 | 8457.92 ms | -100.0% bf16 MFU | 61988 tok/s +step 11635/19560 | loss 3.348056 (-0.55z)| norm 0.2714 (-0.12z)| lr 2.26e-04 | 8460.48 ms | -100.0% bf16 MFU | 61987 tok/s +step 11636/19560 | loss 3.349502 (-0.52z)| norm 0.2946 (+1.36z)| lr 2.26e-04 | 8460.57 ms | -100.0% bf16 MFU | 61986 tok/s +step 11637/19560 | loss 3.333324 (-0.99z)| norm 0.2933 (+1.26z)| lr 2.26e-04 | 8462.33 ms | -100.0% bf16 MFU | 61985 tok/s +step 11638/19560 | loss 3.357783 (-0.26z)| norm 0.2740 (+0.04z)| lr 2.25e-04 | 8456.85 ms | -100.0% bf16 MFU | 61985 tok/s +step 11639/19560 | loss 3.313694 (-1.56z)| norm 0.2905 (+1.10z)| lr 2.25e-04 | 8461.38 ms | -100.0% bf16 MFU | 61984 tok/s +step 11640/19560 | loss 3.310536 (-1.64z)| norm 0.2373 (-2.32z)| lr 2.25e-04 | 8459.11 ms | -100.0% bf16 MFU | 61984 tok/s +step 11641/19560 | loss 3.395416 (+0.89z)| norm 0.2633 (-0.65z)| lr 2.25e-04 | 8457.26 ms | -100.0% bf16 MFU | 61984 tok/s +step 11642/19560 | loss 3.333721 (-0.93z)| norm 0.2738 (+0.01z)| lr 2.25e-04 | 8459.74 ms | -100.0% bf16 MFU | 61984 tok/s +step 11643/19560 | loss 3.326820 (-1.12z)| norm 0.2552 (-1.17z)| lr 2.25e-04 | 8457.65 ms | -100.0% bf16 MFU | 61984 tok/s +step 11644/19560 | loss 3.400358 (+1.06z)| norm 0.2421 (-1.97z)| lr 2.25e-04 | 8455.56 ms | -100.0% bf16 MFU | 61985 tok/s +step 11645/19560 | loss 3.297924 (-1.93z)| norm 0.2540 (-1.21z)| lr 2.25e-04 | 8462.07 ms | -100.0% bf16 MFU | 61984 tok/s +step 11646/19560 | loss 3.318718 (-1.30z)| norm 0.2376 (-2.19z)| lr 2.25e-04 | 8459.22 ms | -100.0% bf16 MFU | 61984 tok/s +step 11647/19560 | loss 3.373315 (+0.29z)| norm 0.2773 (+0.28z)| lr 2.25e-04 | 8460.13 ms | -100.0% bf16 MFU | 61983 tok/s +step 11648/19560 | loss 3.306425 (-1.64z)| norm 0.2425 (-1.86z)| lr 2.25e-04 | 8456.47 ms | -100.0% bf16 MFU | 61984 tok/s +step 11649/19560 | loss 3.354191 (-0.25z)| norm 0.2611 (-0.70z)| lr 2.25e-04 | 8462.09 ms | -100.0% bf16 MFU | 61982 tok/s +step 11650/19560 | loss 3.347559 (-0.44z)| norm 0.2517 (-1.27z)| lr 2.25e-04 | 8459.90 ms | -100.0% bf16 MFU | 61982 tok/s +step 11651/19560 | loss 3.318243 (-1.28z)| norm 0.2557 (-1.02z)| lr 2.25e-04 | 8458.66 ms | -100.0% bf16 MFU | 61982 tok/s +step 11652/19560 | loss 3.408381 (+1.32z)| norm 0.2662 (-0.38z)| lr 2.25e-04 | 8453.67 ms | -100.0% bf16 MFU | 61984 tok/s +step 11653/19560 | loss 3.376799 (+0.39z)| norm 0.2592 (-0.80z)| lr 2.25e-04 | 8453.96 ms | -100.0% bf16 MFU | 61986 tok/s +step 11654/19560 | loss 3.338886 (-0.70z)| norm 0.2631 (-0.55z)| lr 2.25e-04 | 8458.50 ms | -100.0% bf16 MFU | 61985 tok/s +step 11655/19560 | loss 3.339810 (-0.66z)| norm 0.2896 (+1.05z)| lr 2.25e-04 | 8460.17 ms | -100.0% bf16 MFU | 61985 tok/s +step 11656/19560 | loss 3.391923 (+0.86z)| norm 0.2888 (+0.99z)| lr 2.25e-04 | 8459.09 ms | -100.0% bf16 MFU | 61984 tok/s +step 11657/19560 | loss 3.404179 (+1.21z)| norm 0.2611 (-0.69z)| lr 2.25e-04 | 8458.86 ms | -100.0% bf16 MFU | 61984 tok/s +step 11658/19560 | loss 3.301067 (-1.75z)| norm 0.2837 (+0.67z)| lr 2.25e-04 | 8452.45 ms | -100.0% bf16 MFU | 61986 tok/s +step 11659/19560 | loss 3.388167 (+0.74z)| norm 0.2587 (-0.85z)| lr 2.24e-04 | 8456.82 ms | -100.0% bf16 MFU | 61987 tok/s +step 11660/19560 | loss 3.362147 (-0.00z)| norm 0.2667 (-0.36z)| lr 2.24e-04 | 8463.68 ms | -100.0% bf16 MFU | 61985 tok/s +step 11661/19560 | loss 3.357303 (-0.14z)| norm 0.2921 (+1.17z)| lr 2.24e-04 | 8462.12 ms | -100.0% bf16 MFU | 61983 tok/s +step 11662/19560 | loss 3.324632 (-1.06z)| norm 0.2872 (+0.86z)| lr 2.24e-04 | 8456.92 ms | -100.0% bf16 MFU | 61984 tok/s +step 11663/19560 | loss 3.342610 (-0.55z)| norm 0.2709 (-0.13z)| lr 2.24e-04 | 8452.77 ms | -100.0% bf16 MFU | 61986 tok/s +step 11664/19560 | loss 3.346161 (-0.43z)| norm 0.2920 (+1.13z)| lr 2.24e-04 | 8457.89 ms | -100.0% bf16 MFU | 61986 tok/s +step 11665/19560 | loss 3.374742 (+0.38z)| norm 0.2753 (+0.12z)| lr 2.24e-04 | 8457.77 ms | -100.0% bf16 MFU | 61986 tok/s +step 11666/19560 | loss 3.338786 (-0.64z)| norm 0.2615 (-0.73z)| lr 2.24e-04 | 8457.04 ms | -100.0% bf16 MFU | 61987 tok/s +step 11667/19560 | loss 3.330635 (-0.88z)| norm 0.2740 (+0.03z)| lr 2.24e-04 | 8455.46 ms | -100.0% bf16 MFU | 61988 tok/s +step 11668/19560 | loss 3.417815 (+1.64z)| norm 0.2670 (-0.39z)| lr 2.24e-04 | 8457.78 ms | -100.0% bf16 MFU | 61988 tok/s +step 11669/19560 | loss 3.377423 (+0.47z)| norm 0.2461 (-1.63z)| lr 2.24e-04 | 8456.47 ms | -100.0% bf16 MFU | 61988 tok/s +step 11670/19560 | loss 3.333667 (-0.79z)| norm 0.2818 (+0.51z)| lr 2.24e-04 | 8460.90 ms | -100.0% bf16 MFU | 61987 tok/s +step 11671/19560 | loss 3.387481 (+0.77z)| norm 0.2591 (-0.84z)| lr 2.24e-04 | 8458.50 ms | -100.0% bf16 MFU | 61987 tok/s +step 11672/19560 | loss 3.335379 (-0.73z)| norm 0.2577 (-0.93z)| lr 2.24e-04 | 8458.75 ms | -100.0% bf16 MFU | 61987 tok/s +step 11673/19560 | loss 3.349063 (-0.32z)| norm 0.2551 (-1.06z)| lr 2.24e-04 | 8459.21 ms | -100.0% bf16 MFU | 61986 tok/s +step 11674/19560 | loss 3.364168 (+0.12z)| norm 0.2599 (-0.77z)| lr 2.24e-04 | 8456.39 ms | -100.0% bf16 MFU | 61987 tok/s +step 11675/19560 | loss 3.374897 (+0.43z)| norm 0.2819 (+0.55z)| lr 2.24e-04 | 8457.29 ms | -100.0% bf16 MFU | 61987 tok/s +step 11676/19560 | loss 3.346864 (-0.39z)| norm 0.2520 (-1.22z)| lr 2.24e-04 | 8460.04 ms | -100.0% bf16 MFU | 61986 tok/s +step 11677/19560 | loss 3.429457 (+1.99z)| norm 0.3074 (+2.04z)| lr 2.24e-04 | 8458.04 ms | -100.0% bf16 MFU | 61986 tok/s +step 11678/19560 | loss 3.327898 (-0.93z)| norm 0.2607 (-0.69z)| lr 2.24e-04 | 8454.66 ms | -100.0% bf16 MFU | 61988 tok/s +step 11679/19560 | loss 3.339131 (-0.62z)| norm 0.2723 (-0.01z)| lr 2.23e-04 | 8460.99 ms | -100.0% bf16 MFU | 61987 tok/s +step 11680/19560 | loss 3.375225 (+0.42z)| norm 0.2835 (+0.66z)| lr 2.23e-04 | 8458.57 ms | -100.0% bf16 MFU | 61986 tok/s +step 11681/19560 | loss 3.356541 (-0.13z)| norm 0.2649 (-0.44z)| lr 2.23e-04 | 8455.13 ms | -100.0% bf16 MFU | 61988 tok/s +step 11682/19560 | loss 3.330819 (-0.88z)| norm 0.2954 (+1.35z)| lr 2.23e-04 | 8457.04 ms | -100.0% bf16 MFU | 61988 tok/s +step 11683/19560 | loss 3.344802 (-0.46z)| norm 0.2721 (-0.02z)| lr 2.23e-04 | 8456.05 ms | -100.0% bf16 MFU | 61989 tok/s +step 11684/19560 | loss 3.332477 (-0.81z)| norm 0.2790 (+0.37z)| lr 2.23e-04 | 8457.48 ms | -100.0% bf16 MFU | 61989 tok/s +step 11685/19560 | loss 3.333748 (-0.76z)| norm 0.2788 (+0.36z)| lr 2.23e-04 | 8453.22 ms | -100.0% bf16 MFU | 61990 tok/s +step 11686/19560 | loss 3.319526 (-1.16z)| norm 0.2543 (-1.08z)| lr 2.23e-04 | 8454.70 ms | -100.0% bf16 MFU | 61991 tok/s +step 11687/19560 | loss 3.437039 (+2.21z)| norm 0.2704 (-0.14z)| lr 2.23e-04 | 8460.37 ms | -100.0% bf16 MFU | 61990 tok/s +step 11688/19560 | loss 3.337915 (-0.65z)| norm 0.2774 (+0.27z)| lr 2.23e-04 | 8454.52 ms | -100.0% bf16 MFU | 61991 tok/s +step 11689/19560 | loss 3.372252 (+0.34z)| norm 0.2755 (+0.16z)| lr 2.23e-04 | 8455.03 ms | -100.0% bf16 MFU | 61992 tok/s +step 11690/19560 | loss 3.371099 (+0.31z)| norm 0.2860 (+0.77z)| lr 2.23e-04 | 8456.63 ms | -100.0% bf16 MFU | 61993 tok/s +step 11691/19560 | loss 3.418018 (+1.64z)| norm 0.2627 (-0.62z)| lr 2.23e-04 | 8454.58 ms | -100.0% bf16 MFU | 61994 tok/s +step 11692/19560 | loss 3.367528 (+0.20z)| norm 0.2936 (+1.20z)| lr 2.23e-04 | 8454.92 ms | -100.0% bf16 MFU | 61994 tok/s +step 11693/19560 | loss 3.399283 (+1.09z)| norm 0.2521 (-1.25z)| lr 2.23e-04 | 8454.43 ms | -100.0% bf16 MFU | 61995 tok/s +step 11694/19560 | loss 3.346947 (-0.41z)| norm 0.2784 (+0.29z)| lr 2.23e-04 | 8455.15 ms | -100.0% bf16 MFU | 61996 tok/s +step 11695/19560 | loss 3.352034 (-0.27z)| norm 0.2498 (-1.39z)| lr 2.23e-04 | 8456.43 ms | -100.0% bf16 MFU | 61996 tok/s +step 11696/19560 | loss 3.361900 (+0.02z)| norm 0.2790 (+0.33z)| lr 2.23e-04 | 8452.88 ms | -100.0% bf16 MFU | 61998 tok/s +step 11697/19560 | loss 3.455733 (+2.63z)| norm 0.2638 (-0.56z)| lr 2.23e-04 | 8455.12 ms | -100.0% bf16 MFU | 61998 tok/s +step 11698/19560 | loss 3.370428 (+0.23z)| norm 0.3112 (+2.17z)| lr 2.23e-04 | 8458.45 ms | -100.0% bf16 MFU | 61997 tok/s +step 11699/19560 | loss 3.379707 (+0.50z)| norm 0.2578 (-0.91z)| lr 2.23e-04 | 8456.48 ms | -100.0% bf16 MFU | 61997 tok/s +step 11700/19560 | loss 3.370542 (+0.25z)| norm 0.3083 (+1.96z)| lr 2.22e-04 | 8451.22 ms | -100.0% bf16 MFU | 61999 tok/s +step 11701/19560 | loss 3.458719 (+2.66z)| norm 0.2830 (+0.52z)| lr 2.22e-04 | 8452.03 ms | -100.0% bf16 MFU | 62001 tok/s +step 11702/19560 | loss 3.395714 (+0.90z)| norm 0.2725 (-0.07z)| lr 2.22e-04 | 8456.94 ms | -100.0% bf16 MFU | 62001 tok/s +step 11703/19560 | loss 3.339064 (-0.65z)| norm 0.2790 (+0.35z)| lr 2.22e-04 | 8454.45 ms | -100.0% bf16 MFU | 62001 tok/s +step 11704/19560 | loss 3.346618 (-0.43z)| norm 0.2655 (-0.46z)| lr 2.22e-04 | 8453.64 ms | -100.0% bf16 MFU | 62002 tok/s +step 11705/19560 | loss 3.300593 (-1.68z)| norm 0.2783 (+0.35z)| lr 2.22e-04 | 8455.91 ms | -100.0% bf16 MFU | 62002 tok/s +step 11706/19560 | loss 3.391585 (+0.84z)| norm 0.2654 (-0.46z)| lr 2.22e-04 | 8452.88 ms | -100.0% bf16 MFU | 62003 tok/s +step 11707/19560 | loss 3.375390 (+0.38z)| norm 0.2734 (+0.06z)| lr 2.22e-04 | 8456.56 ms | -100.0% bf16 MFU | 62003 tok/s +step 11708/19560 | loss 3.322937 (-1.06z)| norm 0.2753 (+0.18z)| lr 2.22e-04 | 8453.19 ms | -100.0% bf16 MFU | 62004 tok/s +step 11709/19560 | loss 3.338958 (-0.62z)| norm 0.2520 (-1.31z)| lr 2.22e-04 | 8454.65 ms | -100.0% bf16 MFU | 62004 tok/s +step 11710/19560 | loss 3.419706 (+1.58z)| norm 0.2592 (-0.84z)| lr 2.22e-04 | 8444.99 ms | -100.0% bf16 MFU | 62008 tok/s +step 11711/19560 | loss 3.379486 (+0.47z)| norm 0.2806 (+0.55z)| lr 2.22e-04 | 8439.06 ms | -100.0% bf16 MFU | 62014 tok/s +step 11712/19560 | loss 3.353518 (-0.23z)| norm 0.2901 (+1.15z)| lr 2.22e-04 | 8439.46 ms | -100.0% bf16 MFU | 62020 tok/s +step 11713/19560 | loss 3.331768 (-0.83z)| norm 0.2844 (+0.77z)| lr 2.22e-04 | 8436.02 ms | -100.0% bf16 MFU | 62026 tok/s +step 11714/19560 | loss 3.352922 (-0.25z)| norm 0.2727 (+0.02z)| lr 2.22e-04 | 8434.73 ms | -100.0% bf16 MFU | 62033 tok/s +step 11715/19560 | loss 3.307423 (-1.48z)| norm 0.2845 (+0.77z)| lr 2.22e-04 | 8431.91 ms | -100.0% bf16 MFU | 62040 tok/s +step 11716/19560 | loss 3.312785 (-1.31z)| norm 0.2640 (-0.53z)| lr 2.22e-04 | 8434.65 ms | -100.0% bf16 MFU | 62046 tok/s +step 11717/19560 | loss 3.355973 (-0.14z)| norm 0.2757 (+0.23z)| lr 2.22e-04 | 8432.87 ms | -100.0% bf16 MFU | 62052 tok/s +step 11718/19560 | loss 3.382292 (+0.63z)| norm 0.2565 (-1.01z)| lr 2.22e-04 | 8432.12 ms | -100.0% bf16 MFU | 62059 tok/s +step 11719/19560 | loss 3.362514 (+0.05z)| norm 0.2750 (+0.23z)| lr 2.22e-04 | 8430.30 ms | -100.0% bf16 MFU | 62065 tok/s +step 11720/19560 | loss 3.415694 (+1.57z)| norm 0.2684 (-0.20z)| lr 2.22e-04 | 8435.66 ms | -100.0% bf16 MFU | 62069 tok/s +step 11721/19560 | loss 3.459028 (+2.71z)| norm 0.2742 (+0.18z)| lr 2.21e-04 | 8433.69 ms | -100.0% bf16 MFU | 62074 tok/s +step 11722/19560 | loss 3.356097 (-0.17z)| norm 0.2654 (-0.39z)| lr 2.21e-04 | 8433.44 ms | -100.0% bf16 MFU | 62079 tok/s +step 11723/19560 | loss 3.379907 (+0.49z)| norm 0.2623 (-0.61z)| lr 2.21e-04 | 8435.39 ms | -100.0% bf16 MFU | 62083 tok/s +step 11724/19560 | loss 3.344348 (-0.50z)| norm 0.2663 (-0.33z)| lr 2.21e-04 | 8435.55 ms | -100.0% bf16 MFU | 62086 tok/s +step 11725/19560 | loss 3.351635 (-0.29z)| norm 0.2726 (+0.11z)| lr 2.21e-04 | 8434.41 ms | -100.0% bf16 MFU | 62090 tok/s +step 11726/19560 | loss 3.341943 (-0.56z)| norm 0.3062 (+2.36z)| lr 2.21e-04 | 8436.62 ms | -100.0% bf16 MFU | 62093 tok/s +step 11727/19560 | loss 3.371774 (+0.27z)| norm 0.2828 (+0.78z)| lr 2.21e-04 | 8439.37 ms | -100.0% bf16 MFU | 62094 tok/s +step 11728/19560 | loss 3.345100 (-0.48z)| norm 0.2980 (+1.77z)| lr 2.21e-04 | 8434.85 ms | -100.0% bf16 MFU | 62097 tok/s +step 11729/19560 | loss 3.333371 (-0.82z)| norm 0.2662 (-0.36z)| lr 2.21e-04 | 8435.74 ms | -100.0% bf16 MFU | 62100 tok/s +step 11730/19560 | loss 3.371700 (+0.28z)| norm 0.2989 (+1.81z)| lr 2.21e-04 | 8436.61 ms | -100.0% bf16 MFU | 62102 tok/s +step 11731/19560 | loss 3.372384 (+0.29z)| norm 0.2837 (+0.79z)| lr 2.21e-04 | 8438.29 ms | -100.0% bf16 MFU | 62104 tok/s +step 11732/19560 | loss 3.345350 (-0.49z)| norm 0.2863 (+0.95z)| lr 2.21e-04 | 8440.00 ms | -100.0% bf16 MFU | 62104 tok/s +step 11733/19560 | loss 3.328243 (-0.97z)| norm 0.2635 (-0.56z)| lr 2.21e-04 | 8437.66 ms | -100.0% bf16 MFU | 62106 tok/s +step 11734/19560 | loss 3.322616 (-1.12z)| norm 0.2910 (+1.25z)| lr 2.21e-04 | 8441.69 ms | -100.0% bf16 MFU | 62106 tok/s +step 11735/19560 | loss 3.361838 (+0.01z)| norm 0.2880 (+1.06z)| lr 2.21e-04 | 8442.99 ms | -100.0% bf16 MFU | 62106 tok/s +step 11736/19560 | loss 3.408843 (+1.33z)| norm 0.2852 (+0.86z)| lr 2.21e-04 | 8444.02 ms | -100.0% bf16 MFU | 62105 tok/s +step 11737/19560 | loss 3.343719 (-0.51z)| norm 0.2985 (+1.70z)| lr 2.21e-04 | 8444.44 ms | -100.0% bf16 MFU | 62104 tok/s +step 11738/19560 | loss 3.395524 (+0.96z)| norm 0.2835 (+0.71z)| lr 2.21e-04 | 8462.59 ms | -100.0% bf16 MFU | 62096 tok/s +step 11739/19560 | loss 3.407058 (+1.27z)| norm 0.2637 (-0.57z)| lr 2.21e-04 | 8470.96 ms | -100.0% bf16 MFU | 62086 tok/s +step 11740/19560 | loss 3.338174 (-0.67z)| norm 0.2919 (+1.26z)| lr 2.21e-04 | 8470.39 ms | -100.0% bf16 MFU | 62077 tok/s +step 11741/19560 | loss 3.319548 (-1.17z)| norm 0.2638 (-0.57z)| lr 2.21e-04 | 8479.04 ms | -100.0% bf16 MFU | 62065 tok/s +step 11742/19560 | loss 3.317009 (-1.25z)| norm 0.2608 (-0.77z)| lr 2.20e-04 | 8473.30 ms | -100.0% bf16 MFU | 62055 tok/s +step 11743/19560 | loss 3.353574 (-0.22z)| norm 0.2621 (-0.68z)| lr 2.20e-04 | 8479.98 ms | -100.0% bf16 MFU | 62044 tok/s +step 11744/19560 | loss 3.371304 (+0.27z)| norm 0.2702 (-0.15z)| lr 2.20e-04 | 8474.65 ms | -100.0% bf16 MFU | 62035 tok/s +step 11745/19560 | loss 3.381910 (+0.57z)| norm 0.2584 (-0.92z)| lr 2.20e-04 | 8476.64 ms | -100.0% bf16 MFU | 62026 tok/s +step 11746/19560 | loss 3.427097 (+1.86z)| norm 0.2599 (-0.83z)| lr 2.20e-04 | 8474.57 ms | -100.0% bf16 MFU | 62018 tok/s +step 11747/19560 | loss 3.391129 (+0.83z)| norm 0.2877 (+0.98z)| lr 2.20e-04 | 8475.26 ms | -100.0% bf16 MFU | 62010 tok/s +step 11748/19560 | loss 3.424265 (+1.73z)| norm 0.2655 (-0.47z)| lr 2.20e-04 | 8476.23 ms | -100.0% bf16 MFU | 62002 tok/s +step 11749/19560 | loss 3.429161 (+1.83z)| norm 0.2859 (+0.85z)| lr 2.20e-04 | 8477.12 ms | -100.0% bf16 MFU | 61994 tok/s +step 11750/19560 | loss 3.288495 (-2.02z)| norm 0.2559 (-1.11z)| lr 2.20e-04 | 8477.17 ms | -100.0% bf16 MFU | 61987 tok/s +val loss 3.348552 +evaluating HellaSwag: 0/1256 evaluating HellaSwag: 10/1256 evaluating HellaSwag: 20/1256 evaluating HellaSwag: 30/1256 evaluating HellaSwag: 40/1256 evaluating HellaSwag: 50/1256 evaluating HellaSwag: 60/1256 evaluating HellaSwag: 70/1256 evaluating HellaSwag: 80/1256 evaluating HellaSwag: 90/1256 evaluating HellaSwag: 100/1256 evaluating HellaSwag: 110/1256 evaluating HellaSwag: 120/1256 evaluating HellaSwag: 130/1256 evaluating HellaSwag: 140/1256 evaluating HellaSwag: 150/1256 evaluating HellaSwag: 160/1256 evaluating HellaSwag: 170/1256 evaluating HellaSwag: 180/1256 evaluating HellaSwag: 190/1256 evaluating HellaSwag: 200/1256 evaluating HellaSwag: 210/1256 evaluating HellaSwag: 220/1256 evaluating HellaSwag: 230/1256 evaluating HellaSwag: 240/1256 evaluating HellaSwag: 250/1256 evaluating HellaSwag: 260/1256 evaluating HellaSwag: 270/1256 evaluating HellaSwag: 280/1256 evaluating HellaSwag: 290/1256 evaluating HellaSwag: 300/1256 evaluating HellaSwag: 310/1256 evaluating HellaSwag: 320/1256 evaluating HellaSwag: 330/1256 evaluating HellaSwag: 340/1256 evaluating HellaSwag: 350/1256 evaluating HellaSwag: 360/1256 evaluating HellaSwag: 370/1256 evaluating HellaSwag: 380/1256 evaluating HellaSwag: 390/1256 evaluating HellaSwag: 400/1256 evaluating HellaSwag: 410/1256 evaluating HellaSwag: 420/1256 evaluating HellaSwag: 430/1256 evaluating HellaSwag: 440/1256 evaluating HellaSwag: 450/1256 evaluating HellaSwag: 460/1256 evaluating HellaSwag: 470/1256 evaluating HellaSwag: 480/1256 evaluating HellaSwag: 490/1256 evaluating HellaSwag: 500/1256 evaluating HellaSwag: 510/1256 evaluating HellaSwag: 520/1256 evaluating HellaSwag: 530/1256 evaluating HellaSwag: 540/1256 evaluating HellaSwag: 550/1256 evaluating HellaSwag: 560/1256 evaluating HellaSwag: 570/1256 evaluating HellaSwag: 580/1256 evaluating HellaSwag: 590/1256 evaluating HellaSwag: 600/1256 evaluating HellaSwag: 610/1256 evaluating HellaSwag: 620/1256 evaluating HellaSwag: 630/1256 evaluating HellaSwag: 640/1256 evaluating HellaSwag: 650/1256 evaluating HellaSwag: 660/1256 evaluating HellaSwag: 670/1256 evaluating HellaSwag: 680/1256 evaluating HellaSwag: 690/1256 evaluating HellaSwag: 700/1256 evaluating HellaSwag: 710/1256 evaluating HellaSwag: 720/1256 evaluating HellaSwag: 730/1256 evaluating HellaSwag: 740/1256 evaluating HellaSwag: 750/1256 evaluating HellaSwag: 760/1256 evaluating HellaSwag: 770/1256 evaluating HellaSwag: 780/1256 evaluating HellaSwag: 790/1256 evaluating HellaSwag: 800/1256 evaluating HellaSwag: 810/1256 evaluating HellaSwag: 820/1256 evaluating HellaSwag: 830/1256 evaluating HellaSwag: 840/1256 evaluating HellaSwag: 850/1256 evaluating HellaSwag: 860/1256 evaluating HellaSwag: 870/1256 evaluating HellaSwag: 880/1256 evaluating HellaSwag: 890/1256 evaluating HellaSwag: 900/1256 evaluating HellaSwag: 910/1256 evaluating HellaSwag: 920/1256 evaluating HellaSwag: 930/1256 evaluating HellaSwag: 940/1256 evaluating HellaSwag: 950/1256 evaluating HellaSwag: 960/1256 evaluating HellaSwag: 970/1256 evaluating HellaSwag: 980/1256 evaluating HellaSwag: 990/1256 evaluating HellaSwag: 1000/1256 evaluating HellaSwag: 1010/1256 evaluating HellaSwag: 1020/1256 evaluating HellaSwag: 1030/1256 evaluating HellaSwag: 1040/1256 evaluating HellaSwag: 1050/1256 evaluating HellaSwag: 1060/1256 evaluating HellaSwag: 1070/1256 evaluating HellaSwag: 1080/1256 evaluating HellaSwag: 1090/1256 evaluating HellaSwag: 1100/1256 evaluating HellaSwag: 1110/1256 evaluating HellaSwag: 1120/1256 evaluating HellaSwag: 1130/1256 evaluating HellaSwag: 1140/1256 evaluating HellaSwag: 1150/1256 evaluating HellaSwag: 1160/1256 evaluating HellaSwag: 1170/1256 evaluating HellaSwag: 1180/1256 evaluating HellaSwag: 1190/1256 evaluating HellaSwag: 1200/1256 evaluating HellaSwag: 1210/1256 evaluating HellaSwag: 1220/1256 evaluating HellaSwag: 1230/1256 evaluating HellaSwag: 1240/1256 evaluating HellaSwag: 1250/1256 HellaSwag: 2905/10042 = 0.289285 +step 11751/19560 | loss 3.350084 (-0.33z)| norm 0.2999 (+1.73z)| lr 2.20e-04 | 8473.77 ms | -100.0% bf16 MFU | 61981 tok/s +step 11752/19560 | loss 3.366093 (+0.10z)| norm 0.2718 (-0.10z)| lr 2.20e-04 | 8476.66 ms | -100.0% bf16 MFU | 61975 tok/s +step 11753/19560 | loss 3.369452 (+0.19z)| norm 0.3019 (+1.83z)| lr 2.20e-04 | 8474.78 ms | -100.0% bf16 MFU | 61969 tok/s +step 11754/19560 | loss 3.413313 (+1.40z)| norm 0.2894 (+1.01z)| lr 2.20e-04 | 8469.60 ms | -100.0% bf16 MFU | 61966 tok/s +step 11755/19560 | loss 3.362311 (-0.02z)| norm 0.3064 (+2.04z)| lr 2.20e-04 | 8467.33 ms | -100.0% bf16 MFU | 61963 tok/s +step 11756/19560 | loss 3.377913 (+0.42z)| norm 0.2825 (+0.53z)| lr 2.20e-04 | 8469.45 ms | -100.0% bf16 MFU | 61960 tok/s +step 11757/19560 | loss 3.378705 (+0.46z)| norm 0.2851 (+0.70z)| lr 2.20e-04 | 8475.94 ms | -100.0% bf16 MFU | 61955 tok/s +step 11758/19560 | loss 3.354070 (-0.23z)| norm 0.2734 (-0.03z)| lr 2.20e-04 | 8477.14 ms | -100.0% bf16 MFU | 61950 tok/s +step 11759/19560 | loss 3.330124 (-0.89z)| norm 0.2920 (+1.13z)| lr 2.20e-04 | 8472.34 ms | -100.0% bf16 MFU | 61946 tok/s +step 11760/19560 | loss 3.411981 (+1.44z)| norm 0.2917 (+1.13z)| lr 2.20e-04 | 8465.51 ms | -100.0% bf16 MFU | 61946 tok/s +step 11761/19560 | loss 3.440109 (+2.18z)| norm 0.2557 (-1.15z)| lr 2.20e-04 | 8466.55 ms | -100.0% bf16 MFU | 61945 tok/s +step 11762/19560 | loss 3.378123 (+0.44z)| norm 0.2882 (+0.91z)| lr 2.19e-04 | 8473.57 ms | -100.0% bf16 MFU | 61941 tok/s +step 11763/19560 | loss 3.359778 (-0.08z)| norm 0.3023 (+1.77z)| lr 2.19e-04 | 8474.15 ms | -100.0% bf16 MFU | 61938 tok/s +step 11764/19560 | loss 3.330940 (-0.88z)| norm 0.2730 (-0.06z)| lr 2.19e-04 | 8471.36 ms | -100.0% bf16 MFU | 61935 tok/s +step 11765/19560 | loss 3.380523 (+0.50z)| norm 0.2577 (-1.00z)| lr 2.19e-04 | 8464.91 ms | -100.0% bf16 MFU | 61935 tok/s +step 11766/19560 | loss 3.404780 (+1.16z)| norm 0.3091 (+2.17z)| lr 2.19e-04 | 8459.16 ms | -100.0% bf16 MFU | 61937 tok/s +step 11767/19560 | loss 3.374501 (+0.31z)| norm 0.2762 (+0.15z)| lr 2.19e-04 | 8468.59 ms | -100.0% bf16 MFU | 61936 tok/s +step 11768/19560 | loss 3.331696 (-0.90z)| norm 0.2685 (-0.35z)| lr 2.19e-04 | 8469.50 ms | -100.0% bf16 MFU | 61934 tok/s +step 11769/19560 | loss 3.385073 (+0.61z)| norm 0.2881 (+0.88z)| lr 2.19e-04 | 8472.13 ms | -100.0% bf16 MFU | 61932 tok/s +step 11770/19560 | loss 3.373170 (+0.26z)| norm 0.2974 (+1.44z)| lr 2.19e-04 | 8466.11 ms | -100.0% bf16 MFU | 61932 tok/s +step 11771/19560 | loss 3.330454 (-0.95z)| norm 0.2574 (-1.07z)| lr 2.19e-04 | 8465.86 ms | -100.0% bf16 MFU | 61932 tok/s +step 11772/19560 | loss 3.330063 (-0.95z)| norm 0.2585 (-1.02z)| lr 2.19e-04 | 8470.00 ms | -100.0% bf16 MFU | 61930 tok/s +step 11773/19560 | loss 3.343434 (-0.58z)| norm 0.2854 (+0.68z)| lr 2.19e-04 | 8469.07 ms | -100.0% bf16 MFU | 61929 tok/s +step 11774/19560 | loss 3.366685 (+0.07z)| norm 0.2573 (-1.15z)| lr 2.19e-04 | 8468.26 ms | -100.0% bf16 MFU | 61928 tok/s +step 11775/19560 | loss 3.376389 (+0.35z)| norm 0.2754 (+0.03z)| lr 2.19e-04 | 8473.77 ms | -100.0% bf16 MFU | 61925 tok/s +step 11776/19560 | loss 3.423110 (+1.68z)| norm 0.3373 (+3.84z)| lr 2.19e-04 | 8471.07 ms | -100.0% bf16 MFU | 61923 tok/s +step 11777/19560 | loss 3.325552 (-1.13z)| norm 0.2658 (-0.62z)| lr 2.19e-04 | 8462.06 ms | -100.0% bf16 MFU | 61925 tok/s +step 11778/19560 | loss 3.368361 (+0.10z)| norm 0.2807 (+0.30z)| lr 2.19e-04 | 8463.23 ms | -100.0% bf16 MFU | 61926 tok/s +step 11779/19560 | loss 3.380973 (+0.45z)| norm 0.2820 (+0.37z)| lr 2.19e-04 | 8463.85 ms | -100.0% bf16 MFU | 61927 tok/s +step 11780/19560 | loss 3.369028 (+0.11z)| norm 0.2574 (-1.18z)| lr 2.19e-04 | 8466.66 ms | -100.0% bf16 MFU | 61927 tok/s +step 11781/19560 | loss 3.432810 (+1.93z)| norm 0.2773 (+0.06z)| lr 2.19e-04 | 8467.01 ms | -100.0% bf16 MFU | 61927 tok/s +step 11782/19560 | loss 3.371965 (+0.17z)| norm 0.2832 (+0.43z)| lr 2.19e-04 | 8466.46 ms | -100.0% bf16 MFU | 61927 tok/s +step 11783/19560 | loss 3.370509 (+0.13z)| norm 0.3042 (+1.74z)| lr 2.18e-04 | 8469.40 ms | -100.0% bf16 MFU | 61926 tok/s +step 11784/19560 | loss 3.349369 (-0.47z)| norm 0.2975 (+1.31z)| lr 2.18e-04 | 8469.46 ms | -100.0% bf16 MFU | 61924 tok/s +step 11785/19560 | loss 3.357908 (-0.22z)| norm 0.2675 (-0.57z)| lr 2.18e-04 | 8464.12 ms | -100.0% bf16 MFU | 61925 tok/s +step 11786/19560 | loss 3.466545 (+2.84z)| norm 0.2974 (+1.29z)| lr 2.18e-04 | 8464.84 ms | -100.0% bf16 MFU | 61926 tok/s +step 11787/19560 | loss 3.440870 (+2.07z)| norm 0.2689 (-0.50z)| lr 2.18e-04 | 8469.01 ms | -100.0% bf16 MFU | 61925 tok/s +step 11788/19560 | loss 3.385952 (+0.52z)| norm 0.2787 (+0.11z)| lr 2.18e-04 | 8468.22 ms | -100.0% bf16 MFU | 61924 tok/s +step 11789/19560 | loss 3.327410 (-1.11z)| norm 0.2768 (+0.00z)| lr 2.18e-04 | 8465.18 ms | -100.0% bf16 MFU | 61925 tok/s +step 11790/19560 | loss 3.308752 (-1.62z)| norm 0.2811 (+0.28z)| lr 2.18e-04 | 8467.30 ms | -100.0% bf16 MFU | 61925 tok/s +step 11791/19560 | loss 3.364842 (-0.06z)| norm 0.2765 (-0.02z)| lr 2.18e-04 | 8467.39 ms | -100.0% bf16 MFU | 61924 tok/s +step 11792/19560 | loss 3.436610 (+1.89z)| norm 0.2627 (-0.87z)| lr 2.18e-04 | 8467.32 ms | -100.0% bf16 MFU | 61924 tok/s +step 11793/19560 | loss 3.347264 (-0.56z)| norm 0.2770 (+0.03z)| lr 2.18e-04 | 8469.00 ms | -100.0% bf16 MFU | 61923 tok/s +step 11794/19560 | loss 3.382089 (+0.39z)| norm 0.2807 (+0.25z)| lr 2.18e-04 | 8465.53 ms | -100.0% bf16 MFU | 61924 tok/s +step 11795/19560 | loss 3.343514 (-0.68z)| norm 0.2597 (-1.06z)| lr 2.18e-04 | 8459.39 ms | -100.0% bf16 MFU | 61926 tok/s +step 11796/19560 | loss 3.405058 (+1.02z)| norm 0.2835 (+0.43z)| lr 2.18e-04 | 8451.24 ms | -100.0% bf16 MFU | 61932 tok/s +step 11797/19560 | loss 3.370388 (+0.07z)| norm 0.2841 (+0.45z)| lr 2.18e-04 | 8449.17 ms | -100.0% bf16 MFU | 61938 tok/s +step 11798/19560 | loss 3.395312 (+0.74z)| norm 0.2758 (-0.07z)| lr 2.18e-04 | 8448.14 ms | -100.0% bf16 MFU | 61944 tok/s +step 11799/19560 | loss 3.425478 (+1.56z)| norm 0.2873 (+0.65z)| lr 2.18e-04 | 8448.47 ms | -100.0% bf16 MFU | 61950 tok/s +step 11800/19560 | loss 3.351945 (-0.46z)| norm 0.2719 (-0.35z)| lr 2.18e-04 | 8451.08 ms | -100.0% bf16 MFU | 61954 tok/s +step 11801/19560 | loss 3.374209 (+0.14z)| norm 0.2799 (+0.16z)| lr 2.18e-04 | 8451.96 ms | -100.0% bf16 MFU | 61958 tok/s +step 11802/19560 | loss 3.347584 (-0.59z)| norm 0.2888 (+0.72z)| lr 2.18e-04 | 8439.08 ms | -100.0% bf16 MFU | 61966 tok/s +step 11803/19560 | loss 3.369608 (+0.02z)| norm 0.2645 (-0.85z)| lr 2.18e-04 | 8443.93 ms | -100.0% bf16 MFU | 61972 tok/s +step 11804/19560 | loss 3.367178 (-0.05z)| norm 0.3529 (+4.50z)| lr 2.17e-04 | 8444.30 ms | -100.0% bf16 MFU | 61978 tok/s +step 11805/19560 | loss 3.403307 (+0.96z)| norm 0.2918 (+0.83z)| lr 2.17e-04 | 8445.29 ms | -100.0% bf16 MFU | 61983 tok/s +step 11806/19560 | loss 3.313833 (-1.52z)| norm 0.2906 (+0.74z)| lr 2.17e-04 | 8446.24 ms | -100.0% bf16 MFU | 61988 tok/s +step 11807/19560 | loss 3.374613 (+0.16z)| norm 0.2798 (+0.08z)| lr 2.17e-04 | 8439.40 ms | -100.0% bf16 MFU | 61995 tok/s +step 11808/19560 | loss 3.372466 (+0.10z)| norm 0.2732 (-0.32z)| lr 2.17e-04 | 8444.13 ms | -100.0% bf16 MFU | 61999 tok/s +step 11809/19560 | loss 3.305366 (-1.73z)| norm 0.2693 (-0.57z)| lr 2.17e-04 | 8439.52 ms | -100.0% bf16 MFU | 62006 tok/s +step 11810/19560 | loss 3.435519 (+1.79z)| norm 0.2629 (-0.94z)| lr 2.17e-04 | 8440.63 ms | -100.0% bf16 MFU | 62011 tok/s +step 11811/19560 | loss 3.389582 (+0.54z)| norm 0.2933 (+0.91z)| lr 2.17e-04 | 8441.07 ms | -100.0% bf16 MFU | 62016 tok/s +step 11812/19560 | loss 3.370369 (+0.01z)| norm 0.3055 (+1.62z)| lr 2.17e-04 | 8446.75 ms | -100.0% bf16 MFU | 62019 tok/s +step 11813/19560 | loss 3.331034 (-1.06z)| norm 0.2687 (-0.59z)| lr 2.17e-04 | 8444.22 ms | -100.0% bf16 MFU | 62022 tok/s +step 11814/19560 | loss 3.422002 (+1.39z)| norm 0.2565 (-1.34z)| lr 2.17e-04 | 8447.58 ms | -100.0% bf16 MFU | 62024 tok/s +step 11815/19560 | loss 3.316633 (-1.46z)| norm 0.2644 (-0.85z)| lr 2.17e-04 | 8441.65 ms | -100.0% bf16 MFU | 62028 tok/s +step 11816/19560 | loss 3.407176 (+1.01z)| norm 0.2695 (-0.54z)| lr 2.17e-04 | 8447.15 ms | -100.0% bf16 MFU | 62030 tok/s +step 11817/19560 | loss 3.364554 (-0.16z)| norm 0.2919 (+0.80z)| lr 2.17e-04 | 8443.59 ms | -100.0% bf16 MFU | 62033 tok/s +step 11818/19560 | loss 3.350570 (-0.54z)| norm 0.2790 (+0.03z)| lr 2.17e-04 | 8445.41 ms | -100.0% bf16 MFU | 62036 tok/s +step 11819/19560 | loss 3.382675 (+0.35z)| norm 0.2682 (-0.63z)| lr 2.17e-04 | 8444.08 ms | -100.0% bf16 MFU | 62038 tok/s +step 11820/19560 | loss 3.356987 (-0.35z)| norm 0.2656 (-0.77z)| lr 2.17e-04 | 8445.93 ms | -100.0% bf16 MFU | 62040 tok/s +step 11821/19560 | loss 3.328137 (-1.13z)| norm 0.2543 (-1.46z)| lr 2.17e-04 | 8441.95 ms | -100.0% bf16 MFU | 62044 tok/s +step 11822/19560 | loss 3.304370 (-1.75z)| norm 0.2535 (-1.48z)| lr 2.17e-04 | 8450.01 ms | -100.0% bf16 MFU | 62044 tok/s +step 11823/19560 | loss 3.380339 (+0.30z)| norm 0.2472 (-1.85z)| lr 2.17e-04 | 8449.17 ms | -100.0% bf16 MFU | 62044 tok/s +step 11824/19560 | loss 3.352028 (-0.46z)| norm 0.2746 (-0.21z)| lr 2.17e-04 | 8449.71 ms | -100.0% bf16 MFU | 62044 tok/s +step 11825/19560 | loss 3.372547 (+0.11z)| norm 0.2648 (-0.80z)| lr 2.16e-04 | 8452.67 ms | -100.0% bf16 MFU | 62043 tok/s +step 11826/19560 | loss 3.417695 (+1.34z)| norm 0.2562 (-1.30z)| lr 2.16e-04 | 8453.21 ms | -100.0% bf16 MFU | 62042 tok/s +step 11827/19560 | loss 3.429531 (+1.64z)| norm 0.2773 (-0.03z)| lr 2.16e-04 | 8451.00 ms | -100.0% bf16 MFU | 62042 tok/s +step 11828/19560 | loss 3.320747 (-1.30z)| norm 0.2553 (-1.35z)| lr 2.16e-04 | 8460.71 ms | -100.0% bf16 MFU | 62038 tok/s +step 11829/19560 | loss 3.473949 (+2.82z)| norm 0.2750 (-0.15z)| lr 2.16e-04 | 8455.74 ms | -100.0% bf16 MFU | 62037 tok/s +step 11830/19560 | loss 3.412389 (+1.16z)| norm 0.2616 (-0.95z)| lr 2.16e-04 | 8464.11 ms | -100.0% bf16 MFU | 62032 tok/s +step 11831/19560 | loss 3.305481 (-1.68z)| norm 0.2915 (+0.85z)| lr 2.16e-04 | 8454.60 ms | -100.0% bf16 MFU | 62031 tok/s +step 11832/19560 | loss 3.310814 (-1.52z)| norm 0.2619 (-0.94z)| lr 2.16e-04 | 8460.00 ms | -100.0% bf16 MFU | 62028 tok/s +step 11833/19560 | loss 3.332512 (-0.96z)| norm 0.2861 (+0.53z)| lr 2.16e-04 | 8454.98 ms | -100.0% bf16 MFU | 62027 tok/s +step 11834/19560 | loss 3.325599 (-1.13z)| norm 0.2773 (-0.01z)| lr 2.16e-04 | 8457.65 ms | -100.0% bf16 MFU | 62025 tok/s +step 11835/19560 | loss 3.424142 (+1.46z)| norm 0.2889 (+0.68z)| lr 2.16e-04 | 8455.20 ms | -100.0% bf16 MFU | 62024 tok/s +step 11836/19560 | loss 3.289871 (-2.04z)| norm 0.2674 (-0.62z)| lr 2.16e-04 | 8459.66 ms | -100.0% bf16 MFU | 62022 tok/s +step 11837/19560 | loss 3.343990 (-0.63z)| norm 0.2883 (+0.64z)| lr 2.16e-04 | 8460.95 ms | -100.0% bf16 MFU | 62019 tok/s +step 11838/19560 | loss 3.345246 (-0.59z)| norm 0.2813 (+0.20z)| lr 2.16e-04 | 8457.78 ms | -100.0% bf16 MFU | 62018 tok/s +step 11839/19560 | loss 3.350897 (-0.44z)| norm 0.2978 (+1.20z)| lr 2.16e-04 | 8456.35 ms | -100.0% bf16 MFU | 62017 tok/s +step 11840/19560 | loss 3.350400 (-0.45z)| norm 0.2621 (-0.96z)| lr 2.16e-04 | 8458.87 ms | -100.0% bf16 MFU | 62015 tok/s +step 11841/19560 | loss 3.463128 (+2.42z)| norm 0.2942 (+0.98z)| lr 2.16e-04 | 8459.37 ms | -100.0% bf16 MFU | 62013 tok/s +step 11842/19560 | loss 3.328513 (-1.02z)| norm 0.2617 (-0.98z)| lr 2.16e-04 | 8462.82 ms | -100.0% bf16 MFU | 62010 tok/s +step 11843/19560 | loss 3.397360 (+0.72z)| norm 0.2736 (-0.26z)| lr 2.16e-04 | 8458.73 ms | -100.0% bf16 MFU | 62009 tok/s +step 11844/19560 | loss 3.397852 (+0.72z)| norm 0.2554 (-1.35z)| lr 2.16e-04 | 8457.83 ms | -100.0% bf16 MFU | 62008 tok/s +step 11845/19560 | loss 3.353981 (-0.41z)| norm 0.2700 (-0.47z)| lr 2.16e-04 | 8458.48 ms | -100.0% bf16 MFU | 62006 tok/s +step 11846/19560 | loss 3.373508 (+0.10z)| norm 0.2715 (-0.39z)| lr 2.15e-04 | 8457.30 ms | -100.0% bf16 MFU | 62006 tok/s +step 11847/19560 | loss 3.405545 (+0.91z)| norm 0.2685 (-0.56z)| lr 2.15e-04 | 8458.28 ms | -100.0% bf16 MFU | 62005 tok/s +step 11848/19560 | loss 3.347917 (-0.56z)| norm 0.2946 (+1.00z)| lr 2.15e-04 | 8459.60 ms | -100.0% bf16 MFU | 62003 tok/s +step 11849/19560 | loss 3.311607 (-1.49z)| norm 0.2588 (-1.15z)| lr 2.15e-04 | 8458.79 ms | -100.0% bf16 MFU | 62002 tok/s +step 11850/19560 | loss 3.335060 (-0.87z)| norm 0.2864 (+0.50z)| lr 2.15e-04 | 8457.11 ms | -100.0% bf16 MFU | 62002 tok/s +step 11851/19560 | loss 3.373302 (+0.13z)| norm 0.2602 (-1.07z)| lr 2.15e-04 | 8458.87 ms | -100.0% bf16 MFU | 62001 tok/s +step 11852/19560 | loss 3.374635 (+0.16z)| norm 0.2802 (+0.12z)| lr 2.15e-04 | 8457.58 ms | -100.0% bf16 MFU | 62000 tok/s +step 11853/19560 | loss 3.499928 (+3.27z)| norm 0.3069 (+1.70z)| lr 2.15e-04 | 8448.34 ms | -100.0% bf16 MFU | 62003 tok/s +step 11854/19560 | loss 3.328216 (-1.03z)| norm 0.2951 (+1.00z)| lr 2.15e-04 | 8441.75 ms | -100.0% bf16 MFU | 62008 tok/s +step 11855/19560 | loss 3.423605 (+1.34z)| norm 0.2918 (+0.80z)| lr 2.15e-04 | 8438.71 ms | -100.0% bf16 MFU | 62014 tok/s +step 11856/19560 | loss 3.354565 (-0.38z)| norm 0.2996 (+1.27z)| lr 2.15e-04 | 8437.78 ms | -100.0% bf16 MFU | 62020 tok/s +step 11857/19560 | loss 3.337014 (-0.82z)| norm 0.2692 (-0.55z)| lr 2.15e-04 | 8438.63 ms | -100.0% bf16 MFU | 62026 tok/s +step 11858/19560 | loss 3.360289 (-0.24z)| norm 0.2794 (+0.07z)| lr 2.15e-04 | 8431.64 ms | -100.0% bf16 MFU | 62034 tok/s +step 11859/19560 | loss 3.347499 (-0.55z)| norm 0.2625 (-0.94z)| lr 2.15e-04 | 8440.74 ms | -100.0% bf16 MFU | 62038 tok/s +step 11860/19560 | loss 3.357520 (-0.30z)| norm 0.2657 (-0.73z)| lr 2.15e-04 | 8436.67 ms | -100.0% bf16 MFU | 62043 tok/s +step 11861/19560 | loss 3.350511 (-0.48z)| norm 0.2679 (-0.60z)| lr 2.15e-04 | 8434.11 ms | -100.0% bf16 MFU | 62049 tok/s +step 11862/19560 | loss 3.447330 (+1.89z)| norm 0.2692 (-0.52z)| lr 2.15e-04 | 8437.75 ms | -100.0% bf16 MFU | 62053 tok/s +step 11863/19560 | loss 3.407615 (+0.90z)| norm 0.2833 (+0.33z)| lr 2.15e-04 | 8432.91 ms | -100.0% bf16 MFU | 62059 tok/s +step 11864/19560 | loss 3.324630 (-1.13z)| norm 0.2706 (-0.42z)| lr 2.15e-04 | 8437.50 ms | -100.0% bf16 MFU | 62063 tok/s +step 11865/19560 | loss 3.328417 (-1.03z)| norm 0.2811 (+0.21z)| lr 2.15e-04 | 8441.31 ms | -100.0% bf16 MFU | 62065 tok/s +step 11866/19560 | loss 3.365511 (-0.12z)| norm 0.2618 (-0.94z)| lr 2.14e-04 | 8437.45 ms | -100.0% bf16 MFU | 62069 tok/s +step 11867/19560 | loss 3.399111 (+0.71z)| norm 0.3083 (+1.83z)| lr 2.14e-04 | 8439.15 ms | -100.0% bf16 MFU | 62072 tok/s +step 11868/19560 | loss 3.389303 (+0.46z)| norm 0.2694 (-0.49z)| lr 2.14e-04 | 8435.70 ms | -100.0% bf16 MFU | 62076 tok/s +step 11869/19560 | loss 3.310781 (-1.47z)| norm 0.2758 (-0.11z)| lr 2.14e-04 | 8436.23 ms | -100.0% bf16 MFU | 62079 tok/s +step 11870/19560 | loss 3.355269 (-0.38z)| norm 0.2763 (-0.09z)| lr 2.14e-04 | 8438.10 ms | -100.0% bf16 MFU | 62082 tok/s +step 11871/19560 | loss 3.401165 (+0.74z)| norm 0.2948 (+1.01z)| lr 2.14e-04 | 8437.67 ms | -100.0% bf16 MFU | 62085 tok/s +step 11872/19560 | loss 3.352825 (-0.45z)| norm 0.2633 (-0.88z)| lr 2.14e-04 | 8439.85 ms | -100.0% bf16 MFU | 62087 tok/s +step 11873/19560 | loss 3.332798 (-0.93z)| norm 0.2855 (+0.44z)| lr 2.14e-04 | 8440.25 ms | -100.0% bf16 MFU | 62088 tok/s +step 11874/19560 | loss 3.335646 (-0.85z)| norm 0.2574 (-1.25z)| lr 2.14e-04 | 8439.99 ms | -100.0% bf16 MFU | 62090 tok/s +step 11875/19560 | loss 3.324701 (-1.10z)| norm 0.2713 (-0.41z)| lr 2.14e-04 | 8442.16 ms | -100.0% bf16 MFU | 62090 tok/s +step 11876/19560 | loss 3.381454 (+0.31z)| norm 0.2631 (-0.90z)| lr 2.14e-04 | 8446.21 ms | -100.0% bf16 MFU | 62090 tok/s +step 11877/19560 | loss 3.298046 (-1.73z)| norm 0.2683 (-0.58z)| lr 2.14e-04 | 8440.93 ms | -100.0% bf16 MFU | 62091 tok/s +step 11878/19560 | loss 3.376583 (+0.20z)| norm 0.2726 (-0.33z)| lr 2.14e-04 | 8445.04 ms | -100.0% bf16 MFU | 62090 tok/s +step 11879/19560 | loss 3.379828 (+0.27z)| norm 0.2898 (+0.72z)| lr 2.14e-04 | 8444.71 ms | -100.0% bf16 MFU | 62090 tok/s +step 11880/19560 | loss 3.395231 (+0.65z)| norm 0.3529 (+4.20z)| lr 2.14e-04 | 8447.58 ms | -100.0% bf16 MFU | 62089 tok/s +step 11881/19560 | loss 3.347153 (-0.55z)| norm 0.3335 (+3.00z)| lr 2.14e-04 | 8444.21 ms | -100.0% bf16 MFU | 62089 tok/s +step 11882/19560 | loss 3.372084 (+0.08z)| norm 0.2835 (+0.26z)| lr 2.14e-04 | 8447.26 ms | -100.0% bf16 MFU | 62088 tok/s +step 11883/19560 | loss 3.343634 (-0.63z)| norm 0.2897 (+0.61z)| lr 2.14e-04 | 8450.39 ms | -100.0% bf16 MFU | 62085 tok/s +step 11884/19560 | loss 3.321558 (-1.17z)| norm 0.2985 (+1.09z)| lr 2.14e-04 | 8452.37 ms | -100.0% bf16 MFU | 62082 tok/s +step 11885/19560 | loss 3.413080 (+1.11z)| norm 0.2908 (+0.66z)| lr 2.14e-04 | 8455.43 ms | -100.0% bf16 MFU | 62079 tok/s +step 11886/19560 | loss 3.400006 (+0.78z)| norm 0.2902 (+0.62z)| lr 2.14e-04 | 8453.75 ms | -100.0% bf16 MFU | 62076 tok/s +step 11887/19560 | loss 3.392986 (+0.59z)| norm 0.2818 (+0.16z)| lr 2.13e-04 | 8455.55 ms | -100.0% bf16 MFU | 62072 tok/s +step 11888/19560 | loss 3.369034 (+0.00z)| norm 0.2863 (+0.41z)| lr 2.13e-04 | 8454.44 ms | -100.0% bf16 MFU | 62069 tok/s +step 11889/19560 | loss 3.457132 (+2.19z)| norm 0.2988 (+1.08z)| lr 2.13e-04 | 8453.63 ms | -100.0% bf16 MFU | 62067 tok/s +step 11890/19560 | loss 3.367036 (-0.05z)| norm 0.2770 (-0.11z)| lr 2.13e-04 | 8455.84 ms | -100.0% bf16 MFU | 62064 tok/s +step 11891/19560 | loss 3.383620 (+0.36z)| norm 0.2996 (+1.14z)| lr 2.13e-04 | 8453.91 ms | -100.0% bf16 MFU | 62061 tok/s +step 11892/19560 | loss 3.376001 (+0.16z)| norm 0.2917 (+0.69z)| lr 2.13e-04 | 8452.05 ms | -100.0% bf16 MFU | 62060 tok/s +step 11893/19560 | loss 3.368995 (-0.01z)| norm 0.2846 (+0.29z)| lr 2.13e-04 | 8456.69 ms | -100.0% bf16 MFU | 62057 tok/s +step 11894/19560 | loss 3.328084 (-1.02z)| norm 0.2771 (-0.11z)| lr 2.13e-04 | 8454.22 ms | -100.0% bf16 MFU | 62054 tok/s +step 11895/19560 | loss 3.409955 (+1.01z)| norm 0.2855 (+0.35z)| lr 2.13e-04 | 8455.37 ms | -100.0% bf16 MFU | 62052 tok/s +step 11896/19560 | loss 3.358531 (-0.27z)| norm 0.2977 (+1.02z)| lr 2.13e-04 | 8458.13 ms | -100.0% bf16 MFU | 62049 tok/s +step 11897/19560 | loss 3.248463 (-2.89z)| norm 0.2830 (+0.20z)| lr 2.13e-04 | 8456.82 ms | -100.0% bf16 MFU | 62046 tok/s +step 11898/19560 | loss 3.333889 (-0.82z)| norm 0.3090 (+1.64z)| lr 2.13e-04 | 8455.15 ms | -100.0% bf16 MFU | 62044 tok/s +step 11899/19560 | loss 3.338469 (-0.71z)| norm 0.2611 (-1.03z)| lr 2.13e-04 | 8453.65 ms | -100.0% bf16 MFU | 62043 tok/s +step 11900/19560 | loss 3.295923 (-1.71z)| norm 0.2984 (+1.03z)| lr 2.13e-04 | 8454.57 ms | -100.0% bf16 MFU | 62041 tok/s +step 11901/19560 | loss 3.356040 (-0.28z)| norm 0.2542 (-1.41z)| lr 2.13e-04 | 8454.13 ms | -100.0% bf16 MFU | 62040 tok/s +step 11902/19560 | loss 3.375237 (+0.18z)| norm 0.3010 (+1.17z)| lr 2.13e-04 | 8454.40 ms | -100.0% bf16 MFU | 62039 tok/s +step 11903/19560 | loss 3.368247 (+0.01z)| norm 0.2763 (-0.20z)| lr 2.13e-04 | 8454.77 ms | -100.0% bf16 MFU | 62037 tok/s +step 11904/19560 | loss 3.378703 (+0.27z)| norm 0.2799 (+0.02z)| lr 2.13e-04 | 8454.93 ms | -100.0% bf16 MFU | 62036 tok/s +step 11905/19560 | loss 3.321383 (-1.11z)| norm 0.2877 (+0.47z)| lr 2.13e-04 | 8453.42 ms | -100.0% bf16 MFU | 62035 tok/s +step 11906/19560 | loss 3.316364 (-1.21z)| norm 0.2720 (-0.44z)| lr 2.13e-04 | 8454.07 ms | -100.0% bf16 MFU | 62034 tok/s +step 11907/19560 | loss 3.343691 (-0.55z)| norm 0.2802 (+0.04z)| lr 2.13e-04 | 8451.76 ms | -100.0% bf16 MFU | 62034 tok/s +step 11908/19560 | loss 3.362292 (-0.10z)| norm 0.2814 (+0.10z)| lr 2.12e-04 | 8455.96 ms | -100.0% bf16 MFU | 62033 tok/s +step 11909/19560 | loss 3.332839 (-0.80z)| norm 0.2810 (+0.07z)| lr 2.12e-04 | 8453.84 ms | -100.0% bf16 MFU | 62032 tok/s +step 11910/19560 | loss 3.323856 (-1.00z)| norm 0.2753 (-0.26z)| lr 2.12e-04 | 8454.00 ms | -100.0% bf16 MFU | 62031 tok/s +step 11911/19560 | loss 3.326598 (-0.92z)| norm 0.3000 (+1.19z)| lr 2.12e-04 | 8454.37 ms | -100.0% bf16 MFU | 62030 tok/s +step 11912/19560 | loss 3.405031 (+0.94z)| norm 0.3022 (+1.31z)| lr 2.12e-04 | 8451.84 ms | -100.0% bf16 MFU | 62030 tok/s +step 11913/19560 | loss 3.301991 (-1.49z)| norm 0.2812 (+0.08z)| lr 2.12e-04 | 8453.49 ms | -100.0% bf16 MFU | 62030 tok/s +step 11914/19560 | loss 3.375294 (+0.26z)| norm 0.2810 (+0.07z)| lr 2.12e-04 | 8451.94 ms | -100.0% bf16 MFU | 62030 tok/s +step 11915/19560 | loss 3.309997 (-1.30z)| norm 0.2891 (+0.54z)| lr 2.12e-04 | 8452.73 ms | -100.0% bf16 MFU | 62030 tok/s +step 11916/19560 | loss 3.328664 (-0.83z)| norm 0.2648 (-0.87z)| lr 2.12e-04 | 8455.02 ms | -100.0% bf16 MFU | 62029 tok/s +step 11917/19560 | loss 3.331213 (-0.77z)| norm 0.2656 (-0.82z)| lr 2.12e-04 | 8454.71 ms | -100.0% bf16 MFU | 62028 tok/s +step 11918/19560 | loss 3.425777 (+1.50z)| norm 0.2829 (+0.19z)| lr 2.12e-04 | 8451.51 ms | -100.0% bf16 MFU | 62028 tok/s +step 11919/19560 | loss 3.372303 (+0.20z)| norm 0.2676 (-0.70z)| lr 2.12e-04 | 8454.08 ms | -100.0% bf16 MFU | 62028 tok/s +step 11920/19560 | loss 3.386740 (+0.57z)| norm 0.2817 (+0.12z)| lr 2.12e-04 | 8455.70 ms | -100.0% bf16 MFU | 62026 tok/s +step 11921/19560 | loss 3.340337 (-0.57z)| norm 0.2805 (+0.04z)| lr 2.12e-04 | 8457.38 ms | -100.0% bf16 MFU | 62025 tok/s +step 11922/19560 | loss 3.398838 (+0.86z)| norm 0.2712 (-0.50z)| lr 2.12e-04 | 8455.92 ms | -100.0% bf16 MFU | 62024 tok/s +step 11923/19560 | loss 3.402376 (+0.93z)| norm 0.2525 (-1.58z)| lr 2.12e-04 | 8453.06 ms | -100.0% bf16 MFU | 62024 tok/s +step 11924/19560 | loss 3.310229 (-1.29z)| norm 0.2692 (-0.60z)| lr 2.12e-04 | 8455.49 ms | -100.0% bf16 MFU | 62023 tok/s +step 11925/19560 | loss 3.319564 (-1.05z)| norm 0.2423 (-2.11z)| lr 2.12e-04 | 8455.29 ms | -100.0% bf16 MFU | 62022 tok/s +step 11926/19560 | loss 3.317944 (-1.07z)| norm 0.2689 (-0.58z)| lr 2.12e-04 | 8451.19 ms | -100.0% bf16 MFU | 62023 tok/s +step 11927/19560 | loss 3.336595 (-0.61z)| norm 0.2540 (-1.41z)| lr 2.12e-04 | 8455.88 ms | -100.0% bf16 MFU | 62022 tok/s +step 11928/19560 | loss 3.360038 (-0.04z)| norm 0.2626 (-0.92z)| lr 2.12e-04 | 8454.29 ms | -100.0% bf16 MFU | 62021 tok/s +step 11929/19560 | loss 3.342989 (-0.45z)| norm 0.2594 (-1.08z)| lr 2.11e-04 | 8476.19 ms | -100.0% bf16 MFU | 62013 tok/s +step 11930/19560 | loss 3.317210 (-1.07z)| norm 0.2608 (-0.99z)| lr 2.11e-04 | 8482.99 ms | -100.0% bf16 MFU | 62003 tok/s +step 11931/19560 | loss 3.330529 (-0.74z)| norm 0.2641 (-0.80z)| lr 2.11e-04 | 8480.40 ms | -100.0% bf16 MFU | 61994 tok/s +step 11932/19560 | loss 3.357338 (-0.09z)| norm 0.2706 (-0.44z)| lr 2.11e-04 | 8473.90 ms | -100.0% bf16 MFU | 61987 tok/s +step 11933/19560 | loss 3.367289 (+0.16z)| norm 0.2642 (-0.81z)| lr 2.11e-04 | 8477.71 ms | -100.0% bf16 MFU | 61980 tok/s +step 11934/19560 | loss 3.349726 (-0.27z)| norm 0.2775 (-0.00z)| lr 2.11e-04 | 8478.07 ms | -100.0% bf16 MFU | 61973 tok/s +step 11935/19560 | loss 3.373782 (+0.31z)| norm 0.2882 (+0.64z)| lr 2.11e-04 | 8479.39 ms | -100.0% bf16 MFU | 61966 tok/s +step 11936/19560 | loss 3.354713 (-0.15z)| norm 0.2803 (+0.16z)| lr 2.11e-04 | 8477.75 ms | -100.0% bf16 MFU | 61960 tok/s +step 11937/19560 | loss 3.343071 (-0.44z)| norm 0.2675 (-0.61z)| lr 2.11e-04 | 8474.08 ms | -100.0% bf16 MFU | 61955 tok/s +step 11938/19560 | loss 3.356353 (-0.10z)| norm 0.2873 (+0.57z)| lr 2.11e-04 | 8474.03 ms | -100.0% bf16 MFU | 61951 tok/s +step 11939/19560 | loss 3.369202 (+0.22z)| norm 0.2654 (-0.73z)| lr 2.11e-04 | 8477.27 ms | -100.0% bf16 MFU | 61946 tok/s +step 11940/19560 | loss 3.407866 (+1.17z)| norm 0.2862 (+0.53z)| lr 2.11e-04 | 8474.58 ms | -100.0% bf16 MFU | 61942 tok/s +step 11941/19560 | loss 3.353540 (-0.18z)| norm 0.2644 (-0.79z)| lr 2.11e-04 | 8473.80 ms | -100.0% bf16 MFU | 61938 tok/s +step 11942/19560 | loss 3.396956 (+0.91z)| norm 0.2758 (-0.10z)| lr 2.11e-04 | 8474.88 ms | -100.0% bf16 MFU | 61935 tok/s +step 11943/19560 | loss 3.339229 (-0.54z)| norm 0.2667 (-0.67z)| lr 2.11e-04 | 8475.42 ms | -100.0% bf16 MFU | 61931 tok/s +step 11944/19560 | loss 3.396209 (+0.89z)| norm 0.2703 (-0.45z)| lr 2.11e-04 | 8473.50 ms | -100.0% bf16 MFU | 61928 tok/s +step 11945/19560 | loss 3.345204 (-0.38z)| norm 0.2814 (+0.24z)| lr 2.11e-04 | 8468.40 ms | -100.0% bf16 MFU | 61927 tok/s +step 11946/19560 | loss 3.318844 (-1.04z)| norm 0.2713 (-0.38z)| lr 2.11e-04 | 8472.28 ms | -100.0% bf16 MFU | 61925 tok/s +step 11947/19560 | loss 3.310966 (-1.21z)| norm 0.2834 (+0.36z)| lr 2.11e-04 | 8466.06 ms | -100.0% bf16 MFU | 61925 tok/s +step 11948/19560 | loss 3.396139 (+0.90z)| norm 0.2872 (+0.59z)| lr 2.11e-04 | 8474.34 ms | -100.0% bf16 MFU | 61922 tok/s +step 11949/19560 | loss 3.390794 (+0.75z)| norm 0.2713 (-0.41z)| lr 2.11e-04 | 8470.57 ms | -100.0% bf16 MFU | 61921 tok/s +step 11950/19560 | loss 3.410516 (+1.23z)| norm 0.2539 (-1.49z)| lr 2.10e-04 | 8473.96 ms | -100.0% bf16 MFU | 61918 tok/s +step 11951/19560 | loss 3.369521 (+0.21z)| norm 0.2799 (+0.11z)| lr 2.10e-04 | 8474.51 ms | -100.0% bf16 MFU | 61916 tok/s +step 11952/19560 | loss 3.338372 (-0.57z)| norm 0.2648 (-0.83z)| lr 2.10e-04 | 8472.21 ms | -100.0% bf16 MFU | 61914 tok/s +step 11953/19560 | loss 3.357358 (-0.09z)| norm 0.2956 (+1.09z)| lr 2.10e-04 | 8471.60 ms | -100.0% bf16 MFU | 61913 tok/s +step 11954/19560 | loss 3.360909 (+0.01z)| norm 0.2650 (-0.84z)| lr 2.10e-04 | 8468.65 ms | -100.0% bf16 MFU | 61913 tok/s +step 11955/19560 | loss 3.302218 (-1.44z)| norm 0.2793 (+0.06z)| lr 2.10e-04 | 8467.37 ms | -100.0% bf16 MFU | 61913 tok/s +step 11956/19560 | loss 3.389851 (+0.75z)| norm 0.2785 (+0.00z)| lr 2.10e-04 | 8477.65 ms | -100.0% bf16 MFU | 61910 tok/s +step 11957/19560 | loss 3.352159 (-0.18z)| norm 0.2622 (-1.03z)| lr 2.10e-04 | 8467.67 ms | -100.0% bf16 MFU | 61910 tok/s +step 11958/19560 | loss 3.370170 (+0.30z)| norm 0.2824 (+0.24z)| lr 2.10e-04 | 8466.70 ms | -100.0% bf16 MFU | 61911 tok/s +step 11959/19560 | loss 3.327165 (-0.84z)| norm 0.2598 (-1.18z)| lr 2.10e-04 | 8473.52 ms | -100.0% bf16 MFU | 61909 tok/s +step 11960/19560 | loss 3.394076 (+0.91z)| norm 0.2887 (+0.65z)| lr 2.10e-04 | 8473.67 ms | -100.0% bf16 MFU | 61907 tok/s +step 11961/19560 | loss 3.358264 (-0.04z)| norm 0.2609 (-1.10z)| lr 2.10e-04 | 8468.40 ms | -100.0% bf16 MFU | 61907 tok/s +step 11962/19560 | loss 3.365105 (+0.13z)| norm 0.3051 (+1.66z)| lr 2.10e-04 | 8470.06 ms | -100.0% bf16 MFU | 61907 tok/s +step 11963/19560 | loss 3.339743 (-0.53z)| norm 0.2650 (-0.84z)| lr 2.10e-04 | 8466.00 ms | -100.0% bf16 MFU | 61908 tok/s +step 11964/19560 | loss 3.332295 (-0.75z)| norm 0.2803 (+0.11z)| lr 2.10e-04 | 8473.68 ms | -100.0% bf16 MFU | 61906 tok/s +step 11965/19560 | loss 3.381809 (+0.59z)| norm 0.3043 (+1.60z)| lr 2.10e-04 | 8465.38 ms | -100.0% bf16 MFU | 61907 tok/s +step 11966/19560 | loss 3.354148 (-0.16z)| norm 0.2752 (-0.21z)| lr 2.10e-04 | 8465.34 ms | -100.0% bf16 MFU | 61909 tok/s +step 11967/19560 | loss 3.308635 (-1.38z)| norm 0.2983 (+1.22z)| lr 2.10e-04 | 8468.99 ms | -100.0% bf16 MFU | 61909 tok/s +step 11968/19560 | loss 3.388254 (+0.76z)| norm 0.2680 (-0.66z)| lr 2.10e-04 | 8469.43 ms | -100.0% bf16 MFU | 61908 tok/s +step 11969/19560 | loss 3.383928 (+0.68z)| norm 0.2957 (+1.06z)| lr 2.10e-04 | 8468.09 ms | -100.0% bf16 MFU | 61909 tok/s +step 11970/19560 | loss 3.340508 (-0.53z)| norm 0.2805 (+0.11z)| lr 2.10e-04 | 8462.81 ms | -100.0% bf16 MFU | 61911 tok/s +step 11971/19560 | loss 3.302455 (-1.56z)| norm 0.2852 (+0.40z)| lr 2.09e-04 | 8465.65 ms | -100.0% bf16 MFU | 61912 tok/s +step 11972/19560 | loss 3.365640 (+0.19z)| norm 0.2826 (+0.22z)| lr 2.09e-04 | 8465.27 ms | -100.0% bf16 MFU | 61913 tok/s +step 11973/19560 | loss 3.345129 (-0.37z)| norm 0.2963 (+1.07z)| lr 2.09e-04 | 8467.14 ms | -100.0% bf16 MFU | 61913 tok/s +step 11974/19560 | loss 3.582221 (+5.42z)| norm 0.2931 (+0.86z)| lr 2.09e-04 | 8466.41 ms | -100.0% bf16 MFU | 61914 tok/s +step 11975/19560 | loss 3.347919 (-0.29z)| norm 0.2643 (-0.95z)| lr 2.09e-04 | 8464.36 ms | -100.0% bf16 MFU | 61915 tok/s +step 11976/19560 | loss 3.367481 (+0.19z)| norm 0.2759 (-0.21z)| lr 2.09e-04 | 8469.21 ms | -100.0% bf16 MFU | 61915 tok/s +step 11977/19560 | loss 3.381923 (+0.53z)| norm 0.2705 (-0.56z)| lr 2.09e-04 | 8465.13 ms | -100.0% bf16 MFU | 61916 tok/s +step 11978/19560 | loss 3.334833 (-0.63z)| norm 0.2595 (-1.24z)| lr 2.09e-04 | 8465.46 ms | -100.0% bf16 MFU | 61917 tok/s +step 11979/19560 | loss 3.365541 (+0.13z)| norm 0.2709 (-0.52z)| lr 2.09e-04 | 8463.05 ms | -100.0% bf16 MFU | 61918 tok/s +step 11980/19560 | loss 3.391159 (+0.75z)| norm 0.2700 (-0.58z)| lr 2.09e-04 | 8464.10 ms | -100.0% bf16 MFU | 61919 tok/s +step 11981/19560 | loss 3.295953 (-1.61z)| norm 0.3846 (+5.76z)| lr 2.09e-04 | 8466.48 ms | -100.0% bf16 MFU | 61920 tok/s +step 11982/19560 | loss 3.390184 (+0.79z)| norm 0.2705 (-0.50z)| lr 2.09e-04 | 8462.71 ms | -100.0% bf16 MFU | 61921 tok/s +step 11983/19560 | loss 3.361691 (+0.07z)| norm 0.2939 (+0.79z)| lr 2.09e-04 | 8467.80 ms | -100.0% bf16 MFU | 61921 tok/s +step 11984/19560 | loss 3.339847 (-0.49z)| norm 0.2621 (-0.95z)| lr 2.09e-04 | 8461.77 ms | -100.0% bf16 MFU | 61923 tok/s +step 11985/19560 | loss 3.451648 (+2.32z)| norm 0.2906 (+0.61z)| lr 2.09e-04 | 8460.70 ms | -100.0% bf16 MFU | 61925 tok/s +step 11986/19560 | loss 3.357499 (-0.06z)| norm 0.2717 (-0.43z)| lr 2.09e-04 | 8466.07 ms | -100.0% bf16 MFU | 61925 tok/s +step 11987/19560 | loss 3.353661 (-0.15z)| norm 0.2898 (+0.56z)| lr 2.09e-04 | 8461.22 ms | -100.0% bf16 MFU | 61927 tok/s +step 11988/19560 | loss 3.381692 (+0.55z)| norm 0.2608 (-1.03z)| lr 2.09e-04 | 8464.39 ms | -100.0% bf16 MFU | 61928 tok/s +step 11989/19560 | loss 3.415119 (+1.37z)| norm 0.2889 (+0.50z)| lr 2.09e-04 | 8463.17 ms | -100.0% bf16 MFU | 61929 tok/s +step 11990/19560 | loss 3.323265 (-0.92z)| norm 0.2694 (-0.57z)| lr 2.09e-04 | 8459.74 ms | -100.0% bf16 MFU | 61931 tok/s +step 11991/19560 | loss 3.401686 (+1.08z)| norm 0.3023 (+1.23z)| lr 2.09e-04 | 8457.28 ms | -100.0% bf16 MFU | 61934 tok/s +step 11992/19560 | loss 3.370920 (+0.28z)| norm 0.2623 (-0.95z)| lr 2.08e-04 | 8458.87 ms | -100.0% bf16 MFU | 61937 tok/s +step 11993/19560 | loss 3.337662 (-0.57z)| norm 0.3038 (+1.29z)| lr 2.08e-04 | 8463.71 ms | -100.0% bf16 MFU | 61937 tok/s +step 11994/19560 | loss 3.302994 (-1.43z)| norm 0.2546 (-1.37z)| lr 2.08e-04 | 8469.52 ms | -100.0% bf16 MFU | 61935 tok/s +step 11995/19560 | loss 3.307937 (-1.28z)| norm 0.2800 (+0.01z)| lr 2.08e-04 | 8463.61 ms | -100.0% bf16 MFU | 61936 tok/s +step 11996/19560 | loss 3.315950 (-1.06z)| norm 0.2595 (-1.09z)| lr 2.08e-04 | 8462.29 ms | -100.0% bf16 MFU | 61937 tok/s +step 11997/19560 | loss 3.388972 (+0.77z)| norm 0.2713 (-0.45z)| lr 2.08e-04 | 8467.69 ms | -100.0% bf16 MFU | 61936 tok/s +step 11998/19560 | loss 3.344949 (-0.35z)| norm 0.2546 (-1.34z)| lr 2.08e-04 | 8464.12 ms | -100.0% bf16 MFU | 61936 tok/s +step 11999/19560 | loss 3.345219 (-0.33z)| norm 0.2617 (-0.94z)| lr 2.08e-04 | 8463.17 ms | -100.0% bf16 MFU | 61937 tok/s +step 12000/19560 | loss 3.304989 (-1.33z)| norm 0.2896 (+0.55z)| lr 2.08e-04 | 8464.97 ms | -100.0% bf16 MFU | 61937 tok/s +val loss 3.343576 +evaluating HellaSwag: 0/1256 evaluating HellaSwag: 10/1256 evaluating HellaSwag: 20/1256 evaluating HellaSwag: 30/1256 evaluating HellaSwag: 40/1256 evaluating HellaSwag: 50/1256 evaluating HellaSwag: 60/1256 evaluating HellaSwag: 70/1256 evaluating HellaSwag: 80/1256 evaluating HellaSwag: 90/1256 evaluating HellaSwag: 100/1256 evaluating HellaSwag: 110/1256 evaluating HellaSwag: 120/1256 evaluating HellaSwag: 130/1256 evaluating HellaSwag: 140/1256 evaluating HellaSwag: 150/1256 evaluating HellaSwag: 160/1256 evaluating HellaSwag: 170/1256 evaluating HellaSwag: 180/1256 evaluating HellaSwag: 190/1256 evaluating HellaSwag: 200/1256 evaluating HellaSwag: 210/1256 evaluating HellaSwag: 220/1256 evaluating HellaSwag: 230/1256 evaluating HellaSwag: 240/1256 evaluating HellaSwag: 250/1256 evaluating HellaSwag: 260/1256 evaluating HellaSwag: 270/1256 evaluating HellaSwag: 280/1256 evaluating HellaSwag: 290/1256 evaluating HellaSwag: 300/1256 evaluating HellaSwag: 310/1256 evaluating HellaSwag: 320/1256 evaluating HellaSwag: 330/1256 evaluating HellaSwag: 340/1256 evaluating HellaSwag: 350/1256 evaluating HellaSwag: 360/1256 evaluating HellaSwag: 370/1256 evaluating HellaSwag: 380/1256 evaluating HellaSwag: 390/1256 evaluating HellaSwag: 400/1256 evaluating HellaSwag: 410/1256 evaluating HellaSwag: 420/1256 evaluating HellaSwag: 430/1256 evaluating HellaSwag: 440/1256 evaluating HellaSwag: 450/1256 evaluating HellaSwag: 460/1256 evaluating HellaSwag: 470/1256 evaluating HellaSwag: 480/1256 evaluating HellaSwag: 490/1256 evaluating HellaSwag: 500/1256 evaluating HellaSwag: 510/1256 evaluating HellaSwag: 520/1256 evaluating HellaSwag: 530/1256 evaluating HellaSwag: 540/1256 evaluating HellaSwag: 550/1256 evaluating HellaSwag: 560/1256 evaluating HellaSwag: 570/1256 evaluating HellaSwag: 580/1256 evaluating HellaSwag: 590/1256 evaluating HellaSwag: 600/1256 evaluating HellaSwag: 610/1256 evaluating HellaSwag: 620/1256 evaluating HellaSwag: 630/1256 evaluating HellaSwag: 640/1256 evaluating HellaSwag: 650/1256 evaluating HellaSwag: 660/1256 evaluating HellaSwag: 670/1256 evaluating HellaSwag: 680/1256 evaluating HellaSwag: 690/1256 evaluating HellaSwag: 700/1256 evaluating HellaSwag: 710/1256 evaluating HellaSwag: 720/1256 evaluating HellaSwag: 730/1256 evaluating HellaSwag: 740/1256 evaluating HellaSwag: 750/1256 evaluating HellaSwag: 760/1256 evaluating HellaSwag: 770/1256 evaluating HellaSwag: 780/1256 evaluating HellaSwag: 790/1256 evaluating HellaSwag: 800/1256 evaluating HellaSwag: 810/1256 evaluating HellaSwag: 820/1256 evaluating HellaSwag: 830/1256 evaluating HellaSwag: 840/1256 evaluating HellaSwag: 850/1256 evaluating HellaSwag: 860/1256 evaluating HellaSwag: 870/1256 evaluating HellaSwag: 880/1256 evaluating HellaSwag: 890/1256 evaluating HellaSwag: 900/1256 evaluating HellaSwag: 910/1256 evaluating HellaSwag: 920/1256 evaluating HellaSwag: 930/1256 evaluating HellaSwag: 940/1256 evaluating HellaSwag: 950/1256 evaluating HellaSwag: 960/1256 evaluating HellaSwag: 970/1256 evaluating HellaSwag: 980/1256 evaluating HellaSwag: 990/1256 evaluating HellaSwag: 1000/1256 evaluating HellaSwag: 1010/1256 evaluating HellaSwag: 1020/1256 evaluating HellaSwag: 1030/1256 evaluating HellaSwag: 1040/1256 evaluating HellaSwag: 1050/1256 evaluating HellaSwag: 1060/1256 evaluating HellaSwag: 1070/1256 evaluating HellaSwag: 1080/1256 evaluating HellaSwag: 1090/1256 evaluating HellaSwag: 1100/1256 evaluating HellaSwag: 1110/1256 evaluating HellaSwag: 1120/1256 evaluating HellaSwag: 1130/1256 evaluating HellaSwag: 1140/1256 evaluating HellaSwag: 1150/1256 evaluating HellaSwag: 1160/1256 evaluating HellaSwag: 1170/1256 evaluating HellaSwag: 1180/1256 evaluating HellaSwag: 1190/1256 evaluating HellaSwag: 1200/1256 evaluating HellaSwag: 1210/1256 evaluating HellaSwag: 1220/1256 evaluating HellaSwag: 1230/1256 evaluating HellaSwag: 1240/1256 evaluating HellaSwag: 1250/1256 HellaSwag: 2934/10042 = 0.292173 +step 12001/19560 | loss 3.358626 (+0.02z)| norm 0.2866 (+0.39z)| lr 2.08e-04 | 8461.48 ms | -100.0% bf16 MFU | 61938 tok/s +step 12002/19560 | loss 3.397197 (+0.98z)| norm 0.2667 (-0.69z)| lr 2.08e-04 | 8458.54 ms | -100.0% bf16 MFU | 61940 tok/s +step 12003/19560 | loss 3.340933 (-0.45z)| norm 0.2924 (+0.69z)| lr 2.08e-04 | 8459.35 ms | -100.0% bf16 MFU | 61942 tok/s +step 12004/19560 | loss 3.370703 (+0.31z)| norm 0.2502 (-1.58z)| lr 2.08e-04 | 8464.28 ms | -100.0% bf16 MFU | 61942 tok/s +step 12005/19560 | loss 3.420994 (+1.56z)| norm 0.2876 (+0.43z)| lr 2.08e-04 | 8465.36 ms | -100.0% bf16 MFU | 61942 tok/s +step 12006/19560 | loss 3.317956 (-1.04z)| norm 0.2506 (-1.54z)| lr 2.08e-04 | 8460.29 ms | -100.0% bf16 MFU | 61943 tok/s +step 12007/19560 | loss 3.396030 (+0.93z)| norm 0.2621 (-0.91z)| lr 2.08e-04 | 8461.63 ms | -100.0% bf16 MFU | 61944 tok/s +step 12008/19560 | loss 3.337743 (-0.53z)| norm 0.2566 (-1.24z)| lr 2.08e-04 | 8462.24 ms | -100.0% bf16 MFU | 61945 tok/s +step 12009/19560 | loss 3.442351 (+2.06z)| norm 0.3049 (+1.54z)| lr 2.08e-04 | 8462.80 ms | -100.0% bf16 MFU | 61945 tok/s +step 12010/19560 | loss 3.307929 (-1.26z)| norm 0.2800 (+0.10z)| lr 2.08e-04 | 8461.77 ms | -100.0% bf16 MFU | 61946 tok/s +step 12011/19560 | loss 3.358437 (-0.02z)| norm 0.2619 (-0.94z)| lr 2.08e-04 | 8458.00 ms | -100.0% bf16 MFU | 61948 tok/s +step 12012/19560 | loss 3.364448 (+0.13z)| norm 0.2652 (-0.73z)| lr 2.08e-04 | 8463.15 ms | -100.0% bf16 MFU | 61948 tok/s +step 12013/19560 | loss 3.325309 (-0.83z)| norm 0.2577 (-1.15z)| lr 2.07e-04 | 8460.86 ms | -100.0% bf16 MFU | 61949 tok/s +step 12014/19560 | loss 3.340366 (-0.45z)| norm 0.2487 (-1.64z)| lr 2.07e-04 | 8463.14 ms | -100.0% bf16 MFU | 61949 tok/s +step 12015/19560 | loss 3.313127 (-1.11z)| norm 0.2624 (-0.84z)| lr 2.07e-04 | 8467.47 ms | -100.0% bf16 MFU | 61947 tok/s +step 12016/19560 | loss 3.338319 (-0.47z)| norm 0.2597 (-0.98z)| lr 2.07e-04 | 8459.68 ms | -100.0% bf16 MFU | 61949 tok/s +step 12017/19560 | loss 3.362238 (+0.14z)| norm 0.2576 (-1.09z)| lr 2.07e-04 | 8459.47 ms | -100.0% bf16 MFU | 61950 tok/s +step 12018/19560 | loss 3.342732 (-0.35z)| norm 0.2764 (-0.01z)| lr 2.07e-04 | 8457.65 ms | -100.0% bf16 MFU | 61952 tok/s +step 12019/19560 | loss 3.362760 (+0.17z)| norm 0.2749 (-0.08z)| lr 2.07e-04 | 8461.87 ms | -100.0% bf16 MFU | 61952 tok/s +step 12020/19560 | loss 3.400317 (+1.12z)| norm 0.2483 (-1.59z)| lr 2.07e-04 | 8458.07 ms | -100.0% bf16 MFU | 61954 tok/s +step 12021/19560 | loss 3.379426 (+0.58z)| norm 0.2595 (-0.93z)| lr 2.07e-04 | 8460.67 ms | -100.0% bf16 MFU | 61955 tok/s +step 12022/19560 | loss 3.373506 (+0.42z)| norm 0.2669 (-0.51z)| lr 2.07e-04 | 8465.93 ms | -100.0% bf16 MFU | 61954 tok/s +step 12023/19560 | loss 3.333857 (-0.58z)| norm 0.2747 (-0.06z)| lr 2.07e-04 | 8461.07 ms | -100.0% bf16 MFU | 61954 tok/s +step 12024/19560 | loss 3.451838 (+2.37z)| norm 0.2574 (-1.03z)| lr 2.07e-04 | 8464.06 ms | -100.0% bf16 MFU | 61954 tok/s +step 12025/19560 | loss 3.371431 (+0.35z)| norm 0.2693 (-0.34z)| lr 2.07e-04 | 8458.72 ms | -100.0% bf16 MFU | 61955 tok/s +step 12026/19560 | loss 3.398604 (+1.03z)| norm 0.2736 (-0.08z)| lr 2.07e-04 | 8457.20 ms | -100.0% bf16 MFU | 61957 tok/s +step 12027/19560 | loss 3.293971 (-1.64z)| norm 0.2614 (-0.79z)| lr 2.07e-04 | 8457.68 ms | -100.0% bf16 MFU | 61958 tok/s +step 12028/19560 | loss 3.399441 (+1.04z)| norm 0.2847 (+0.58z)| lr 2.07e-04 | 8460.62 ms | -100.0% bf16 MFU | 61959 tok/s +step 12029/19560 | loss 3.326217 (-0.83z)| norm 0.2652 (-0.57z)| lr 2.07e-04 | 8459.25 ms | -100.0% bf16 MFU | 61960 tok/s +step 12030/19560 | loss 3.329934 (-0.73z)| norm 0.2635 (-0.66z)| lr 2.07e-04 | 8456.73 ms | -100.0% bf16 MFU | 61962 tok/s +step 12031/19560 | loss 3.420374 (+1.56z)| norm 0.2767 (+0.12z)| lr 2.07e-04 | 8455.62 ms | -100.0% bf16 MFU | 61964 tok/s +step 12032/19560 | loss 3.346213 (-0.31z)| norm 0.2658 (-0.52z)| lr 2.07e-04 | 8462.18 ms | -100.0% bf16 MFU | 61964 tok/s +step 12033/19560 | loss 3.283011 (-1.88z)| norm 0.2800 (+0.33z)| lr 2.07e-04 | 8459.89 ms | -100.0% bf16 MFU | 61964 tok/s +step 12034/19560 | loss 3.466264 (+2.61z)| norm 0.2773 (+0.16z)| lr 2.06e-04 | 8455.11 ms | -100.0% bf16 MFU | 61966 tok/s +step 12035/19560 | loss 3.415760 (+1.36z)| norm 0.2605 (-0.82z)| lr 2.06e-04 | 8458.60 ms | -100.0% bf16 MFU | 61967 tok/s +step 12036/19560 | loss 3.371785 (+0.29z)| norm 0.2990 (+1.44z)| lr 2.06e-04 | 8462.24 ms | -100.0% bf16 MFU | 61967 tok/s +step 12037/19560 | loss 3.444248 (+2.00z)| norm 0.2802 (+0.33z)| lr 2.06e-04 | 8459.73 ms | -100.0% bf16 MFU | 61967 tok/s +step 12038/19560 | loss 3.311213 (-1.18z)| norm 0.3177 (+2.46z)| lr 2.06e-04 | 8455.21 ms | -100.0% bf16 MFU | 61969 tok/s +step 12039/19560 | loss 3.319346 (-0.99z)| norm 0.2830 (+0.48z)| lr 2.06e-04 | 8456.89 ms | -100.0% bf16 MFU | 61970 tok/s +step 12040/19560 | loss 3.379360 (+0.45z)| norm 0.2993 (+1.42z)| lr 2.06e-04 | 8460.85 ms | -100.0% bf16 MFU | 61970 tok/s +step 12041/19560 | loss 3.426601 (+1.56z)| norm 0.2852 (+0.61z)| lr 2.06e-04 | 8454.81 ms | -100.0% bf16 MFU | 61972 tok/s +step 12042/19560 | loss 3.400681 (+0.93z)| norm 0.2774 (+0.16z)| lr 2.06e-04 | 8453.72 ms | -100.0% bf16 MFU | 61974 tok/s +step 12043/19560 | loss 3.329771 (-0.77z)| norm 0.2725 (-0.12z)| lr 2.06e-04 | 8457.93 ms | -100.0% bf16 MFU | 61975 tok/s +step 12044/19560 | loss 3.352823 (-0.22z)| norm 0.2729 (-0.10z)| lr 2.06e-04 | 8455.10 ms | -100.0% bf16 MFU | 61977 tok/s +step 12045/19560 | loss 3.308382 (-1.28z)| norm 0.3057 (+1.77z)| lr 2.06e-04 | 8461.75 ms | -100.0% bf16 MFU | 61976 tok/s +step 12046/19560 | loss 3.344913 (-0.39z)| norm 0.3039 (+1.64z)| lr 2.06e-04 | 8461.68 ms | -100.0% bf16 MFU | 61975 tok/s +step 12047/19560 | loss 3.346855 (-0.34z)| norm 0.2722 (-0.17z)| lr 2.06e-04 | 8457.35 ms | -100.0% bf16 MFU | 61976 tok/s +step 12048/19560 | loss 3.351724 (-0.22z)| norm 0.2896 (+0.82z)| lr 2.06e-04 | 8454.26 ms | -100.0% bf16 MFU | 61978 tok/s +step 12049/19560 | loss 3.371011 (+0.24z)| norm 0.2665 (-0.49z)| lr 2.06e-04 | 8456.73 ms | -100.0% bf16 MFU | 61979 tok/s +step 12050/19560 | loss 3.337557 (-0.56z)| norm 0.2710 (-0.23z)| lr 2.06e-04 | 8457.74 ms | -100.0% bf16 MFU | 61979 tok/s +step 12051/19560 | loss 3.352734 (-0.18z)| norm 0.2682 (-0.40z)| lr 2.06e-04 | 8457.86 ms | -100.0% bf16 MFU | 61980 tok/s +step 12052/19560 | loss 3.357081 (-0.08z)| norm 0.2857 (+0.59z)| lr 2.06e-04 | 8455.81 ms | -100.0% bf16 MFU | 61981 tok/s +step 12053/19560 | loss 3.363894 (+0.07z)| norm 0.2695 (-0.35z)| lr 2.06e-04 | 8462.68 ms | -100.0% bf16 MFU | 61980 tok/s +step 12054/19560 | loss 3.362974 (+0.04z)| norm 0.3139 (+2.16z)| lr 2.06e-04 | 8457.98 ms | -100.0% bf16 MFU | 61980 tok/s +step 12055/19560 | loss 3.340425 (-0.51z)| norm 0.2688 (-0.41z)| lr 2.05e-04 | 8456.31 ms | -100.0% bf16 MFU | 61981 tok/s +step 12056/19560 | loss 3.384243 (+0.56z)| norm 0.3146 (+2.15z)| lr 2.05e-04 | 8457.98 ms | -100.0% bf16 MFU | 61981 tok/s +step 12057/19560 | loss 3.326756 (-0.85z)| norm 0.2633 (-0.74z)| lr 2.05e-04 | 8454.62 ms | -100.0% bf16 MFU | 61983 tok/s +step 12058/19560 | loss 3.383601 (+0.54z)| norm 0.2881 (+0.65z)| lr 2.05e-04 | 8455.58 ms | -100.0% bf16 MFU | 61984 tok/s +step 12059/19560 | loss 3.444066 (+1.98z)| norm 0.2861 (+0.52z)| lr 2.05e-04 | 8458.25 ms | -100.0% bf16 MFU | 61984 tok/s +step 12060/19560 | loss 3.474046 (+2.61z)| norm 0.2826 (+0.32z)| lr 2.05e-04 | 8454.91 ms | -100.0% bf16 MFU | 61985 tok/s +step 12061/19560 | loss 3.389205 (+0.60z)| norm 0.2876 (+0.59z)| lr 2.05e-04 | 8456.16 ms | -100.0% bf16 MFU | 61986 tok/s +step 12062/19560 | loss 3.361542 (-0.06z)| norm 0.2763 (-0.05z)| lr 2.05e-04 | 8459.38 ms | -100.0% bf16 MFU | 61986 tok/s +step 12063/19560 | loss 3.359531 (-0.10z)| norm 0.2972 (+1.13z)| lr 2.05e-04 | 8457.52 ms | -100.0% bf16 MFU | 61986 tok/s +step 12064/19560 | loss 3.354521 (-0.22z)| norm 0.2654 (-0.65z)| lr 2.05e-04 | 8454.42 ms | -100.0% bf16 MFU | 61987 tok/s +step 12065/19560 | loss 3.335524 (-0.67z)| norm 0.2800 (+0.16z)| lr 2.05e-04 | 8452.77 ms | -100.0% bf16 MFU | 61989 tok/s +step 12066/19560 | loss 3.332727 (-0.73z)| norm 0.2616 (-0.86z)| lr 2.05e-04 | 8455.99 ms | -100.0% bf16 MFU | 61990 tok/s +step 12067/19560 | loss 3.330150 (-0.78z)| norm 0.2622 (-0.82z)| lr 2.05e-04 | 8456.94 ms | -100.0% bf16 MFU | 61990 tok/s +step 12068/19560 | loss 3.372112 (+0.22z)| norm 0.2790 (+0.12z)| lr 2.05e-04 | 8457.38 ms | -100.0% bf16 MFU | 61990 tok/s +step 12069/19560 | loss 3.287421 (-1.75z)| norm 0.2757 (-0.07z)| lr 2.05e-04 | 8459.04 ms | -100.0% bf16 MFU | 61990 tok/s +step 12070/19560 | loss 3.339784 (-0.52z)| norm 0.2724 (-0.25z)| lr 2.05e-04 | 8455.94 ms | -100.0% bf16 MFU | 61990 tok/s +step 12071/19560 | loss 3.396129 (+0.79z)| norm 0.2766 (-0.02z)| lr 2.05e-04 | 8456.04 ms | -100.0% bf16 MFU | 61991 tok/s +step 12072/19560 | loss 3.291212 (-1.63z)| norm 0.2921 (+0.84z)| lr 2.05e-04 | 8455.54 ms | -100.0% bf16 MFU | 61992 tok/s +step 12073/19560 | loss 3.405052 (+0.99z)| norm 0.2963 (+1.06z)| lr 2.05e-04 | 8456.05 ms | -100.0% bf16 MFU | 61992 tok/s +step 12074/19560 | loss 3.367110 (+0.11z)| norm 0.2670 (-0.58z)| lr 2.05e-04 | 8459.00 ms | -100.0% bf16 MFU | 61991 tok/s +step 12075/19560 | loss 3.332668 (-0.70z)| norm 0.2759 (-0.07z)| lr 2.05e-04 | 8453.43 ms | -100.0% bf16 MFU | 61993 tok/s +step 12076/19560 | loss 3.330952 (-0.72z)| norm 0.2690 (-0.45z)| lr 2.04e-04 | 8453.15 ms | -100.0% bf16 MFU | 61994 tok/s +step 12077/19560 | loss 3.355487 (-0.15z)| norm 0.2826 (+0.31z)| lr 2.04e-04 | 8459.54 ms | -100.0% bf16 MFU | 61993 tok/s +step 12078/19560 | loss 3.345831 (-0.36z)| norm 0.2897 (+0.69z)| lr 2.04e-04 | 8459.41 ms | -100.0% bf16 MFU | 61993 tok/s +step 12079/19560 | loss 3.378374 (+0.40z)| norm 0.2945 (+0.95z)| lr 2.04e-04 | 8456.79 ms | -100.0% bf16 MFU | 61993 tok/s +step 12080/19560 | loss 3.379178 (+0.41z)| norm 0.2733 (-0.24z)| lr 2.04e-04 | 8455.22 ms | -100.0% bf16 MFU | 61994 tok/s +step 12081/19560 | loss 3.382087 (+0.47z)| norm 0.2897 (+0.68z)| lr 2.04e-04 | 8455.63 ms | -100.0% bf16 MFU | 61994 tok/s +step 12082/19560 | loss 3.342798 (-0.44z)| norm 0.2796 (+0.11z)| lr 2.04e-04 | 8457.44 ms | -100.0% bf16 MFU | 61994 tok/s +step 12083/19560 | loss 3.353086 (-0.21z)| norm 0.2986 (+1.17z)| lr 2.04e-04 | 8455.33 ms | -100.0% bf16 MFU | 61995 tok/s +step 12084/19560 | loss 3.391282 (+0.68z)| norm 0.2885 (+0.59z)| lr 2.04e-04 | 8454.63 ms | -100.0% bf16 MFU | 61995 tok/s +step 12085/19560 | loss 3.382150 (+0.46z)| norm 0.2722 (-0.32z)| lr 2.04e-04 | 8460.82 ms | -100.0% bf16 MFU | 61994 tok/s +step 12086/19560 | loss 3.308859 (-1.24z)| norm 0.2856 (+0.43z)| lr 2.04e-04 | 8456.09 ms | -100.0% bf16 MFU | 61994 tok/s +step 12087/19560 | loss 3.360166 (-0.05z)| norm 0.3337 (+2.99z)| lr 2.04e-04 | 8457.13 ms | -100.0% bf16 MFU | 61994 tok/s +step 12088/19560 | loss 3.349477 (-0.29z)| norm 0.3044 (+1.39z)| lr 2.04e-04 | 8459.09 ms | -100.0% bf16 MFU | 61994 tok/s +step 12089/19560 | loss 3.364550 (+0.06z)| norm 0.2854 (+0.35z)| lr 2.04e-04 | 8454.17 ms | -100.0% bf16 MFU | 61995 tok/s +step 12090/19560 | loss 3.366886 (+0.12z)| norm 0.2882 (+0.52z)| lr 2.04e-04 | 8455.54 ms | -100.0% bf16 MFU | 61995 tok/s +step 12091/19560 | loss 3.387810 (+0.60z)| norm 0.2818 (+0.16z)| lr 2.04e-04 | 8457.02 ms | -100.0% bf16 MFU | 61995 tok/s +step 12092/19560 | loss 3.378908 (+0.38z)| norm 0.2929 (+0.76z)| lr 2.04e-04 | 8455.61 ms | -100.0% bf16 MFU | 61996 tok/s +step 12093/19560 | loss 3.354997 (-0.18z)| norm 0.2625 (-0.88z)| lr 2.04e-04 | 8456.73 ms | -100.0% bf16 MFU | 61996 tok/s +step 12094/19560 | loss 3.368055 (+0.13z)| norm 0.2684 (-0.56z)| lr 2.04e-04 | 8456.20 ms | -100.0% bf16 MFU | 61996 tok/s +step 12095/19560 | loss 3.303838 (-1.38z)| norm 0.2589 (-1.06z)| lr 2.04e-04 | 8456.04 ms | -100.0% bf16 MFU | 61996 tok/s +step 12096/19560 | loss 3.325924 (-0.85z)| norm 0.2585 (-1.07z)| lr 2.04e-04 | 8456.26 ms | -100.0% bf16 MFU | 61996 tok/s +step 12097/19560 | loss 3.339802 (-0.51z)| norm 0.2527 (-1.37z)| lr 2.04e-04 | 8457.65 ms | -100.0% bf16 MFU | 61996 tok/s +step 12098/19560 | loss 3.337205 (-0.57z)| norm 0.2661 (-0.63z)| lr 2.03e-04 | 8453.72 ms | -100.0% bf16 MFU | 61997 tok/s +step 12099/19560 | loss 3.344280 (-0.42z)| norm 0.2652 (-0.67z)| lr 2.03e-04 | 8454.49 ms | -100.0% bf16 MFU | 61998 tok/s +step 12100/19560 | loss 3.368273 (+0.15z)| norm 0.2794 (+0.10z)| lr 2.03e-04 | 8457.13 ms | -100.0% bf16 MFU | 61998 tok/s +step 12101/19560 | loss 3.340441 (-0.51z)| norm 0.2738 (-0.20z)| lr 2.03e-04 | 8455.80 ms | -100.0% bf16 MFU | 61998 tok/s +step 12102/19560 | loss 3.402661 (+1.11z)| norm 0.2729 (-0.24z)| lr 2.03e-04 | 8454.21 ms | -100.0% bf16 MFU | 61999 tok/s +step 12103/19560 | loss 3.357360 (-0.09z)| norm 0.3540 (+3.89z)| lr 2.03e-04 | 8458.43 ms | -100.0% bf16 MFU | 61998 tok/s +step 12104/19560 | loss 3.319123 (-1.08z)| norm 0.3004 (+1.13z)| lr 2.03e-04 | 8456.84 ms | -100.0% bf16 MFU | 61998 tok/s +step 12105/19560 | loss 3.311325 (-1.27z)| norm 0.2685 (-0.49z)| lr 2.03e-04 | 8457.99 ms | -100.0% bf16 MFU | 61997 tok/s +step 12106/19560 | loss 3.341874 (-0.47z)| norm 0.3062 (+1.40z)| lr 2.03e-04 | 8457.71 ms | -100.0% bf16 MFU | 61997 tok/s +step 12107/19560 | loss 3.420843 (+1.58z)| norm 0.2615 (-0.85z)| lr 2.03e-04 | 8456.46 ms | -100.0% bf16 MFU | 61997 tok/s +step 12108/19560 | loss 3.340017 (-0.52z)| norm 0.2898 (+0.57z)| lr 2.03e-04 | 8458.15 ms | -100.0% bf16 MFU | 61997 tok/s +step 12109/19560 | loss 3.386909 (+0.69z)| norm 0.2990 (+1.21z)| lr 2.03e-04 | 8456.51 ms | -100.0% bf16 MFU | 61997 tok/s +step 12110/19560 | loss 3.389500 (+0.76z)| norm 0.2741 (-0.22z)| lr 2.03e-04 | 8452.54 ms | -100.0% bf16 MFU | 61998 tok/s +step 12111/19560 | loss 3.366994 (+0.17z)| norm 0.2775 (-0.02z)| lr 2.03e-04 | 8457.36 ms | -100.0% bf16 MFU | 61998 tok/s +step 12112/19560 | loss 3.430920 (+1.81z)| norm 0.2663 (-0.66z)| lr 2.03e-04 | 8454.98 ms | -100.0% bf16 MFU | 61998 tok/s +step 12113/19560 | loss 3.341480 (-0.50z)| norm 0.2956 (+1.02z)| lr 2.03e-04 | 8458.21 ms | -100.0% bf16 MFU | 61998 tok/s +step 12114/19560 | loss 3.353166 (-0.19z)| norm 0.2842 (+0.36z)| lr 2.03e-04 | 8454.14 ms | -100.0% bf16 MFU | 61999 tok/s +step 12115/19560 | loss 3.359462 (-0.03z)| norm 0.3133 (+1.99z)| lr 2.03e-04 | 8456.08 ms | -100.0% bf16 MFU | 61999 tok/s +step 12116/19560 | loss 3.360730 (+0.01z)| norm 0.2740 (-0.24z)| lr 2.03e-04 | 8454.95 ms | -100.0% bf16 MFU | 61999 tok/s +step 12117/19560 | loss 3.408818 (+1.29z)| norm 0.2941 (+0.89z)| lr 2.03e-04 | 8458.36 ms | -100.0% bf16 MFU | 61999 tok/s +step 12118/19560 | loss 3.345515 (-0.40z)| norm 0.2760 (-0.13z)| lr 2.03e-04 | 8454.94 ms | -100.0% bf16 MFU | 61999 tok/s +step 12119/19560 | loss 3.338430 (-0.57z)| norm 0.2659 (-0.69z)| lr 2.02e-04 | 8459.93 ms | -100.0% bf16 MFU | 61998 tok/s +step 12120/19560 | loss 3.315763 (-1.16z)| norm 0.2786 (+0.02z)| lr 2.02e-04 | 8484.36 ms | -100.0% bf16 MFU | 61988 tok/s +step 12121/19560 | loss 3.385706 (+0.69z)| norm 0.2707 (-0.42z)| lr 2.02e-04 | 8485.59 ms | -100.0% bf16 MFU | 61978 tok/s +step 12122/19560 | loss 3.389264 (+0.77z)| norm 0.3331 (+3.05z)| lr 2.02e-04 | 8483.54 ms | -100.0% bf16 MFU | 61969 tok/s +step 12123/19560 | loss 3.373343 (+0.33z)| norm 0.2862 (+0.42z)| lr 2.02e-04 | 8485.92 ms | -100.0% bf16 MFU | 61959 tok/s +step 12124/19560 | loss 3.331806 (-0.79z)| norm 0.2926 (+0.77z)| lr 2.02e-04 | 8485.73 ms | -100.0% bf16 MFU | 61951 tok/s +step 12125/19560 | loss 3.320560 (-1.08z)| norm 0.2687 (-0.57z)| lr 2.02e-04 | 8478.13 ms | -100.0% bf16 MFU | 61945 tok/s +step 12126/19560 | loss 3.364065 (+0.09z)| norm 0.2793 (+0.01z)| lr 2.02e-04 | 8481.39 ms | -100.0% bf16 MFU | 61939 tok/s +step 12127/19560 | loss 3.319880 (-1.10z)| norm 0.2663 (-0.72z)| lr 2.02e-04 | 8484.18 ms | -100.0% bf16 MFU | 61932 tok/s +step 12128/19560 | loss 3.423058 (+1.66z)| norm 0.2884 (+0.52z)| lr 2.02e-04 | 8476.65 ms | -100.0% bf16 MFU | 61928 tok/s +step 12129/19560 | loss 3.365260 (+0.10z)| norm 0.2669 (-0.68z)| lr 2.02e-04 | 8476.99 ms | -100.0% bf16 MFU | 61924 tok/s +step 12130/19560 | loss 3.309803 (-1.37z)| norm 0.2912 (+0.68z)| lr 2.02e-04 | 8481.12 ms | -100.0% bf16 MFU | 61918 tok/s +step 12131/19560 | loss 3.336002 (-0.66z)| norm 0.2647 (-0.80z)| lr 2.02e-04 | 8478.90 ms | -100.0% bf16 MFU | 61914 tok/s +step 12132/19560 | loss 3.339719 (-0.56z)| norm 0.2882 (+0.51z)| lr 2.02e-04 | 8484.12 ms | -100.0% bf16 MFU | 61908 tok/s +step 12133/19560 | loss 3.343278 (-0.45z)| norm 0.2798 (+0.04z)| lr 2.02e-04 | 8472.32 ms | -100.0% bf16 MFU | 61907 tok/s +step 12134/19560 | loss 3.279056 (-2.15z)| norm 0.2684 (-0.62z)| lr 2.02e-04 | 8474.28 ms | -100.0% bf16 MFU | 61905 tok/s +step 12135/19560 | loss 3.353242 (-0.16z)| norm 0.2741 (-0.31z)| lr 2.02e-04 | 8470.05 ms | -100.0% bf16 MFU | 61905 tok/s +step 12136/19560 | loss 3.366757 (+0.19z)| norm 0.2537 (-1.48z)| lr 2.02e-04 | 8476.46 ms | -100.0% bf16 MFU | 61902 tok/s +step 12137/19560 | loss 3.296930 (-1.66z)| norm 0.2641 (-0.86z)| lr 2.02e-04 | 8471.24 ms | -100.0% bf16 MFU | 61902 tok/s +step 12138/19560 | loss 3.367163 (+0.23z)| norm 0.2656 (-0.77z)| lr 2.02e-04 | 8468.73 ms | -100.0% bf16 MFU | 61902 tok/s +step 12139/19560 | loss 3.304799 (-1.45z)| norm 0.2592 (-1.14z)| lr 2.02e-04 | 8475.92 ms | -100.0% bf16 MFU | 61900 tok/s +step 12140/19560 | loss 3.354388 (-0.11z)| norm 0.2733 (-0.33z)| lr 2.01e-04 | 8475.19 ms | -100.0% bf16 MFU | 61898 tok/s +step 12141/19560 | loss 3.332716 (-0.69z)| norm 0.2626 (-0.95z)| lr 2.01e-04 | 8475.02 ms | -100.0% bf16 MFU | 61896 tok/s +step 12142/19560 | loss 3.375342 (+0.45z)| norm 0.2619 (-1.01z)| lr 2.01e-04 | 8474.11 ms | -100.0% bf16 MFU | 61895 tok/s +step 12143/19560 | loss 3.272820 (-2.28z)| norm 0.2513 (-1.61z)| lr 2.01e-04 | 8474.56 ms | -100.0% bf16 MFU | 61893 tok/s +step 12144/19560 | loss 3.316423 (-1.11z)| norm 0.2747 (-0.26z)| lr 2.01e-04 | 8472.20 ms | -100.0% bf16 MFU | 61893 tok/s +step 12145/19560 | loss 3.335460 (-0.60z)| norm 0.2672 (-0.70z)| lr 2.01e-04 | 8476.41 ms | -100.0% bf16 MFU | 61891 tok/s +step 12146/19560 | loss 3.356175 (-0.05z)| norm 0.2780 (-0.07z)| lr 2.01e-04 | 8470.45 ms | -100.0% bf16 MFU | 61891 tok/s +step 12147/19560 | loss 3.362239 (+0.11z)| norm 0.2721 (-0.42z)| lr 2.01e-04 | 8471.41 ms | -100.0% bf16 MFU | 61891 tok/s +step 12148/19560 | loss 3.345416 (-0.33z)| norm 0.2655 (-0.82z)| lr 2.01e-04 | 8475.99 ms | -100.0% bf16 MFU | 61889 tok/s +step 12149/19560 | loss 3.322171 (-0.93z)| norm 0.2645 (-0.88z)| lr 2.01e-04 | 8475.28 ms | -100.0% bf16 MFU | 61888 tok/s +step 12150/19560 | loss 3.406286 (+1.29z)| norm 0.2640 (-0.91z)| lr 2.01e-04 | 8477.06 ms | -100.0% bf16 MFU | 61886 tok/s +step 12151/19560 | loss 3.280510 (-1.99z)| norm 0.2930 (+0.80z)| lr 2.01e-04 | 8466.45 ms | -100.0% bf16 MFU | 61888 tok/s +step 12152/19560 | loss 3.357560 (+0.03z)| norm 0.2658 (-0.82z)| lr 2.01e-04 | 8467.78 ms | -100.0% bf16 MFU | 61889 tok/s +step 12153/19560 | loss 3.342601 (-0.36z)| norm 0.2731 (-0.39z)| lr 2.01e-04 | 8475.87 ms | -100.0% bf16 MFU | 61887 tok/s +step 12154/19560 | loss 3.337219 (-0.49z)| norm 0.2796 (+0.00z)| lr 2.01e-04 | 8472.44 ms | -100.0% bf16 MFU | 61887 tok/s +step 12155/19560 | loss 3.346445 (-0.26z)| norm 0.2705 (-0.55z)| lr 2.01e-04 | 8470.55 ms | -100.0% bf16 MFU | 61888 tok/s +step 12156/19560 | loss 3.273572 (-2.18z)| norm 0.2608 (-1.11z)| lr 2.01e-04 | 8473.17 ms | -100.0% bf16 MFU | 61887 tok/s +step 12157/19560 | loss 3.338861 (-0.44z)| norm 0.2646 (-0.88z)| lr 2.01e-04 | 8468.88 ms | -100.0% bf16 MFU | 61888 tok/s +step 12158/19560 | loss 3.360057 (+0.13z)| norm 0.2814 (+0.11z)| lr 2.01e-04 | 8470.96 ms | -100.0% bf16 MFU | 61888 tok/s +step 12159/19560 | loss 3.288341 (-1.77z)| norm 0.2754 (-0.25z)| lr 2.01e-04 | 8466.02 ms | -100.0% bf16 MFU | 61890 tok/s +step 12160/19560 | loss 3.394469 (+1.06z)| norm 0.2803 (+0.03z)| lr 2.01e-04 | 8464.73 ms | -100.0% bf16 MFU | 61893 tok/s +step 12161/19560 | loss 3.352264 (-0.08z)| norm 0.2638 (-0.94z)| lr 2.00e-04 | 8470.86 ms | -100.0% bf16 MFU | 61893 tok/s +step 12162/19560 | loss 3.316985 (-1.04z)| norm 0.2752 (-0.26z)| lr 2.00e-04 | 8470.08 ms | -100.0% bf16 MFU | 61893 tok/s +step 12163/19560 | loss 3.343784 (-0.28z)| norm 0.2761 (-0.22z)| lr 2.00e-04 | 8467.43 ms | -100.0% bf16 MFU | 61894 tok/s +step 12164/19560 | loss 3.390464 (+1.04z)| norm 0.2731 (-0.39z)| lr 2.00e-04 | 8464.99 ms | -100.0% bf16 MFU | 61896 tok/s +step 12165/19560 | loss 3.277809 (-2.12z)| norm 0.2984 (+1.12z)| lr 2.00e-04 | 8468.99 ms | -100.0% bf16 MFU | 61897 tok/s +step 12166/19560 | loss 3.360346 (+0.22z)| norm 0.2723 (-0.43z)| lr 2.00e-04 | 8464.93 ms | -100.0% bf16 MFU | 61899 tok/s +step 12167/19560 | loss 3.338627 (-0.41z)| norm 0.2735 (-0.35z)| lr 2.00e-04 | 8464.86 ms | -100.0% bf16 MFU | 61901 tok/s +step 12168/19560 | loss 3.325530 (-0.77z)| norm 0.2828 (+0.23z)| lr 2.00e-04 | 8469.75 ms | -100.0% bf16 MFU | 61901 tok/s +step 12169/19560 | loss 3.386768 (+1.00z)| norm 0.2934 (+0.87z)| lr 2.00e-04 | 8469.55 ms | -100.0% bf16 MFU | 61901 tok/s +step 12170/19560 | loss 3.419319 (+1.93z)| norm 0.2640 (-0.92z)| lr 2.00e-04 | 8460.81 ms | -100.0% bf16 MFU | 61904 tok/s +step 12171/19560 | loss 3.261035 (-2.56z)| norm 0.2843 (+0.31z)| lr 2.00e-04 | 8467.61 ms | -100.0% bf16 MFU | 61905 tok/s +step 12172/19560 | loss 3.394511 (+1.19z)| norm 0.2729 (-0.38z)| lr 2.00e-04 | 8463.19 ms | -100.0% bf16 MFU | 61907 tok/s +step 12173/19560 | loss 3.352625 (+0.00z)| norm 0.2716 (-0.45z)| lr 2.00e-04 | 8464.80 ms | -100.0% bf16 MFU | 61909 tok/s +step 12174/19560 | loss 3.401559 (+1.36z)| norm 0.2781 (-0.03z)| lr 2.00e-04 | 8469.40 ms | -100.0% bf16 MFU | 61908 tok/s +step 12175/19560 | loss 3.365384 (+0.34z)| norm 0.2664 (-0.76z)| lr 2.00e-04 | 8466.72 ms | -100.0% bf16 MFU | 61909 tok/s +step 12176/19560 | loss 3.360434 (+0.20z)| norm 0.2785 (-0.01z)| lr 2.00e-04 | 8469.12 ms | -100.0% bf16 MFU | 61909 tok/s +step 12177/19560 | loss 3.324131 (-0.80z)| norm 0.2781 (-0.03z)| lr 2.00e-04 | 8466.68 ms | -100.0% bf16 MFU | 61910 tok/s +step 12178/19560 | loss 3.357419 (+0.13z)| norm 0.2758 (-0.18z)| lr 2.00e-04 | 8460.25 ms | -100.0% bf16 MFU | 61913 tok/s +step 12179/19560 | loss 3.344265 (-0.24z)| norm 0.2826 (+0.24z)| lr 2.00e-04 | 8459.09 ms | -100.0% bf16 MFU | 61916 tok/s +step 12180/19560 | loss 3.342191 (-0.29z)| norm 0.2702 (-0.53z)| lr 2.00e-04 | 8446.65 ms | -100.0% bf16 MFU | 61924 tok/s +step 12181/19560 | loss 3.354876 (+0.06z)| norm 0.2748 (-0.25z)| lr 2.00e-04 | 8444.12 ms | -100.0% bf16 MFU | 61932 tok/s +step 12182/19560 | loss 3.308467 (-1.22z)| norm 0.2769 (-0.10z)| lr 1.99e-04 | 8444.60 ms | -100.0% bf16 MFU | 61940 tok/s +step 12183/19560 | loss 3.377632 (+0.70z)| norm 0.2842 (+0.36z)| lr 1.99e-04 | 8438.83 ms | -100.0% bf16 MFU | 61949 tok/s +step 12184/19560 | loss 3.358222 (+0.16z)| norm 0.2619 (-1.06z)| lr 1.99e-04 | 8444.71 ms | -100.0% bf16 MFU | 61956 tok/s +step 12185/19560 | loss 3.346209 (-0.17z)| norm 0.2942 (+1.03z)| lr 1.99e-04 | 8443.37 ms | -100.0% bf16 MFU | 61963 tok/s +step 12186/19560 | loss 3.320457 (-0.88z)| norm 0.2749 (-0.22z)| lr 1.99e-04 | 8446.20 ms | -100.0% bf16 MFU | 61968 tok/s +step 12187/19560 | loss 3.387966 (+1.04z)| norm 0.2822 (+0.26z)| lr 1.99e-04 | 8442.87 ms | -100.0% bf16 MFU | 61975 tok/s +step 12188/19560 | loss 3.326185 (-0.72z)| norm 0.2731 (-0.33z)| lr 1.99e-04 | 8447.72 ms | -100.0% bf16 MFU | 61979 tok/s +step 12189/19560 | loss 3.416096 (+1.94z)| norm 0.2687 (-0.61z)| lr 1.99e-04 | 8443.07 ms | -100.0% bf16 MFU | 61985 tok/s +step 12190/19560 | loss 3.295441 (-1.60z)| norm 0.2790 (+0.06z)| lr 1.99e-04 | 8448.71 ms | -100.0% bf16 MFU | 61989 tok/s +step 12191/19560 | loss 3.419645 (+1.99z)| norm 0.2794 (+0.10z)| lr 1.99e-04 | 8446.06 ms | -100.0% bf16 MFU | 61993 tok/s +step 12192/19560 | loss 3.358494 (+0.23z)| norm 0.2754 (-0.17z)| lr 1.99e-04 | 8443.10 ms | -100.0% bf16 MFU | 61998 tok/s +step 12193/19560 | loss 3.328402 (-0.64z)| norm 0.2664 (-0.75z)| lr 1.99e-04 | 8444.56 ms | -100.0% bf16 MFU | 62003 tok/s +step 12194/19560 | loss 3.481972 (+3.57z)| norm 0.2820 (+0.26z)| lr 1.99e-04 | 8442.61 ms | -100.0% bf16 MFU | 62007 tok/s +step 12195/19560 | loss 3.413906 (+1.67z)| norm 0.2765 (-0.11z)| lr 1.99e-04 | 8441.94 ms | -100.0% bf16 MFU | 62012 tok/s +step 12196/19560 | loss 3.359365 (+0.19z)| norm 0.2606 (-1.15z)| lr 1.99e-04 | 8441.35 ms | -100.0% bf16 MFU | 62017 tok/s +step 12197/19560 | loss 3.329563 (-0.63z)| norm 0.2776 (-0.03z)| lr 1.99e-04 | 8448.30 ms | -100.0% bf16 MFU | 62019 tok/s +step 12198/19560 | loss 3.333129 (-0.53z)| norm 0.2698 (-0.54z)| lr 1.99e-04 | 8447.43 ms | -100.0% bf16 MFU | 62021 tok/s +step 12199/19560 | loss 3.328291 (-0.65z)| norm 0.2819 (+0.25z)| lr 1.99e-04 | 8443.21 ms | -100.0% bf16 MFU | 62025 tok/s +step 12200/19560 | loss 3.363353 (+0.30z)| norm 0.2754 (-0.16z)| lr 1.99e-04 | 8444.75 ms | -100.0% bf16 MFU | 62028 tok/s +step 12201/19560 | loss 3.294156 (-1.60z)| norm 0.2711 (-0.44z)| lr 1.99e-04 | 8450.66 ms | -100.0% bf16 MFU | 62029 tok/s +step 12202/19560 | loss 3.306844 (-1.23z)| norm 0.2835 (+0.38z)| lr 1.99e-04 | 8444.94 ms | -100.0% bf16 MFU | 62032 tok/s +step 12203/19560 | loss 3.314740 (-1.00z)| norm 0.2672 (-0.70z)| lr 1.99e-04 | 8448.04 ms | -100.0% bf16 MFU | 62033 tok/s +step 12204/19560 | loss 3.355033 (+0.10z)| norm 0.2827 (+0.32z)| lr 1.98e-04 | 8451.34 ms | -100.0% bf16 MFU | 62033 tok/s +step 12205/19560 | loss 3.325650 (-0.70z)| norm 0.2705 (-0.49z)| lr 1.98e-04 | 8455.96 ms | -100.0% bf16 MFU | 62032 tok/s +step 12206/19560 | loss 3.357986 (+0.19z)| norm 0.2668 (-0.72z)| lr 1.98e-04 | 8451.93 ms | -100.0% bf16 MFU | 62032 tok/s +step 12207/19560 | loss 3.281354 (-1.88z)| norm 0.2725 (-0.33z)| lr 1.98e-04 | 8458.85 ms | -100.0% bf16 MFU | 62029 tok/s +step 12208/19560 | loss 3.337103 (-0.35z)| norm 0.2752 (-0.15z)| lr 1.98e-04 | 8460.88 ms | -100.0% bf16 MFU | 62026 tok/s +step 12209/19560 | loss 3.322814 (-0.73z)| norm 0.2913 (+0.92z)| lr 1.98e-04 | 8456.24 ms | -100.0% bf16 MFU | 62025 tok/s +step 12210/19560 | loss 3.416482 (+1.79z)| norm 0.2778 (+0.02z)| lr 1.98e-04 | 8459.45 ms | -100.0% bf16 MFU | 62022 tok/s +step 12211/19560 | loss 3.404012 (+1.43z)| norm 0.2816 (+0.28z)| lr 1.98e-04 | 8453.14 ms | -100.0% bf16 MFU | 62022 tok/s +step 12212/19560 | loss 3.275488 (-1.96z)| norm 0.2780 (+0.05z)| lr 1.98e-04 | 8457.97 ms | -100.0% bf16 MFU | 62021 tok/s +step 12213/19560 | loss 3.377851 (+0.75z)| norm 0.2722 (-0.34z)| lr 1.98e-04 | 8458.80 ms | -100.0% bf16 MFU | 62019 tok/s +step 12214/19560 | loss 3.369736 (+0.52z)| norm 0.2730 (-0.28z)| lr 1.98e-04 | 8455.54 ms | -100.0% bf16 MFU | 62018 tok/s +step 12215/19560 | loss 3.313505 (-0.96z)| norm 0.2749 (-0.13z)| lr 1.98e-04 | 8459.29 ms | -100.0% bf16 MFU | 62016 tok/s +step 12216/19560 | loss 3.334016 (-0.41z)| norm 0.2755 (-0.07z)| lr 1.98e-04 | 8457.07 ms | -100.0% bf16 MFU | 62015 tok/s +step 12217/19560 | loss 3.412298 (+1.63z)| norm 0.2722 (-0.30z)| lr 1.98e-04 | 8456.42 ms | -100.0% bf16 MFU | 62014 tok/s +step 12218/19560 | loss 3.355367 (+0.14z)| norm 0.2955 (+1.38z)| lr 1.98e-04 | 8456.16 ms | -100.0% bf16 MFU | 62013 tok/s +step 12219/19560 | loss 3.334762 (-0.39z)| norm 0.2888 (+0.89z)| lr 1.98e-04 | 8457.47 ms | -100.0% bf16 MFU | 62012 tok/s +step 12220/19560 | loss 3.390852 (+1.08z)| norm 0.2945 (+1.29z)| lr 1.98e-04 | 8460.92 ms | -100.0% bf16 MFU | 62010 tok/s +step 12221/19560 | loss 3.300933 (-1.26z)| norm 0.2760 (-0.04z)| lr 1.98e-04 | 8457.43 ms | -100.0% bf16 MFU | 62009 tok/s +step 12222/19560 | loss 3.343006 (-0.16z)| norm 0.2649 (-0.84z)| lr 1.98e-04 | 8457.52 ms | -100.0% bf16 MFU | 62008 tok/s +step 12223/19560 | loss 3.368786 (+0.51z)| norm 0.2664 (-0.74z)| lr 1.98e-04 | 8458.17 ms | -100.0% bf16 MFU | 62007 tok/s +step 12224/19560 | loss 3.359944 (+0.27z)| norm 0.2751 (-0.12z)| lr 1.98e-04 | 8459.54 ms | -100.0% bf16 MFU | 62005 tok/s +step 12225/19560 | loss 3.303524 (-1.20z)| norm 0.2551 (-1.58z)| lr 1.97e-04 | 8457.57 ms | -100.0% bf16 MFU | 62005 tok/s +step 12226/19560 | loss 3.364452 (+0.39z)| norm 0.2901 (+0.96z)| lr 1.97e-04 | 8457.88 ms | -100.0% bf16 MFU | 62004 tok/s +step 12227/19560 | loss 3.377991 (+0.73z)| norm 0.2747 (-0.17z)| lr 1.97e-04 | 8459.50 ms | -100.0% bf16 MFU | 62002 tok/s +step 12228/19560 | loss 3.303773 (-1.18z)| norm 0.2843 (+0.53z)| lr 1.97e-04 | 8459.42 ms | -100.0% bf16 MFU | 62001 tok/s +step 12229/19560 | loss 3.346218 (-0.08z)| norm 0.2734 (-0.27z)| lr 1.97e-04 | 8457.39 ms | -100.0% bf16 MFU | 62001 tok/s +step 12230/19560 | loss 3.361909 (+0.33z)| norm 0.2652 (-0.86z)| lr 1.97e-04 | 8460.73 ms | -100.0% bf16 MFU | 61999 tok/s +step 12231/19560 | loss 3.293744 (-1.42z)| norm 0.2816 (+0.44z)| lr 1.97e-04 | 8462.61 ms | -100.0% bf16 MFU | 61997 tok/s +step 12232/19560 | loss 3.328537 (-0.52z)| norm 0.2891 (+1.08z)| lr 1.97e-04 | 8460.36 ms | -100.0% bf16 MFU | 61995 tok/s +step 12233/19560 | loss 3.307798 (-1.06z)| norm 0.2728 (-0.30z)| lr 1.97e-04 | 8457.89 ms | -100.0% bf16 MFU | 61995 tok/s +step 12234/19560 | loss 3.320085 (-0.74z)| norm 0.2666 (-0.82z)| lr 1.97e-04 | 8458.72 ms | -100.0% bf16 MFU | 61994 tok/s +step 12235/19560 | loss 3.349452 (+0.04z)| norm 0.2792 (+0.26z)| lr 1.97e-04 | 8459.91 ms | -100.0% bf16 MFU | 61993 tok/s +step 12236/19560 | loss 3.333217 (-0.39z)| norm 0.2860 (+0.86z)| lr 1.97e-04 | 8455.22 ms | -100.0% bf16 MFU | 61994 tok/s +step 12237/19560 | loss 3.359021 (+0.30z)| norm 0.2855 (+0.84z)| lr 1.97e-04 | 8456.72 ms | -100.0% bf16 MFU | 61994 tok/s +step 12238/19560 | loss 3.283263 (-1.66z)| norm 0.2808 (+0.41z)| lr 1.97e-04 | 8461.83 ms | -100.0% bf16 MFU | 61992 tok/s +step 12239/19560 | loss 3.385590 (+1.01z)| norm 0.2762 (+0.01z)| lr 1.97e-04 | 8457.94 ms | -100.0% bf16 MFU | 61992 tok/s +step 12240/19560 | loss 3.375947 (+0.78z)| norm 0.2701 (-0.54z)| lr 1.97e-04 | 8460.76 ms | -100.0% bf16 MFU | 61991 tok/s +step 12241/19560 | loss 3.298100 (-1.27z)| norm 0.2824 (+0.57z)| lr 1.97e-04 | 8457.86 ms | -100.0% bf16 MFU | 61991 tok/s +step 12242/19560 | loss 3.315924 (-0.79z)| norm 0.2950 (+1.69z)| lr 1.97e-04 | 8460.57 ms | -100.0% bf16 MFU | 61990 tok/s +step 12243/19560 | loss 3.319489 (-0.69z)| norm 0.2565 (-1.78z)| lr 1.97e-04 | 8451.89 ms | -100.0% bf16 MFU | 61992 tok/s +step 12244/19560 | loss 3.333034 (-0.33z)| norm 0.3129 (+3.27z)| lr 1.97e-04 | 8457.42 ms | -100.0% bf16 MFU | 61992 tok/s +step 12245/19560 | loss 3.298774 (-1.21z)| norm 0.2679 (-0.70z)| lr 1.97e-04 | 8458.03 ms | -100.0% bf16 MFU | 61992 tok/s +step 12246/19560 | loss 3.282388 (-1.61z)| norm 0.2850 (+0.81z)| lr 1.96e-04 | 8459.99 ms | -100.0% bf16 MFU | 61991 tok/s +step 12247/19560 | loss 3.337682 (-0.17z)| norm 0.2881 (+1.07z)| lr 1.96e-04 | 8456.14 ms | -100.0% bf16 MFU | 61991 tok/s +step 12248/19560 | loss 3.336805 (-0.19z)| norm 0.2795 (+0.31z)| lr 1.96e-04 | 8454.03 ms | -100.0% bf16 MFU | 61992 tok/s +step 12249/19560 | loss 3.309915 (-0.88z)| norm 0.2743 (-0.15z)| lr 1.96e-04 | 8455.94 ms | -100.0% bf16 MFU | 61993 tok/s +step 12250/19560 | loss 3.395720 (+1.36z)| norm 0.2732 (-0.24z)| lr 1.96e-04 | 8457.59 ms | -100.0% bf16 MFU | 61993 tok/s +val loss 3.337786 +evaluating HellaSwag: 0/1256 evaluating HellaSwag: 10/1256 evaluating HellaSwag: 20/1256 evaluating HellaSwag: 30/1256 evaluating HellaSwag: 40/1256 evaluating HellaSwag: 50/1256 evaluating HellaSwag: 60/1256 evaluating HellaSwag: 70/1256 evaluating HellaSwag: 80/1256 evaluating HellaSwag: 90/1256 evaluating HellaSwag: 100/1256 evaluating HellaSwag: 110/1256 evaluating HellaSwag: 120/1256 evaluating HellaSwag: 130/1256 evaluating HellaSwag: 140/1256 evaluating HellaSwag: 150/1256 evaluating HellaSwag: 160/1256 evaluating HellaSwag: 170/1256 evaluating HellaSwag: 180/1256 evaluating HellaSwag: 190/1256 evaluating HellaSwag: 200/1256 evaluating HellaSwag: 210/1256 evaluating HellaSwag: 220/1256 evaluating HellaSwag: 230/1256 evaluating HellaSwag: 240/1256 evaluating HellaSwag: 250/1256 evaluating HellaSwag: 260/1256 evaluating HellaSwag: 270/1256 evaluating HellaSwag: 280/1256 evaluating HellaSwag: 290/1256 evaluating HellaSwag: 300/1256 evaluating HellaSwag: 310/1256 evaluating HellaSwag: 320/1256 evaluating HellaSwag: 330/1256 evaluating HellaSwag: 340/1256 evaluating HellaSwag: 350/1256 evaluating HellaSwag: 360/1256 evaluating HellaSwag: 370/1256 evaluating HellaSwag: 380/1256 evaluating HellaSwag: 390/1256 evaluating HellaSwag: 400/1256 evaluating HellaSwag: 410/1256 evaluating HellaSwag: 420/1256 evaluating HellaSwag: 430/1256 evaluating HellaSwag: 440/1256 evaluating HellaSwag: 450/1256 evaluating HellaSwag: 460/1256 evaluating HellaSwag: 470/1256 evaluating HellaSwag: 480/1256 evaluating HellaSwag: 490/1256 evaluating HellaSwag: 500/1256 evaluating HellaSwag: 510/1256 evaluating HellaSwag: 520/1256 evaluating HellaSwag: 530/1256 evaluating HellaSwag: 540/1256 evaluating HellaSwag: 550/1256 evaluating HellaSwag: 560/1256 evaluating HellaSwag: 570/1256 evaluating HellaSwag: 580/1256 evaluating HellaSwag: 590/1256 evaluating HellaSwag: 600/1256 evaluating HellaSwag: 610/1256 evaluating HellaSwag: 620/1256 evaluating HellaSwag: 630/1256 evaluating HellaSwag: 640/1256 evaluating HellaSwag: 650/1256 evaluating HellaSwag: 660/1256 evaluating HellaSwag: 670/1256 evaluating HellaSwag: 680/1256 evaluating HellaSwag: 690/1256 evaluating HellaSwag: 700/1256 evaluating HellaSwag: 710/1256 evaluating HellaSwag: 720/1256 evaluating HellaSwag: 730/1256 evaluating HellaSwag: 740/1256 evaluating HellaSwag: 750/1256 evaluating HellaSwag: 760/1256 evaluating HellaSwag: 770/1256 evaluating HellaSwag: 780/1256 evaluating HellaSwag: 790/1256 evaluating HellaSwag: 800/1256 evaluating HellaSwag: 810/1256 evaluating HellaSwag: 820/1256 evaluating HellaSwag: 830/1256 evaluating HellaSwag: 840/1256 evaluating HellaSwag: 850/1256 evaluating HellaSwag: 860/1256 evaluating HellaSwag: 870/1256 evaluating HellaSwag: 880/1256 evaluating HellaSwag: 890/1256 evaluating HellaSwag: 900/1256 evaluating HellaSwag: 910/1256 evaluating HellaSwag: 920/1256 evaluating HellaSwag: 930/1256 evaluating HellaSwag: 940/1256 evaluating HellaSwag: 950/1256 evaluating HellaSwag: 960/1256 evaluating HellaSwag: 970/1256 evaluating HellaSwag: 980/1256 evaluating HellaSwag: 990/1256 evaluating HellaSwag: 1000/1256 evaluating HellaSwag: 1010/1256 evaluating HellaSwag: 1020/1256 evaluating HellaSwag: 1030/1256 evaluating HellaSwag: 1040/1256 evaluating HellaSwag: 1050/1256 evaluating HellaSwag: 1060/1256 evaluating HellaSwag: 1070/1256 evaluating HellaSwag: 1080/1256 evaluating HellaSwag: 1090/1256 evaluating HellaSwag: 1100/1256 evaluating HellaSwag: 1110/1256 evaluating HellaSwag: 1120/1256 evaluating HellaSwag: 1130/1256 evaluating HellaSwag: 1140/1256 evaluating HellaSwag: 1150/1256 evaluating HellaSwag: 1160/1256 evaluating HellaSwag: 1170/1256 evaluating HellaSwag: 1180/1256 evaluating HellaSwag: 1190/1256 evaluating HellaSwag: 1200/1256 evaluating HellaSwag: 1210/1256 evaluating HellaSwag: 1220/1256 evaluating HellaSwag: 1230/1256 evaluating HellaSwag: 1240/1256 evaluating HellaSwag: 1250/1256 HellaSwag: 2933/10042 = 0.292073 +step 12251/19560 | loss 3.325501 (-0.47z)| norm 0.2754 (-0.01z)| lr 1.96e-04 | 8452.89 ms | -100.0% bf16 MFU | 61994 tok/s +step 12252/19560 | loss 3.317536 (-0.67z)| norm 0.2808 (+0.54z)| lr 1.96e-04 | 8456.18 ms | -100.0% bf16 MFU | 61995 tok/s +step 12253/19560 | loss 3.341821 (-0.04z)| norm 0.2592 (-1.62z)| lr 1.96e-04 | 8455.79 ms | -100.0% bf16 MFU | 61995 tok/s +step 12254/19560 | loss 3.330413 (-0.33z)| norm 0.2996 (+2.36z)| lr 1.96e-04 | 8454.44 ms | -100.0% bf16 MFU | 61996 tok/s +step 12255/19560 | loss 3.332596 (-0.28z)| norm 0.2573 (-1.76z)| lr 1.96e-04 | 8452.57 ms | -100.0% bf16 MFU | 61998 tok/s +step 12256/19560 | loss 3.378131 (+0.94z)| norm 0.2773 (+0.19z)| lr 1.96e-04 | 8457.33 ms | -100.0% bf16 MFU | 61997 tok/s +step 12257/19560 | loss 3.375319 (+0.86z)| norm 0.2747 (-0.07z)| lr 1.96e-04 | 8455.34 ms | -100.0% bf16 MFU | 61998 tok/s +step 12258/19560 | loss 3.324452 (-0.50z)| norm 0.2599 (-1.50z)| lr 1.96e-04 | 8457.71 ms | -100.0% bf16 MFU | 61997 tok/s +step 12259/19560 | loss 3.364996 (+0.58z)| norm 0.2541 (-2.04z)| lr 1.96e-04 | 8456.45 ms | -100.0% bf16 MFU | 61997 tok/s +step 12260/19560 | loss 3.367798 (+0.65z)| norm 0.2695 (-0.53z)| lr 1.96e-04 | 8457.80 ms | -100.0% bf16 MFU | 61997 tok/s +step 12261/19560 | loss 3.385916 (+1.11z)| norm 0.2483 (-2.51z)| lr 1.96e-04 | 8456.03 ms | -100.0% bf16 MFU | 61997 tok/s +step 12262/19560 | loss 3.342500 (-0.05z)| norm 0.2806 (+0.55z)| lr 1.96e-04 | 8454.11 ms | -100.0% bf16 MFU | 61998 tok/s +step 12263/19560 | loss 3.323768 (-0.54z)| norm 0.2581 (-1.56z)| lr 1.96e-04 | 8458.37 ms | -100.0% bf16 MFU | 61997 tok/s +step 12264/19560 | loss 3.316278 (-0.73z)| norm 0.2682 (-0.63z)| lr 1.96e-04 | 8455.63 ms | -100.0% bf16 MFU | 61998 tok/s +step 12265/19560 | loss 3.283453 (-1.60z)| norm 0.2576 (-1.62z)| lr 1.96e-04 | 8455.69 ms | -100.0% bf16 MFU | 61998 tok/s +step 12266/19560 | loss 3.349647 (+0.17z)| norm 0.2594 (-1.44z)| lr 1.96e-04 | 8454.93 ms | -100.0% bf16 MFU | 61999 tok/s +step 12267/19560 | loss 3.409808 (+1.73z)| norm 0.2526 (-2.06z)| lr 1.95e-04 | 8456.62 ms | -100.0% bf16 MFU | 61999 tok/s +step 12268/19560 | loss 3.343073 (-0.03z)| norm 0.2700 (-0.43z)| lr 1.95e-04 | 8456.49 ms | -100.0% bf16 MFU | 61999 tok/s +step 12269/19560 | loss 3.334522 (-0.26z)| norm 0.2483 (-2.40z)| lr 1.95e-04 | 8457.50 ms | -100.0% bf16 MFU | 61998 tok/s +step 12270/19560 | loss 3.350927 (+0.18z)| norm 0.2590 (-1.42z)| lr 1.95e-04 | 8458.02 ms | -100.0% bf16 MFU | 61998 tok/s +step 12271/19560 | loss 3.343307 (-0.03z)| norm 0.2634 (-1.03z)| lr 1.95e-04 | 8456.95 ms | -100.0% bf16 MFU | 61998 tok/s +step 12272/19560 | loss 3.295770 (-1.30z)| norm 0.2763 (+0.16z)| lr 1.95e-04 | 8454.88 ms | -100.0% bf16 MFU | 61998 tok/s +step 12273/19560 | loss 3.382788 (+1.01z)| norm 0.2818 (+0.66z)| lr 1.95e-04 | 8454.87 ms | -100.0% bf16 MFU | 61999 tok/s +step 12274/19560 | loss 3.287022 (-1.51z)| norm 0.2748 (+0.01z)| lr 1.95e-04 | 8457.18 ms | -100.0% bf16 MFU | 61998 tok/s +step 12275/19560 | loss 3.309573 (-0.90z)| norm 0.2780 (+0.31z)| lr 1.95e-04 | 8461.64 ms | -100.0% bf16 MFU | 61997 tok/s +step 12276/19560 | loss 3.298871 (-1.17z)| norm 0.2878 (+1.20z)| lr 1.95e-04 | 8458.20 ms | -100.0% bf16 MFU | 61996 tok/s +step 12277/19560 | loss 3.362867 (+0.50z)| norm 0.2836 (+0.80z)| lr 1.95e-04 | 8459.76 ms | -100.0% bf16 MFU | 61995 tok/s +step 12278/19560 | loss 3.350770 (+0.20z)| norm 0.2618 (-1.22z)| lr 1.95e-04 | 8455.26 ms | -100.0% bf16 MFU | 61996 tok/s +step 12279/19560 | loss 3.354573 (+0.29z)| norm 0.2827 (+0.72z)| lr 1.95e-04 | 8451.09 ms | -100.0% bf16 MFU | 61998 tok/s +step 12280/19560 | loss 3.324475 (-0.51z)| norm 0.2927 (+1.63z)| lr 1.95e-04 | 8453.40 ms | -100.0% bf16 MFU | 61999 tok/s +step 12281/19560 | loss 3.332233 (-0.30z)| norm 0.2516 (-2.12z)| lr 1.95e-04 | 8454.49 ms | -100.0% bf16 MFU | 62000 tok/s +step 12282/19560 | loss 3.322098 (-0.57z)| norm 0.3103 (+3.07z)| lr 1.95e-04 | 8454.58 ms | -100.0% bf16 MFU | 62000 tok/s +step 12283/19560 | loss 3.352659 (+0.24z)| norm 0.2784 (+0.28z)| lr 1.95e-04 | 8456.60 ms | -100.0% bf16 MFU | 62000 tok/s +step 12284/19560 | loss 3.367871 (+0.64z)| norm 0.2877 (+1.07z)| lr 1.95e-04 | 8454.05 ms | -100.0% bf16 MFU | 62001 tok/s +step 12285/19560 | loss 3.326930 (-0.47z)| norm 0.2671 (-0.74z)| lr 1.95e-04 | 8453.67 ms | -100.0% bf16 MFU | 62002 tok/s +step 12286/19560 | loss 3.283142 (-1.62z)| norm 0.2943 (+1.62z)| lr 1.95e-04 | 8457.08 ms | -100.0% bf16 MFU | 62001 tok/s +step 12287/19560 | loss 3.355397 (+0.31z)| norm 0.2939 (+1.56z)| lr 1.95e-04 | 8453.92 ms | -100.0% bf16 MFU | 62002 tok/s +step 12288/19560 | loss 3.367975 (+0.66z)| norm 0.2861 (+0.89z)| lr 1.95e-04 | 8454.87 ms | -100.0% bf16 MFU | 62003 tok/s +step 12289/19560 | loss 3.304570 (-1.05z)| norm 0.2600 (-1.35z)| lr 1.94e-04 | 8454.55 ms | -100.0% bf16 MFU | 62003 tok/s +step 12290/19560 | loss 3.310981 (-0.88z)| norm 0.2849 (+0.77z)| lr 1.94e-04 | 8454.04 ms | -100.0% bf16 MFU | 62004 tok/s +step 12291/19560 | loss 3.332293 (-0.30z)| norm 0.2678 (-0.68z)| lr 1.94e-04 | 8455.90 ms | -100.0% bf16 MFU | 62004 tok/s +step 12292/19560 | loss 3.340475 (-0.07z)| norm 0.2634 (-1.05z)| lr 1.94e-04 | 8452.91 ms | -100.0% bf16 MFU | 62005 tok/s +step 12293/19560 | loss 3.367925 (+0.67z)| norm 0.2726 (-0.25z)| lr 1.94e-04 | 8450.85 ms | -100.0% bf16 MFU | 62006 tok/s +step 12294/19560 | loss 3.381310 (+1.03z)| norm 0.2680 (-0.64z)| lr 1.94e-04 | 8455.30 ms | -100.0% bf16 MFU | 62006 tok/s +step 12295/19560 | loss 3.368186 (+0.66z)| norm 0.2767 (+0.11z)| lr 1.94e-04 | 8453.39 ms | -100.0% bf16 MFU | 62007 tok/s +step 12296/19560 | loss 3.296678 (-1.28z)| norm 0.2848 (+0.80z)| lr 1.94e-04 | 8455.05 ms | -100.0% bf16 MFU | 62007 tok/s +step 12297/19560 | loss 3.397940 (+1.47z)| norm 0.2796 (+0.37z)| lr 1.94e-04 | 8454.42 ms | -100.0% bf16 MFU | 62008 tok/s +step 12298/19560 | loss 3.312844 (-0.83z)| norm 0.2587 (-1.44z)| lr 1.94e-04 | 8453.98 ms | -100.0% bf16 MFU | 62008 tok/s +step 12299/19560 | loss 3.357132 (+0.37z)| norm 0.2709 (-0.38z)| lr 1.94e-04 | 8456.86 ms | -100.0% bf16 MFU | 62007 tok/s +step 12300/19560 | loss 3.393250 (+1.39z)| norm 0.2712 (-0.35z)| lr 1.94e-04 | 8455.66 ms | -100.0% bf16 MFU | 62007 tok/s +step 12301/19560 | loss 3.355846 (+0.34z)| norm 0.2672 (-0.69z)| lr 1.94e-04 | 8453.25 ms | -100.0% bf16 MFU | 62008 tok/s +step 12302/19560 | loss 3.301569 (-1.17z)| norm 0.2610 (-1.21z)| lr 1.94e-04 | 8456.46 ms | -100.0% bf16 MFU | 62008 tok/s +step 12303/19560 | loss 3.283547 (-1.65z)| norm 0.2585 (-1.41z)| lr 1.94e-04 | 8451.21 ms | -100.0% bf16 MFU | 62009 tok/s +step 12304/19560 | loss 3.329685 (-0.35z)| norm 0.2506 (-2.04z)| lr 1.94e-04 | 8454.99 ms | -100.0% bf16 MFU | 62009 tok/s +step 12305/19560 | loss 3.335510 (-0.19z)| norm 0.2649 (-0.82z)| lr 1.94e-04 | 8453.35 ms | -100.0% bf16 MFU | 62010 tok/s +step 12306/19560 | loss 3.264438 (-2.12z)| norm 0.2713 (-0.28z)| lr 1.94e-04 | 8456.48 ms | -100.0% bf16 MFU | 62009 tok/s +step 12307/19560 | loss 3.367893 (+0.72z)| norm 0.2550 (-1.61z)| lr 1.94e-04 | 8452.46 ms | -100.0% bf16 MFU | 62010 tok/s +step 12308/19560 | loss 3.355278 (+0.37z)| norm 0.3282 (+4.13z)| lr 1.94e-04 | 8454.44 ms | -100.0% bf16 MFU | 62010 tok/s +step 12309/19560 | loss 3.309345 (-0.88z)| norm 0.2807 (+0.45z)| lr 1.94e-04 | 8454.75 ms | -100.0% bf16 MFU | 62010 tok/s +step 12310/19560 | loss 3.292111 (-1.34z)| norm 0.2748 (-0.01z)| lr 1.93e-04 | 8465.74 ms | -100.0% bf16 MFU | 62006 tok/s +step 12311/19560 | loss 3.318174 (-0.62z)| norm 0.2783 (+0.27z)| lr 1.93e-04 | 8480.01 ms | -100.0% bf16 MFU | 61997 tok/s +step 12312/19560 | loss 3.394945 (+1.46z)| norm 0.2572 (-1.36z)| lr 1.93e-04 | 8480.42 ms | -100.0% bf16 MFU | 61989 tok/s +step 12313/19560 | loss 3.341517 (+0.01z)| norm 0.2626 (-0.93z)| lr 1.93e-04 | 8479.08 ms | -100.0% bf16 MFU | 61981 tok/s +step 12314/19560 | loss 3.381808 (+1.09z)| norm 0.2596 (-1.15z)| lr 1.93e-04 | 8480.76 ms | -100.0% bf16 MFU | 61973 tok/s +step 12315/19560 | loss 3.355605 (+0.39z)| norm 0.2630 (-0.87z)| lr 1.93e-04 | 8484.74 ms | -100.0% bf16 MFU | 61964 tok/s +step 12316/19560 | loss 3.323381 (-0.49z)| norm 0.2690 (-0.41z)| lr 1.93e-04 | 8475.02 ms | -100.0% bf16 MFU | 61959 tok/s +step 12317/19560 | loss 3.367168 (+0.72z)| norm 0.3188 (+3.26z)| lr 1.93e-04 | 8481.49 ms | -100.0% bf16 MFU | 61952 tok/s +step 12318/19560 | loss 3.339575 (-0.05z)| norm 0.2695 (-0.38z)| lr 1.93e-04 | 8479.05 ms | -100.0% bf16 MFU | 61946 tok/s +step 12319/19560 | loss 3.338952 (-0.05z)| norm 0.2977 (+1.68z)| lr 1.93e-04 | 8477.22 ms | -100.0% bf16 MFU | 61941 tok/s +step 12320/19560 | loss 3.405834 (+1.81z)| norm 0.2737 (-0.07z)| lr 1.93e-04 | 8473.68 ms | -100.0% bf16 MFU | 61937 tok/s +step 12321/19560 | loss 3.393532 (+1.44z)| norm 0.2687 (-0.44z)| lr 1.93e-04 | 8477.73 ms | -100.0% bf16 MFU | 61933 tok/s +step 12322/19560 | loss 3.331079 (-0.28z)| norm 0.2808 (+0.45z)| lr 1.93e-04 | 8473.76 ms | -100.0% bf16 MFU | 61930 tok/s +step 12323/19560 | loss 3.363210 (+0.70z)| norm 0.2670 (-0.56z)| lr 1.93e-04 | 8477.85 ms | -100.0% bf16 MFU | 61925 tok/s +step 12324/19560 | loss 3.334424 (-0.16z)| norm 0.2830 (+0.60z)| lr 1.93e-04 | 8476.02 ms | -100.0% bf16 MFU | 61922 tok/s +step 12325/19560 | loss 3.372012 (+0.96z)| norm 0.2769 (+0.15z)| lr 1.93e-04 | 8474.67 ms | -100.0% bf16 MFU | 61919 tok/s +step 12326/19560 | loss 3.411002 (+2.07z)| norm 0.2840 (+0.66z)| lr 1.93e-04 | 8471.02 ms | -100.0% bf16 MFU | 61918 tok/s +step 12327/19560 | loss 3.301100 (-1.16z)| norm 0.2812 (+0.46z)| lr 1.93e-04 | 8472.06 ms | -100.0% bf16 MFU | 61916 tok/s +step 12328/19560 | loss 3.361827 (+0.63z)| norm 0.3047 (+2.13z)| lr 1.93e-04 | 8476.57 ms | -100.0% bf16 MFU | 61913 tok/s +step 12329/19560 | loss 3.397460 (+1.64z)| norm 0.2957 (+1.45z)| lr 1.93e-04 | 8474.45 ms | -100.0% bf16 MFU | 61910 tok/s +step 12330/19560 | loss 3.409680 (+1.95z)| norm 0.2930 (+1.25z)| lr 1.93e-04 | 8474.23 ms | -100.0% bf16 MFU | 61908 tok/s +step 12331/19560 | loss 3.308207 (-0.98z)| norm 0.2977 (+1.55z)| lr 1.93e-04 | 8469.25 ms | -100.0% bf16 MFU | 61908 tok/s +step 12332/19560 | loss 3.445533 (+2.87z)| norm 0.2807 (+0.36z)| lr 1.92e-04 | 8476.45 ms | -100.0% bf16 MFU | 61905 tok/s +step 12333/19560 | loss 3.351963 (+0.25z)| norm 0.2930 (+1.21z)| lr 1.92e-04 | 8476.21 ms | -100.0% bf16 MFU | 61903 tok/s +step 12334/19560 | loss 3.361179 (+0.51z)| norm 0.2821 (+0.43z)| lr 1.92e-04 | 8467.38 ms | -100.0% bf16 MFU | 61904 tok/s +step 12335/19560 | loss 3.348129 (+0.13z)| norm 0.2863 (+0.72z)| lr 1.92e-04 | 8472.47 ms | -100.0% bf16 MFU | 61902 tok/s +step 12336/19560 | loss 3.367691 (+0.68z)| norm 0.2706 (-0.38z)| lr 1.92e-04 | 8472.89 ms | -100.0% bf16 MFU | 61901 tok/s +step 12337/19560 | loss 3.369903 (+0.73z)| norm 0.2656 (-0.72z)| lr 1.92e-04 | 8479.34 ms | -100.0% bf16 MFU | 61898 tok/s +step 12338/19560 | loss 3.347096 (+0.10z)| norm 0.2698 (-0.42z)| lr 1.92e-04 | 8467.46 ms | -100.0% bf16 MFU | 61899 tok/s +step 12339/19560 | loss 3.383187 (+1.15z)| norm 0.2783 (+0.18z)| lr 1.92e-04 | 8471.37 ms | -100.0% bf16 MFU | 61898 tok/s +step 12340/19560 | loss 3.351043 (+0.21z)| norm 0.2684 (-0.51z)| lr 1.92e-04 | 8468.21 ms | -100.0% bf16 MFU | 61899 tok/s +step 12341/19560 | loss 3.316141 (-0.80z)| norm 0.2681 (-0.53z)| lr 1.92e-04 | 8471.18 ms | -100.0% bf16 MFU | 61899 tok/s +step 12342/19560 | loss 3.329364 (-0.41z)| norm 0.2809 (+0.36z)| lr 1.92e-04 | 8461.92 ms | -100.0% bf16 MFU | 61902 tok/s +step 12343/19560 | loss 3.475852 (+3.67z)| norm 0.2861 (+0.72z)| lr 1.92e-04 | 8467.59 ms | -100.0% bf16 MFU | 61902 tok/s +step 12344/19560 | loss 3.337423 (-0.20z)| norm 0.2638 (-0.83z)| lr 1.92e-04 | 8467.05 ms | -100.0% bf16 MFU | 61903 tok/s +step 12345/19560 | loss 3.350312 (+0.18z)| norm 0.2722 (-0.24z)| lr 1.92e-04 | 8470.39 ms | -100.0% bf16 MFU | 61903 tok/s +step 12346/19560 | loss 3.324325 (-0.55z)| norm 0.2821 (+0.46z)| lr 1.92e-04 | 8470.36 ms | -100.0% bf16 MFU | 61903 tok/s +step 12347/19560 | loss 3.319610 (-0.68z)| norm 0.2750 (-0.03z)| lr 1.92e-04 | 8472.17 ms | -100.0% bf16 MFU | 61902 tok/s +step 12348/19560 | loss 3.499575 (+4.11z)| norm 0.2895 (+0.99z)| lr 1.92e-04 | 8464.75 ms | -100.0% bf16 MFU | 61903 tok/s +step 12349/19560 | loss 3.312058 (-0.86z)| norm 0.2725 (-0.21z)| lr 1.92e-04 | 8471.77 ms | -100.0% bf16 MFU | 61903 tok/s +step 12350/19560 | loss 3.369263 (+0.65z)| norm 0.2762 (+0.05z)| lr 1.92e-04 | 8450.81 ms | -100.0% bf16 MFU | 61909 tok/s +step 12351/19560 | loss 3.412136 (+1.76z)| norm 0.2684 (-0.50z)| lr 1.92e-04 | 8454.17 ms | -100.0% bf16 MFU | 61915 tok/s +step 12352/19560 | loss 3.415517 (+1.81z)| norm 0.2686 (-0.49z)| lr 1.92e-04 | 8448.63 ms | -100.0% bf16 MFU | 61922 tok/s +step 12353/19560 | loss 3.335227 (-0.28z)| norm 0.2730 (-0.19z)| lr 1.91e-04 | 8442.46 ms | -100.0% bf16 MFU | 61931 tok/s +step 12354/19560 | loss 3.469487 (+3.08z)| norm 0.2960 (+1.44z)| lr 1.91e-04 | 8441.88 ms | -100.0% bf16 MFU | 61940 tok/s +step 12355/19560 | loss 3.357558 (+0.28z)| norm 0.2712 (-0.31z)| lr 1.91e-04 | 8445.27 ms | -100.0% bf16 MFU | 61947 tok/s +step 12356/19560 | loss 3.340786 (-0.15z)| norm 0.2892 (+0.96z)| lr 1.91e-04 | 8446.48 ms | -100.0% bf16 MFU | 61953 tok/s +step 12357/19560 | loss 3.435129 (+2.17z)| norm 0.2771 (+0.10z)| lr 1.91e-04 | 8447.93 ms | -100.0% bf16 MFU | 61958 tok/s +step 12358/19560 | loss 3.379758 (+0.79z)| norm 0.2816 (+0.41z)| lr 1.91e-04 | 8449.00 ms | -100.0% bf16 MFU | 61963 tok/s +step 12359/19560 | loss 3.353020 (+0.12z)| norm 0.2705 (-0.37z)| lr 1.91e-04 | 8443.31 ms | -100.0% bf16 MFU | 61970 tok/s +step 12360/19560 | loss 3.556387 (+4.68z)| norm 1.5499 (+11.18z)| lr 1.91e-04 | 8451.47 ms | -100.0% bf16 MFU | 61973 tok/s +step 12361/19560 | loss 3.384258 (+0.77z)| norm 0.4353 (+1.30z)| lr 1.91e-04 | 8446.48 ms | -100.0% bf16 MFU | 61978 tok/s +step 12362/19560 | loss 3.290983 (-1.34z)| norm 0.3756 (+0.77z)| lr 1.91e-04 | 8446.93 ms | -100.0% bf16 MFU | 61982 tok/s +step 12363/19560 | loss 3.343586 (-0.15z)| norm 0.3416 (+0.47z)| lr 1.91e-04 | 8442.80 ms | -100.0% bf16 MFU | 61988 tok/s +step 12364/19560 | loss 3.406032 (+1.24z)| norm 0.3226 (+0.30z)| lr 1.91e-04 | 8443.47 ms | -100.0% bf16 MFU | 61993 tok/s +step 12365/19560 | loss 3.339606 (-0.25z)| norm 0.2890 (+0.00z)| lr 1.91e-04 | 8451.53 ms | -100.0% bf16 MFU | 61996 tok/s +step 12366/19560 | loss 3.342129 (-0.20z)| norm 0.3140 (+0.22z)| lr 1.91e-04 | 8446.60 ms | -100.0% bf16 MFU | 61999 tok/s +step 12367/19560 | loss 3.338892 (-0.27z)| norm 0.2933 (+0.04z)| lr 1.91e-04 | 8446.41 ms | -100.0% bf16 MFU | 62003 tok/s +step 12368/19560 | loss 3.353137 (+0.06z)| norm 0.3082 (+0.17z)| lr 1.91e-04 | 8438.67 ms | -100.0% bf16 MFU | 62009 tok/s +step 12369/19560 | loss 3.399592 (+1.10z)| norm 0.2757 (-0.12z)| lr 1.91e-04 | 8446.36 ms | -100.0% bf16 MFU | 62012 tok/s +step 12370/19560 | loss 3.340987 (-0.24z)| norm 0.2850 (-0.04z)| lr 1.91e-04 | 8444.07 ms | -100.0% bf16 MFU | 62016 tok/s +step 12371/19560 | loss 3.395499 (+0.99z)| norm 0.2856 (-0.03z)| lr 1.91e-04 | 8448.78 ms | -100.0% bf16 MFU | 62018 tok/s +step 12372/19560 | loss 3.347430 (-0.11z)| norm 0.2794 (-0.08z)| lr 1.91e-04 | 8449.48 ms | -100.0% bf16 MFU | 62020 tok/s +step 12373/19560 | loss 3.386165 (+0.76z)| norm 0.2717 (-0.15z)| lr 1.91e-04 | 8447.31 ms | -100.0% bf16 MFU | 62022 tok/s +step 12374/19560 | loss 3.354089 (+0.01z)| norm 0.2713 (-0.15z)| lr 1.91e-04 | 8447.77 ms | -100.0% bf16 MFU | 62024 tok/s +step 12375/19560 | loss 3.345849 (-0.18z)| norm 0.2816 (-0.06z)| lr 1.90e-04 | 8448.67 ms | -100.0% bf16 MFU | 62026 tok/s +step 12376/19560 | loss 3.333292 (-0.47z)| norm 0.2520 (-0.32z)| lr 1.90e-04 | 8451.55 ms | -100.0% bf16 MFU | 62026 tok/s +step 12377/19560 | loss 3.369370 (+0.36z)| norm 0.2687 (-0.17z)| lr 1.90e-04 | 8450.66 ms | -100.0% bf16 MFU | 62027 tok/s +step 12378/19560 | loss 3.335046 (-0.43z)| norm 0.2688 (-0.17z)| lr 1.90e-04 | 8450.07 ms | -100.0% bf16 MFU | 62028 tok/s +step 12379/19560 | loss 3.355057 (+0.03z)| norm 0.2560 (-0.28z)| lr 1.90e-04 | 8449.46 ms | -100.0% bf16 MFU | 62029 tok/s +step 12380/19560 | loss 3.415455 (+1.41z)| norm 0.2515 (-0.32z)| lr 1.90e-04 | 8456.51 ms | -100.0% bf16 MFU | 62027 tok/s +step 12381/19560 | loss 3.338619 (-0.37z)| norm 0.2861 (-0.02z)| lr 1.90e-04 | 8457.33 ms | -100.0% bf16 MFU | 62026 tok/s +step 12382/19560 | loss 3.298725 (-1.28z)| norm 0.2514 (-0.32z)| lr 1.90e-04 | 8460.01 ms | -100.0% bf16 MFU | 62023 tok/s +step 12383/19560 | loss 3.342501 (-0.27z)| norm 0.2674 (-0.18z)| lr 1.90e-04 | 8457.93 ms | -100.0% bf16 MFU | 62021 tok/s +step 12384/19560 | loss 3.433144 (+1.79z)| norm 0.2736 (-0.13z)| lr 1.90e-04 | 8459.02 ms | -100.0% bf16 MFU | 62019 tok/s +step 12385/19560 | loss 3.379529 (+0.56z)| norm 0.2791 (-0.08z)| lr 1.90e-04 | 8461.00 ms | -100.0% bf16 MFU | 62016 tok/s +step 12386/19560 | loss 3.352375 (-0.06z)| norm 0.2894 (+0.01z)| lr 1.90e-04 | 8451.18 ms | -100.0% bf16 MFU | 62017 tok/s +step 12387/19560 | loss 3.408871 (+1.21z)| norm 0.2678 (-0.18z)| lr 1.90e-04 | 8463.47 ms | -100.0% bf16 MFU | 62014 tok/s +step 12388/19560 | loss 3.356987 (+0.04z)| norm 0.2878 (-0.01z)| lr 1.90e-04 | 8462.73 ms | -100.0% bf16 MFU | 62011 tok/s +step 12389/19560 | loss 3.413213 (+1.30z)| norm 0.2752 (-0.12z)| lr 1.90e-04 | 8451.34 ms | -100.0% bf16 MFU | 62012 tok/s +step 12390/19560 | loss 3.384105 (+0.64z)| norm 0.2827 (-0.05z)| lr 1.90e-04 | 8461.52 ms | -100.0% bf16 MFU | 62010 tok/s +step 12391/19560 | loss 3.370988 (+0.34z)| norm 0.2679 (-0.18z)| lr 1.90e-04 | 8455.81 ms | -100.0% bf16 MFU | 62009 tok/s +step 12392/19560 | loss 3.365134 (+0.20z)| norm 0.2719 (-0.15z)| lr 1.90e-04 | 8460.35 ms | -100.0% bf16 MFU | 62007 tok/s +step 12393/19560 | loss 3.375773 (+0.42z)| norm 0.2696 (-0.17z)| lr 1.90e-04 | 8459.35 ms | -100.0% bf16 MFU | 62006 tok/s +step 12394/19560 | loss 3.357655 (+0.01z)| norm 0.2619 (-0.24z)| lr 1.90e-04 | 8461.74 ms | -100.0% bf16 MFU | 62004 tok/s +step 12395/19560 | loss 3.412080 (+1.25z)| norm 0.2668 (-0.20z)| lr 1.90e-04 | 8459.96 ms | -100.0% bf16 MFU | 62002 tok/s +step 12396/19560 | loss 3.411515 (+1.22z)| norm 0.2613 (-0.24z)| lr 1.89e-04 | 8461.56 ms | -100.0% bf16 MFU | 62000 tok/s +step 12397/19560 | loss 3.389952 (+0.72z)| norm 0.2812 (-0.07z)| lr 1.89e-04 | 8456.77 ms | -100.0% bf16 MFU | 62000 tok/s +step 12398/19560 | loss 3.377304 (+0.43z)| norm 0.2473 (-0.37z)| lr 1.89e-04 | 8456.27 ms | -100.0% bf16 MFU | 62000 tok/s +step 12399/19560 | loss 3.350131 (-0.19z)| norm 0.2903 (+0.01z)| lr 1.89e-04 | 8459.00 ms | -100.0% bf16 MFU | 61999 tok/s +step 12400/19560 | loss 3.322257 (-0.83z)| norm 0.2614 (-0.25z)| lr 1.89e-04 | 8461.87 ms | -100.0% bf16 MFU | 61997 tok/s +step 12401/19560 | loss 3.372267 (+0.31z)| norm 0.2835 (-0.05z)| lr 1.89e-04 | 8460.59 ms | -100.0% bf16 MFU | 61995 tok/s +step 12402/19560 | loss 3.338372 (-0.48z)| norm 0.2800 (-0.08z)| lr 1.89e-04 | 8456.24 ms | -100.0% bf16 MFU | 61996 tok/s +step 12403/19560 | loss 3.410594 (+1.17z)| norm 0.2736 (-0.14z)| lr 1.89e-04 | 8456.50 ms | -100.0% bf16 MFU | 61996 tok/s +step 12404/19560 | loss 3.374510 (+0.33z)| norm 0.2746 (-0.13z)| lr 1.89e-04 | 8455.69 ms | -100.0% bf16 MFU | 61996 tok/s +step 12405/19560 | loss 3.382817 (+0.52z)| norm 0.3325 (+0.37z)| lr 1.89e-04 | 8459.55 ms | -100.0% bf16 MFU | 61995 tok/s +step 12406/19560 | loss 3.374781 (+0.32z)| norm 0.2674 (-0.20z)| lr 1.89e-04 | 8454.76 ms | -100.0% bf16 MFU | 61996 tok/s +step 12407/19560 | loss 3.360509 (-0.01z)| norm 0.3230 (+0.29z)| lr 1.89e-04 | 8463.00 ms | -100.0% bf16 MFU | 61994 tok/s +step 12408/19560 | loss 3.370458 (+0.22z)| norm 0.2555 (-0.30z)| lr 1.89e-04 | 8460.92 ms | -100.0% bf16 MFU | 61992 tok/s +step 12409/19560 | loss 3.357408 (-0.09z)| norm 0.2830 (-0.06z)| lr 1.89e-04 | 8465.48 ms | -100.0% bf16 MFU | 61989 tok/s +step 12410/19560 | loss 3.371606 (+0.23z)| norm 0.2765 (-0.12z)| lr 1.89e-04 | 8456.41 ms | -100.0% bf16 MFU | 61990 tok/s +step 12411/19560 | loss 3.453835 (+2.10z)| norm 0.2820 (-0.07z)| lr 1.89e-04 | 8457.00 ms | -100.0% bf16 MFU | 61990 tok/s +step 12412/19560 | loss 3.304571 (-1.31z)| norm 0.2653 (-0.21z)| lr 1.89e-04 | 8458.20 ms | -100.0% bf16 MFU | 61990 tok/s +step 12413/19560 | loss 3.333635 (-0.65z)| norm 0.2776 (-0.11z)| lr 1.89e-04 | 8459.60 ms | -100.0% bf16 MFU | 61989 tok/s +step 12414/19560 | loss 3.356713 (-0.14z)| norm 0.2642 (-0.22z)| lr 1.89e-04 | 8461.60 ms | -100.0% bf16 MFU | 61988 tok/s +step 12415/19560 | loss 3.339197 (-0.54z)| norm 0.2833 (-0.05z)| lr 1.89e-04 | 8457.85 ms | -100.0% bf16 MFU | 61988 tok/s +step 12416/19560 | loss 3.312424 (-1.14z)| norm 0.2510 (-0.33z)| lr 1.89e-04 | 8460.49 ms | -100.0% bf16 MFU | 61987 tok/s +step 12417/19560 | loss 3.345727 (-0.39z)| norm 0.2734 (-0.14z)| lr 1.89e-04 | 8454.28 ms | -100.0% bf16 MFU | 61988 tok/s +step 12418/19560 | loss 3.347057 (-0.37z)| norm 0.2533 (-0.31z)| lr 1.88e-04 | 8459.08 ms | -100.0% bf16 MFU | 61988 tok/s +step 12419/19560 | loss 3.345013 (-0.42z)| norm 0.2591 (-0.26z)| lr 1.88e-04 | 8455.97 ms | -100.0% bf16 MFU | 61988 tok/s +step 12420/19560 | loss 3.429959 (+1.54z)| norm 0.2659 (-0.20z)| lr 1.88e-04 | 8460.76 ms | -100.0% bf16 MFU | 61987 tok/s +step 12421/19560 | loss 3.350072 (-0.31z)| norm 0.2677 (-0.18z)| lr 1.88e-04 | 8454.64 ms | -100.0% bf16 MFU | 61989 tok/s +step 12422/19560 | loss 3.363533 (+0.01z)| norm 0.2780 (-0.10z)| lr 1.88e-04 | 8457.53 ms | -100.0% bf16 MFU | 61989 tok/s +step 12423/19560 | loss 3.314481 (-1.11z)| norm 0.2593 (-0.26z)| lr 1.88e-04 | 8453.51 ms | -100.0% bf16 MFU | 61990 tok/s +step 12424/19560 | loss 3.379026 (+0.36z)| norm 0.2915 (+0.02z)| lr 1.88e-04 | 8456.84 ms | -100.0% bf16 MFU | 61991 tok/s +step 12425/19560 | loss 3.401982 (+0.89z)| norm 0.2463 (-0.37z)| lr 1.88e-04 | 8456.81 ms | -100.0% bf16 MFU | 61991 tok/s +step 12426/19560 | loss 3.387745 (+0.55z)| norm 0.2767 (-0.11z)| lr 1.88e-04 | 8456.01 ms | -100.0% bf16 MFU | 61991 tok/s +step 12427/19560 | loss 3.362118 (-0.05z)| norm 0.2590 (-0.26z)| lr 1.88e-04 | 8457.79 ms | -100.0% bf16 MFU | 61991 tok/s +step 12428/19560 | loss 3.343904 (-0.46z)| norm 0.2701 (-0.16z)| lr 1.88e-04 | 8456.20 ms | -100.0% bf16 MFU | 61992 tok/s +step 12429/19560 | loss 3.341148 (-0.52z)| norm 0.2641 (-0.21z)| lr 1.88e-04 | 8458.28 ms | -100.0% bf16 MFU | 61991 tok/s +step 12430/19560 | loss 3.334008 (-0.70z)| norm 0.2595 (-0.25z)| lr 1.88e-04 | 8458.99 ms | -100.0% bf16 MFU | 61991 tok/s +step 12431/19560 | loss 3.336833 (-0.65z)| norm 0.2793 (-0.08z)| lr 1.88e-04 | 8459.69 ms | -100.0% bf16 MFU | 61990 tok/s +step 12432/19560 | loss 3.404679 (+0.94z)| norm 0.2675 (-0.19z)| lr 1.88e-04 | 8455.69 ms | -100.0% bf16 MFU | 61991 tok/s +step 12433/19560 | loss 3.321770 (-1.02z)| norm 0.2701 (-0.16z)| lr 1.88e-04 | 8462.13 ms | -100.0% bf16 MFU | 61989 tok/s +step 12434/19560 | loss 3.367656 (+0.05z)| norm 0.2716 (-0.15z)| lr 1.88e-04 | 8456.46 ms | -100.0% bf16 MFU | 61989 tok/s +step 12435/19560 | loss 3.431258 (+1.56z)| norm 0.2847 (-0.04z)| lr 1.88e-04 | 8457.78 ms | -100.0% bf16 MFU | 61989 tok/s +step 12436/19560 | loss 3.372687 (+0.15z)| norm 0.2661 (-0.20z)| lr 1.88e-04 | 8457.74 ms | -100.0% bf16 MFU | 61989 tok/s +step 12437/19560 | loss 3.343583 (-0.55z)| norm 0.2631 (-0.22z)| lr 1.88e-04 | 8457.04 ms | -100.0% bf16 MFU | 61990 tok/s +step 12438/19560 | loss 3.405694 (+0.93z)| norm 0.2621 (-0.23z)| lr 1.88e-04 | 8458.11 ms | -100.0% bf16 MFU | 61990 tok/s +step 12439/19560 | loss 3.340138 (-0.67z)| norm 0.2569 (-0.27z)| lr 1.87e-04 | 8456.85 ms | -100.0% bf16 MFU | 61990 tok/s +step 12440/19560 | loss 3.383428 (+0.39z)| norm 0.2637 (-0.22z)| lr 1.87e-04 | 8454.67 ms | -100.0% bf16 MFU | 61991 tok/s +step 12441/19560 | loss 3.344737 (-0.56z)| norm 0.2613 (-0.24z)| lr 1.87e-04 | 8453.18 ms | -100.0% bf16 MFU | 61992 tok/s +step 12442/19560 | loss 3.391964 (+0.60z)| norm 0.2958 (+0.06z)| lr 1.87e-04 | 8447.00 ms | -100.0% bf16 MFU | 61996 tok/s +step 12443/19560 | loss 3.413665 (+1.11z)| norm 0.2640 (-0.22z)| lr 1.87e-04 | 8443.29 ms | -100.0% bf16 MFU | 62001 tok/s +step 12444/19560 | loss 3.354288 (-0.34z)| norm 0.2974 (+0.08z)| lr 1.87e-04 | 8436.08 ms | -100.0% bf16 MFU | 62009 tok/s +step 12445/19560 | loss 3.301267 (-1.61z)| norm 0.2469 (-0.36z)| lr 1.87e-04 | 8436.67 ms | -100.0% bf16 MFU | 62015 tok/s +step 12446/19560 | loss 3.377511 (+0.23z)| norm 0.2780 (-0.09z)| lr 1.87e-04 | 8438.96 ms | -100.0% bf16 MFU | 62021 tok/s +step 12447/19560 | loss 3.348197 (-0.48z)| norm 0.2490 (-0.34z)| lr 1.87e-04 | 8437.47 ms | -100.0% bf16 MFU | 62027 tok/s +step 12448/19560 | loss 3.449779 (+1.95z)| norm 0.2873 (-0.01z)| lr 1.87e-04 | 8435.94 ms | -100.0% bf16 MFU | 62033 tok/s +step 12449/19560 | loss 3.351149 (-0.41z)| norm 0.2943 (+0.05z)| lr 1.87e-04 | 8437.20 ms | -100.0% bf16 MFU | 62038 tok/s +step 12450/19560 | loss 3.351429 (-0.41z)| norm 0.2799 (-0.07z)| lr 1.87e-04 | 8438.36 ms | -100.0% bf16 MFU | 62043 tok/s +step 12451/19560 | loss 3.384120 (+0.38z)| norm 0.2702 (-0.16z)| lr 1.87e-04 | 8434.23 ms | -100.0% bf16 MFU | 62049 tok/s +step 12452/19560 | loss 3.360823 (-0.19z)| norm 0.2872 (-0.01z)| lr 1.87e-04 | 8434.38 ms | -100.0% bf16 MFU | 62054 tok/s +step 12453/19560 | loss 3.369097 (+0.01z)| norm 0.2929 (+0.04z)| lr 1.87e-04 | 8436.06 ms | -100.0% bf16 MFU | 62059 tok/s +step 12454/19560 | loss 3.308818 (-1.42z)| norm 0.2645 (-0.21z)| lr 1.87e-04 | 8438.83 ms | -100.0% bf16 MFU | 62063 tok/s +step 12455/19560 | loss 3.487506 (+2.78z)| norm 0.2880 (-0.00z)| lr 1.87e-04 | 8437.67 ms | -100.0% bf16 MFU | 62066 tok/s +step 12456/19560 | loss 3.355052 (-0.33z)| norm 0.2825 (-0.05z)| lr 1.87e-04 | 8437.31 ms | -100.0% bf16 MFU | 62070 tok/s +step 12457/19560 | loss 3.334320 (-0.81z)| norm 0.2759 (-0.11z)| lr 1.87e-04 | 8434.28 ms | -100.0% bf16 MFU | 62075 tok/s +step 12458/19560 | loss 3.342662 (-0.60z)| norm 0.3064 (+0.16z)| lr 1.87e-04 | 8440.67 ms | -100.0% bf16 MFU | 62077 tok/s +step 12459/19560 | loss 3.344138 (-0.58z)| norm 0.2603 (-0.24z)| lr 1.87e-04 | 8437.53 ms | -100.0% bf16 MFU | 62080 tok/s +step 12460/19560 | loss 3.394234 (+0.63z)| norm 0.2996 (+0.10z)| lr 1.87e-04 | 8437.22 ms | -100.0% bf16 MFU | 62083 tok/s +step 12461/19560 | loss 3.328402 (-0.95z)| norm 0.2547 (-0.29z)| lr 1.86e-04 | 8436.54 ms | -100.0% bf16 MFU | 62086 tok/s +step 12462/19560 | loss 3.390204 (+0.53z)| norm 0.2732 (-0.13z)| lr 1.86e-04 | 8437.54 ms | -100.0% bf16 MFU | 62088 tok/s +step 12463/19560 | loss 3.333616 (-0.82z)| norm 0.2533 (-0.30z)| lr 1.86e-04 | 8436.00 ms | -100.0% bf16 MFU | 62091 tok/s +step 12464/19560 | loss 3.355383 (-0.30z)| norm 0.2991 (+0.10z)| lr 1.86e-04 | 8440.98 ms | -100.0% bf16 MFU | 62092 tok/s +step 12465/19560 | loss 3.382543 (+0.35z)| norm 0.2960 (+0.07z)| lr 1.86e-04 | 8442.24 ms | -100.0% bf16 MFU | 62093 tok/s +step 12466/19560 | loss 3.344459 (-0.56z)| norm 0.2931 (+0.04z)| lr 1.86e-04 | 8442.17 ms | -100.0% bf16 MFU | 62093 tok/s +step 12467/19560 | loss 3.338407 (-0.70z)| norm 0.2875 (-0.00z)| lr 1.86e-04 | 8443.45 ms | -100.0% bf16 MFU | 62093 tok/s +step 12468/19560 | loss 3.375429 (+0.18z)| norm 0.2686 (-0.17z)| lr 1.86e-04 | 8442.90 ms | -100.0% bf16 MFU | 62094 tok/s +step 12469/19560 | loss 3.391494 (+0.55z)| norm 0.2869 (-0.01z)| lr 1.86e-04 | 8445.08 ms | -100.0% bf16 MFU | 62093 tok/s +step 12470/19560 | loss 3.358748 (-0.24z)| norm 0.2867 (-0.01z)| lr 1.86e-04 | 8448.32 ms | -100.0% bf16 MFU | 62091 tok/s +step 12471/19560 | loss 3.324456 (-1.06z)| norm 0.2766 (-0.10z)| lr 1.86e-04 | 8447.30 ms | -100.0% bf16 MFU | 62090 tok/s +step 12472/19560 | loss 3.329383 (-0.93z)| norm 0.3110 (+0.20z)| lr 1.86e-04 | 8452.08 ms | -100.0% bf16 MFU | 62087 tok/s +step 12473/19560 | loss 3.348916 (-0.45z)| norm 0.2955 (+0.06z)| lr 1.86e-04 | 8453.62 ms | -100.0% bf16 MFU | 62084 tok/s +step 12474/19560 | loss 3.284609 (-2.00z)| norm 0.3086 (+0.17z)| lr 1.86e-04 | 8451.02 ms | -100.0% bf16 MFU | 62081 tok/s +step 12475/19560 | loss 3.369359 (+0.05z)| norm 0.2621 (-0.23z)| lr 1.86e-04 | 8452.61 ms | -100.0% bf16 MFU | 62079 tok/s +step 12476/19560 | loss 3.340899 (-0.64z)| norm 0.2878 (-0.01z)| lr 1.86e-04 | 8452.89 ms | -100.0% bf16 MFU | 62076 tok/s +step 12477/19560 | loss 3.334111 (-0.82z)| norm 0.2799 (-0.08z)| lr 1.86e-04 | 8455.56 ms | -100.0% bf16 MFU | 62072 tok/s +step 12478/19560 | loss 3.363114 (-0.08z)| norm 0.2727 (-0.14z)| lr 1.86e-04 | 8453.36 ms | -100.0% bf16 MFU | 62070 tok/s +step 12479/19560 | loss 3.344046 (-0.56z)| norm 0.2795 (-0.08z)| lr 1.86e-04 | 8455.39 ms | -100.0% bf16 MFU | 62067 tok/s +step 12480/19560 | loss 3.355100 (-0.26z)| norm 0.2807 (-0.07z)| lr 1.86e-04 | 8455.03 ms | -100.0% bf16 MFU | 62064 tok/s +step 12481/19560 | loss 3.313424 (-1.33z)| norm 0.2523 (-0.32z)| lr 1.86e-04 | 8455.09 ms | -100.0% bf16 MFU | 62061 tok/s +step 12482/19560 | loss 3.353448 (-0.29z)| norm 0.2836 (-0.04z)| lr 1.85e-04 | 8454.48 ms | -100.0% bf16 MFU | 62059 tok/s +step 12483/19560 | loss 3.343011 (-0.56z)| norm 0.2603 (-0.25z)| lr 1.85e-04 | 8456.13 ms | -100.0% bf16 MFU | 62056 tok/s +step 12484/19560 | loss 3.359988 (-0.11z)| norm 0.2571 (-0.27z)| lr 1.85e-04 | 8456.78 ms | -100.0% bf16 MFU | 62053 tok/s +step 12485/19560 | loss 3.327878 (-0.95z)| norm 0.2518 (-0.32z)| lr 1.85e-04 | 8454.28 ms | -100.0% bf16 MFU | 62051 tok/s +step 12486/19560 | loss 3.381415 (+0.48z)| norm 0.2506 (-0.33z)| lr 1.85e-04 | 8454.54 ms | -100.0% bf16 MFU | 62049 tok/s +step 12487/19560 | loss 3.347163 (-0.43z)| norm 0.2714 (-0.14z)| lr 1.85e-04 | 8452.39 ms | -100.0% bf16 MFU | 62048 tok/s +step 12488/19560 | loss 3.435185 (+2.14z)| norm 0.2558 (-0.91z)| lr 1.85e-04 | 8454.26 ms | -100.0% bf16 MFU | 62046 tok/s +step 12489/19560 | loss 3.388543 (+0.76z)| norm 0.2710 (-0.28z)| lr 1.85e-04 | 8453.98 ms | -100.0% bf16 MFU | 62045 tok/s +step 12490/19560 | loss 3.389920 (+0.79z)| norm 0.2521 (-1.32z)| lr 1.85e-04 | 8452.88 ms | -100.0% bf16 MFU | 62044 tok/s +step 12491/19560 | loss 3.390295 (+0.79z)| norm 0.2736 (-0.09z)| lr 1.85e-04 | 8454.92 ms | -100.0% bf16 MFU | 62042 tok/s +step 12492/19560 | loss 3.309306 (-1.59z)| norm 0.2751 (+0.03z)| lr 1.85e-04 | 8455.71 ms | -100.0% bf16 MFU | 62040 tok/s +step 12493/19560 | loss 3.326810 (-1.07z)| norm 0.2637 (-0.67z)| lr 1.85e-04 | 8454.22 ms | -100.0% bf16 MFU | 62039 tok/s +step 12494/19560 | loss 3.353138 (-0.29z)| norm 0.2870 (+0.80z)| lr 1.85e-04 | 8453.43 ms | -100.0% bf16 MFU | 62038 tok/s +step 12495/19560 | loss 3.360736 (-0.07z)| norm 0.2590 (-0.95z)| lr 1.85e-04 | 8455.52 ms | -100.0% bf16 MFU | 62036 tok/s +step 12496/19560 | loss 3.394585 (+0.92z)| norm 0.2860 (+0.78z)| lr 1.85e-04 | 8453.16 ms | -100.0% bf16 MFU | 62036 tok/s +step 12497/19560 | loss 3.345493 (-0.52z)| norm 0.2737 (-0.01z)| lr 1.85e-04 | 8454.81 ms | -100.0% bf16 MFU | 62034 tok/s +step 12498/19560 | loss 3.348925 (-0.42z)| norm 0.2693 (-0.28z)| lr 1.85e-04 | 8453.79 ms | -100.0% bf16 MFU | 62034 tok/s +step 12499/19560 | loss 3.321168 (-1.23z)| norm 0.2660 (-0.48z)| lr 1.85e-04 | 8457.06 ms | -100.0% bf16 MFU | 62032 tok/s +step 12500/19560 | loss 3.522169 (+4.34z)| norm 0.2619 (-0.74z)| lr 1.85e-04 | 8455.75 ms | -100.0% bf16 MFU | 62030 tok/s +val loss 3.333107 +evaluating HellaSwag: 0/1256 evaluating HellaSwag: 10/1256 evaluating HellaSwag: 20/1256 evaluating HellaSwag: 30/1256 evaluating HellaSwag: 40/1256 evaluating HellaSwag: 50/1256 evaluating HellaSwag: 60/1256 evaluating HellaSwag: 70/1256 evaluating HellaSwag: 80/1256 evaluating HellaSwag: 90/1256 evaluating HellaSwag: 100/1256 evaluating HellaSwag: 110/1256 evaluating HellaSwag: 120/1256 evaluating HellaSwag: 130/1256 evaluating HellaSwag: 140/1256 evaluating HellaSwag: 150/1256 evaluating HellaSwag: 160/1256 evaluating HellaSwag: 170/1256 evaluating HellaSwag: 180/1256 evaluating HellaSwag: 190/1256 evaluating HellaSwag: 200/1256 evaluating HellaSwag: 210/1256 evaluating HellaSwag: 220/1256 evaluating HellaSwag: 230/1256 evaluating HellaSwag: 240/1256 evaluating HellaSwag: 250/1256 evaluating HellaSwag: 260/1256 evaluating HellaSwag: 270/1256 evaluating HellaSwag: 280/1256 evaluating HellaSwag: 290/1256 evaluating HellaSwag: 300/1256 evaluating HellaSwag: 310/1256 evaluating HellaSwag: 320/1256 evaluating HellaSwag: 330/1256 evaluating HellaSwag: 340/1256 evaluating HellaSwag: 350/1256 evaluating HellaSwag: 360/1256 evaluating HellaSwag: 370/1256 evaluating HellaSwag: 380/1256 evaluating HellaSwag: 390/1256 evaluating HellaSwag: 400/1256 evaluating HellaSwag: 410/1256 evaluating HellaSwag: 420/1256 evaluating HellaSwag: 430/1256 evaluating HellaSwag: 440/1256 evaluating HellaSwag: 450/1256 evaluating HellaSwag: 460/1256 evaluating HellaSwag: 470/1256 evaluating HellaSwag: 480/1256 evaluating HellaSwag: 490/1256 evaluating HellaSwag: 500/1256 evaluating HellaSwag: 510/1256 evaluating HellaSwag: 520/1256 evaluating HellaSwag: 530/1256 evaluating HellaSwag: 540/1256 evaluating HellaSwag: 550/1256 evaluating HellaSwag: 560/1256 evaluating HellaSwag: 570/1256 evaluating HellaSwag: 580/1256 evaluating HellaSwag: 590/1256 evaluating HellaSwag: 600/1256 evaluating HellaSwag: 610/1256 evaluating HellaSwag: 620/1256 evaluating HellaSwag: 630/1256 evaluating HellaSwag: 640/1256 evaluating HellaSwag: 650/1256 evaluating HellaSwag: 660/1256 evaluating HellaSwag: 670/1256 evaluating HellaSwag: 680/1256 evaluating HellaSwag: 690/1256 evaluating HellaSwag: 700/1256 evaluating HellaSwag: 710/1256 evaluating HellaSwag: 720/1256 evaluating HellaSwag: 730/1256 evaluating HellaSwag: 740/1256 evaluating HellaSwag: 750/1256 evaluating HellaSwag: 760/1256 evaluating HellaSwag: 770/1256 evaluating HellaSwag: 780/1256 evaluating HellaSwag: 790/1256 evaluating HellaSwag: 800/1256 evaluating HellaSwag: 810/1256 evaluating HellaSwag: 820/1256 evaluating HellaSwag: 830/1256 evaluating HellaSwag: 840/1256 evaluating HellaSwag: 850/1256 evaluating HellaSwag: 860/1256 evaluating HellaSwag: 870/1256 evaluating HellaSwag: 880/1256 evaluating HellaSwag: 890/1256 evaluating HellaSwag: 900/1256 evaluating HellaSwag: 910/1256 evaluating HellaSwag: 920/1256 evaluating HellaSwag: 930/1256 evaluating HellaSwag: 940/1256 evaluating HellaSwag: 950/1256 evaluating HellaSwag: 960/1256 evaluating HellaSwag: 970/1256 evaluating HellaSwag: 980/1256 evaluating HellaSwag: 990/1256 evaluating HellaSwag: 1000/1256 evaluating HellaSwag: 1010/1256 evaluating HellaSwag: 1020/1256 evaluating HellaSwag: 1030/1256 evaluating HellaSwag: 1040/1256 evaluating HellaSwag: 1050/1256 evaluating HellaSwag: 1060/1256 evaluating HellaSwag: 1070/1256 evaluating HellaSwag: 1080/1256 evaluating HellaSwag: 1090/1256 evaluating HellaSwag: 1100/1256 evaluating HellaSwag: 1110/1256 evaluating HellaSwag: 1120/1256 evaluating HellaSwag: 1130/1256 evaluating HellaSwag: 1140/1256 evaluating HellaSwag: 1150/1256 evaluating HellaSwag: 1160/1256 evaluating HellaSwag: 1170/1256 evaluating HellaSwag: 1180/1256 evaluating HellaSwag: 1190/1256 evaluating HellaSwag: 1200/1256 evaluating HellaSwag: 1210/1256 evaluating HellaSwag: 1220/1256 evaluating HellaSwag: 1230/1256 evaluating HellaSwag: 1240/1256 evaluating HellaSwag: 1250/1256 HellaSwag: 2942/10042 = 0.292970 +step 12501/19560 | loss 3.297000 (-1.79z)| norm 0.2643 (-0.58z)| lr 1.85e-04 | 8481.95 ms | -100.0% bf16 MFU | 62019 tok/s +step 12502/19560 | loss 3.346978 (-0.44z)| norm 0.2589 (-0.92z)| lr 1.85e-04 | 8485.67 ms | -100.0% bf16 MFU | 62008 tok/s +step 12503/19560 | loss 3.383218 (+0.54z)| norm 0.2577 (-0.97z)| lr 1.85e-04 | 8482.44 ms | -100.0% bf16 MFU | 61998 tok/s +step 12504/19560 | loss 3.314808 (-1.31z)| norm 0.2866 (+0.84z)| lr 1.84e-04 | 8482.72 ms | -100.0% bf16 MFU | 61988 tok/s +step 12505/19560 | loss 3.327637 (-0.95z)| norm 0.2788 (+0.34z)| lr 1.84e-04 | 8483.00 ms | -100.0% bf16 MFU | 61979 tok/s +step 12506/19560 | loss 3.304628 (-1.55z)| norm 0.3098 (+2.25z)| lr 1.84e-04 | 8473.17 ms | -100.0% bf16 MFU | 61974 tok/s +step 12507/19560 | loss 3.361020 (-0.05z)| norm 0.2823 (+0.52z)| lr 1.84e-04 | 8485.77 ms | -100.0% bf16 MFU | 61964 tok/s +step 12508/19560 | loss 3.442300 (+2.10z)| norm 0.2775 (+0.21z)| lr 1.84e-04 | 8481.41 ms | -100.0% bf16 MFU | 61957 tok/s +step 12509/19560 | loss 3.352135 (-0.29z)| norm 0.2855 (+0.72z)| lr 1.84e-04 | 8486.04 ms | -100.0% bf16 MFU | 61948 tok/s +step 12510/19560 | loss 3.361340 (-0.06z)| norm 0.2781 (+0.24z)| lr 1.84e-04 | 8482.83 ms | -100.0% bf16 MFU | 61941 tok/s +step 12511/19560 | loss 3.368739 (+0.13z)| norm 0.2866 (+0.77z)| lr 1.84e-04 | 8477.59 ms | -100.0% bf16 MFU | 61936 tok/s +step 12512/19560 | loss 3.361404 (-0.05z)| norm 0.2820 (+0.47z)| lr 1.84e-04 | 8473.85 ms | -100.0% bf16 MFU | 61933 tok/s +step 12513/19560 | loss 3.356596 (-0.17z)| norm 0.2822 (+0.48z)| lr 1.84e-04 | 8478.15 ms | -100.0% bf16 MFU | 61928 tok/s +step 12514/19560 | loss 3.327182 (-0.96z)| norm 0.2794 (+0.31z)| lr 1.84e-04 | 8476.82 ms | -100.0% bf16 MFU | 61924 tok/s +step 12515/19560 | loss 3.461019 (+2.59z)| norm 0.2743 (-0.02z)| lr 1.84e-04 | 8471.68 ms | -100.0% bf16 MFU | 61923 tok/s +step 12516/19560 | loss 3.357895 (-0.14z)| norm 0.2713 (-0.20z)| lr 1.84e-04 | 8467.33 ms | -100.0% bf16 MFU | 61922 tok/s +step 12517/19560 | loss 3.334045 (-0.76z)| norm 0.2587 (-1.00z)| lr 1.84e-04 | 8478.10 ms | -100.0% bf16 MFU | 61918 tok/s +step 12518/19560 | loss 3.339336 (-0.61z)| norm 0.2695 (-0.30z)| lr 1.84e-04 | 8471.35 ms | -100.0% bf16 MFU | 61917 tok/s +step 12519/19560 | loss 3.332025 (-0.80z)| norm 0.2761 (+0.11z)| lr 1.84e-04 | 8476.06 ms | -100.0% bf16 MFU | 61914 tok/s +step 12520/19560 | loss 3.371372 (+0.25z)| norm 0.2880 (+0.86z)| lr 1.84e-04 | 8474.14 ms | -100.0% bf16 MFU | 61912 tok/s +step 12521/19560 | loss 3.361845 (-0.00z)| norm 0.2761 (+0.10z)| lr 1.84e-04 | 8466.96 ms | -100.0% bf16 MFU | 61912 tok/s +step 12522/19560 | loss 3.392542 (+0.81z)| norm 0.2884 (+0.87z)| lr 1.84e-04 | 8461.98 ms | -100.0% bf16 MFU | 61914 tok/s +step 12523/19560 | loss 3.358649 (-0.08z)| norm 0.2798 (+0.32z)| lr 1.84e-04 | 8470.61 ms | -100.0% bf16 MFU | 61913 tok/s +step 12524/19560 | loss 3.365257 (+0.10z)| norm 0.2911 (+1.03z)| lr 1.84e-04 | 8477.91 ms | -100.0% bf16 MFU | 61910 tok/s +step 12525/19560 | loss 3.365221 (+0.11z)| norm 0.2629 (-0.76z)| lr 1.84e-04 | 8474.94 ms | -100.0% bf16 MFU | 61907 tok/s +step 12526/19560 | loss 3.304362 (-1.50z)| norm 0.2676 (-0.48z)| lr 1.83e-04 | 8471.80 ms | -100.0% bf16 MFU | 61906 tok/s +step 12527/19560 | loss 3.347898 (-0.34z)| norm 0.2777 (+0.18z)| lr 1.83e-04 | 8473.98 ms | -100.0% bf16 MFU | 61905 tok/s +step 12528/19560 | loss 3.386644 (+0.68z)| norm 0.2530 (-1.40z)| lr 1.83e-04 | 8472.14 ms | -100.0% bf16 MFU | 61904 tok/s +step 12529/19560 | loss 3.342120 (-0.50z)| norm 0.2668 (-0.51z)| lr 1.83e-04 | 8474.92 ms | -100.0% bf16 MFU | 61902 tok/s +step 12530/19560 | loss 3.342947 (-0.48z)| norm 0.2663 (-0.53z)| lr 1.83e-04 | 8468.70 ms | -100.0% bf16 MFU | 61902 tok/s +step 12531/19560 | loss 3.333885 (-0.71z)| norm 0.2634 (-0.71z)| lr 1.83e-04 | 8474.61 ms | -100.0% bf16 MFU | 61900 tok/s +step 12532/19560 | loss 3.328213 (-0.85z)| norm 0.2503 (-1.52z)| lr 1.83e-04 | 8465.81 ms | -100.0% bf16 MFU | 61902 tok/s +step 12533/19560 | loss 3.351259 (-0.23z)| norm 0.2593 (-0.97z)| lr 1.83e-04 | 8473.26 ms | -100.0% bf16 MFU | 61900 tok/s +step 12534/19560 | loss 3.350403 (-0.24z)| norm 0.2706 (-0.21z)| lr 1.83e-04 | 8463.50 ms | -100.0% bf16 MFU | 61903 tok/s +step 12535/19560 | loss 3.273139 (-2.26z)| norm 0.2723 (-0.07z)| lr 1.83e-04 | 8468.81 ms | -100.0% bf16 MFU | 61903 tok/s +step 12536/19560 | loss 3.338441 (-0.53z)| norm 0.2931 (+1.35z)| lr 1.83e-04 | 8466.61 ms | -100.0% bf16 MFU | 61904 tok/s +step 12537/19560 | loss 3.356265 (-0.06z)| norm 0.3027 (+1.98z)| lr 1.83e-04 | 8465.71 ms | -100.0% bf16 MFU | 61905 tok/s +step 12538/19560 | loss 3.387340 (+0.75z)| norm 0.2702 (-0.25z)| lr 1.83e-04 | 8473.38 ms | -100.0% bf16 MFU | 61904 tok/s +step 12539/19560 | loss 3.373209 (+0.41z)| norm 0.2858 (+0.82z)| lr 1.83e-04 | 8471.53 ms | -100.0% bf16 MFU | 61903 tok/s +step 12540/19560 | loss 3.397036 (+1.04z)| norm 0.2837 (+0.66z)| lr 1.83e-04 | 8467.96 ms | -100.0% bf16 MFU | 61904 tok/s +step 12541/19560 | loss 3.435164 (+2.02z)| norm 0.2631 (-0.74z)| lr 1.83e-04 | 8466.64 ms | -100.0% bf16 MFU | 61905 tok/s +step 12542/19560 | loss 3.360602 (+0.03z)| norm 0.3119 (+2.52z)| lr 1.83e-04 | 8468.91 ms | -100.0% bf16 MFU | 61905 tok/s +step 12543/19560 | loss 3.439360 (+2.07z)| norm 0.2577 (-1.09z)| lr 1.83e-04 | 8459.13 ms | -100.0% bf16 MFU | 61908 tok/s +step 12544/19560 | loss 3.368534 (+0.20z)| norm 0.2860 (+0.79z)| lr 1.83e-04 | 8448.06 ms | -100.0% bf16 MFU | 61916 tok/s +step 12545/19560 | loss 3.316587 (-1.16z)| norm 0.2622 (-0.80z)| lr 1.83e-04 | 8446.07 ms | -100.0% bf16 MFU | 61924 tok/s +step 12546/19560 | loss 3.364858 (+0.11z)| norm 0.2717 (-0.18z)| lr 1.83e-04 | 8441.24 ms | -100.0% bf16 MFU | 61933 tok/s +step 12547/19560 | loss 3.343826 (-0.44z)| norm 0.2723 (-0.15z)| lr 1.82e-04 | 8444.43 ms | -100.0% bf16 MFU | 61941 tok/s +step 12548/19560 | loss 3.341089 (-0.50z)| norm 0.2727 (-0.13z)| lr 1.82e-04 | 8447.19 ms | -100.0% bf16 MFU | 61947 tok/s +step 12549/19560 | loss 3.366030 (+0.16z)| norm 0.3043 (+1.97z)| lr 1.82e-04 | 8450.95 ms | -100.0% bf16 MFU | 61952 tok/s +step 12550/19560 | loss 3.359435 (-0.02z)| norm 0.2601 (-0.97z)| lr 1.82e-04 | 8451.45 ms | -100.0% bf16 MFU | 61956 tok/s +step 12551/19560 | loss 3.478228 (+3.01z)| norm 0.2813 (+0.43z)| lr 1.82e-04 | 8445.36 ms | -100.0% bf16 MFU | 61962 tok/s +step 12552/19560 | loss 3.278111 (-2.09z)| norm 0.2701 (-0.31z)| lr 1.82e-04 | 8444.68 ms | -100.0% bf16 MFU | 61968 tok/s +step 12553/19560 | loss 3.376726 (+0.42z)| norm 0.2725 (-0.16z)| lr 1.82e-04 | 8444.19 ms | -100.0% bf16 MFU | 61974 tok/s +step 12554/19560 | loss 3.391741 (+0.80z)| norm 0.2864 (+0.78z)| lr 1.82e-04 | 8442.38 ms | -100.0% bf16 MFU | 61981 tok/s +step 12555/19560 | loss 3.398107 (+0.95z)| norm 0.2998 (+1.66z)| lr 1.82e-04 | 8441.52 ms | -100.0% bf16 MFU | 61987 tok/s +step 12556/19560 | loss 3.346431 (-0.36z)| norm 0.2676 (-0.52z)| lr 1.82e-04 | 8446.02 ms | -100.0% bf16 MFU | 61991 tok/s +step 12557/19560 | loss 3.361708 (+0.02z)| norm 0.2498 (-1.70z)| lr 1.82e-04 | 8442.34 ms | -100.0% bf16 MFU | 61997 tok/s +step 12558/19560 | loss 3.340900 (-0.51z)| norm 0.2793 (+0.27z)| lr 1.82e-04 | 8445.44 ms | -100.0% bf16 MFU | 62001 tok/s +step 12559/19560 | loss 3.338664 (-0.57z)| norm 0.2512 (-1.59z)| lr 1.82e-04 | 8442.40 ms | -100.0% bf16 MFU | 62006 tok/s +step 12560/19560 | loss 3.400885 (+1.02z)| norm 0.2628 (-0.82z)| lr 1.82e-04 | 8439.17 ms | -100.0% bf16 MFU | 62012 tok/s +step 12561/19560 | loss 3.394883 (+0.85z)| norm 0.2639 (-0.74z)| lr 1.82e-04 | 8443.38 ms | -100.0% bf16 MFU | 62016 tok/s +step 12562/19560 | loss 3.357042 (-0.11z)| norm 0.2660 (-0.59z)| lr 1.82e-04 | 8439.54 ms | -100.0% bf16 MFU | 62022 tok/s +step 12563/19560 | loss 3.365627 (+0.12z)| norm 0.2744 (-0.03z)| lr 1.82e-04 | 8440.61 ms | -100.0% bf16 MFU | 62026 tok/s +step 12564/19560 | loss 3.363612 (+0.07z)| norm 0.2411 (-2.19z)| lr 1.82e-04 | 8449.48 ms | -100.0% bf16 MFU | 62027 tok/s +step 12565/19560 | loss 3.497818 (+3.35z)| norm 0.3001 (+1.62z)| lr 1.82e-04 | 8447.63 ms | -100.0% bf16 MFU | 62029 tok/s +step 12566/19560 | loss 3.420314 (+1.43z)| norm 0.2650 (-0.65z)| lr 1.82e-04 | 8444.88 ms | -100.0% bf16 MFU | 62032 tok/s +step 12567/19560 | loss 3.416232 (+1.31z)| norm 0.2615 (-0.88z)| lr 1.82e-04 | 8445.30 ms | -100.0% bf16 MFU | 62034 tok/s +step 12568/19560 | loss 3.376081 (+0.33z)| norm 0.2790 (+0.25z)| lr 1.82e-04 | 8448.93 ms | -100.0% bf16 MFU | 62035 tok/s +step 12569/19560 | loss 3.348096 (-0.36z)| norm 0.2773 (+0.13z)| lr 1.81e-04 | 8450.50 ms | -100.0% bf16 MFU | 62036 tok/s +step 12570/19560 | loss 3.345820 (-0.41z)| norm 0.2700 (-0.33z)| lr 1.81e-04 | 8452.98 ms | -100.0% bf16 MFU | 62035 tok/s +step 12571/19560 | loss 3.387124 (+0.61z)| norm 0.2799 (+0.31z)| lr 1.81e-04 | 8453.74 ms | -100.0% bf16 MFU | 62034 tok/s +step 12572/19560 | loss 3.416341 (+1.31z)| norm 0.2653 (-0.64z)| lr 1.81e-04 | 8449.45 ms | -100.0% bf16 MFU | 62035 tok/s +step 12573/19560 | loss 3.368445 (+0.13z)| norm 0.2801 (+0.33z)| lr 1.81e-04 | 8454.29 ms | -100.0% bf16 MFU | 62034 tok/s +step 12574/19560 | loss 3.346834 (-0.40z)| norm 0.2806 (+0.36z)| lr 1.81e-04 | 8457.75 ms | -100.0% bf16 MFU | 62032 tok/s +step 12575/19560 | loss 3.284911 (-1.89z)| norm 0.2779 (+0.16z)| lr 1.81e-04 | 8455.73 ms | -100.0% bf16 MFU | 62030 tok/s +step 12576/19560 | loss 3.323645 (-0.93z)| norm 0.2712 (-0.28z)| lr 1.81e-04 | 8457.09 ms | -100.0% bf16 MFU | 62029 tok/s +step 12577/19560 | loss 3.327357 (-0.83z)| norm 0.2685 (-0.45z)| lr 1.81e-04 | 8458.39 ms | -100.0% bf16 MFU | 62026 tok/s +step 12578/19560 | loss 3.370641 (+0.23z)| norm 0.2585 (-1.12z)| lr 1.81e-04 | 8458.18 ms | -100.0% bf16 MFU | 62024 tok/s +step 12579/19560 | loss 3.395408 (+0.83z)| norm 0.2529 (-1.48z)| lr 1.81e-04 | 8458.95 ms | -100.0% bf16 MFU | 62022 tok/s +step 12580/19560 | loss 3.393860 (+0.79z)| norm 0.2702 (-0.30z)| lr 1.81e-04 | 8457.18 ms | -100.0% bf16 MFU | 62021 tok/s +step 12581/19560 | loss 3.389084 (+0.66z)| norm 0.2573 (-1.16z)| lr 1.81e-04 | 8460.79 ms | -100.0% bf16 MFU | 62018 tok/s +step 12582/19560 | loss 3.360594 (-0.04z)| norm 0.2670 (-0.50z)| lr 1.81e-04 | 8460.68 ms | -100.0% bf16 MFU | 62015 tok/s +step 12583/19560 | loss 3.399233 (+0.96z)| norm 0.2823 (+0.54z)| lr 1.81e-04 | 8457.49 ms | -100.0% bf16 MFU | 62014 tok/s +step 12584/19560 | loss 3.363211 (+0.04z)| norm 0.2506 (-1.58z)| lr 1.81e-04 | 8458.37 ms | -100.0% bf16 MFU | 62013 tok/s +step 12585/19560 | loss 3.384900 (+0.58z)| norm 0.2631 (-0.74z)| lr 1.81e-04 | 8460.12 ms | -100.0% bf16 MFU | 62011 tok/s +step 12586/19560 | loss 3.403253 (+1.04z)| norm 0.2711 (-0.18z)| lr 1.81e-04 | 8457.86 ms | -100.0% bf16 MFU | 62010 tok/s +step 12587/19560 | loss 3.406646 (+1.11z)| norm 0.2625 (-0.77z)| lr 1.81e-04 | 8460.50 ms | -100.0% bf16 MFU | 62008 tok/s +step 12588/19560 | loss 3.400303 (+0.94z)| norm 0.2567 (-1.16z)| lr 1.81e-04 | 8463.80 ms | -100.0% bf16 MFU | 62004 tok/s +step 12589/19560 | loss 3.310199 (-1.33z)| norm 0.2914 (+1.22z)| lr 1.81e-04 | 8460.92 ms | -100.0% bf16 MFU | 62002 tok/s +step 12590/19560 | loss 3.394115 (+0.79z)| norm 0.2578 (-1.09z)| lr 1.81e-04 | 8461.90 ms | -100.0% bf16 MFU | 62000 tok/s +step 12591/19560 | loss 3.355385 (-0.20z)| norm 0.2741 (+0.03z)| lr 1.80e-04 | 8459.67 ms | -100.0% bf16 MFU | 61999 tok/s +step 12592/19560 | loss 3.327465 (-0.89z)| norm 0.2590 (-1.01z)| lr 1.80e-04 | 8453.15 ms | -100.0% bf16 MFU | 62000 tok/s +step 12593/19560 | loss 3.345733 (-0.43z)| norm 0.2797 (+0.45z)| lr 1.80e-04 | 8458.96 ms | -100.0% bf16 MFU | 61999 tok/s +step 12594/19560 | loss 3.385675 (+0.57z)| norm 0.2719 (-0.09z)| lr 1.80e-04 | 8456.65 ms | -100.0% bf16 MFU | 61999 tok/s +step 12595/19560 | loss 3.349736 (-0.34z)| norm 0.2691 (-0.28z)| lr 1.80e-04 | 8458.41 ms | -100.0% bf16 MFU | 61998 tok/s +step 12596/19560 | loss 3.312587 (-1.25z)| norm 0.2580 (-1.06z)| lr 1.80e-04 | 8461.47 ms | -100.0% bf16 MFU | 61997 tok/s +step 12597/19560 | loss 3.381041 (+0.47z)| norm 0.2697 (-0.22z)| lr 1.80e-04 | 8459.00 ms | -100.0% bf16 MFU | 61996 tok/s +step 12598/19560 | loss 3.317920 (-1.11z)| norm 0.2655 (-0.51z)| lr 1.80e-04 | 8465.52 ms | -100.0% bf16 MFU | 61993 tok/s +step 12599/19560 | loss 3.372512 (+0.25z)| norm 0.2568 (-1.12z)| lr 1.80e-04 | 8457.62 ms | -100.0% bf16 MFU | 61992 tok/s +step 12600/19560 | loss 3.389513 (+0.66z)| norm 0.2798 (+0.55z)| lr 1.80e-04 | 8453.31 ms | -100.0% bf16 MFU | 61994 tok/s +step 12601/19560 | loss 3.350303 (-0.32z)| norm 0.2583 (-1.01z)| lr 1.80e-04 | 8456.47 ms | -100.0% bf16 MFU | 61994 tok/s +step 12602/19560 | loss 3.398476 (+0.88z)| norm 0.2724 (+0.05z)| lr 1.80e-04 | 8454.35 ms | -100.0% bf16 MFU | 61995 tok/s +step 12603/19560 | loss 3.362502 (-0.03z)| norm 0.2941 (+1.67z)| lr 1.80e-04 | 8457.64 ms | -100.0% bf16 MFU | 61995 tok/s +step 12604/19560 | loss 3.371829 (+0.20z)| norm 0.2603 (-0.87z)| lr 1.80e-04 | 8456.81 ms | -100.0% bf16 MFU | 61995 tok/s +step 12605/19560 | loss 3.343597 (-0.52z)| norm 0.2991 (+2.03z)| lr 1.80e-04 | 8464.40 ms | -100.0% bf16 MFU | 61992 tok/s +step 12606/19560 | loss 3.335614 (-0.72z)| norm 0.2868 (+1.10z)| lr 1.80e-04 | 8462.04 ms | -100.0% bf16 MFU | 61990 tok/s +step 12607/19560 | loss 3.368910 (+0.12z)| norm 0.2794 (+0.55z)| lr 1.80e-04 | 8451.83 ms | -100.0% bf16 MFU | 61993 tok/s +step 12608/19560 | loss 3.362978 (-0.03z)| norm 0.2856 (+1.01z)| lr 1.80e-04 | 8456.73 ms | -100.0% bf16 MFU | 61993 tok/s +step 12609/19560 | loss 3.432563 (+1.71z)| norm 0.2777 (+0.41z)| lr 1.80e-04 | 8458.17 ms | -100.0% bf16 MFU | 61992 tok/s +step 12610/19560 | loss 3.332617 (-0.82z)| norm 0.2962 (+1.77z)| lr 1.80e-04 | 8453.55 ms | -100.0% bf16 MFU | 61994 tok/s +step 12611/19560 | loss 3.331575 (-0.84z)| norm 0.3778 (+6.39z)| lr 1.80e-04 | 8458.14 ms | -100.0% bf16 MFU | 61993 tok/s +step 12612/19560 | loss 3.332596 (-0.81z)| norm 0.2835 (+0.62z)| lr 1.80e-04 | 8465.27 ms | -100.0% bf16 MFU | 61990 tok/s +step 12613/19560 | loss 3.350805 (-0.35z)| norm 0.2816 (+0.49z)| lr 1.79e-04 | 8458.16 ms | -100.0% bf16 MFU | 61990 tok/s +step 12614/19560 | loss 3.340416 (-0.61z)| norm 0.2753 (+0.09z)| lr 1.79e-04 | 8460.56 ms | -100.0% bf16 MFU | 61989 tok/s +step 12615/19560 | loss 3.382321 (+0.44z)| norm 0.2987 (+1.51z)| lr 1.79e-04 | 8461.13 ms | -100.0% bf16 MFU | 61988 tok/s +step 12616/19560 | loss 3.352031 (-0.31z)| norm 0.2766 (+0.15z)| lr 1.79e-04 | 8458.37 ms | -100.0% bf16 MFU | 61988 tok/s +step 12617/19560 | loss 3.411192 (+1.19z)| norm 0.2761 (+0.12z)| lr 1.79e-04 | 8458.19 ms | -100.0% bf16 MFU | 61988 tok/s +step 12618/19560 | loss 3.383531 (+0.49z)| norm 0.2706 (-0.24z)| lr 1.79e-04 | 8453.83 ms | -100.0% bf16 MFU | 61989 tok/s +step 12619/19560 | loss 3.398567 (+0.87z)| norm 0.2840 (+0.59z)| lr 1.79e-04 | 8451.97 ms | -100.0% bf16 MFU | 61991 tok/s +step 12620/19560 | loss 3.444517 (+1.99z)| norm 0.2809 (+0.39z)| lr 1.79e-04 | 8456.32 ms | -100.0% bf16 MFU | 61992 tok/s +step 12621/19560 | loss 3.364443 (-0.03z)| norm 0.2659 (-0.54z)| lr 1.79e-04 | 8458.12 ms | -100.0% bf16 MFU | 61991 tok/s +step 12622/19560 | loss 3.280635 (-2.10z)| norm 0.2756 (+0.07z)| lr 1.79e-04 | 8454.48 ms | -100.0% bf16 MFU | 61992 tok/s +step 12623/19560 | loss 3.369992 (+0.12z)| norm 0.2515 (-1.42z)| lr 1.79e-04 | 8459.33 ms | -100.0% bf16 MFU | 61992 tok/s +step 12624/19560 | loss 3.433934 (+1.68z)| norm 0.2648 (-0.59z)| lr 1.79e-04 | 8459.77 ms | -100.0% bf16 MFU | 61991 tok/s +step 12625/19560 | loss 3.311950 (-1.31z)| norm 0.2828 (+0.53z)| lr 1.79e-04 | 8459.00 ms | -100.0% bf16 MFU | 61990 tok/s +step 12626/19560 | loss 3.360280 (-0.12z)| norm 0.2683 (-0.37z)| lr 1.79e-04 | 8458.62 ms | -100.0% bf16 MFU | 61990 tok/s +step 12627/19560 | loss 3.339991 (-0.63z)| norm 0.2503 (-1.47z)| lr 1.79e-04 | 8456.14 ms | -100.0% bf16 MFU | 61990 tok/s +step 12628/19560 | loss 3.354019 (-0.27z)| norm 0.2695 (-0.29z)| lr 1.79e-04 | 8455.57 ms | -100.0% bf16 MFU | 61991 tok/s +step 12629/19560 | loss 3.336961 (-0.73z)| norm 0.2698 (-0.28z)| lr 1.79e-04 | 8459.60 ms | -100.0% bf16 MFU | 61990 tok/s +step 12630/19560 | loss 3.385540 (+0.55z)| norm 0.2697 (-0.29z)| lr 1.79e-04 | 8462.76 ms | -100.0% bf16 MFU | 61988 tok/s +step 12631/19560 | loss 3.472919 (+2.75z)| norm 0.2774 (+0.18z)| lr 1.79e-04 | 8454.11 ms | -100.0% bf16 MFU | 61990 tok/s +step 12632/19560 | loss 3.386848 (+0.54z)| norm 0.2718 (-0.16z)| lr 1.79e-04 | 8455.44 ms | -100.0% bf16 MFU | 61991 tok/s +step 12633/19560 | loss 3.345158 (-0.54z)| norm 0.2818 (+0.46z)| lr 1.79e-04 | 8458.52 ms | -100.0% bf16 MFU | 61990 tok/s +step 12634/19560 | loss 3.365531 (-0.03z)| norm 0.2721 (-0.13z)| lr 1.79e-04 | 8456.83 ms | -100.0% bf16 MFU | 61991 tok/s +step 12635/19560 | loss 3.324980 (-1.08z)| norm 0.2783 (+0.27z)| lr 1.78e-04 | 8457.03 ms | -100.0% bf16 MFU | 61991 tok/s +step 12636/19560 | loss 3.348715 (-0.45z)| norm 0.2972 (+1.44z)| lr 1.78e-04 | 8454.68 ms | -100.0% bf16 MFU | 61992 tok/s +step 12637/19560 | loss 3.367480 (+0.04z)| norm 0.2715 (-0.17z)| lr 1.78e-04 | 8456.37 ms | -100.0% bf16 MFU | 61992 tok/s +step 12638/19560 | loss 3.409622 (+1.14z)| norm 0.2823 (+0.51z)| lr 1.78e-04 | 8457.80 ms | -100.0% bf16 MFU | 61992 tok/s +step 12639/19560 | loss 3.415841 (+1.29z)| norm 0.2606 (-0.85z)| lr 1.78e-04 | 8455.53 ms | -100.0% bf16 MFU | 61993 tok/s +step 12640/19560 | loss 3.376112 (+0.25z)| norm 0.2767 (+0.17z)| lr 1.78e-04 | 8454.72 ms | -100.0% bf16 MFU | 61994 tok/s +step 12641/19560 | loss 3.382352 (+0.40z)| norm 0.2616 (-0.77z)| lr 1.78e-04 | 8458.70 ms | -100.0% bf16 MFU | 61993 tok/s +step 12642/19560 | loss 3.379740 (+0.33z)| norm 0.2583 (-0.96z)| lr 1.78e-04 | 8457.48 ms | -100.0% bf16 MFU | 61993 tok/s +step 12643/19560 | loss 3.338074 (-0.75z)| norm 0.2652 (-0.52z)| lr 1.78e-04 | 8458.49 ms | -100.0% bf16 MFU | 61992 tok/s +step 12644/19560 | loss 3.345968 (-0.54z)| norm 0.2691 (-0.28z)| lr 1.78e-04 | 8456.79 ms | -100.0% bf16 MFU | 61993 tok/s +step 12645/19560 | loss 3.347628 (-0.50z)| norm 0.2614 (-0.77z)| lr 1.78e-04 | 8459.46 ms | -100.0% bf16 MFU | 61992 tok/s +step 12646/19560 | loss 3.374514 (+0.21z)| norm 0.2647 (-0.55z)| lr 1.78e-04 | 8457.14 ms | -100.0% bf16 MFU | 61992 tok/s +step 12647/19560 | loss 3.332496 (-0.91z)| norm 0.2751 (+0.10z)| lr 1.78e-04 | 8452.77 ms | -100.0% bf16 MFU | 61994 tok/s +step 12648/19560 | loss 3.361907 (-0.12z)| norm 0.2711 (-0.14z)| lr 1.78e-04 | 8458.25 ms | -100.0% bf16 MFU | 61993 tok/s +step 12649/19560 | loss 3.345989 (-0.55z)| norm 0.2638 (-0.59z)| lr 1.78e-04 | 8455.19 ms | -100.0% bf16 MFU | 61994 tok/s +step 12650/19560 | loss 3.381040 (+0.40z)| norm 0.2713 (-0.11z)| lr 1.78e-04 | 8457.55 ms | -100.0% bf16 MFU | 61994 tok/s +step 12651/19560 | loss 3.367512 (+0.03z)| norm 0.2610 (-0.76z)| lr 1.78e-04 | 8457.25 ms | -100.0% bf16 MFU | 61994 tok/s +step 12652/19560 | loss 3.375479 (+0.24z)| norm 0.2530 (-1.24z)| lr 1.78e-04 | 8457.06 ms | -100.0% bf16 MFU | 61994 tok/s +step 12653/19560 | loss 3.363482 (-0.08z)| norm 0.2659 (-0.43z)| lr 1.78e-04 | 8458.44 ms | -100.0% bf16 MFU | 61993 tok/s +step 12654/19560 | loss 3.384676 (+0.48z)| norm 0.2597 (-0.82z)| lr 1.78e-04 | 8456.49 ms | -100.0% bf16 MFU | 61993 tok/s +step 12655/19560 | loss 3.358661 (-0.23z)| norm 0.2808 (+0.51z)| lr 1.78e-04 | 8455.41 ms | -100.0% bf16 MFU | 61994 tok/s +step 12656/19560 | loss 3.357230 (-0.26z)| norm 0.2409 (-1.97z)| lr 1.78e-04 | 8455.18 ms | -100.0% bf16 MFU | 61995 tok/s +step 12657/19560 | loss 3.359193 (-0.21z)| norm 0.2555 (-1.06z)| lr 1.77e-04 | 8454.92 ms | -100.0% bf16 MFU | 61996 tok/s +step 12658/19560 | loss 3.289800 (-2.06z)| norm 0.2673 (-0.32z)| lr 1.77e-04 | 8454.36 ms | -100.0% bf16 MFU | 61996 tok/s +step 12659/19560 | loss 3.356033 (-0.29z)| norm 0.2704 (-0.13z)| lr 1.77e-04 | 8457.21 ms | -100.0% bf16 MFU | 61996 tok/s +step 12660/19560 | loss 3.374759 (+0.20z)| norm 0.2765 (+0.23z)| lr 1.77e-04 | 8454.40 ms | -100.0% bf16 MFU | 61997 tok/s +step 12661/19560 | loss 3.354603 (-0.34z)| norm 0.2621 (-0.67z)| lr 1.77e-04 | 8456.04 ms | -100.0% bf16 MFU | 61997 tok/s +step 12662/19560 | loss 3.343274 (-0.64z)| norm 0.2717 (-0.07z)| lr 1.77e-04 | 8457.07 ms | -100.0% bf16 MFU | 61997 tok/s +step 12663/19560 | loss 3.368278 (+0.01z)| norm 0.2868 (+0.87z)| lr 1.77e-04 | 8456.14 ms | -100.0% bf16 MFU | 61997 tok/s +step 12664/19560 | loss 3.344843 (-0.64z)| norm 0.2583 (-0.90z)| lr 1.77e-04 | 8454.91 ms | -100.0% bf16 MFU | 61998 tok/s +step 12665/19560 | loss 3.434837 (+1.81z)| norm 0.2860 (+0.86z)| lr 1.77e-04 | 8455.83 ms | -100.0% bf16 MFU | 61998 tok/s +step 12666/19560 | loss 3.362024 (-0.17z)| norm 0.2504 (-1.38z)| lr 1.77e-04 | 8455.61 ms | -100.0% bf16 MFU | 61999 tok/s +step 12667/19560 | loss 3.363472 (-0.13z)| norm 0.2749 (+0.17z)| lr 1.77e-04 | 8456.65 ms | -100.0% bf16 MFU | 61999 tok/s +step 12668/19560 | loss 3.341631 (-0.72z)| norm 0.2571 (-0.94z)| lr 1.77e-04 | 8455.36 ms | -100.0% bf16 MFU | 61999 tok/s +step 12669/19560 | loss 3.363435 (-0.11z)| norm 0.2689 (-0.20z)| lr 1.77e-04 | 8453.08 ms | -100.0% bf16 MFU | 62000 tok/s +step 12670/19560 | loss 3.403917 (+1.00z)| norm 0.2502 (-1.37z)| lr 1.77e-04 | 8453.67 ms | -100.0% bf16 MFU | 62001 tok/s +step 12671/19560 | loss 3.356562 (-0.29z)| norm 0.2637 (-0.51z)| lr 1.77e-04 | 8455.12 ms | -100.0% bf16 MFU | 62001 tok/s +step 12672/19560 | loss 3.378561 (+0.32z)| norm 0.2546 (-1.08z)| lr 1.77e-04 | 8455.17 ms | -100.0% bf16 MFU | 62002 tok/s +step 12673/19560 | loss 3.384818 (+0.49z)| norm 0.2661 (-0.34z)| lr 1.77e-04 | 8454.26 ms | -100.0% bf16 MFU | 62002 tok/s +step 12674/19560 | loss 3.410098 (+1.18z)| norm 0.2687 (-0.18z)| lr 1.77e-04 | 8455.39 ms | -100.0% bf16 MFU | 62003 tok/s +step 12675/19560 | loss 3.394707 (+0.74z)| norm 0.2798 (+0.54z)| lr 1.77e-04 | 8457.15 ms | -100.0% bf16 MFU | 62002 tok/s +step 12676/19560 | loss 3.325211 (-1.20z)| norm 0.2664 (-0.33z)| lr 1.77e-04 | 8459.18 ms | -100.0% bf16 MFU | 62001 tok/s +step 12677/19560 | loss 3.322824 (-1.25z)| norm 0.2816 (+0.67z)| lr 1.77e-04 | 8454.97 ms | -100.0% bf16 MFU | 62001 tok/s +step 12678/19560 | loss 3.367341 (-0.02z)| norm 0.2645 (-0.44z)| lr 1.77e-04 | 8454.83 ms | -100.0% bf16 MFU | 62002 tok/s +step 12679/19560 | loss 3.353183 (-0.40z)| norm 0.2796 (+0.54z)| lr 1.76e-04 | 8455.94 ms | -100.0% bf16 MFU | 62002 tok/s +step 12680/19560 | loss 3.374947 (+0.21z)| norm 0.2599 (-0.74z)| lr 1.76e-04 | 8456.69 ms | -100.0% bf16 MFU | 62002 tok/s +step 12681/19560 | loss 3.413066 (+1.33z)| norm 0.3102 (+2.46z)| lr 1.76e-04 | 8457.29 ms | -100.0% bf16 MFU | 62001 tok/s +step 12682/19560 | loss 3.350729 (-0.50z)| norm 0.2721 (+0.05z)| lr 1.76e-04 | 8456.22 ms | -100.0% bf16 MFU | 62001 tok/s +step 12683/19560 | loss 3.377475 (+0.29z)| norm 0.2663 (-0.31z)| lr 1.76e-04 | 8455.31 ms | -100.0% bf16 MFU | 62001 tok/s +step 12684/19560 | loss 3.383381 (+0.46z)| norm 0.2716 (+0.03z)| lr 1.76e-04 | 8455.74 ms | -100.0% bf16 MFU | 62002 tok/s +step 12685/19560 | loss 3.398164 (+0.89z)| norm 0.2581 (-0.85z)| lr 1.76e-04 | 8454.18 ms | -100.0% bf16 MFU | 62002 tok/s +step 12686/19560 | loss 3.362650 (-0.17z)| norm 0.2766 (+0.35z)| lr 1.76e-04 | 8452.58 ms | -100.0% bf16 MFU | 62003 tok/s +step 12687/19560 | loss 3.328224 (-1.18z)| norm 0.2580 (-0.87z)| lr 1.76e-04 | 8455.48 ms | -100.0% bf16 MFU | 62004 tok/s +step 12688/19560 | loss 3.412027 (+1.29z)| norm 0.2821 (+0.70z)| lr 1.76e-04 | 8453.95 ms | -100.0% bf16 MFU | 62004 tok/s +step 12689/19560 | loss 3.400541 (+0.95z)| norm 0.3142 (+2.69z)| lr 1.76e-04 | 8454.21 ms | -100.0% bf16 MFU | 62005 tok/s +step 12690/19560 | loss 3.366717 (-0.05z)| norm 0.2614 (-0.66z)| lr 1.76e-04 | 8456.37 ms | -100.0% bf16 MFU | 62004 tok/s +step 12691/19560 | loss 3.514780 (+3.99z)| norm 0.2842 (+0.78z)| lr 1.76e-04 | 8457.82 ms | -100.0% bf16 MFU | 62004 tok/s +step 12692/19560 | loss 3.311580 (-1.57z)| norm 0.2650 (-0.45z)| lr 1.76e-04 | 8483.64 ms | -100.0% bf16 MFU | 61993 tok/s +step 12693/19560 | loss 3.401246 (+0.94z)| norm 0.2869 (+0.97z)| lr 1.76e-04 | 8485.96 ms | -100.0% bf16 MFU | 61983 tok/s +step 12694/19560 | loss 3.334685 (-0.95z)| norm 0.2372 (-2.19z)| lr 1.76e-04 | 8478.75 ms | -100.0% bf16 MFU | 61976 tok/s +step 12695/19560 | loss 3.409257 (+1.20z)| norm 0.2856 (+0.87z)| lr 1.76e-04 | 8479.53 ms | -100.0% bf16 MFU | 61968 tok/s +step 12696/19560 | loss 3.317870 (-1.41z)| norm 0.2628 (-0.57z)| lr 1.76e-04 | 8477.72 ms | -100.0% bf16 MFU | 61962 tok/s +step 12697/19560 | loss 3.322912 (-1.25z)| norm 0.2661 (-0.35z)| lr 1.76e-04 | 8478.77 ms | -100.0% bf16 MFU | 61956 tok/s +step 12698/19560 | loss 3.351123 (-0.45z)| norm 0.2521 (-1.22z)| lr 1.76e-04 | 8478.58 ms | -100.0% bf16 MFU | 61950 tok/s +step 12699/19560 | loss 3.355483 (-0.32z)| norm 0.2548 (-1.04z)| lr 1.76e-04 | 8476.07 ms | -100.0% bf16 MFU | 61945 tok/s +step 12700/19560 | loss 3.328645 (-1.07z)| norm 0.2687 (-0.17z)| lr 1.76e-04 | 8476.75 ms | -100.0% bf16 MFU | 61940 tok/s +step 12701/19560 | loss 3.347703 (-0.52z)| norm 0.2575 (-0.86z)| lr 1.75e-04 | 8474.10 ms | -100.0% bf16 MFU | 61937 tok/s +step 12702/19560 | loss 3.342539 (-0.67z)| norm 0.2884 (+1.07z)| lr 1.75e-04 | 8475.44 ms | -100.0% bf16 MFU | 61933 tok/s +step 12703/19560 | loss 3.371862 (+0.15z)| norm 0.2722 (+0.06z)| lr 1.75e-04 | 8476.84 ms | -100.0% bf16 MFU | 61929 tok/s +step 12704/19560 | loss 3.359682 (-0.21z)| norm 0.2877 (+1.02z)| lr 1.75e-04 | 8475.03 ms | -100.0% bf16 MFU | 61925 tok/s +step 12705/19560 | loss 3.316801 (-1.46z)| norm 0.2604 (-0.67z)| lr 1.75e-04 | 8482.50 ms | -100.0% bf16 MFU | 61920 tok/s +step 12706/19560 | loss 3.376368 (+0.28z)| norm 0.2565 (-0.92z)| lr 1.75e-04 | 8476.08 ms | -100.0% bf16 MFU | 61916 tok/s +step 12707/19560 | loss 3.373991 (+0.21z)| norm 0.2722 (+0.05z)| lr 1.75e-04 | 8473.13 ms | -100.0% bf16 MFU | 61914 tok/s +step 12708/19560 | loss 3.359142 (-0.21z)| norm 0.2709 (-0.03z)| lr 1.75e-04 | 8475.61 ms | -100.0% bf16 MFU | 61912 tok/s +step 12709/19560 | loss 3.389600 (+0.68z)| norm 0.2759 (+0.27z)| lr 1.75e-04 | 8473.80 ms | -100.0% bf16 MFU | 61910 tok/s +step 12710/19560 | loss 3.306642 (-1.72z)| norm 0.2867 (+0.94z)| lr 1.75e-04 | 8469.93 ms | -100.0% bf16 MFU | 61909 tok/s +step 12711/19560 | loss 3.329212 (-1.05z)| norm 0.2609 (-0.67z)| lr 1.75e-04 | 8472.21 ms | -100.0% bf16 MFU | 61908 tok/s +step 12712/19560 | loss 3.332079 (-0.96z)| norm 0.2576 (-0.88z)| lr 1.75e-04 | 8468.01 ms | -100.0% bf16 MFU | 61908 tok/s +step 12713/19560 | loss 3.381518 (+0.47z)| norm 0.3013 (+1.82z)| lr 1.75e-04 | 8476.17 ms | -100.0% bf16 MFU | 61905 tok/s +step 12714/19560 | loss 3.371264 (+0.18z)| norm 0.2558 (-0.99z)| lr 1.75e-04 | 8468.15 ms | -100.0% bf16 MFU | 61906 tok/s +step 12715/19560 | loss 3.303332 (-1.75z)| norm 0.2844 (+0.77z)| lr 1.75e-04 | 8476.03 ms | -100.0% bf16 MFU | 61903 tok/s +step 12716/19560 | loss 3.274971 (-2.49z)| norm 0.2788 (+0.41z)| lr 1.75e-04 | 8469.39 ms | -100.0% bf16 MFU | 61903 tok/s +step 12717/19560 | loss 3.377261 (+0.39z)| norm 0.2609 (-0.68z)| lr 1.75e-04 | 8472.96 ms | -100.0% bf16 MFU | 61902 tok/s +step 12718/19560 | loss 3.336161 (-0.77z)| norm 0.2788 (+0.42z)| lr 1.75e-04 | 8468.63 ms | -100.0% bf16 MFU | 61902 tok/s +step 12719/19560 | loss 3.407624 (+1.25z)| norm 0.2811 (+0.56z)| lr 1.75e-04 | 8467.83 ms | -100.0% bf16 MFU | 61903 tok/s +step 12720/19560 | loss 3.358728 (-0.15z)| norm 0.2808 (+0.53z)| lr 1.75e-04 | 8472.67 ms | -100.0% bf16 MFU | 61902 tok/s +step 12721/19560 | loss 3.326789 (-1.05z)| norm 0.3036 (+1.91z)| lr 1.75e-04 | 8463.33 ms | -100.0% bf16 MFU | 61904 tok/s +step 12722/19560 | loss 3.321934 (-1.17z)| norm 0.2787 (+0.38z)| lr 1.75e-04 | 8468.07 ms | -100.0% bf16 MFU | 61905 tok/s +step 12723/19560 | loss 3.339323 (-0.67z)| norm 0.2972 (+1.49z)| lr 1.74e-04 | 8469.11 ms | -100.0% bf16 MFU | 61905 tok/s +step 12724/19560 | loss 3.412245 (+1.36z)| norm 0.2829 (+0.61z)| lr 1.74e-04 | 8466.24 ms | -100.0% bf16 MFU | 61906 tok/s +step 12725/19560 | loss 3.383459 (+0.55z)| norm 0.3188 (+2.69z)| lr 1.74e-04 | 8462.49 ms | -100.0% bf16 MFU | 61908 tok/s +step 12726/19560 | loss 3.322508 (-1.17z)| norm 0.2682 (-0.30z)| lr 1.74e-04 | 8471.39 ms | -100.0% bf16 MFU | 61907 tok/s +step 12727/19560 | loss 3.383286 (+0.54z)| norm 0.3032 (+1.73z)| lr 1.74e-04 | 8462.72 ms | -100.0% bf16 MFU | 61910 tok/s +step 12728/19560 | loss 3.334914 (-0.81z)| norm 0.2757 (+0.12z)| lr 1.74e-04 | 8468.38 ms | -100.0% bf16 MFU | 61910 tok/s +step 12729/19560 | loss 3.386768 (+0.64z)| norm 0.2930 (+1.12z)| lr 1.74e-04 | 8462.40 ms | -100.0% bf16 MFU | 61912 tok/s +step 12730/19560 | loss 3.301451 (-1.72z)| norm 0.2841 (+0.59z)| lr 1.74e-04 | 8466.88 ms | -100.0% bf16 MFU | 61912 tok/s +step 12731/19560 | loss 3.348271 (-0.41z)| norm 0.3181 (+2.52z)| lr 1.74e-04 | 8462.53 ms | -100.0% bf16 MFU | 61914 tok/s +step 12732/19560 | loss 3.341774 (-0.59z)| norm 0.2863 (+0.69z)| lr 1.74e-04 | 8464.33 ms | -100.0% bf16 MFU | 61916 tok/s +step 12733/19560 | loss 3.326802 (-1.00z)| norm 0.3047 (+1.73z)| lr 1.74e-04 | 8462.92 ms | -100.0% bf16 MFU | 61918 tok/s +step 12734/19560 | loss 3.392204 (+0.81z)| norm 0.2770 (+0.15z)| lr 1.74e-04 | 8464.29 ms | -100.0% bf16 MFU | 61919 tok/s +step 12735/19560 | loss 3.500975 (+3.60z)| norm 0.2863 (+0.68z)| lr 1.74e-04 | 8468.19 ms | -100.0% bf16 MFU | 61918 tok/s +step 12736/19560 | loss 3.322968 (-1.07z)| norm 0.2915 (+0.97z)| lr 1.74e-04 | 8469.24 ms | -100.0% bf16 MFU | 61918 tok/s +step 12737/19560 | loss 3.348345 (-0.39z)| norm 0.3105 (+2.01z)| lr 1.74e-04 | 8462.33 ms | -100.0% bf16 MFU | 61920 tok/s +step 12738/19560 | loss 3.341272 (-0.58z)| norm 0.2827 (+0.46z)| lr 1.74e-04 | 8462.84 ms | -100.0% bf16 MFU | 61921 tok/s +step 12739/19560 | loss 3.347923 (-0.41z)| norm 0.2780 (+0.27z)| lr 1.74e-04 | 8467.94 ms | -100.0% bf16 MFU | 61921 tok/s +step 12740/19560 | loss 3.415251 (+1.36z)| norm 0.2740 (+0.01z)| lr 1.74e-04 | 8463.65 ms | -100.0% bf16 MFU | 61922 tok/s +step 12741/19560 | loss 3.347883 (-0.43z)| norm 0.3104 (+2.35z)| lr 1.74e-04 | 8469.75 ms | -100.0% bf16 MFU | 61921 tok/s +step 12742/19560 | loss 3.375074 (+0.29z)| norm 0.2896 (+1.00z)| lr 1.74e-04 | 8465.60 ms | -100.0% bf16 MFU | 61922 tok/s +step 12743/19560 | loss 3.380512 (+0.43z)| norm 0.2670 (-0.45z)| lr 1.74e-04 | 8470.40 ms | -100.0% bf16 MFU | 61920 tok/s +step 12744/19560 | loss 3.468455 (+2.67z)| norm 0.3245 (+3.14z)| lr 1.74e-04 | 8464.36 ms | -100.0% bf16 MFU | 61921 tok/s +step 12745/19560 | loss 3.353483 (-0.29z)| norm 0.2804 (+0.38z)| lr 1.73e-04 | 8459.42 ms | -100.0% bf16 MFU | 61924 tok/s +step 12746/19560 | loss 3.311385 (-1.36z)| norm 0.2778 (+0.22z)| lr 1.73e-04 | 8461.65 ms | -100.0% bf16 MFU | 61926 tok/s +step 12747/19560 | loss 3.379991 (+0.41z)| norm 0.2832 (+0.56z)| lr 1.73e-04 | 8464.95 ms | -100.0% bf16 MFU | 61926 tok/s +step 12748/19560 | loss 3.315265 (-1.25z)| norm 0.2778 (+0.22z)| lr 1.73e-04 | 8464.19 ms | -100.0% bf16 MFU | 61927 tok/s +step 12749/19560 | loss 3.277825 (-2.17z)| norm 0.2633 (-0.69z)| lr 1.73e-04 | 8465.08 ms | -100.0% bf16 MFU | 61928 tok/s +step 12750/19560 | loss 3.339976 (-0.59z)| norm 0.2845 (+0.63z)| lr 1.73e-04 | 8461.66 ms | -100.0% bf16 MFU | 61929 tok/s +val loss 3.328467 +evaluating HellaSwag: 0/1256 evaluating HellaSwag: 10/1256 evaluating HellaSwag: 20/1256 evaluating HellaSwag: 30/1256 evaluating HellaSwag: 40/1256 evaluating HellaSwag: 50/1256 evaluating HellaSwag: 60/1256 evaluating HellaSwag: 70/1256 evaluating HellaSwag: 80/1256 evaluating HellaSwag: 90/1256 evaluating HellaSwag: 100/1256 evaluating HellaSwag: 110/1256 evaluating HellaSwag: 120/1256 evaluating HellaSwag: 130/1256 evaluating HellaSwag: 140/1256 evaluating HellaSwag: 150/1256 evaluating HellaSwag: 160/1256 evaluating HellaSwag: 170/1256 evaluating HellaSwag: 180/1256 evaluating HellaSwag: 190/1256 evaluating HellaSwag: 200/1256 evaluating HellaSwag: 210/1256 evaluating HellaSwag: 220/1256 evaluating HellaSwag: 230/1256 evaluating HellaSwag: 240/1256 evaluating HellaSwag: 250/1256 evaluating HellaSwag: 260/1256 evaluating HellaSwag: 270/1256 evaluating HellaSwag: 280/1256 evaluating HellaSwag: 290/1256 evaluating HellaSwag: 300/1256 evaluating HellaSwag: 310/1256 evaluating HellaSwag: 320/1256 evaluating HellaSwag: 330/1256 evaluating HellaSwag: 340/1256 evaluating HellaSwag: 350/1256 evaluating HellaSwag: 360/1256 evaluating HellaSwag: 370/1256 evaluating HellaSwag: 380/1256 evaluating HellaSwag: 390/1256 evaluating HellaSwag: 400/1256 evaluating HellaSwag: 410/1256 evaluating HellaSwag: 420/1256 evaluating HellaSwag: 430/1256 evaluating HellaSwag: 440/1256 evaluating HellaSwag: 450/1256 evaluating HellaSwag: 460/1256 evaluating HellaSwag: 470/1256 evaluating HellaSwag: 480/1256 evaluating HellaSwag: 490/1256 evaluating HellaSwag: 500/1256 evaluating HellaSwag: 510/1256 evaluating HellaSwag: 520/1256 evaluating HellaSwag: 530/1256 evaluating HellaSwag: 540/1256 evaluating HellaSwag: 550/1256 evaluating HellaSwag: 560/1256 evaluating HellaSwag: 570/1256 evaluating HellaSwag: 580/1256 evaluating HellaSwag: 590/1256 evaluating HellaSwag: 600/1256 evaluating HellaSwag: 610/1256 evaluating HellaSwag: 620/1256 evaluating HellaSwag: 630/1256 evaluating HellaSwag: 640/1256 evaluating HellaSwag: 650/1256 evaluating HellaSwag: 660/1256 evaluating HellaSwag: 670/1256 evaluating HellaSwag: 680/1256 evaluating HellaSwag: 690/1256 evaluating HellaSwag: 700/1256 evaluating HellaSwag: 710/1256 evaluating HellaSwag: 720/1256 evaluating HellaSwag: 730/1256 evaluating HellaSwag: 740/1256 evaluating HellaSwag: 750/1256 evaluating HellaSwag: 760/1256 evaluating HellaSwag: 770/1256 evaluating HellaSwag: 780/1256 evaluating HellaSwag: 790/1256 evaluating HellaSwag: 800/1256 evaluating HellaSwag: 810/1256 evaluating HellaSwag: 820/1256 evaluating HellaSwag: 830/1256 evaluating HellaSwag: 840/1256 evaluating HellaSwag: 850/1256 evaluating HellaSwag: 860/1256 evaluating HellaSwag: 870/1256 evaluating HellaSwag: 880/1256 evaluating HellaSwag: 890/1256 evaluating HellaSwag: 900/1256 evaluating HellaSwag: 910/1256 evaluating HellaSwag: 920/1256 evaluating HellaSwag: 930/1256 evaluating HellaSwag: 940/1256 evaluating HellaSwag: 950/1256 evaluating HellaSwag: 960/1256 evaluating HellaSwag: 970/1256 evaluating HellaSwag: 980/1256 evaluating HellaSwag: 990/1256 evaluating HellaSwag: 1000/1256 evaluating HellaSwag: 1010/1256 evaluating HellaSwag: 1020/1256 evaluating HellaSwag: 1030/1256 evaluating HellaSwag: 1040/1256 evaluating HellaSwag: 1050/1256 evaluating HellaSwag: 1060/1256 evaluating HellaSwag: 1070/1256 evaluating HellaSwag: 1080/1256 evaluating HellaSwag: 1090/1256 evaluating HellaSwag: 1100/1256 evaluating HellaSwag: 1110/1256 evaluating HellaSwag: 1120/1256 evaluating HellaSwag: 1130/1256 evaluating HellaSwag: 1140/1256 evaluating HellaSwag: 1150/1256 evaluating HellaSwag: 1160/1256 evaluating HellaSwag: 1170/1256 evaluating HellaSwag: 1180/1256 evaluating HellaSwag: 1190/1256 evaluating HellaSwag: 1200/1256 evaluating HellaSwag: 1210/1256 evaluating HellaSwag: 1220/1256 evaluating HellaSwag: 1230/1256 evaluating HellaSwag: 1240/1256 evaluating HellaSwag: 1250/1256 HellaSwag: 2931/10042 = 0.291874 +step 12751/19560 | loss 3.333256 (-0.76z)| norm 0.2707 (-0.24z)| lr 1.73e-04 | 8461.59 ms | -100.0% bf16 MFU | 61931 tok/s +step 12752/19560 | loss 3.393108 (+0.81z)| norm 0.2763 (+0.11z)| lr 1.73e-04 | 8463.53 ms | -100.0% bf16 MFU | 61932 tok/s +step 12753/19560 | loss 3.309976 (-1.37z)| norm 0.2698 (-0.29z)| lr 1.73e-04 | 8460.52 ms | -100.0% bf16 MFU | 61934 tok/s +step 12754/19560 | loss 3.320429 (-1.08z)| norm 0.2691 (-0.34z)| lr 1.73e-04 | 8465.82 ms | -100.0% bf16 MFU | 61933 tok/s +step 12755/19560 | loss 3.373370 (+0.29z)| norm 0.2821 (+0.47z)| lr 1.73e-04 | 8463.43 ms | -100.0% bf16 MFU | 61934 tok/s +step 12756/19560 | loss 3.383983 (+0.57z)| norm 0.2893 (+0.91z)| lr 1.73e-04 | 8456.68 ms | -100.0% bf16 MFU | 61937 tok/s +step 12757/19560 | loss 3.358423 (-0.11z)| norm 0.2893 (+0.90z)| lr 1.73e-04 | 8465.97 ms | -100.0% bf16 MFU | 61937 tok/s +step 12758/19560 | loss 3.375249 (+0.34z)| norm 0.2646 (-0.66z)| lr 1.73e-04 | 8465.55 ms | -100.0% bf16 MFU | 61937 tok/s +step 12759/19560 | loss 3.307141 (-1.45z)| norm 0.2859 (+0.68z)| lr 1.73e-04 | 8460.03 ms | -100.0% bf16 MFU | 61938 tok/s +step 12760/19560 | loss 3.306369 (-1.45z)| norm 0.2743 (-0.05z)| lr 1.73e-04 | 8459.83 ms | -100.0% bf16 MFU | 61940 tok/s +step 12761/19560 | loss 3.319924 (-1.08z)| norm 0.2680 (-0.44z)| lr 1.73e-04 | 8462.96 ms | -100.0% bf16 MFU | 61941 tok/s +step 12762/19560 | loss 3.331655 (-0.75z)| norm 0.2777 (+0.17z)| lr 1.73e-04 | 8458.55 ms | -100.0% bf16 MFU | 61943 tok/s +step 12763/19560 | loss 3.370784 (+0.28z)| norm 0.2706 (-0.28z)| lr 1.73e-04 | 8462.46 ms | -100.0% bf16 MFU | 61943 tok/s +step 12764/19560 | loss 3.362086 (+0.04z)| norm 0.2901 (+0.96z)| lr 1.73e-04 | 8459.46 ms | -100.0% bf16 MFU | 61945 tok/s +step 12765/19560 | loss 3.279149 (-2.11z)| norm 0.2810 (+0.38z)| lr 1.73e-04 | 8457.67 ms | -100.0% bf16 MFU | 61947 tok/s +step 12766/19560 | loss 3.333309 (-0.68z)| norm 0.2697 (-0.33z)| lr 1.73e-04 | 8456.05 ms | -100.0% bf16 MFU | 61950 tok/s +step 12767/19560 | loss 3.317829 (-1.07z)| norm 0.2832 (+0.52z)| lr 1.72e-04 | 8457.38 ms | -100.0% bf16 MFU | 61952 tok/s +step 12768/19560 | loss 3.299399 (-1.53z)| norm 0.2867 (+0.73z)| lr 1.72e-04 | 8461.73 ms | -100.0% bf16 MFU | 61952 tok/s +step 12769/19560 | loss 3.322067 (-0.92z)| norm 0.2560 (-1.20z)| lr 1.72e-04 | 8454.40 ms | -100.0% bf16 MFU | 61956 tok/s +step 12770/19560 | loss 3.338910 (-0.48z)| norm 0.2878 (+0.79z)| lr 1.72e-04 | 8461.91 ms | -100.0% bf16 MFU | 61956 tok/s +step 12771/19560 | loss 3.340882 (-0.42z)| norm 0.2565 (-1.18z)| lr 1.72e-04 | 8458.60 ms | -100.0% bf16 MFU | 61957 tok/s +step 12772/19560 | loss 3.365784 (+0.22z)| norm 0.2590 (-1.01z)| lr 1.72e-04 | 8460.57 ms | -100.0% bf16 MFU | 61958 tok/s +step 12773/19560 | loss 3.411748 (+1.40z)| norm 0.2887 (+0.83z)| lr 1.72e-04 | 8460.00 ms | -100.0% bf16 MFU | 61958 tok/s +step 12774/19560 | loss 3.330369 (-0.70z)| norm 0.2754 (-0.01z)| lr 1.72e-04 | 8459.71 ms | -100.0% bf16 MFU | 61959 tok/s +step 12775/19560 | loss 3.376559 (+0.49z)| norm 0.2766 (+0.07z)| lr 1.72e-04 | 8460.06 ms | -100.0% bf16 MFU | 61960 tok/s +step 12776/19560 | loss 3.336442 (-0.55z)| norm 0.2728 (-0.17z)| lr 1.72e-04 | 8457.60 ms | -100.0% bf16 MFU | 61961 tok/s +step 12777/19560 | loss 3.333031 (-0.63z)| norm 0.2932 (+1.09z)| lr 1.72e-04 | 8461.07 ms | -100.0% bf16 MFU | 61961 tok/s +step 12778/19560 | loss 3.474733 (+2.92z)| norm 0.2811 (+0.33z)| lr 1.72e-04 | 8460.83 ms | -100.0% bf16 MFU | 61962 tok/s +step 12779/19560 | loss 3.329822 (-0.70z)| norm 0.2688 (-0.45z)| lr 1.72e-04 | 8456.52 ms | -100.0% bf16 MFU | 61964 tok/s +step 12780/19560 | loss 3.373223 (+0.38z)| norm 0.2636 (-0.78z)| lr 1.72e-04 | 8460.73 ms | -100.0% bf16 MFU | 61964 tok/s +step 12781/19560 | loss 3.358193 (+0.01z)| norm 0.2674 (-0.54z)| lr 1.72e-04 | 8466.80 ms | -100.0% bf16 MFU | 61962 tok/s +step 12782/19560 | loss 3.322120 (-0.88z)| norm 0.2733 (-0.18z)| lr 1.72e-04 | 8460.00 ms | -100.0% bf16 MFU | 61962 tok/s +step 12783/19560 | loss 3.331321 (-0.64z)| norm 0.2580 (-1.13z)| lr 1.72e-04 | 8457.55 ms | -100.0% bf16 MFU | 61964 tok/s +step 12784/19560 | loss 3.323244 (-0.84z)| norm 0.2654 (-0.69z)| lr 1.72e-04 | 8456.62 ms | -100.0% bf16 MFU | 61965 tok/s +step 12785/19560 | loss 3.294329 (-1.53z)| norm 0.2857 (+0.61z)| lr 1.72e-04 | 8459.92 ms | -100.0% bf16 MFU | 61966 tok/s +step 12786/19560 | loss 3.410258 (+1.31z)| norm 0.2639 (-0.80z)| lr 1.72e-04 | 8454.98 ms | -100.0% bf16 MFU | 61968 tok/s +step 12787/19560 | loss 3.306732 (-1.23z)| norm 0.2857 (+0.60z)| lr 1.72e-04 | 8445.86 ms | -100.0% bf16 MFU | 61973 tok/s +step 12788/19560 | loss 3.397353 (+0.99z)| norm 0.2772 (+0.05z)| lr 1.72e-04 | 8441.78 ms | -100.0% bf16 MFU | 61980 tok/s +step 12789/19560 | loss 3.270259 (-2.08z)| norm 0.2746 (-0.12z)| lr 1.71e-04 | 8442.79 ms | -100.0% bf16 MFU | 61986 tok/s +step 12790/19560 | loss 3.345416 (-0.27z)| norm 0.2747 (-0.12z)| lr 1.71e-04 | 8434.15 ms | -100.0% bf16 MFU | 61995 tok/s +step 12791/19560 | loss 3.368781 (+0.30z)| norm 0.2680 (-0.54z)| lr 1.71e-04 | 8439.87 ms | -100.0% bf16 MFU | 62001 tok/s +step 12792/19560 | loss 3.324970 (-0.75z)| norm 0.2651 (-0.74z)| lr 1.71e-04 | 8436.50 ms | -100.0% bf16 MFU | 62008 tok/s +step 12793/19560 | loss 3.427912 (+1.73z)| norm 0.2676 (-0.57z)| lr 1.71e-04 | 8435.82 ms | -100.0% bf16 MFU | 62015 tok/s +step 12794/19560 | loss 3.370588 (+0.34z)| norm 0.2775 (+0.07z)| lr 1.71e-04 | 8437.06 ms | -100.0% bf16 MFU | 62022 tok/s +step 12795/19560 | loss 3.331670 (-0.59z)| norm 0.2679 (-0.56z)| lr 1.71e-04 | 8432.75 ms | -100.0% bf16 MFU | 62029 tok/s +step 12796/19560 | loss 3.342055 (-0.34z)| norm 0.2825 (+0.39z)| lr 1.71e-04 | 8435.69 ms | -100.0% bf16 MFU | 62035 tok/s +step 12797/19560 | loss 3.340486 (-0.37z)| norm 0.2654 (-0.74z)| lr 1.71e-04 | 8438.80 ms | -100.0% bf16 MFU | 62040 tok/s +step 12798/19560 | loss 3.272747 (-1.96z)| norm 0.2744 (-0.16z)| lr 1.71e-04 | 8436.17 ms | -100.0% bf16 MFU | 62045 tok/s +step 12799/19560 | loss 3.297706 (-1.34z)| norm 0.2594 (-1.16z)| lr 1.71e-04 | 8437.72 ms | -100.0% bf16 MFU | 62050 tok/s +step 12800/19560 | loss 3.323426 (-0.72z)| norm 0.2702 (-0.45z)| lr 1.71e-04 | 8437.37 ms | -100.0% bf16 MFU | 62054 tok/s +step 12801/19560 | loss 3.347689 (-0.14z)| norm 0.2567 (-1.34z)| lr 1.71e-04 | 8438.12 ms | -100.0% bf16 MFU | 62058 tok/s +step 12802/19560 | loss 3.288827 (-1.51z)| norm 0.2619 (-0.99z)| lr 1.71e-04 | 8437.71 ms | -100.0% bf16 MFU | 62062 tok/s +step 12803/19560 | loss 3.315708 (-0.86z)| norm 0.2636 (-0.87z)| lr 1.71e-04 | 8439.84 ms | -100.0% bf16 MFU | 62065 tok/s +step 12804/19560 | loss 3.258929 (-2.16z)| norm 0.2693 (-0.49z)| lr 1.71e-04 | 8435.62 ms | -100.0% bf16 MFU | 62069 tok/s +step 12805/19560 | loss 3.317227 (-0.80z)| norm 0.2650 (-0.77z)| lr 1.71e-04 | 8438.75 ms | -100.0% bf16 MFU | 62072 tok/s +step 12806/19560 | loss 3.385142 (+0.78z)| norm 0.2773 (+0.04z)| lr 1.71e-04 | 8438.13 ms | -100.0% bf16 MFU | 62075 tok/s +step 12807/19560 | loss 3.313003 (-0.89z)| norm 0.2659 (-0.71z)| lr 1.71e-04 | 8437.78 ms | -100.0% bf16 MFU | 62078 tok/s +step 12808/19560 | loss 3.334266 (-0.39z)| norm 0.2586 (-1.19z)| lr 1.71e-04 | 8441.04 ms | -100.0% bf16 MFU | 62080 tok/s +step 12809/19560 | loss 3.405524 (+1.27z)| norm 0.2673 (-0.60z)| lr 1.71e-04 | 8437.98 ms | -100.0% bf16 MFU | 62083 tok/s +step 12810/19560 | loss 3.304315 (-1.07z)| norm 0.2767 (+0.03z)| lr 1.71e-04 | 8440.77 ms | -100.0% bf16 MFU | 62084 tok/s +step 12811/19560 | loss 3.343872 (-0.15z)| norm 0.2587 (-1.18z)| lr 1.70e-04 | 8444.61 ms | -100.0% bf16 MFU | 62084 tok/s +step 12812/19560 | loss 3.297418 (-1.20z)| norm 0.2792 (+0.20z)| lr 1.70e-04 | 8440.31 ms | -100.0% bf16 MFU | 62086 tok/s +step 12813/19560 | loss 3.352404 (+0.07z)| norm 0.2653 (-0.74z)| lr 1.70e-04 | 8444.08 ms | -100.0% bf16 MFU | 62086 tok/s +step 12814/19560 | loss 3.330904 (-0.42z)| norm 0.2598 (-1.10z)| lr 1.70e-04 | 8444.35 ms | -100.0% bf16 MFU | 62086 tok/s +step 12815/19560 | loss 3.306736 (-0.97z)| norm 0.2479 (-1.89z)| lr 1.70e-04 | 8446.52 ms | -100.0% bf16 MFU | 62086 tok/s +step 12816/19560 | loss 3.360211 (+0.27z)| norm 0.2927 (+1.10z)| lr 1.70e-04 | 8446.33 ms | -100.0% bf16 MFU | 62085 tok/s +step 12817/19560 | loss 3.308136 (-0.93z)| norm 0.2606 (-1.03z)| lr 1.70e-04 | 8448.52 ms | -100.0% bf16 MFU | 62083 tok/s +step 12818/19560 | loss 3.392359 (+1.03z)| norm 0.2648 (-0.75z)| lr 1.70e-04 | 8446.20 ms | -100.0% bf16 MFU | 62083 tok/s +step 12819/19560 | loss 3.339528 (-0.18z)| norm 0.2946 (+1.27z)| lr 1.70e-04 | 8449.00 ms | -100.0% bf16 MFU | 62082 tok/s +step 12820/19560 | loss 3.337793 (-0.22z)| norm 0.2691 (-0.46z)| lr 1.70e-04 | 8447.51 ms | -100.0% bf16 MFU | 62081 tok/s +step 12821/19560 | loss 3.313694 (-0.81z)| norm 0.3048 (+1.93z)| lr 1.70e-04 | 8451.66 ms | -100.0% bf16 MFU | 62078 tok/s +step 12822/19560 | loss 3.318390 (-0.69z)| norm 0.2511 (-1.71z)| lr 1.70e-04 | 8452.77 ms | -100.0% bf16 MFU | 62076 tok/s +step 12823/19560 | loss 3.349101 (+0.09z)| norm 0.2899 (+0.93z)| lr 1.70e-04 | 8453.78 ms | -100.0% bf16 MFU | 62073 tok/s +step 12824/19560 | loss 3.339947 (-0.15z)| norm 0.2755 (-0.05z)| lr 1.70e-04 | 8456.10 ms | -100.0% bf16 MFU | 62069 tok/s +step 12825/19560 | loss 3.320713 (-0.63z)| norm 0.2988 (+1.50z)| lr 1.70e-04 | 8451.58 ms | -100.0% bf16 MFU | 62067 tok/s +step 12826/19560 | loss 3.354696 (+0.23z)| norm 0.2752 (-0.10z)| lr 1.70e-04 | 8452.72 ms | -100.0% bf16 MFU | 62065 tok/s +step 12827/19560 | loss 3.330900 (-0.37z)| norm 0.2662 (-0.73z)| lr 1.70e-04 | 8451.99 ms | -100.0% bf16 MFU | 62064 tok/s +step 12828/19560 | loss 3.356994 (+0.28z)| norm 0.2681 (-0.60z)| lr 1.70e-04 | 8454.90 ms | -100.0% bf16 MFU | 62061 tok/s +step 12829/19560 | loss 3.352836 (+0.18z)| norm 0.2748 (-0.15z)| lr 1.70e-04 | 8457.94 ms | -100.0% bf16 MFU | 62057 tok/s +step 12830/19560 | loss 3.334532 (-0.28z)| norm 0.2803 (+0.24z)| lr 1.70e-04 | 8451.18 ms | -100.0% bf16 MFU | 62056 tok/s +step 12831/19560 | loss 3.295121 (-1.25z)| norm 0.2756 (-0.09z)| lr 1.70e-04 | 8454.34 ms | -100.0% bf16 MFU | 62054 tok/s +step 12832/19560 | loss 3.316241 (-0.71z)| norm 0.2571 (-1.35z)| lr 1.70e-04 | 8455.54 ms | -100.0% bf16 MFU | 62052 tok/s +step 12833/19560 | loss 3.389050 (+1.09z)| norm 0.3008 (+1.64z)| lr 1.69e-04 | 8453.17 ms | -100.0% bf16 MFU | 62050 tok/s +step 12834/19560 | loss 3.359652 (+0.36z)| norm 0.2627 (-0.99z)| lr 1.69e-04 | 8454.26 ms | -100.0% bf16 MFU | 62049 tok/s +step 12835/19560 | loss 3.374676 (+0.74z)| norm 0.2685 (-0.59z)| lr 1.69e-04 | 8453.76 ms | -100.0% bf16 MFU | 62047 tok/s +step 12836/19560 | loss 3.342868 (-0.06z)| norm 0.2823 (+0.36z)| lr 1.69e-04 | 8453.28 ms | -100.0% bf16 MFU | 62046 tok/s +step 12837/19560 | loss 3.313222 (-0.78z)| norm 0.2868 (+0.66z)| lr 1.69e-04 | 8454.73 ms | -100.0% bf16 MFU | 62044 tok/s +step 12838/19560 | loss 3.344175 (-0.02z)| norm 0.2838 (+0.46z)| lr 1.69e-04 | 8454.47 ms | -100.0% bf16 MFU | 62042 tok/s +step 12839/19560 | loss 3.336603 (-0.21z)| norm 0.2757 (-0.11z)| lr 1.69e-04 | 8453.87 ms | -100.0% bf16 MFU | 62041 tok/s +step 12840/19560 | loss 3.340291 (-0.12z)| norm 0.2885 (+0.76z)| lr 1.69e-04 | 8455.95 ms | -100.0% bf16 MFU | 62039 tok/s +step 12841/19560 | loss 3.325143 (-0.49z)| norm 0.2793 (+0.14z)| lr 1.69e-04 | 8456.29 ms | -100.0% bf16 MFU | 62037 tok/s +step 12842/19560 | loss 3.336362 (-0.20z)| norm 0.2688 (-0.61z)| lr 1.69e-04 | 8452.80 ms | -100.0% bf16 MFU | 62037 tok/s +step 12843/19560 | loss 3.317310 (-0.69z)| norm 0.2942 (+1.18z)| lr 1.69e-04 | 8454.80 ms | -100.0% bf16 MFU | 62035 tok/s +step 12844/19560 | loss 3.308562 (-0.92z)| norm 0.2604 (-1.19z)| lr 1.69e-04 | 8453.08 ms | -100.0% bf16 MFU | 62035 tok/s +step 12845/19560 | loss 3.270913 (-1.84z)| norm 0.2812 (+0.26z)| lr 1.69e-04 | 8454.07 ms | -100.0% bf16 MFU | 62034 tok/s +step 12846/19560 | loss 3.339139 (-0.12z)| norm 0.2900 (+0.87z)| lr 1.69e-04 | 8454.25 ms | -100.0% bf16 MFU | 62033 tok/s +step 12847/19560 | loss 3.280334 (-1.58z)| norm 0.2704 (-0.50z)| lr 1.69e-04 | 8454.53 ms | -100.0% bf16 MFU | 62032 tok/s +step 12848/19560 | loss 3.341144 (-0.04z)| norm 0.2803 (+0.19z)| lr 1.69e-04 | 8451.94 ms | -100.0% bf16 MFU | 62032 tok/s +step 12849/19560 | loss 3.307488 (-0.88z)| norm 0.2483 (-2.02z)| lr 1.69e-04 | 8451.11 ms | -100.0% bf16 MFU | 62032 tok/s +step 12850/19560 | loss 3.327606 (-0.38z)| norm 0.2676 (-0.66z)| lr 1.69e-04 | 8453.14 ms | -100.0% bf16 MFU | 62032 tok/s +step 12851/19560 | loss 3.372074 (+0.74z)| norm 0.2936 (+1.17z)| lr 1.69e-04 | 8449.87 ms | -100.0% bf16 MFU | 62032 tok/s +step 12852/19560 | loss 3.305795 (-0.92z)| norm 0.2516 (-1.74z)| lr 1.69e-04 | 8453.48 ms | -100.0% bf16 MFU | 62032 tok/s +step 12853/19560 | loss 3.288771 (-1.33z)| norm 0.2740 (-0.17z)| lr 1.69e-04 | 8450.77 ms | -100.0% bf16 MFU | 62032 tok/s +step 12854/19560 | loss 3.351248 (+0.25z)| norm 0.2838 (+0.52z)| lr 1.69e-04 | 8453.14 ms | -100.0% bf16 MFU | 62032 tok/s +step 12855/19560 | loss 3.378619 (+0.95z)| norm 0.2771 (+0.06z)| lr 1.68e-04 | 8453.38 ms | -100.0% bf16 MFU | 62031 tok/s +step 12856/19560 | loss 3.355950 (+0.37z)| norm 0.3066 (+2.15z)| lr 1.68e-04 | 8454.59 ms | -100.0% bf16 MFU | 62030 tok/s +step 12857/19560 | loss 3.330309 (-0.28z)| norm 0.2611 (-1.08z)| lr 1.68e-04 | 8453.93 ms | -100.0% bf16 MFU | 62030 tok/s +step 12858/19560 | loss 3.339715 (-0.04z)| norm 0.2746 (-0.12z)| lr 1.68e-04 | 8452.49 ms | -100.0% bf16 MFU | 62030 tok/s +step 12859/19560 | loss 3.345863 (+0.11z)| norm 0.2541 (-1.59z)| lr 1.68e-04 | 8451.27 ms | -100.0% bf16 MFU | 62030 tok/s +step 12860/19560 | loss 3.348534 (+0.18z)| norm 0.2691 (-0.48z)| lr 1.68e-04 | 8452.13 ms | -100.0% bf16 MFU | 62030 tok/s +step 12861/19560 | loss 3.309301 (-0.82z)| norm 0.2623 (-0.97z)| lr 1.68e-04 | 8454.32 ms | -100.0% bf16 MFU | 62029 tok/s +step 12862/19560 | loss 3.339208 (-0.04z)| norm 0.2952 (+1.47z)| lr 1.68e-04 | 8453.48 ms | -100.0% bf16 MFU | 62029 tok/s +step 12863/19560 | loss 3.387871 (+1.32z)| norm 0.2982 (+1.67z)| lr 1.68e-04 | 8456.18 ms | -100.0% bf16 MFU | 62027 tok/s +step 12864/19560 | loss 3.353468 (+0.36z)| norm 0.2751 (-0.02z)| lr 1.68e-04 | 8456.88 ms | -100.0% bf16 MFU | 62026 tok/s +step 12865/19560 | loss 3.318698 (-0.59z)| norm 0.2680 (-0.54z)| lr 1.68e-04 | 8454.66 ms | -100.0% bf16 MFU | 62025 tok/s +step 12866/19560 | loss 3.337783 (-0.06z)| norm 0.2801 (+0.39z)| lr 1.68e-04 | 8451.70 ms | -100.0% bf16 MFU | 62025 tok/s +step 12867/19560 | loss 3.301636 (-1.04z)| norm 0.2843 (+0.71z)| lr 1.68e-04 | 8455.35 ms | -100.0% bf16 MFU | 62024 tok/s +step 12868/19560 | loss 3.294094 (-1.23z)| norm 0.2707 (-0.33z)| lr 1.68e-04 | 8453.14 ms | -100.0% bf16 MFU | 62024 tok/s +step 12869/19560 | loss 3.374027 (+0.97z)| norm 0.2829 (+0.63z)| lr 1.68e-04 | 8452.05 ms | -100.0% bf16 MFU | 62025 tok/s +step 12870/19560 | loss 3.363746 (+0.69z)| norm 0.2807 (+0.47z)| lr 1.68e-04 | 8453.99 ms | -100.0% bf16 MFU | 62024 tok/s +step 12871/19560 | loss 3.303389 (-0.96z)| norm 0.2736 (-0.09z)| lr 1.68e-04 | 8453.89 ms | -100.0% bf16 MFU | 62024 tok/s +step 12872/19560 | loss 3.325759 (-0.33z)| norm 0.3127 (+3.06z)| lr 1.68e-04 | 8454.73 ms | -100.0% bf16 MFU | 62023 tok/s +step 12873/19560 | loss 3.296828 (-1.16z)| norm 0.2833 (+0.69z)| lr 1.68e-04 | 8452.56 ms | -100.0% bf16 MFU | 62024 tok/s +step 12874/19560 | loss 3.337829 (+0.03z)| norm 0.2781 (+0.27z)| lr 1.68e-04 | 8451.34 ms | -100.0% bf16 MFU | 62024 tok/s +step 12875/19560 | loss 3.308300 (-0.82z)| norm 0.2754 (+0.06z)| lr 1.68e-04 | 8453.78 ms | -100.0% bf16 MFU | 62024 tok/s +step 12876/19560 | loss 3.320187 (-0.47z)| norm 0.2737 (-0.07z)| lr 1.68e-04 | 8453.75 ms | -100.0% bf16 MFU | 62024 tok/s +step 12877/19560 | loss 3.340525 (+0.11z)| norm 0.2660 (-0.69z)| lr 1.68e-04 | 8453.45 ms | -100.0% bf16 MFU | 62023 tok/s +step 12878/19560 | loss 3.351621 (+0.43z)| norm 0.2788 (+0.34z)| lr 1.67e-04 | 8452.66 ms | -100.0% bf16 MFU | 62024 tok/s +step 12879/19560 | loss 3.309352 (-0.81z)| norm 0.2662 (-0.68z)| lr 1.67e-04 | 8451.67 ms | -100.0% bf16 MFU | 62024 tok/s +step 12880/19560 | loss 3.297787 (-1.13z)| norm 0.3032 (+2.25z)| lr 1.67e-04 | 8454.43 ms | -100.0% bf16 MFU | 62024 tok/s +step 12881/19560 | loss 3.300483 (-1.05z)| norm 0.2869 (+0.95z)| lr 1.67e-04 | 8451.38 ms | -100.0% bf16 MFU | 62024 tok/s +step 12882/19560 | loss 3.320106 (-0.47z)| norm 0.2804 (+0.43z)| lr 1.67e-04 | 8465.39 ms | -100.0% bf16 MFU | 62020 tok/s +step 12883/19560 | loss 3.363424 (+0.82z)| norm 0.2860 (+0.86z)| lr 1.67e-04 | 8479.64 ms | -100.0% bf16 MFU | 62010 tok/s +step 12884/19560 | loss 3.331631 (-0.11z)| norm 0.2760 (+0.09z)| lr 1.67e-04 | 8479.89 ms | -100.0% bf16 MFU | 62001 tok/s +step 12885/19560 | loss 3.333667 (-0.05z)| norm 0.2827 (+0.62z)| lr 1.67e-04 | 8475.66 ms | -100.0% bf16 MFU | 61994 tok/s +step 12886/19560 | loss 3.361070 (+0.78z)| norm 0.2663 (-0.69z)| lr 1.67e-04 | 8475.35 ms | -100.0% bf16 MFU | 61987 tok/s +step 12887/19560 | loss 3.372042 (+1.09z)| norm 0.2897 (+1.17z)| lr 1.67e-04 | 8477.88 ms | -100.0% bf16 MFU | 61980 tok/s +step 12888/19560 | loss 3.331520 (-0.13z)| norm 0.2764 (+0.11z)| lr 1.67e-04 | 8478.14 ms | -100.0% bf16 MFU | 61973 tok/s +step 12889/19560 | loss 3.325991 (-0.30z)| norm 0.2579 (-1.33z)| lr 1.67e-04 | 8476.19 ms | -100.0% bf16 MFU | 61967 tok/s +step 12890/19560 | loss 3.368420 (+0.97z)| norm 0.2737 (-0.09z)| lr 1.67e-04 | 8476.88 ms | -100.0% bf16 MFU | 61961 tok/s +step 12891/19560 | loss 3.378443 (+1.26z)| norm 0.2705 (-0.34z)| lr 1.67e-04 | 8471.79 ms | -100.0% bf16 MFU | 61957 tok/s +step 12892/19560 | loss 3.359634 (+0.70z)| norm 0.2971 (+1.74z)| lr 1.67e-04 | 8474.40 ms | -100.0% bf16 MFU | 61953 tok/s +step 12893/19560 | loss 3.338051 (+0.04z)| norm 0.2706 (-0.33z)| lr 1.67e-04 | 8471.57 ms | -100.0% bf16 MFU | 61950 tok/s +step 12894/19560 | loss 3.323512 (-0.40z)| norm 0.2886 (+1.06z)| lr 1.67e-04 | 8474.07 ms | -100.0% bf16 MFU | 61946 tok/s +step 12895/19560 | loss 3.348921 (+0.37z)| norm 0.2751 (+0.02z)| lr 1.67e-04 | 8471.39 ms | -100.0% bf16 MFU | 61943 tok/s +step 12896/19560 | loss 3.345007 (+0.24z)| norm 0.2847 (+0.77z)| lr 1.67e-04 | 8472.52 ms | -100.0% bf16 MFU | 61940 tok/s +step 12897/19560 | loss 3.368730 (+0.95z)| norm 0.2705 (-0.35z)| lr 1.67e-04 | 8475.54 ms | -100.0% bf16 MFU | 61936 tok/s +step 12898/19560 | loss 3.258811 (-2.32z)| norm 0.2825 (+0.60z)| lr 1.67e-04 | 8466.57 ms | -100.0% bf16 MFU | 61935 tok/s +step 12899/19560 | loss 3.347688 (+0.32z)| norm 0.2625 (-0.99z)| lr 1.67e-04 | 8473.25 ms | -100.0% bf16 MFU | 61932 tok/s +step 12900/19560 | loss 3.283895 (-1.55z)| norm 0.2768 (+0.13z)| lr 1.66e-04 | 8476.52 ms | -100.0% bf16 MFU | 61928 tok/s +step 12901/19560 | loss 3.391242 (+1.64z)| norm 0.2577 (-1.37z)| lr 1.66e-04 | 8472.83 ms | -100.0% bf16 MFU | 61926 tok/s +step 12902/19560 | loss 3.325306 (-0.32z)| norm 0.2909 (+1.26z)| lr 1.66e-04 | 8473.85 ms | -100.0% bf16 MFU | 61923 tok/s +step 12903/19560 | loss 3.332159 (-0.11z)| norm 0.2698 (-0.40z)| lr 1.66e-04 | 8472.94 ms | -100.0% bf16 MFU | 61921 tok/s +step 12904/19560 | loss 3.299227 (-1.08z)| norm 0.2667 (-0.65z)| lr 1.66e-04 | 8471.26 ms | -100.0% bf16 MFU | 61919 tok/s +step 12905/19560 | loss 3.342555 (+0.21z)| norm 0.2630 (-0.93z)| lr 1.66e-04 | 8469.80 ms | -100.0% bf16 MFU | 61918 tok/s +step 12906/19560 | loss 3.378421 (+1.39z)| norm 0.2725 (-0.17z)| lr 1.66e-04 | 8475.66 ms | -100.0% bf16 MFU | 61915 tok/s +step 12907/19560 | loss 3.323294 (-0.37z)| norm 0.2700 (-0.37z)| lr 1.66e-04 | 8469.17 ms | -100.0% bf16 MFU | 61915 tok/s +step 12908/19560 | loss 3.349456 (+0.48z)| norm 0.2922 (+1.38z)| lr 1.66e-04 | 8465.87 ms | -100.0% bf16 MFU | 61915 tok/s +step 12909/19560 | loss 3.356523 (+0.70z)| norm 0.2543 (-1.61z)| lr 1.66e-04 | 8467.19 ms | -100.0% bf16 MFU | 61916 tok/s +step 12910/19560 | loss 3.385249 (+1.59z)| norm 0.2781 (+0.26z)| lr 1.66e-04 | 8467.18 ms | -100.0% bf16 MFU | 61916 tok/s +step 12911/19560 | loss 3.345696 (+0.33z)| norm 0.2666 (-0.65z)| lr 1.66e-04 | 8471.80 ms | -100.0% bf16 MFU | 61914 tok/s +step 12912/19560 | loss 3.303557 (-0.99z)| norm 0.2630 (-0.93z)| lr 1.66e-04 | 8459.01 ms | -100.0% bf16 MFU | 61918 tok/s +step 12913/19560 | loss 3.291688 (-1.37z)| norm 0.2514 (-1.81z)| lr 1.66e-04 | 8467.80 ms | -100.0% bf16 MFU | 61918 tok/s +step 12914/19560 | loss 3.332252 (-0.07z)| norm 0.2542 (-1.57z)| lr 1.66e-04 | 8470.05 ms | -100.0% bf16 MFU | 61917 tok/s +step 12915/19560 | loss 3.326450 (-0.26z)| norm 0.2719 (-0.19z)| lr 1.66e-04 | 8472.22 ms | -100.0% bf16 MFU | 61915 tok/s +step 12916/19560 | loss 3.217072 (-3.62z)| norm 0.2637 (-0.81z)| lr 1.66e-04 | 8469.41 ms | -100.0% bf16 MFU | 61914 tok/s +step 12917/19560 | loss 3.351380 (+0.56z)| norm 0.2760 (+0.13z)| lr 1.66e-04 | 8463.00 ms | -100.0% bf16 MFU | 61916 tok/s +step 12918/19560 | loss 3.325871 (-0.25z)| norm 0.2507 (-1.79z)| lr 1.66e-04 | 8466.40 ms | -100.0% bf16 MFU | 61917 tok/s +step 12919/19560 | loss 3.317217 (-0.51z)| norm 0.2783 (+0.32z)| lr 1.66e-04 | 8471.24 ms | -100.0% bf16 MFU | 61915 tok/s +step 12920/19560 | loss 3.387963 (+1.70z)| norm 0.2611 (-1.00z)| lr 1.66e-04 | 8467.50 ms | -100.0% bf16 MFU | 61915 tok/s +step 12921/19560 | loss 3.301520 (-1.01z)| norm 0.2565 (-1.33z)| lr 1.66e-04 | 8462.92 ms | -100.0% bf16 MFU | 61917 tok/s +step 12922/19560 | loss 3.320746 (-0.38z)| norm 0.2624 (-0.87z)| lr 1.65e-04 | 8465.45 ms | -100.0% bf16 MFU | 61918 tok/s +step 12923/19560 | loss 3.237632 (-2.95z)| norm 0.2795 (+0.42z)| lr 1.65e-04 | 8472.42 ms | -100.0% bf16 MFU | 61916 tok/s +step 12924/19560 | loss 3.317478 (-0.44z)| norm 0.2529 (-1.57z)| lr 1.65e-04 | 8462.10 ms | -100.0% bf16 MFU | 61918 tok/s +step 12925/19560 | loss 3.337504 (+0.19z)| norm 0.2650 (-0.66z)| lr 1.65e-04 | 8468.21 ms | -100.0% bf16 MFU | 61918 tok/s +step 12926/19560 | loss 3.322564 (-0.29z)| norm 0.2800 (+0.47z)| lr 1.65e-04 | 8467.63 ms | -100.0% bf16 MFU | 61918 tok/s +step 12927/19560 | loss 3.336154 (+0.13z)| norm 0.2764 (+0.18z)| lr 1.65e-04 | 8470.17 ms | -100.0% bf16 MFU | 61917 tok/s +step 12928/19560 | loss 3.359127 (+0.85z)| norm 0.2815 (+0.56z)| lr 1.65e-04 | 8460.47 ms | -100.0% bf16 MFU | 61920 tok/s +step 12929/19560 | loss 3.324270 (-0.25z)| norm 0.2683 (-0.44z)| lr 1.65e-04 | 8459.78 ms | -100.0% bf16 MFU | 61922 tok/s +step 12930/19560 | loss 3.335881 (+0.11z)| norm 0.2683 (-0.45z)| lr 1.65e-04 | 8469.00 ms | -100.0% bf16 MFU | 61922 tok/s +step 12931/19560 | loss 3.332531 (-0.00z)| norm 0.2697 (-0.34z)| lr 1.65e-04 | 8460.97 ms | -100.0% bf16 MFU | 61924 tok/s +step 12932/19560 | loss 3.351496 (+0.60z)| norm 0.2697 (-0.35z)| lr 1.65e-04 | 8467.00 ms | -100.0% bf16 MFU | 61924 tok/s +step 12933/19560 | loss 3.359252 (+0.84z)| norm 0.2526 (-1.63z)| lr 1.65e-04 | 8460.76 ms | -100.0% bf16 MFU | 61926 tok/s +step 12934/19560 | loss 3.327205 (-0.20z)| norm 0.2855 (+0.85z)| lr 1.65e-04 | 8460.27 ms | -100.0% bf16 MFU | 61928 tok/s +step 12935/19560 | loss 3.366874 (+1.10z)| norm 0.2454 (-2.12z)| lr 1.65e-04 | 8461.02 ms | -100.0% bf16 MFU | 61930 tok/s +step 12936/19560 | loss 3.362454 (+0.94z)| norm 0.2666 (-0.56z)| lr 1.65e-04 | 8458.89 ms | -100.0% bf16 MFU | 61932 tok/s +step 12937/19560 | loss 3.399507 (+2.17z)| norm 0.2684 (-0.42z)| lr 1.65e-04 | 8457.32 ms | -100.0% bf16 MFU | 61935 tok/s +step 12938/19560 | loss 3.301021 (-1.09z)| norm 0.2489 (-1.83z)| lr 1.65e-04 | 8464.45 ms | -100.0% bf16 MFU | 61936 tok/s +step 12939/19560 | loss 3.404562 (+2.28z)| norm 0.2928 (+1.36z)| lr 1.65e-04 | 8464.51 ms | -100.0% bf16 MFU | 61936 tok/s +step 12940/19560 | loss 3.340872 (+0.20z)| norm 0.2676 (-0.47z)| lr 1.65e-04 | 8466.60 ms | -100.0% bf16 MFU | 61935 tok/s +step 12941/19560 | loss 3.283789 (-1.63z)| norm 0.2728 (-0.10z)| lr 1.65e-04 | 8462.89 ms | -100.0% bf16 MFU | 61936 tok/s +step 12942/19560 | loss 3.356286 (+0.71z)| norm 0.2904 (+1.17z)| lr 1.65e-04 | 8463.74 ms | -100.0% bf16 MFU | 61936 tok/s +step 12943/19560 | loss 3.340701 (+0.20z)| norm 0.2541 (-1.50z)| lr 1.65e-04 | 8464.64 ms | -100.0% bf16 MFU | 61937 tok/s +step 12944/19560 | loss 3.302208 (-1.03z)| norm 0.2821 (+0.58z)| lr 1.65e-04 | 8461.71 ms | -100.0% bf16 MFU | 61938 tok/s +step 12945/19560 | loss 3.362250 (+0.89z)| norm 0.2621 (-0.91z)| lr 1.64e-04 | 8459.10 ms | -100.0% bf16 MFU | 61940 tok/s +step 12946/19560 | loss 3.226595 (-3.33z)| norm 0.2730 (-0.11z)| lr 1.64e-04 | 8462.28 ms | -100.0% bf16 MFU | 61941 tok/s +step 12947/19560 | loss 3.316381 (-0.52z)| norm 0.2686 (-0.42z)| lr 1.64e-04 | 8458.58 ms | -100.0% bf16 MFU | 61943 tok/s +step 12948/19560 | loss 3.301312 (-0.98z)| norm 0.2481 (-1.92z)| lr 1.64e-04 | 8459.57 ms | -100.0% bf16 MFU | 61944 tok/s +step 12949/19560 | loss 3.356061 (+0.71z)| norm 0.2619 (-0.89z)| lr 1.64e-04 | 8464.54 ms | -100.0% bf16 MFU | 61944 tok/s +step 12950/19560 | loss 3.392919 (+1.82z)| norm 0.2528 (-1.58z)| lr 1.64e-04 | 8457.03 ms | -100.0% bf16 MFU | 61947 tok/s +step 12951/19560 | loss 3.407959 (+2.22z)| norm 0.2634 (-0.77z)| lr 1.64e-04 | 8453.85 ms | -100.0% bf16 MFU | 61950 tok/s +step 12952/19560 | loss 3.322784 (-0.34z)| norm 0.2630 (-0.79z)| lr 1.64e-04 | 8456.11 ms | -100.0% bf16 MFU | 61953 tok/s +step 12953/19560 | loss 3.350438 (+0.49z)| norm 0.2662 (-0.53z)| lr 1.64e-04 | 8462.05 ms | -100.0% bf16 MFU | 61953 tok/s +step 12954/19560 | loss 3.317271 (-0.50z)| norm 0.2728 (-0.03z)| lr 1.64e-04 | 8459.34 ms | -100.0% bf16 MFU | 61954 tok/s +step 12955/19560 | loss 3.332222 (-0.05z)| norm 0.2536 (-1.47z)| lr 1.64e-04 | 8462.96 ms | -100.0% bf16 MFU | 61954 tok/s +step 12956/19560 | loss 3.310447 (-0.70z)| norm 0.2998 (+1.98z)| lr 1.64e-04 | 8459.13 ms | -100.0% bf16 MFU | 61955 tok/s +step 12957/19560 | loss 3.351398 (+0.54z)| norm 0.2651 (-0.61z)| lr 1.64e-04 | 8458.54 ms | -100.0% bf16 MFU | 61957 tok/s +step 12958/19560 | loss 3.348493 (+0.44z)| norm 0.2730 (-0.02z)| lr 1.64e-04 | 8460.33 ms | -100.0% bf16 MFU | 61957 tok/s +step 12959/19560 | loss 3.297460 (-1.09z)| norm 0.2659 (-0.54z)| lr 1.64e-04 | 8455.78 ms | -100.0% bf16 MFU | 61960 tok/s +step 12960/19560 | loss 3.334783 (+0.03z)| norm 0.2750 (+0.13z)| lr 1.64e-04 | 8462.44 ms | -100.0% bf16 MFU | 61959 tok/s +step 12961/19560 | loss 3.360776 (+0.82z)| norm 0.2809 (+0.60z)| lr 1.64e-04 | 8460.67 ms | -100.0% bf16 MFU | 61960 tok/s +step 12962/19560 | loss 3.310645 (-0.69z)| norm 0.2712 (-0.15z)| lr 1.64e-04 | 8458.38 ms | -100.0% bf16 MFU | 61961 tok/s +step 12963/19560 | loss 3.433995 (+2.96z)| norm 0.3084 (+2.60z)| lr 1.64e-04 | 8463.38 ms | -100.0% bf16 MFU | 61960 tok/s +step 12964/19560 | loss 3.332038 (-0.05z)| norm 0.2703 (-0.22z)| lr 1.64e-04 | 8459.73 ms | -100.0% bf16 MFU | 61961 tok/s +step 12965/19560 | loss 3.329997 (-0.11z)| norm 0.2767 (+0.25z)| lr 1.64e-04 | 8464.68 ms | -100.0% bf16 MFU | 61960 tok/s +step 12966/19560 | loss 3.308825 (-0.73z)| norm 0.2615 (-0.87z)| lr 1.64e-04 | 8457.08 ms | -100.0% bf16 MFU | 61962 tok/s +step 12967/19560 | loss 3.389713 (+1.63z)| norm 0.2709 (-0.16z)| lr 1.63e-04 | 8457.06 ms | -100.0% bf16 MFU | 61963 tok/s +step 12968/19560 | loss 3.328782 (-0.15z)| norm 0.2753 (+0.18z)| lr 1.63e-04 | 8458.05 ms | -100.0% bf16 MFU | 61964 tok/s +step 12969/19560 | loss 3.406469 (+2.07z)| norm 0.2628 (-0.75z)| lr 1.63e-04 | 8456.93 ms | -100.0% bf16 MFU | 61966 tok/s +step 12970/19560 | loss 3.285603 (-1.38z)| norm 0.2803 (+0.55z)| lr 1.63e-04 | 8451.24 ms | -100.0% bf16 MFU | 61970 tok/s +step 12971/19560 | loss 3.276198 (-1.63z)| norm 0.2673 (-0.41z)| lr 1.63e-04 | 8461.30 ms | -100.0% bf16 MFU | 61969 tok/s +step 12972/19560 | loss 3.324646 (-0.26z)| norm 0.2668 (-0.45z)| lr 1.63e-04 | 8459.24 ms | -100.0% bf16 MFU | 61970 tok/s +step 12973/19560 | loss 3.323967 (-0.30z)| norm 0.2764 (+0.28z)| lr 1.63e-04 | 8455.13 ms | -100.0% bf16 MFU | 61972 tok/s +step 12974/19560 | loss 3.227612 (-2.93z)| norm 0.2729 (+0.03z)| lr 1.63e-04 | 8452.39 ms | -100.0% bf16 MFU | 61974 tok/s +step 12975/19560 | loss 3.385782 (+1.43z)| norm 0.2677 (-0.37z)| lr 1.63e-04 | 8458.58 ms | -100.0% bf16 MFU | 61975 tok/s +step 12976/19560 | loss 3.383059 (+1.33z)| norm 0.2661 (-0.48z)| lr 1.63e-04 | 8456.65 ms | -100.0% bf16 MFU | 61976 tok/s +step 12977/19560 | loss 3.388202 (+1.45z)| norm 0.2799 (+0.56z)| lr 1.63e-04 | 8454.15 ms | -100.0% bf16 MFU | 61978 tok/s +step 12978/19560 | loss 3.321713 (-0.37z)| norm 0.2838 (+0.85z)| lr 1.63e-04 | 8455.37 ms | -100.0% bf16 MFU | 61979 tok/s +step 12979/19560 | loss 3.316309 (-0.50z)| norm 0.2718 (-0.07z)| lr 1.63e-04 | 8456.23 ms | -100.0% bf16 MFU | 61980 tok/s +step 12980/19560 | loss 3.393653 (+1.58z)| norm 0.3219 (+3.64z)| lr 1.63e-04 | 8452.67 ms | -100.0% bf16 MFU | 61983 tok/s +step 12981/19560 | loss 3.344597 (+0.24z)| norm 0.2677 (-0.41z)| lr 1.63e-04 | 8450.39 ms | -100.0% bf16 MFU | 61986 tok/s +step 12982/19560 | loss 3.342541 (+0.19z)| norm 0.2868 (+1.01z)| lr 1.63e-04 | 8457.28 ms | -100.0% bf16 MFU | 61986 tok/s +step 12983/19560 | loss 3.314395 (-0.57z)| norm 0.2730 (-0.01z)| lr 1.63e-04 | 8454.76 ms | -100.0% bf16 MFU | 61987 tok/s +step 12984/19560 | loss 3.347931 (+0.35z)| norm 0.3046 (+2.35z)| lr 1.63e-04 | 8456.79 ms | -100.0% bf16 MFU | 61988 tok/s +step 12985/19560 | loss 3.346910 (+0.32z)| norm 0.2935 (+1.49z)| lr 1.63e-04 | 8452.16 ms | -100.0% bf16 MFU | 61990 tok/s +step 12986/19560 | loss 3.349428 (+0.39z)| norm 0.3124 (+2.79z)| lr 1.63e-04 | 8456.70 ms | -100.0% bf16 MFU | 61990 tok/s +step 12987/19560 | loss 3.360677 (+0.69z)| norm 0.2824 (+0.61z)| lr 1.63e-04 | 8455.00 ms | -100.0% bf16 MFU | 61991 tok/s +step 12988/19560 | loss 3.404272 (+1.85z)| norm 0.3038 (+2.11z)| lr 1.63e-04 | 8458.22 ms | -100.0% bf16 MFU | 61991 tok/s +step 12989/19560 | loss 3.296138 (-1.07z)| norm 0.2666 (-0.54z)| lr 1.63e-04 | 8457.88 ms | -100.0% bf16 MFU | 61991 tok/s +step 12990/19560 | loss 3.368069 (+0.86z)| norm 0.2641 (-0.71z)| lr 1.62e-04 | 8456.94 ms | -100.0% bf16 MFU | 61991 tok/s +step 12991/19560 | loss 3.368227 (+0.87z)| norm 0.2539 (-1.43z)| lr 1.62e-04 | 8460.47 ms | -100.0% bf16 MFU | 61990 tok/s +step 12992/19560 | loss 3.355643 (+0.53z)| norm 0.2739 (+0.02z)| lr 1.62e-04 | 8456.39 ms | -100.0% bf16 MFU | 61990 tok/s +step 12993/19560 | loss 3.320551 (-0.42z)| norm 0.2807 (+0.51z)| lr 1.62e-04 | 8450.31 ms | -100.0% bf16 MFU | 61993 tok/s +step 12994/19560 | loss 3.311409 (-0.66z)| norm 0.2632 (-0.75z)| lr 1.62e-04 | 8452.12 ms | -100.0% bf16 MFU | 61995 tok/s +step 12995/19560 | loss 3.327625 (-0.23z)| norm 0.2699 (-0.26z)| lr 1.62e-04 | 8458.87 ms | -100.0% bf16 MFU | 61994 tok/s +step 12996/19560 | loss 3.311385 (-0.67z)| norm 0.2946 (+1.50z)| lr 1.62e-04 | 8457.67 ms | -100.0% bf16 MFU | 61994 tok/s +step 12997/19560 | loss 3.322345 (-0.36z)| norm 0.2597 (-0.98z)| lr 1.62e-04 | 8452.87 ms | -100.0% bf16 MFU | 61995 tok/s +step 12998/19560 | loss 3.265908 (-1.86z)| norm 0.2942 (+1.46z)| lr 1.62e-04 | 8452.95 ms | -100.0% bf16 MFU | 61997 tok/s +step 12999/19560 | loss 3.373828 (+1.03z)| norm 0.2687 (-0.34z)| lr 1.62e-04 | 8456.39 ms | -100.0% bf16 MFU | 61997 tok/s +step 13000/19560 | loss 3.312492 (-0.62z)| norm 0.2852 (+0.87z)| lr 1.62e-04 | 8452.59 ms | -100.0% bf16 MFU | 61999 tok/s +val loss 3.323775 +evaluating HellaSwag: 0/1256 evaluating HellaSwag: 10/1256 evaluating HellaSwag: 20/1256 evaluating HellaSwag: 30/1256 evaluating HellaSwag: 40/1256 evaluating HellaSwag: 50/1256 evaluating HellaSwag: 60/1256 evaluating HellaSwag: 70/1256 evaluating HellaSwag: 80/1256 evaluating HellaSwag: 90/1256 evaluating HellaSwag: 100/1256 evaluating HellaSwag: 110/1256 evaluating HellaSwag: 120/1256 evaluating HellaSwag: 130/1256 evaluating HellaSwag: 140/1256 evaluating HellaSwag: 150/1256 evaluating HellaSwag: 160/1256 evaluating HellaSwag: 170/1256 evaluating HellaSwag: 180/1256 evaluating HellaSwag: 190/1256 evaluating HellaSwag: 200/1256 evaluating HellaSwag: 210/1256 evaluating HellaSwag: 220/1256 evaluating HellaSwag: 230/1256 evaluating HellaSwag: 240/1256 evaluating HellaSwag: 250/1256 evaluating HellaSwag: 260/1256 evaluating HellaSwag: 270/1256 evaluating HellaSwag: 280/1256 evaluating HellaSwag: 290/1256 evaluating HellaSwag: 300/1256 evaluating HellaSwag: 310/1256 evaluating HellaSwag: 320/1256 evaluating HellaSwag: 330/1256 evaluating HellaSwag: 340/1256 evaluating HellaSwag: 350/1256 evaluating HellaSwag: 360/1256 evaluating HellaSwag: 370/1256 evaluating HellaSwag: 380/1256 evaluating HellaSwag: 390/1256 evaluating HellaSwag: 400/1256 evaluating HellaSwag: 410/1256 evaluating HellaSwag: 420/1256 evaluating HellaSwag: 430/1256 evaluating HellaSwag: 440/1256 evaluating HellaSwag: 450/1256 evaluating HellaSwag: 460/1256 evaluating HellaSwag: 470/1256 evaluating HellaSwag: 480/1256 evaluating HellaSwag: 490/1256 evaluating HellaSwag: 500/1256 evaluating HellaSwag: 510/1256 evaluating HellaSwag: 520/1256 evaluating HellaSwag: 530/1256 evaluating HellaSwag: 540/1256 evaluating HellaSwag: 550/1256 evaluating HellaSwag: 560/1256 evaluating HellaSwag: 570/1256 evaluating HellaSwag: 580/1256 evaluating HellaSwag: 590/1256 evaluating HellaSwag: 600/1256 evaluating HellaSwag: 610/1256 evaluating HellaSwag: 620/1256 evaluating HellaSwag: 630/1256 evaluating HellaSwag: 640/1256 evaluating HellaSwag: 650/1256 evaluating HellaSwag: 660/1256 evaluating HellaSwag: 670/1256 evaluating HellaSwag: 680/1256 evaluating HellaSwag: 690/1256 evaluating HellaSwag: 700/1256 evaluating HellaSwag: 710/1256 evaluating HellaSwag: 720/1256 evaluating HellaSwag: 730/1256 evaluating HellaSwag: 740/1256 evaluating HellaSwag: 750/1256 evaluating HellaSwag: 760/1256 evaluating HellaSwag: 770/1256 evaluating HellaSwag: 780/1256 evaluating HellaSwag: 790/1256 evaluating HellaSwag: 800/1256 evaluating HellaSwag: 810/1256 evaluating HellaSwag: 820/1256 evaluating HellaSwag: 830/1256 evaluating HellaSwag: 840/1256 evaluating HellaSwag: 850/1256 evaluating HellaSwag: 860/1256 evaluating HellaSwag: 870/1256 evaluating HellaSwag: 880/1256 evaluating HellaSwag: 890/1256 evaluating HellaSwag: 900/1256 evaluating HellaSwag: 910/1256 evaluating HellaSwag: 920/1256 evaluating HellaSwag: 930/1256 evaluating HellaSwag: 940/1256 evaluating HellaSwag: 950/1256 evaluating HellaSwag: 960/1256 evaluating HellaSwag: 970/1256 evaluating HellaSwag: 980/1256 evaluating HellaSwag: 990/1256 evaluating HellaSwag: 1000/1256 evaluating HellaSwag: 1010/1256 evaluating HellaSwag: 1020/1256 evaluating HellaSwag: 1030/1256 evaluating HellaSwag: 1040/1256 evaluating HellaSwag: 1050/1256 evaluating HellaSwag: 1060/1256 evaluating HellaSwag: 1070/1256 evaluating HellaSwag: 1080/1256 evaluating HellaSwag: 1090/1256 evaluating HellaSwag: 1100/1256 evaluating HellaSwag: 1110/1256 evaluating HellaSwag: 1120/1256 evaluating HellaSwag: 1130/1256 evaluating HellaSwag: 1140/1256 evaluating HellaSwag: 1150/1256 evaluating HellaSwag: 1160/1256 evaluating HellaSwag: 1170/1256 evaluating HellaSwag: 1180/1256 evaluating HellaSwag: 1190/1256 evaluating HellaSwag: 1200/1256 evaluating HellaSwag: 1210/1256 evaluating HellaSwag: 1220/1256 evaluating HellaSwag: 1230/1256 evaluating HellaSwag: 1240/1256 evaluating HellaSwag: 1250/1256 HellaSwag: 2935/10042 = 0.292272 +step 13001/19560 | loss 3.310769 (-0.67z)| norm 0.2722 (-0.07z)| lr 1.62e-04 | 8447.82 ms | -100.0% bf16 MFU | 62002 tok/s +step 13002/19560 | loss 3.332067 (-0.09z)| norm 0.2739 (+0.05z)| lr 1.62e-04 | 8453.63 ms | -100.0% bf16 MFU | 62003 tok/s +step 13003/19560 | loss 3.315927 (-0.53z)| norm 0.2841 (+0.79z)| lr 1.62e-04 | 8450.78 ms | -100.0% bf16 MFU | 62004 tok/s +step 13004/19560 | loss 3.342295 (+0.18z)| norm 0.2952 (+1.57z)| lr 1.62e-04 | 8457.03 ms | -100.0% bf16 MFU | 62004 tok/s +step 13005/19560 | loss 3.411399 (+2.00z)| norm 0.2862 (+0.91z)| lr 1.62e-04 | 8454.23 ms | -100.0% bf16 MFU | 62004 tok/s +step 13006/19560 | loss 3.384100 (+1.26z)| norm 0.2733 (-0.02z)| lr 1.62e-04 | 8455.67 ms | -100.0% bf16 MFU | 62004 tok/s +step 13007/19560 | loss 3.331463 (-0.14z)| norm 0.2851 (+0.82z)| lr 1.62e-04 | 8454.01 ms | -100.0% bf16 MFU | 62005 tok/s +step 13008/19560 | loss 3.292966 (-1.16z)| norm 0.2697 (-0.28z)| lr 1.62e-04 | 8453.71 ms | -100.0% bf16 MFU | 62006 tok/s +step 13009/19560 | loss 3.336624 (-0.01z)| norm 0.2764 (+0.22z)| lr 1.62e-04 | 8452.22 ms | -100.0% bf16 MFU | 62007 tok/s +step 13010/19560 | loss 3.314812 (-0.59z)| norm 0.2871 (+1.01z)| lr 1.62e-04 | 8449.99 ms | -100.0% bf16 MFU | 62009 tok/s +step 13011/19560 | loss 3.307144 (-0.78z)| norm 0.2710 (-0.17z)| lr 1.62e-04 | 8456.17 ms | -100.0% bf16 MFU | 62008 tok/s +step 13012/19560 | loss 3.296285 (-1.05z)| norm 0.2738 (+0.04z)| lr 1.61e-04 | 8453.96 ms | -100.0% bf16 MFU | 62009 tok/s +step 13013/19560 | loss 3.275503 (-1.57z)| norm 0.2894 (+1.17z)| lr 1.61e-04 | 8437.32 ms | -100.0% bf16 MFU | 62015 tok/s +step 13014/19560 | loss 3.320249 (-0.40z)| norm 0.2773 (+0.29z)| lr 1.61e-04 | 8440.12 ms | -100.0% bf16 MFU | 62021 tok/s +step 13015/19560 | loss 3.325773 (-0.24z)| norm 0.2711 (-0.16z)| lr 1.61e-04 | 8437.49 ms | -100.0% bf16 MFU | 62026 tok/s +step 13016/19560 | loss 3.325776 (-0.24z)| norm 0.2625 (-0.78z)| lr 1.61e-04 | 8429.81 ms | -100.0% bf16 MFU | 62035 tok/s +step 13017/19560 | loss 3.313601 (-0.56z)| norm 0.2739 (+0.05z)| lr 1.61e-04 | 8435.80 ms | -100.0% bf16 MFU | 62041 tok/s +step 13018/19560 | loss 3.346344 (+0.31z)| norm 0.2784 (+0.37z)| lr 1.61e-04 | 8439.15 ms | -100.0% bf16 MFU | 62045 tok/s +step 13019/19560 | loss 3.341239 (+0.18z)| norm 0.2949 (+1.56z)| lr 1.61e-04 | 8433.85 ms | -100.0% bf16 MFU | 62051 tok/s +step 13020/19560 | loss 3.346260 (+0.32z)| norm 0.2698 (-0.26z)| lr 1.61e-04 | 8433.06 ms | -100.0% bf16 MFU | 62057 tok/s +step 13021/19560 | loss 3.395140 (+1.58z)| norm 0.2652 (-0.59z)| lr 1.61e-04 | 8434.47 ms | -100.0% bf16 MFU | 62062 tok/s +step 13022/19560 | loss 3.333543 (-0.03z)| norm 0.2946 (+1.56z)| lr 1.61e-04 | 8431.70 ms | -100.0% bf16 MFU | 62068 tok/s +step 13023/19560 | loss 3.335395 (+0.02z)| norm 0.2884 (+1.10z)| lr 1.61e-04 | 8438.32 ms | -100.0% bf16 MFU | 62071 tok/s +step 13024/19560 | loss 3.296161 (-1.00z)| norm 0.2749 (+0.11z)| lr 1.61e-04 | 8435.75 ms | -100.0% bf16 MFU | 62075 tok/s +step 13025/19560 | loss 3.342996 (+0.23z)| norm 0.2774 (+0.29z)| lr 1.61e-04 | 8439.84 ms | -100.0% bf16 MFU | 62077 tok/s +step 13026/19560 | loss 3.373967 (+1.03z)| norm 0.2920 (+1.35z)| lr 1.61e-04 | 8433.45 ms | -100.0% bf16 MFU | 62082 tok/s +step 13027/19560 | loss 3.291293 (-1.14z)| norm 0.2873 (+1.00z)| lr 1.61e-04 | 8436.01 ms | -100.0% bf16 MFU | 62085 tok/s +step 13028/19560 | loss 3.349179 (+0.37z)| norm 0.2811 (+0.54z)| lr 1.61e-04 | 8438.09 ms | -100.0% bf16 MFU | 62088 tok/s +step 13029/19560 | loss 3.274210 (-1.59z)| norm 0.2555 (-1.32z)| lr 1.61e-04 | 8434.51 ms | -100.0% bf16 MFU | 62091 tok/s +step 13030/19560 | loss 3.322573 (-0.31z)| norm 0.2628 (-0.78z)| lr 1.61e-04 | 8438.01 ms | -100.0% bf16 MFU | 62093 tok/s +step 13031/19560 | loss 3.373532 (+1.03z)| norm 0.2601 (-0.96z)| lr 1.61e-04 | 8436.17 ms | -100.0% bf16 MFU | 62096 tok/s +step 13032/19560 | loss 3.320473 (-0.38z)| norm 0.2725 (-0.07z)| lr 1.61e-04 | 8434.71 ms | -100.0% bf16 MFU | 62099 tok/s +step 13033/19560 | loss 3.385866 (+1.34z)| norm 0.2659 (-0.55z)| lr 1.61e-04 | 8437.79 ms | -100.0% bf16 MFU | 62101 tok/s +step 13034/19560 | loss 3.280923 (-1.40z)| norm 0.2621 (-0.82z)| lr 1.61e-04 | 8436.11 ms | -100.0% bf16 MFU | 62103 tok/s +step 13035/19560 | loss 3.294296 (-1.04z)| norm 0.2487 (-1.76z)| lr 1.60e-04 | 8435.93 ms | -100.0% bf16 MFU | 62106 tok/s +step 13036/19560 | loss 3.334843 (+0.02z)| norm 0.2639 (-0.66z)| lr 1.60e-04 | 8440.95 ms | -100.0% bf16 MFU | 62106 tok/s +step 13037/19560 | loss 3.314491 (-0.50z)| norm 0.2770 (+0.28z)| lr 1.60e-04 | 8438.73 ms | -100.0% bf16 MFU | 62107 tok/s +step 13038/19560 | loss 3.360269 (+0.71z)| norm 0.2538 (-1.38z)| lr 1.60e-04 | 8442.25 ms | -100.0% bf16 MFU | 62107 tok/s +step 13039/19560 | loss 3.322871 (-0.27z)| norm 0.2929 (+1.42z)| lr 1.60e-04 | 8437.60 ms | -100.0% bf16 MFU | 62108 tok/s +step 13040/19560 | loss 3.352664 (+0.50z)| norm 0.2478 (-1.79z)| lr 1.60e-04 | 8437.97 ms | -100.0% bf16 MFU | 62110 tok/s +step 13041/19560 | loss 3.295985 (-0.99z)| norm 0.2654 (-0.56z)| lr 1.60e-04 | 8440.59 ms | -100.0% bf16 MFU | 62110 tok/s +step 13042/19560 | loss 3.309315 (-0.64z)| norm 0.2718 (-0.10z)| lr 1.60e-04 | 8442.75 ms | -100.0% bf16 MFU | 62109 tok/s +step 13043/19560 | loss 3.295447 (-0.99z)| norm 0.2575 (-1.13z)| lr 1.60e-04 | 8443.11 ms | -100.0% bf16 MFU | 62109 tok/s +step 13044/19560 | loss 3.289537 (-1.20z)| norm 0.2519 (-1.51z)| lr 1.60e-04 | 8440.75 ms | -100.0% bf16 MFU | 62109 tok/s +step 13045/19560 | loss 3.342211 (+0.23z)| norm 0.2734 (+0.02z)| lr 1.60e-04 | 8445.42 ms | -100.0% bf16 MFU | 62108 tok/s +step 13046/19560 | loss 3.307062 (-0.72z)| norm 0.2514 (-1.55z)| lr 1.60e-04 | 8441.74 ms | -100.0% bf16 MFU | 62108 tok/s +step 13047/19560 | loss 3.427006 (+2.45z)| norm 0.2735 (+0.04z)| lr 1.60e-04 | 8447.01 ms | -100.0% bf16 MFU | 62106 tok/s +step 13048/19560 | loss 3.284783 (-1.29z)| norm 0.2652 (-0.56z)| lr 1.60e-04 | 8451.03 ms | -100.0% bf16 MFU | 62102 tok/s +step 13049/19560 | loss 3.315661 (-0.48z)| norm 0.2720 (-0.08z)| lr 1.60e-04 | 8445.08 ms | -100.0% bf16 MFU | 62101 tok/s +step 13050/19560 | loss 3.335263 (+0.04z)| norm 0.2742 (+0.07z)| lr 1.60e-04 | 8443.26 ms | -100.0% bf16 MFU | 62101 tok/s +step 13051/19560 | loss 3.284789 (-1.34z)| norm 0.2583 (-1.06z)| lr 1.60e-04 | 8445.29 ms | -100.0% bf16 MFU | 62100 tok/s +step 13052/19560 | loss 3.321343 (-0.35z)| norm 0.2694 (-0.28z)| lr 1.60e-04 | 8446.43 ms | -100.0% bf16 MFU | 62099 tok/s +step 13053/19560 | loss 3.291861 (-1.13z)| norm 0.2721 (-0.09z)| lr 1.60e-04 | 8444.12 ms | -100.0% bf16 MFU | 62098 tok/s +step 13054/19560 | loss 3.309695 (-0.65z)| norm 0.2470 (-1.86z)| lr 1.60e-04 | 8443.58 ms | -100.0% bf16 MFU | 62098 tok/s +step 13055/19560 | loss 3.286414 (-1.26z)| norm 0.2682 (-0.34z)| lr 1.60e-04 | 8446.29 ms | -100.0% bf16 MFU | 62097 tok/s +step 13056/19560 | loss 3.324860 (-0.22z)| norm 0.2557 (-1.21z)| lr 1.60e-04 | 8446.56 ms | -100.0% bf16 MFU | 62095 tok/s +step 13057/19560 | loss 3.297921 (-0.93z)| norm 0.2689 (-0.28z)| lr 1.60e-04 | 8444.46 ms | -100.0% bf16 MFU | 62095 tok/s +step 13058/19560 | loss 3.358022 (+0.67z)| norm 0.2672 (-0.40z)| lr 1.59e-04 | 8444.05 ms | -100.0% bf16 MFU | 62095 tok/s +step 13059/19560 | loss 3.370749 (+0.99z)| norm 0.2798 (+0.49z)| lr 1.59e-04 | 8442.57 ms | -100.0% bf16 MFU | 62095 tok/s +step 13060/19560 | loss 3.326000 (-0.19z)| norm 0.2675 (-0.38z)| lr 1.59e-04 | 8446.92 ms | -100.0% bf16 MFU | 62094 tok/s +step 13061/19560 | loss 3.327138 (-0.15z)| norm 0.2565 (-1.17z)| lr 1.59e-04 | 8443.65 ms | -100.0% bf16 MFU | 62094 tok/s +step 13062/19560 | loss 3.398136 (+1.70z)| norm 0.2882 (+1.09z)| lr 1.59e-04 | 8446.50 ms | -100.0% bf16 MFU | 62092 tok/s +step 13063/19560 | loss 3.388573 (+1.44z)| norm 0.2792 (+0.43z)| lr 1.59e-04 | 8448.40 ms | -100.0% bf16 MFU | 62091 tok/s +step 13064/19560 | loss 3.349505 (+0.42z)| norm 0.2739 (+0.05z)| lr 1.59e-04 | 8446.64 ms | -100.0% bf16 MFU | 62090 tok/s +step 13065/19560 | loss 3.379529 (+1.22z)| norm 0.2882 (+1.07z)| lr 1.59e-04 | 8445.03 ms | -100.0% bf16 MFU | 62089 tok/s +step 13066/19560 | loss 3.381644 (+1.25z)| norm 0.2863 (+0.92z)| lr 1.59e-04 | 8445.56 ms | -100.0% bf16 MFU | 62089 tok/s +step 13067/19560 | loss 3.275053 (-1.53z)| norm 0.2794 (+0.43z)| lr 1.59e-04 | 8441.33 ms | -100.0% bf16 MFU | 62090 tok/s +step 13068/19560 | loss 3.363660 (+0.80z)| norm 0.2788 (+0.38z)| lr 1.59e-04 | 8445.06 ms | -100.0% bf16 MFU | 62089 tok/s +step 13069/19560 | loss 3.333283 (-0.01z)| norm 0.2632 (-0.75z)| lr 1.59e-04 | 8444.87 ms | -100.0% bf16 MFU | 62089 tok/s +step 13070/19560 | loss 3.336192 (+0.07z)| norm 0.2832 (+0.71z)| lr 1.59e-04 | 8447.10 ms | -100.0% bf16 MFU | 62088 tok/s +step 13071/19560 | loss 3.321347 (-0.32z)| norm 0.2867 (+0.96z)| lr 1.59e-04 | 8444.41 ms | -100.0% bf16 MFU | 62088 tok/s +step 13072/19560 | loss 3.324920 (-0.23z)| norm 0.2712 (-0.18z)| lr 1.59e-04 | 8446.09 ms | -100.0% bf16 MFU | 62087 tok/s +step 13073/19560 | loss 3.374768 (+1.10z)| norm 0.2740 (+0.02z)| lr 1.59e-04 | 8465.35 ms | -100.0% bf16 MFU | 62080 tok/s +step 13074/19560 | loss 3.388000 (+1.45z)| norm 0.2744 (+0.05z)| lr 1.59e-04 | 8469.68 ms | -100.0% bf16 MFU | 62071 tok/s +step 13075/19560 | loss 3.309022 (-0.70z)| norm 0.2567 (-1.25z)| lr 1.59e-04 | 8471.26 ms | -100.0% bf16 MFU | 62062 tok/s +step 13076/19560 | loss 3.400676 (+1.76z)| norm 0.2781 (+0.31z)| lr 1.59e-04 | 8473.13 ms | -100.0% bf16 MFU | 62052 tok/s +step 13077/19560 | loss 3.371375 (+0.96z)| norm 0.2647 (-0.69z)| lr 1.59e-04 | 8469.26 ms | -100.0% bf16 MFU | 62045 tok/s +step 13078/19560 | loss 3.298604 (-0.98z)| norm 0.2806 (+0.48z)| lr 1.59e-04 | 8471.58 ms | -100.0% bf16 MFU | 62037 tok/s +step 13079/19560 | loss 3.367978 (+0.91z)| norm 0.2704 (-0.28z)| lr 1.59e-04 | 8471.28 ms | -100.0% bf16 MFU | 62030 tok/s +step 13080/19560 | loss 3.312061 (-0.61z)| norm 0.2688 (-0.41z)| lr 1.58e-04 | 8466.70 ms | -100.0% bf16 MFU | 62025 tok/s +step 13081/19560 | loss 3.329235 (-0.14z)| norm 0.2781 (+0.28z)| lr 1.58e-04 | 8467.48 ms | -100.0% bf16 MFU | 62019 tok/s +step 13082/19560 | loss 3.326906 (-0.21z)| norm 0.2676 (-0.51z)| lr 1.58e-04 | 8465.06 ms | -100.0% bf16 MFU | 62015 tok/s +step 13083/19560 | loss 3.356773 (+0.61z)| norm 0.2728 (-0.12z)| lr 1.58e-04 | 8466.03 ms | -100.0% bf16 MFU | 62011 tok/s +step 13084/19560 | loss 3.287316 (-1.28z)| norm 0.2612 (-1.00z)| lr 1.58e-04 | 8464.39 ms | -100.0% bf16 MFU | 62007 tok/s +step 13085/19560 | loss 3.299867 (-0.93z)| norm 0.2920 (+1.35z)| lr 1.58e-04 | 8464.42 ms | -100.0% bf16 MFU | 62004 tok/s +step 13086/19560 | loss 3.398915 (+1.73z)| norm 0.2825 (+0.62z)| lr 1.58e-04 | 8464.26 ms | -100.0% bf16 MFU | 62001 tok/s +step 13087/19560 | loss 3.404810 (+1.85z)| norm 0.2851 (+0.80z)| lr 1.58e-04 | 8461.86 ms | -100.0% bf16 MFU | 61999 tok/s +step 13088/19560 | loss 3.322921 (-0.32z)| norm 0.2882 (+1.03z)| lr 1.58e-04 | 8461.78 ms | -100.0% bf16 MFU | 61997 tok/s +step 13089/19560 | loss 3.350352 (+0.41z)| norm 0.2860 (+0.86z)| lr 1.58e-04 | 8466.59 ms | -100.0% bf16 MFU | 61993 tok/s +step 13090/19560 | loss 3.328195 (-0.19z)| norm 0.2652 (-0.72z)| lr 1.58e-04 | 8469.85 ms | -100.0% bf16 MFU | 61988 tok/s +step 13091/19560 | loss 3.316255 (-0.49z)| norm 0.2670 (-0.57z)| lr 1.58e-04 | 8464.57 ms | -100.0% bf16 MFU | 61986 tok/s +step 13092/19560 | loss 3.397607 (+1.70z)| norm 0.2921 (+1.36z)| lr 1.58e-04 | 8464.80 ms | -100.0% bf16 MFU | 61984 tok/s +step 13093/19560 | loss 3.312446 (-0.60z)| norm 0.2855 (+0.84z)| lr 1.58e-04 | 8466.80 ms | -100.0% bf16 MFU | 61980 tok/s +step 13094/19560 | loss 3.394382 (+1.58z)| norm 0.2757 (+0.07z)| lr 1.58e-04 | 8461.63 ms | -100.0% bf16 MFU | 61980 tok/s +step 13095/19560 | loss 3.358638 (+0.64z)| norm 0.2790 (+0.33z)| lr 1.58e-04 | 8464.05 ms | -100.0% bf16 MFU | 61978 tok/s +step 13096/19560 | loss 3.353983 (+0.50z)| norm 0.2655 (-0.71z)| lr 1.58e-04 | 8462.85 ms | -100.0% bf16 MFU | 61976 tok/s +step 13097/19560 | loss 3.306056 (-0.78z)| norm 0.2645 (-0.79z)| lr 1.58e-04 | 8463.79 ms | -100.0% bf16 MFU | 61975 tok/s +step 13098/19560 | loss 3.249598 (-2.28z)| norm 0.2762 (+0.11z)| lr 1.58e-04 | 8464.24 ms | -100.0% bf16 MFU | 61973 tok/s +step 13099/19560 | loss 3.406837 (+1.92z)| norm 0.2821 (+0.56z)| lr 1.58e-04 | 8462.41 ms | -100.0% bf16 MFU | 61972 tok/s +step 13100/19560 | loss 3.307280 (-0.74z)| norm 0.2557 (-1.46z)| lr 1.58e-04 | 8461.60 ms | -100.0% bf16 MFU | 61972 tok/s +step 13101/19560 | loss 3.393027 (+1.52z)| norm 0.2695 (-0.40z)| lr 1.58e-04 | 8455.57 ms | -100.0% bf16 MFU | 61973 tok/s +step 13102/19560 | loss 3.274788 (-1.66z)| norm 0.2572 (-1.32z)| lr 1.58e-04 | 8461.43 ms | -100.0% bf16 MFU | 61973 tok/s +step 13103/19560 | loss 3.339467 (+0.10z)| norm 0.2631 (-0.87z)| lr 1.57e-04 | 8460.28 ms | -100.0% bf16 MFU | 61973 tok/s +step 13104/19560 | loss 3.305470 (-0.81z)| norm 0.2817 (+0.54z)| lr 1.57e-04 | 8460.52 ms | -100.0% bf16 MFU | 61972 tok/s +step 13105/19560 | loss 3.362279 (+0.76z)| norm 0.2616 (-0.98z)| lr 1.57e-04 | 8457.05 ms | -100.0% bf16 MFU | 61974 tok/s +step 13106/19560 | loss 3.318657 (-0.45z)| norm 0.2891 (+1.11z)| lr 1.57e-04 | 8465.93 ms | -100.0% bf16 MFU | 61971 tok/s +step 13107/19560 | loss 3.372061 (+1.01z)| norm 0.2636 (-0.82z)| lr 1.57e-04 | 8459.97 ms | -100.0% bf16 MFU | 61971 tok/s +step 13108/19560 | loss 3.339858 (+0.14z)| norm 0.2707 (-0.27z)| lr 1.57e-04 | 8450.30 ms | -100.0% bf16 MFU | 61975 tok/s +step 13109/19560 | loss 3.388222 (+1.46z)| norm 0.2698 (-0.34z)| lr 1.57e-04 | 8450.46 ms | -100.0% bf16 MFU | 61978 tok/s +step 13110/19560 | loss 3.349918 (+0.40z)| norm 0.2618 (-0.97z)| lr 1.57e-04 | 8459.12 ms | -100.0% bf16 MFU | 61978 tok/s +step 13111/19560 | loss 3.367371 (+0.87z)| norm 0.2853 (+0.90z)| lr 1.57e-04 | 8460.25 ms | -100.0% bf16 MFU | 61978 tok/s +step 13112/19560 | loss 3.320820 (-0.40z)| norm 0.2692 (-0.36z)| lr 1.57e-04 | 8453.96 ms | -100.0% bf16 MFU | 61980 tok/s +step 13113/19560 | loss 3.352189 (+0.46z)| norm 0.2569 (-1.35z)| lr 1.57e-04 | 8461.66 ms | -100.0% bf16 MFU | 61979 tok/s +step 13114/19560 | loss 3.364160 (+0.78z)| norm 0.2662 (-0.58z)| lr 1.57e-04 | 8456.34 ms | -100.0% bf16 MFU | 61980 tok/s +step 13115/19560 | loss 3.326277 (-0.25z)| norm 0.2528 (-1.69z)| lr 1.57e-04 | 8457.89 ms | -100.0% bf16 MFU | 61980 tok/s +step 13116/19560 | loss 3.359663 (+0.69z)| norm 0.2925 (+1.69z)| lr 1.57e-04 | 8456.74 ms | -100.0% bf16 MFU | 61981 tok/s +step 13117/19560 | loss 3.339482 (+0.12z)| norm 0.2623 (-0.89z)| lr 1.57e-04 | 8465.68 ms | -100.0% bf16 MFU | 61979 tok/s +step 13118/19560 | loss 3.340899 (+0.16z)| norm 0.2632 (-0.81z)| lr 1.57e-04 | 8449.14 ms | -100.0% bf16 MFU | 61982 tok/s +step 13119/19560 | loss 3.330064 (-0.13z)| norm 0.2920 (+1.63z)| lr 1.57e-04 | 8459.85 ms | -100.0% bf16 MFU | 61982 tok/s +step 13120/19560 | loss 3.366635 (+0.89z)| norm 0.2441 (-2.39z)| lr 1.57e-04 | 8456.63 ms | -100.0% bf16 MFU | 61983 tok/s +step 13121/19560 | loss 3.313303 (-0.60z)| norm 0.2668 (-0.49z)| lr 1.57e-04 | 8455.02 ms | -100.0% bf16 MFU | 61984 tok/s +step 13122/19560 | loss 3.335917 (+0.03z)| norm 0.2600 (-1.06z)| lr 1.57e-04 | 8453.88 ms | -100.0% bf16 MFU | 61986 tok/s +step 13123/19560 | loss 3.347446 (+0.34z)| norm 0.2564 (-1.34z)| lr 1.57e-04 | 8455.31 ms | -100.0% bf16 MFU | 61987 tok/s +step 13124/19560 | loss 3.375278 (+1.11z)| norm 0.2742 (+0.16z)| lr 1.57e-04 | 8452.45 ms | -100.0% bf16 MFU | 61989 tok/s +step 13125/19560 | loss 3.306397 (-0.81z)| norm 0.2565 (-1.33z)| lr 1.57e-04 | 8456.93 ms | -100.0% bf16 MFU | 61989 tok/s +step 13126/19560 | loss 3.327333 (-0.25z)| norm 0.2663 (-0.50z)| lr 1.56e-04 | 8458.92 ms | -100.0% bf16 MFU | 61989 tok/s +step 13127/19560 | loss 3.307336 (-0.80z)| norm 0.2468 (-2.09z)| lr 1.56e-04 | 8457.15 ms | -100.0% bf16 MFU | 61989 tok/s +step 13128/19560 | loss 3.326459 (-0.26z)| norm 0.2911 (+1.59z)| lr 1.56e-04 | 8450.00 ms | -100.0% bf16 MFU | 61992 tok/s +step 13129/19560 | loss 3.333873 (-0.06z)| norm 0.2736 (+0.14z)| lr 1.56e-04 | 8450.37 ms | -100.0% bf16 MFU | 61994 tok/s +step 13130/19560 | loss 3.389388 (+1.50z)| norm 0.2803 (+0.68z)| lr 1.56e-04 | 8456.87 ms | -100.0% bf16 MFU | 61994 tok/s +step 13131/19560 | loss 3.326285 (-0.28z)| norm 0.2795 (+0.62z)| lr 1.56e-04 | 8457.04 ms | -100.0% bf16 MFU | 61994 tok/s +step 13132/19560 | loss 3.393088 (+1.58z)| norm 0.2835 (+0.97z)| lr 1.56e-04 | 8452.24 ms | -100.0% bf16 MFU | 61996 tok/s +step 13133/19560 | loss 3.299076 (-1.04z)| norm 0.2642 (-0.63z)| lr 1.56e-04 | 8454.45 ms | -100.0% bf16 MFU | 61997 tok/s +step 13134/19560 | loss 3.329695 (-0.16z)| norm 0.2773 (+0.46z)| lr 1.56e-04 | 8453.49 ms | -100.0% bf16 MFU | 61998 tok/s +step 13135/19560 | loss 3.315890 (-0.55z)| norm 0.2732 (+0.12z)| lr 1.56e-04 | 8447.76 ms | -100.0% bf16 MFU | 62001 tok/s +step 13136/19560 | loss 3.379475 (+1.24z)| norm 0.2543 (-1.45z)| lr 1.56e-04 | 8453.31 ms | -100.0% bf16 MFU | 62002 tok/s +step 13137/19560 | loss 3.377504 (+1.17z)| norm 0.2892 (+1.46z)| lr 1.56e-04 | 8453.54 ms | -100.0% bf16 MFU | 62003 tok/s +step 13138/19560 | loss 3.300850 (-1.00z)| norm 0.2708 (-0.06z)| lr 1.56e-04 | 8457.68 ms | -100.0% bf16 MFU | 62003 tok/s +step 13139/19560 | loss 3.314230 (-0.62z)| norm 0.2741 (+0.21z)| lr 1.56e-04 | 8451.74 ms | -100.0% bf16 MFU | 62004 tok/s +step 13140/19560 | loss 3.281446 (-1.54z)| norm 0.2787 (+0.59z)| lr 1.56e-04 | 8450.16 ms | -100.0% bf16 MFU | 62006 tok/s +step 13141/19560 | loss 3.393292 (+1.59z)| norm 0.2788 (+0.62z)| lr 1.56e-04 | 8444.80 ms | -100.0% bf16 MFU | 62010 tok/s +step 13142/19560 | loss 3.296282 (-1.14z)| norm 0.2908 (+1.60z)| lr 1.56e-04 | 8443.47 ms | -100.0% bf16 MFU | 62014 tok/s +step 13143/19560 | loss 3.354510 (+0.49z)| norm 0.2764 (+0.39z)| lr 1.56e-04 | 8447.99 ms | -100.0% bf16 MFU | 62017 tok/s +step 13144/19560 | loss 3.328703 (-0.24z)| norm 0.2835 (+0.97z)| lr 1.56e-04 | 8443.05 ms | -100.0% bf16 MFU | 62021 tok/s +step 13145/19560 | loss 3.342807 (+0.15z)| norm 0.2512 (-1.68z)| lr 1.56e-04 | 8448.75 ms | -100.0% bf16 MFU | 62022 tok/s +step 13146/19560 | loss 3.277441 (-1.66z)| norm 0.2898 (+1.48z)| lr 1.56e-04 | 8442.83 ms | -100.0% bf16 MFU | 62026 tok/s +step 13147/19560 | loss 3.366414 (+0.82z)| norm 0.2629 (-0.72z)| lr 1.56e-04 | 8447.71 ms | -100.0% bf16 MFU | 62028 tok/s +step 13148/19560 | loss 3.304401 (-0.89z)| norm 0.2918 (+1.65z)| lr 1.56e-04 | 8446.24 ms | -100.0% bf16 MFU | 62030 tok/s +step 13149/19560 | loss 3.373990 (+1.05z)| norm 0.2668 (-0.40z)| lr 1.55e-04 | 8443.41 ms | -100.0% bf16 MFU | 62033 tok/s +step 13150/19560 | loss 3.369987 (+0.92z)| norm 0.2740 (+0.21z)| lr 1.55e-04 | 8449.35 ms | -100.0% bf16 MFU | 62034 tok/s +step 13151/19560 | loss 3.322738 (-0.39z)| norm 0.2865 (+1.25z)| lr 1.55e-04 | 8448.39 ms | -100.0% bf16 MFU | 62035 tok/s +step 13152/19560 | loss 3.332125 (-0.14z)| norm 0.3016 (+2.43z)| lr 1.55e-04 | 8447.93 ms | -100.0% bf16 MFU | 62037 tok/s +step 13153/19560 | loss 3.366522 (+0.82z)| norm 0.2875 (+1.27z)| lr 1.55e-04 | 8445.40 ms | -100.0% bf16 MFU | 62039 tok/s +step 13154/19560 | loss 3.363585 (+0.74z)| norm 0.2922 (+1.65z)| lr 1.55e-04 | 8443.83 ms | -100.0% bf16 MFU | 62042 tok/s +step 13155/19560 | loss 3.364664 (+0.76z)| norm 0.2756 (+0.32z)| lr 1.55e-04 | 8444.31 ms | -100.0% bf16 MFU | 62044 tok/s +step 13156/19560 | loss 3.348234 (+0.30z)| norm 0.2859 (+1.15z)| lr 1.55e-04 | 8451.35 ms | -100.0% bf16 MFU | 62043 tok/s +step 13157/19560 | loss 3.311914 (-0.74z)| norm 0.2653 (-0.53z)| lr 1.55e-04 | 8448.94 ms | -100.0% bf16 MFU | 62044 tok/s +step 13158/19560 | loss 3.331374 (-0.19z)| norm 0.2915 (+1.58z)| lr 1.55e-04 | 8444.28 ms | -100.0% bf16 MFU | 62046 tok/s +step 13159/19560 | loss 3.301763 (-1.01z)| norm 0.2821 (+0.80z)| lr 1.55e-04 | 8451.48 ms | -100.0% bf16 MFU | 62046 tok/s +step 13160/19560 | loss 3.348480 (+0.31z)| norm 0.2919 (+1.57z)| lr 1.55e-04 | 8449.12 ms | -100.0% bf16 MFU | 62046 tok/s +step 13161/19560 | loss 3.319001 (-0.52z)| norm 0.2711 (-0.10z)| lr 1.55e-04 | 8444.84 ms | -100.0% bf16 MFU | 62048 tok/s +step 13162/19560 | loss 3.365541 (+0.80z)| norm 0.2981 (+2.02z)| lr 1.55e-04 | 8447.62 ms | -100.0% bf16 MFU | 62049 tok/s +step 13163/19560 | loss 3.398974 (+1.73z)| norm 0.2754 (+0.21z)| lr 1.55e-04 | 8449.98 ms | -100.0% bf16 MFU | 62048 tok/s +step 13164/19560 | loss 3.351992 (+0.38z)| norm 0.2765 (+0.28z)| lr 1.55e-04 | 8449.94 ms | -100.0% bf16 MFU | 62048 tok/s +step 13165/19560 | loss 3.313963 (-0.71z)| norm 0.2905 (+1.39z)| lr 1.55e-04 | 8451.11 ms | -100.0% bf16 MFU | 62048 tok/s +step 13166/19560 | loss 3.322410 (-0.46z)| norm 0.2922 (+1.51z)| lr 1.55e-04 | 8447.20 ms | -100.0% bf16 MFU | 62049 tok/s +step 13167/19560 | loss 3.364357 (+0.73z)| norm 0.3010 (+2.18z)| lr 1.55e-04 | 8446.38 ms | -100.0% bf16 MFU | 62050 tok/s +step 13168/19560 | loss 3.370116 (+0.89z)| norm 0.2816 (+0.63z)| lr 1.55e-04 | 8448.34 ms | -100.0% bf16 MFU | 62050 tok/s +step 13169/19560 | loss 3.372872 (+0.95z)| norm 0.2912 (+1.38z)| lr 1.55e-04 | 8446.95 ms | -100.0% bf16 MFU | 62051 tok/s +step 13170/19560 | loss 3.374517 (+0.98z)| norm 0.2708 (-0.25z)| lr 1.55e-04 | 8450.55 ms | -100.0% bf16 MFU | 62051 tok/s +step 13171/19560 | loss 3.293208 (-1.33z)| norm 0.2753 (+0.10z)| lr 1.54e-04 | 8449.49 ms | -100.0% bf16 MFU | 62051 tok/s +step 13172/19560 | loss 3.301261 (-1.11z)| norm 0.2807 (+0.52z)| lr 1.54e-04 | 8447.29 ms | -100.0% bf16 MFU | 62051 tok/s +step 13173/19560 | loss 3.335689 (-0.12z)| norm 0.2573 (-1.36z)| lr 1.54e-04 | 8445.40 ms | -100.0% bf16 MFU | 62053 tok/s +step 13174/19560 | loss 3.354653 (+0.41z)| norm 0.2861 (+0.95z)| lr 1.54e-04 | 8450.21 ms | -100.0% bf16 MFU | 62052 tok/s +step 13175/19560 | loss 3.346110 (+0.19z)| norm 0.2608 (-1.10z)| lr 1.54e-04 | 8447.24 ms | -100.0% bf16 MFU | 62053 tok/s +step 13176/19560 | loss 3.344335 (+0.12z)| norm 0.2753 (+0.08z)| lr 1.54e-04 | 8448.84 ms | -100.0% bf16 MFU | 62053 tok/s +step 13177/19560 | loss 3.352446 (+0.35z)| norm 0.2734 (-0.08z)| lr 1.54e-04 | 8451.93 ms | -100.0% bf16 MFU | 62052 tok/s +step 13178/19560 | loss 3.330719 (-0.29z)| norm 0.2884 (+1.12z)| lr 1.54e-04 | 8450.19 ms | -100.0% bf16 MFU | 62052 tok/s +step 13179/19560 | loss 3.302967 (-1.13z)| norm 0.2706 (-0.33z)| lr 1.54e-04 | 8446.50 ms | -100.0% bf16 MFU | 62053 tok/s +step 13180/19560 | loss 3.316870 (-0.71z)| norm 0.2655 (-0.74z)| lr 1.54e-04 | 8450.59 ms | -100.0% bf16 MFU | 62052 tok/s +step 13181/19560 | loss 3.397052 (+1.66z)| norm 0.2725 (-0.17z)| lr 1.54e-04 | 8441.65 ms | -100.0% bf16 MFU | 62055 tok/s +step 13182/19560 | loss 3.363091 (+0.64z)| norm 0.2656 (-0.76z)| lr 1.54e-04 | 8450.63 ms | -100.0% bf16 MFU | 62054 tok/s +step 13183/19560 | loss 3.350702 (+0.25z)| norm 0.2850 (+0.84z)| lr 1.54e-04 | 8449.04 ms | -100.0% bf16 MFU | 62054 tok/s +step 13184/19560 | loss 3.373451 (+0.93z)| norm 0.2723 (-0.23z)| lr 1.54e-04 | 8448.30 ms | -100.0% bf16 MFU | 62054 tok/s +step 13185/19560 | loss 3.247906 (-2.78z)| norm 0.2702 (-0.41z)| lr 1.54e-04 | 8449.23 ms | -100.0% bf16 MFU | 62054 tok/s +step 13186/19560 | loss 3.310369 (-0.93z)| norm 0.2727 (-0.20z)| lr 1.54e-04 | 8449.36 ms | -100.0% bf16 MFU | 62054 tok/s +step 13187/19560 | loss 3.324748 (-0.50z)| norm 0.2600 (-1.24z)| lr 1.54e-04 | 8446.90 ms | -100.0% bf16 MFU | 62055 tok/s +step 13188/19560 | loss 3.426372 (+2.41z)| norm 0.2817 (+0.56z)| lr 1.54e-04 | 8449.09 ms | -100.0% bf16 MFU | 62055 tok/s +step 13189/19560 | loss 3.403625 (+1.72z)| norm 0.2645 (-0.89z)| lr 1.54e-04 | 8444.20 ms | -100.0% bf16 MFU | 62056 tok/s +step 13190/19560 | loss 3.352594 (+0.29z)| norm 0.2654 (-0.80z)| lr 1.54e-04 | 8448.03 ms | -100.0% bf16 MFU | 62057 tok/s +step 13191/19560 | loss 3.280657 (-1.75z)| norm 0.2616 (-1.10z)| lr 1.54e-04 | 8446.04 ms | -100.0% bf16 MFU | 62058 tok/s +step 13192/19560 | loss 3.357037 (+0.43z)| norm 0.2595 (-1.26z)| lr 1.54e-04 | 8448.35 ms | -100.0% bf16 MFU | 62058 tok/s +step 13193/19560 | loss 3.366816 (+0.72z)| norm 0.2642 (-0.86z)| lr 1.54e-04 | 8444.54 ms | -100.0% bf16 MFU | 62059 tok/s +step 13194/19560 | loss 3.308429 (-0.94z)| norm 0.2543 (-1.65z)| lr 1.53e-04 | 8447.00 ms | -100.0% bf16 MFU | 62059 tok/s +step 13195/19560 | loss 3.431515 (+2.52z)| norm 0.2757 (+0.13z)| lr 1.53e-04 | 8445.69 ms | -100.0% bf16 MFU | 62060 tok/s +step 13196/19560 | loss 3.359722 (+0.49z)| norm 0.3125 (+3.04z)| lr 1.53e-04 | 8449.12 ms | -100.0% bf16 MFU | 62060 tok/s +step 13197/19560 | loss 3.315439 (-0.76z)| norm 0.2802 (+0.45z)| lr 1.53e-04 | 8449.99 ms | -100.0% bf16 MFU | 62059 tok/s +step 13198/19560 | loss 3.272530 (-1.93z)| norm 0.2699 (-0.37z)| lr 1.53e-04 | 8444.67 ms | -100.0% bf16 MFU | 62061 tok/s +step 13199/19560 | loss 3.329504 (-0.34z)| norm 0.2730 (-0.11z)| lr 1.53e-04 | 8446.60 ms | -100.0% bf16 MFU | 62061 tok/s +step 13200/19560 | loss 3.335290 (-0.18z)| norm 0.2764 (+0.16z)| lr 1.53e-04 | 8447.29 ms | -100.0% bf16 MFU | 62061 tok/s +step 13201/19560 | loss 3.332829 (-0.24z)| norm 0.2634 (-0.87z)| lr 1.53e-04 | 8445.28 ms | -100.0% bf16 MFU | 62062 tok/s +step 13202/19560 | loss 3.312565 (-0.80z)| norm 0.2656 (-0.70z)| lr 1.53e-04 | 8447.16 ms | -100.0% bf16 MFU | 62062 tok/s +step 13203/19560 | loss 3.365823 (+0.69z)| norm 0.2640 (-0.83z)| lr 1.53e-04 | 8447.17 ms | -100.0% bf16 MFU | 62063 tok/s +step 13204/19560 | loss 3.326584 (-0.41z)| norm 0.2826 (+0.66z)| lr 1.53e-04 | 8449.19 ms | -100.0% bf16 MFU | 62062 tok/s +step 13205/19560 | loss 3.292467 (-1.35z)| norm 0.2720 (-0.20z)| lr 1.53e-04 | 8444.86 ms | -100.0% bf16 MFU | 62063 tok/s +step 13206/19560 | loss 3.331540 (-0.25z)| norm 0.2718 (-0.21z)| lr 1.53e-04 | 8447.05 ms | -100.0% bf16 MFU | 62063 tok/s +step 13207/19560 | loss 3.333880 (-0.18z)| norm 0.2595 (-1.18z)| lr 1.53e-04 | 8451.19 ms | -100.0% bf16 MFU | 62062 tok/s +step 13208/19560 | loss 3.431436 (+2.52z)| norm 0.2840 (+0.77z)| lr 1.53e-04 | 8445.03 ms | -100.0% bf16 MFU | 62063 tok/s +step 13209/19560 | loss 3.309808 (-0.87z)| norm 0.2797 (+0.43z)| lr 1.53e-04 | 8452.54 ms | -100.0% bf16 MFU | 62061 tok/s +step 13210/19560 | loss 3.306679 (-0.95z)| norm 0.2789 (+0.35z)| lr 1.53e-04 | 8444.78 ms | -100.0% bf16 MFU | 62062 tok/s +step 13211/19560 | loss 3.333116 (-0.21z)| norm 0.2603 (-1.12z)| lr 1.53e-04 | 8448.96 ms | -100.0% bf16 MFU | 62062 tok/s +step 13212/19560 | loss 3.284719 (-1.55z)| norm 0.2650 (-0.75z)| lr 1.53e-04 | 8447.80 ms | -100.0% bf16 MFU | 62062 tok/s +step 13213/19560 | loss 3.329062 (-0.33z)| norm 0.2548 (-1.54z)| lr 1.53e-04 | 8445.81 ms | -100.0% bf16 MFU | 62063 tok/s +step 13214/19560 | loss 3.388732 (+1.34z)| norm 0.2708 (-0.26z)| lr 1.53e-04 | 8448.16 ms | -100.0% bf16 MFU | 62063 tok/s +step 13215/19560 | loss 3.368932 (+0.80z)| norm 0.2656 (-0.66z)| lr 1.53e-04 | 8443.24 ms | -100.0% bf16 MFU | 62064 tok/s +step 13216/19560 | loss 3.405367 (+1.79z)| norm 0.2838 (+0.80z)| lr 1.53e-04 | 8446.00 ms | -100.0% bf16 MFU | 62065 tok/s +step 13217/19560 | loss 3.313329 (-0.77z)| norm 0.2669 (-0.54z)| lr 1.52e-04 | 8444.33 ms | -100.0% bf16 MFU | 62066 tok/s +step 13218/19560 | loss 3.344885 (+0.11z)| norm 0.2718 (-0.15z)| lr 1.52e-04 | 8444.23 ms | -100.0% bf16 MFU | 62067 tok/s +step 13219/19560 | loss 3.359333 (+0.50z)| norm 0.2674 (-0.51z)| lr 1.52e-04 | 8446.24 ms | -100.0% bf16 MFU | 62067 tok/s +step 13220/19560 | loss 3.417042 (+2.09z)| norm 0.2757 (+0.17z)| lr 1.52e-04 | 8444.15 ms | -100.0% bf16 MFU | 62068 tok/s +step 13221/19560 | loss 3.307229 (-0.95z)| norm 0.2691 (-0.35z)| lr 1.52e-04 | 8445.83 ms | -100.0% bf16 MFU | 62069 tok/s +step 13222/19560 | loss 3.340522 (-0.01z)| norm 0.2527 (-1.66z)| lr 1.52e-04 | 8444.15 ms | -100.0% bf16 MFU | 62070 tok/s +step 13223/19560 | loss 3.320635 (-0.56z)| norm 0.2843 (+0.88z)| lr 1.52e-04 | 8443.91 ms | -100.0% bf16 MFU | 62071 tok/s +step 13224/19560 | loss 3.365093 (+0.68z)| norm 0.2582 (-1.21z)| lr 1.52e-04 | 8444.38 ms | -100.0% bf16 MFU | 62072 tok/s +step 13225/19560 | loss 3.290862 (-1.38z)| norm 0.2679 (-0.44z)| lr 1.52e-04 | 8445.61 ms | -100.0% bf16 MFU | 62072 tok/s +step 13226/19560 | loss 3.338706 (-0.08z)| norm 0.2544 (-1.49z)| lr 1.52e-04 | 8444.32 ms | -100.0% bf16 MFU | 62073 tok/s +step 13227/19560 | loss 3.320813 (-0.57z)| norm 0.2545 (-1.46z)| lr 1.52e-04 | 8443.74 ms | -100.0% bf16 MFU | 62074 tok/s +step 13228/19560 | loss 3.300426 (-1.16z)| norm 0.2618 (-0.89z)| lr 1.52e-04 | 8445.24 ms | -100.0% bf16 MFU | 62074 tok/s +step 13229/19560 | loss 3.386024 (+1.31z)| norm 0.2658 (-0.56z)| lr 1.52e-04 | 8444.09 ms | -100.0% bf16 MFU | 62075 tok/s +step 13230/19560 | loss 3.425329 (+2.39z)| norm 0.2625 (-0.83z)| lr 1.52e-04 | 8445.96 ms | -100.0% bf16 MFU | 62075 tok/s +step 13231/19560 | loss 3.331251 (-0.30z)| norm 0.2858 (+1.01z)| lr 1.52e-04 | 8444.21 ms | -100.0% bf16 MFU | 62076 tok/s +step 13232/19560 | loss 3.412948 (+1.99z)| norm 0.2753 (+0.18z)| lr 1.52e-04 | 8448.35 ms | -100.0% bf16 MFU | 62075 tok/s +step 13233/19560 | loss 3.396178 (+1.50z)| norm 0.2870 (+1.09z)| lr 1.52e-04 | 8446.17 ms | -100.0% bf16 MFU | 62075 tok/s +step 13234/19560 | loss 3.376802 (+0.94z)| norm 0.2626 (-0.84z)| lr 1.52e-04 | 8444.17 ms | -100.0% bf16 MFU | 62075 tok/s +step 13235/19560 | loss 3.321426 (-0.60z)| norm 0.2874 (+1.12z)| lr 1.52e-04 | 8445.12 ms | -100.0% bf16 MFU | 62076 tok/s +step 13236/19560 | loss 3.427712 (+2.31z)| norm 0.2877 (+1.13z)| lr 1.52e-04 | 8445.11 ms | -100.0% bf16 MFU | 62076 tok/s +step 13237/19560 | loss 3.321359 (-0.60z)| norm 0.2610 (-0.98z)| lr 1.52e-04 | 8444.62 ms | -100.0% bf16 MFU | 62076 tok/s +step 13238/19560 | loss 3.331979 (-0.30z)| norm 0.2859 (+0.97z)| lr 1.52e-04 | 8446.49 ms | -100.0% bf16 MFU | 62076 tok/s +step 13239/19560 | loss 3.332706 (-0.27z)| norm 0.2697 (-0.29z)| lr 1.52e-04 | 8445.52 ms | -100.0% bf16 MFU | 62076 tok/s +step 13240/19560 | loss 3.288398 (-1.48z)| norm 0.2620 (-0.90z)| lr 1.51e-04 | 8445.76 ms | -100.0% bf16 MFU | 62076 tok/s +step 13241/19560 | loss 3.314179 (-0.76z)| norm 0.2935 (+1.56z)| lr 1.51e-04 | 8445.53 ms | -100.0% bf16 MFU | 62077 tok/s +step 13242/19560 | loss 3.287568 (-1.46z)| norm 0.2769 (+0.25z)| lr 1.51e-04 | 8442.79 ms | -100.0% bf16 MFU | 62078 tok/s +step 13243/19560 | loss 3.323150 (-0.50z)| norm 0.2757 (+0.14z)| lr 1.51e-04 | 8443.61 ms | -100.0% bf16 MFU | 62078 tok/s +step 13244/19560 | loss 3.315397 (-0.70z)| norm 0.2831 (+0.74z)| lr 1.51e-04 | 8442.97 ms | -100.0% bf16 MFU | 62079 tok/s +step 13245/19560 | loss 3.332679 (-0.23z)| norm 0.2708 (-0.25z)| lr 1.51e-04 | 8443.79 ms | -100.0% bf16 MFU | 62080 tok/s +step 13246/19560 | loss 3.452015 (+2.88z)| norm 0.2849 (+0.87z)| lr 1.51e-04 | 8445.07 ms | -100.0% bf16 MFU | 62080 tok/s +step 13247/19560 | loss 3.312148 (-0.77z)| norm 0.2740 (+0.00z)| lr 1.51e-04 | 8443.93 ms | -100.0% bf16 MFU | 62081 tok/s +step 13248/19560 | loss 3.311416 (-0.78z)| norm 0.2715 (-0.22z)| lr 1.51e-04 | 8442.20 ms | -100.0% bf16 MFU | 62082 tok/s +step 13249/19560 | loss 3.380838 (+1.02z)| norm 0.2585 (-1.28z)| lr 1.51e-04 | 8443.22 ms | -100.0% bf16 MFU | 62082 tok/s +step 13250/19560 | loss 3.343734 (+0.05z)| norm 0.2930 (+1.53z)| lr 1.51e-04 | 8443.98 ms | -100.0% bf16 MFU | 62083 tok/s +val loss 3.320238 +evaluating HellaSwag: 0/1256 evaluating HellaSwag: 10/1256 evaluating HellaSwag: 20/1256 evaluating HellaSwag: 30/1256 evaluating HellaSwag: 40/1256 evaluating HellaSwag: 50/1256 evaluating HellaSwag: 60/1256 evaluating HellaSwag: 70/1256 evaluating HellaSwag: 80/1256 evaluating HellaSwag: 90/1256 evaluating HellaSwag: 100/1256 evaluating HellaSwag: 110/1256 evaluating HellaSwag: 120/1256 evaluating HellaSwag: 130/1256 evaluating HellaSwag: 140/1256 evaluating HellaSwag: 150/1256 evaluating HellaSwag: 160/1256 evaluating HellaSwag: 170/1256 evaluating HellaSwag: 180/1256 evaluating HellaSwag: 190/1256 evaluating HellaSwag: 200/1256 evaluating HellaSwag: 210/1256 evaluating HellaSwag: 220/1256 evaluating HellaSwag: 230/1256 evaluating HellaSwag: 240/1256 evaluating HellaSwag: 250/1256 evaluating HellaSwag: 260/1256 evaluating HellaSwag: 270/1256 evaluating HellaSwag: 280/1256 evaluating HellaSwag: 290/1256 evaluating HellaSwag: 300/1256 evaluating HellaSwag: 310/1256 evaluating HellaSwag: 320/1256 evaluating HellaSwag: 330/1256 evaluating HellaSwag: 340/1256 evaluating HellaSwag: 350/1256 evaluating HellaSwag: 360/1256 evaluating HellaSwag: 370/1256 evaluating HellaSwag: 380/1256 evaluating HellaSwag: 390/1256 evaluating HellaSwag: 400/1256 evaluating HellaSwag: 410/1256 evaluating HellaSwag: 420/1256 evaluating HellaSwag: 430/1256 evaluating HellaSwag: 440/1256 evaluating HellaSwag: 450/1256 evaluating HellaSwag: 460/1256 evaluating HellaSwag: 470/1256 evaluating HellaSwag: 480/1256 evaluating HellaSwag: 490/1256 evaluating HellaSwag: 500/1256 evaluating HellaSwag: 510/1256 evaluating HellaSwag: 520/1256 evaluating HellaSwag: 530/1256 evaluating HellaSwag: 540/1256 evaluating HellaSwag: 550/1256 evaluating HellaSwag: 560/1256 evaluating HellaSwag: 570/1256 evaluating HellaSwag: 580/1256 evaluating HellaSwag: 590/1256 evaluating HellaSwag: 600/1256 evaluating HellaSwag: 610/1256 evaluating HellaSwag: 620/1256 evaluating HellaSwag: 630/1256 evaluating HellaSwag: 640/1256 evaluating HellaSwag: 650/1256 evaluating HellaSwag: 660/1256 evaluating HellaSwag: 670/1256 evaluating HellaSwag: 680/1256 evaluating HellaSwag: 690/1256 evaluating HellaSwag: 700/1256 evaluating HellaSwag: 710/1256 evaluating HellaSwag: 720/1256 evaluating HellaSwag: 730/1256 evaluating HellaSwag: 740/1256 evaluating HellaSwag: 750/1256 evaluating HellaSwag: 760/1256 evaluating HellaSwag: 770/1256 evaluating HellaSwag: 780/1256 evaluating HellaSwag: 790/1256 evaluating HellaSwag: 800/1256 evaluating HellaSwag: 810/1256 evaluating HellaSwag: 820/1256 evaluating HellaSwag: 830/1256 evaluating HellaSwag: 840/1256 evaluating HellaSwag: 850/1256 evaluating HellaSwag: 860/1256 evaluating HellaSwag: 870/1256 evaluating HellaSwag: 880/1256 evaluating HellaSwag: 890/1256 evaluating HellaSwag: 900/1256 evaluating HellaSwag: 910/1256 evaluating HellaSwag: 920/1256 evaluating HellaSwag: 930/1256 evaluating HellaSwag: 940/1256 evaluating HellaSwag: 950/1256 evaluating HellaSwag: 960/1256 evaluating HellaSwag: 970/1256 evaluating HellaSwag: 980/1256 evaluating HellaSwag: 990/1256 evaluating HellaSwag: 1000/1256 evaluating HellaSwag: 1010/1256 evaluating HellaSwag: 1020/1256 evaluating HellaSwag: 1030/1256 evaluating HellaSwag: 1040/1256 evaluating HellaSwag: 1050/1256 evaluating HellaSwag: 1060/1256 evaluating HellaSwag: 1070/1256 evaluating HellaSwag: 1080/1256 evaluating HellaSwag: 1090/1256 evaluating HellaSwag: 1100/1256 evaluating HellaSwag: 1110/1256 evaluating HellaSwag: 1120/1256 evaluating HellaSwag: 1130/1256 evaluating HellaSwag: 1140/1256 evaluating HellaSwag: 1150/1256 evaluating HellaSwag: 1160/1256 evaluating HellaSwag: 1170/1256 evaluating HellaSwag: 1180/1256 evaluating HellaSwag: 1190/1256 evaluating HellaSwag: 1200/1256 evaluating HellaSwag: 1210/1256 evaluating HellaSwag: 1220/1256 evaluating HellaSwag: 1230/1256 evaluating HellaSwag: 1240/1256 evaluating HellaSwag: 1250/1256 HellaSwag: 2957/10042 = 0.294463 +step 13251/19560 | loss 3.344737 (+0.07z)| norm 0.2602 (-1.16z)| lr 1.51e-04 | 8439.40 ms | -100.0% bf16 MFU | 62085 tok/s +step 13252/19560 | loss 3.301201 (-1.05z)| norm 0.2750 (+0.05z)| lr 1.51e-04 | 8437.55 ms | -100.0% bf16 MFU | 62088 tok/s +step 13253/19560 | loss 3.374707 (+0.86z)| norm 0.2811 (+0.54z)| lr 1.51e-04 | 8437.04 ms | -100.0% bf16 MFU | 62090 tok/s +step 13254/19560 | loss 3.403679 (+1.58z)| norm 0.2740 (-0.05z)| lr 1.51e-04 | 8436.83 ms | -100.0% bf16 MFU | 62093 tok/s +step 13255/19560 | loss 3.373125 (+0.78z)| norm 0.2717 (-0.27z)| lr 1.51e-04 | 8436.85 ms | -100.0% bf16 MFU | 62095 tok/s +step 13256/19560 | loss 3.296675 (-1.18z)| norm 0.2842 (+0.80z)| lr 1.51e-04 | 8441.35 ms | -100.0% bf16 MFU | 62096 tok/s +step 13257/19560 | loss 3.320613 (-0.57z)| norm 0.2791 (+0.37z)| lr 1.51e-04 | 8438.62 ms | -100.0% bf16 MFU | 62098 tok/s +step 13258/19560 | loss 3.322079 (-0.52z)| norm 0.2582 (-1.39z)| lr 1.51e-04 | 8438.46 ms | -100.0% bf16 MFU | 62099 tok/s +step 13259/19560 | loss 3.368137 (+0.66z)| norm 0.2753 (+0.06z)| lr 1.51e-04 | 8439.91 ms | -100.0% bf16 MFU | 62100 tok/s +step 13260/19560 | loss 3.350775 (+0.22z)| norm 0.2916 (+1.42z)| lr 1.51e-04 | 8442.18 ms | -100.0% bf16 MFU | 62101 tok/s +step 13261/19560 | loss 3.412766 (+1.80z)| norm 0.2932 (+1.53z)| lr 1.51e-04 | 8446.10 ms | -100.0% bf16 MFU | 62099 tok/s +step 13262/19560 | loss 3.310317 (-0.84z)| norm 0.2796 (+0.39z)| lr 1.51e-04 | 8442.73 ms | -100.0% bf16 MFU | 62099 tok/s +step 13263/19560 | loss 3.333239 (-0.25z)| norm 0.2746 (-0.03z)| lr 1.50e-04 | 8443.53 ms | -100.0% bf16 MFU | 62099 tok/s +step 13264/19560 | loss 3.372594 (+0.76z)| norm 0.2685 (-0.55z)| lr 1.50e-04 | 8470.71 ms | -100.0% bf16 MFU | 62089 tok/s +step 13265/19560 | loss 3.274459 (-1.73z)| norm 0.2881 (+1.11z)| lr 1.50e-04 | 8464.63 ms | -100.0% bf16 MFU | 62081 tok/s +step 13266/19560 | loss 3.316148 (-0.67z)| norm 0.2653 (-0.82z)| lr 1.50e-04 | 8471.93 ms | -100.0% bf16 MFU | 62071 tok/s +step 13267/19560 | loss 3.268481 (-1.86z)| norm 0.2718 (-0.27z)| lr 1.50e-04 | 8471.40 ms | -100.0% bf16 MFU | 62062 tok/s +step 13268/19560 | loss 3.336706 (-0.14z)| norm 0.2898 (+1.24z)| lr 1.50e-04 | 8463.23 ms | -100.0% bf16 MFU | 62057 tok/s +step 13269/19560 | loss 3.341940 (+0.00z)| norm 0.2685 (-0.54z)| lr 1.50e-04 | 8470.01 ms | -100.0% bf16 MFU | 62049 tok/s +step 13270/19560 | loss 3.357597 (+0.39z)| norm 0.2690 (-0.49z)| lr 1.50e-04 | 8464.12 ms | -100.0% bf16 MFU | 62043 tok/s +step 13271/19560 | loss 3.363169 (+0.53z)| norm 0.2624 (-1.03z)| lr 1.50e-04 | 8464.82 ms | -100.0% bf16 MFU | 62038 tok/s +step 13272/19560 | loss 3.295431 (-1.20z)| norm 0.2566 (-1.49z)| lr 1.50e-04 | 8465.35 ms | -100.0% bf16 MFU | 62033 tok/s +step 13273/19560 | loss 3.307056 (-0.89z)| norm 0.2661 (-0.72z)| lr 1.50e-04 | 8466.03 ms | -100.0% bf16 MFU | 62028 tok/s +step 13274/19560 | loss 3.313570 (-0.74z)| norm 0.2692 (-0.45z)| lr 1.50e-04 | 8465.53 ms | -100.0% bf16 MFU | 62023 tok/s +step 13275/19560 | loss 3.360260 (+0.47z)| norm 0.2600 (-1.23z)| lr 1.50e-04 | 8459.65 ms | -100.0% bf16 MFU | 62021 tok/s +step 13276/19560 | loss 3.296806 (-1.17z)| norm 0.2775 (+0.27z)| lr 1.50e-04 | 8469.50 ms | -100.0% bf16 MFU | 62015 tok/s +step 13277/19560 | loss 3.337180 (-0.12z)| norm 0.2553 (-1.60z)| lr 1.50e-04 | 8462.03 ms | -100.0% bf16 MFU | 62012 tok/s +step 13278/19560 | loss 3.382023 (+1.04z)| norm 0.2715 (-0.23z)| lr 1.50e-04 | 8467.55 ms | -100.0% bf16 MFU | 62007 tok/s +step 13279/19560 | loss 3.391044 (+1.25z)| norm 0.2615 (-1.06z)| lr 1.50e-04 | 8461.89 ms | -100.0% bf16 MFU | 62005 tok/s +step 13280/19560 | loss 3.352796 (+0.26z)| norm 0.2848 (+0.94z)| lr 1.50e-04 | 8462.05 ms | -100.0% bf16 MFU | 62002 tok/s +step 13281/19560 | loss 3.312383 (-0.76z)| norm 0.2599 (-1.19z)| lr 1.50e-04 | 8462.63 ms | -100.0% bf16 MFU | 62000 tok/s +step 13282/19560 | loss 3.308408 (-0.85z)| norm 0.2661 (-0.64z)| lr 1.50e-04 | 8463.10 ms | -100.0% bf16 MFU | 61997 tok/s +step 13283/19560 | loss 3.273640 (-1.71z)| norm 0.2715 (-0.17z)| lr 1.50e-04 | 8459.70 ms | -100.0% bf16 MFU | 61996 tok/s +step 13284/19560 | loss 3.307971 (-0.83z)| norm 0.2625 (-0.94z)| lr 1.50e-04 | 8461.70 ms | -100.0% bf16 MFU | 61994 tok/s +step 13285/19560 | loss 3.290959 (-1.25z)| norm 0.2602 (-1.13z)| lr 1.50e-04 | 8456.35 ms | -100.0% bf16 MFU | 61995 tok/s +step 13286/19560 | loss 3.319229 (-0.53z)| norm 0.2747 (+0.14z)| lr 1.49e-04 | 8463.94 ms | -100.0% bf16 MFU | 61992 tok/s +step 13287/19560 | loss 3.315138 (-0.64z)| norm 0.2553 (-1.53z)| lr 1.49e-04 | 8461.31 ms | -100.0% bf16 MFU | 61991 tok/s +step 13288/19560 | loss 3.317099 (-0.58z)| norm 0.2595 (-1.15z)| lr 1.49e-04 | 8459.42 ms | -100.0% bf16 MFU | 61990 tok/s +step 13289/19560 | loss 3.320433 (-0.50z)| norm 0.2633 (-0.81z)| lr 1.49e-04 | 8458.21 ms | -100.0% bf16 MFU | 61990 tok/s +step 13290/19560 | loss 3.300019 (-1.00z)| norm 0.2653 (-0.62z)| lr 1.49e-04 | 8461.10 ms | -100.0% bf16 MFU | 61988 tok/s +step 13291/19560 | loss 3.273184 (-1.65z)| norm 0.2791 (+0.60z)| lr 1.49e-04 | 8460.65 ms | -100.0% bf16 MFU | 61987 tok/s +step 13292/19560 | loss 3.328533 (-0.25z)| norm 0.2687 (-0.31z)| lr 1.49e-04 | 8455.25 ms | -100.0% bf16 MFU | 61988 tok/s +step 13293/19560 | loss 3.272225 (-1.64z)| norm 0.2638 (-0.74z)| lr 1.49e-04 | 8461.53 ms | -100.0% bf16 MFU | 61987 tok/s +step 13294/19560 | loss 3.314733 (-0.58z)| norm 0.2772 (+0.48z)| lr 1.49e-04 | 8454.90 ms | -100.0% bf16 MFU | 61988 tok/s +step 13295/19560 | loss 3.320055 (-0.44z)| norm 0.2681 (-0.33z)| lr 1.49e-04 | 8461.28 ms | -100.0% bf16 MFU | 61987 tok/s +step 13296/19560 | loss 3.270536 (-1.64z)| norm 0.2821 (+0.98z)| lr 1.49e-04 | 8457.02 ms | -100.0% bf16 MFU | 61987 tok/s +step 13297/19560 | loss 3.352592 (+0.39z)| norm 0.2630 (-0.80z)| lr 1.49e-04 | 8453.20 ms | -100.0% bf16 MFU | 61989 tok/s +step 13298/19560 | loss 3.333374 (-0.08z)| norm 0.2834 (+1.11z)| lr 1.49e-04 | 8457.08 ms | -100.0% bf16 MFU | 61989 tok/s +step 13299/19560 | loss 3.339735 (+0.07z)| norm 0.2831 (+1.07z)| lr 1.49e-04 | 8455.45 ms | -100.0% bf16 MFU | 61990 tok/s +step 13300/19560 | loss 3.288703 (-1.20z)| norm 0.2551 (-1.52z)| lr 1.49e-04 | 8455.88 ms | -100.0% bf16 MFU | 61991 tok/s +step 13301/19560 | loss 3.333129 (-0.09z)| norm 0.2806 (+0.84z)| lr 1.49e-04 | 8454.42 ms | -100.0% bf16 MFU | 61992 tok/s +step 13302/19560 | loss 3.322056 (-0.36z)| norm 0.2567 (-1.37z)| lr 1.49e-04 | 8455.36 ms | -100.0% bf16 MFU | 61993 tok/s +step 13303/19560 | loss 3.309641 (-0.66z)| norm 0.2729 (+0.13z)| lr 1.49e-04 | 8454.85 ms | -100.0% bf16 MFU | 61994 tok/s +step 13304/19560 | loss 3.362551 (+0.65z)| norm 0.2710 (-0.04z)| lr 1.49e-04 | 8457.00 ms | -100.0% bf16 MFU | 61994 tok/s +step 13305/19560 | loss 3.275057 (-1.49z)| norm 0.2990 (+2.50z)| lr 1.49e-04 | 8454.59 ms | -100.0% bf16 MFU | 61995 tok/s +step 13306/19560 | loss 3.318240 (-0.43z)| norm 0.2808 (+0.85z)| lr 1.49e-04 | 8454.05 ms | -100.0% bf16 MFU | 61996 tok/s +step 13307/19560 | loss 3.371845 (+0.87z)| norm 0.2730 (+0.13z)| lr 1.49e-04 | 8458.62 ms | -100.0% bf16 MFU | 61995 tok/s +step 13308/19560 | loss 3.322101 (-0.35z)| norm 0.3033 (+2.81z)| lr 1.49e-04 | 8458.92 ms | -100.0% bf16 MFU | 61994 tok/s +step 13309/19560 | loss 3.304406 (-0.77z)| norm 0.2762 (+0.38z)| lr 1.49e-04 | 8451.87 ms | -100.0% bf16 MFU | 61996 tok/s +step 13310/19560 | loss 3.297755 (-0.92z)| norm 0.3027 (+2.65z)| lr 1.48e-04 | 8454.38 ms | -100.0% bf16 MFU | 61997 tok/s +step 13311/19560 | loss 3.326766 (-0.20z)| norm 0.2877 (+1.34z)| lr 1.48e-04 | 8455.99 ms | -100.0% bf16 MFU | 61997 tok/s +step 13312/19560 | loss 3.369536 (+0.85z)| norm 0.2944 (+1.88z)| lr 1.48e-04 | 8460.15 ms | -100.0% bf16 MFU | 61996 tok/s +step 13313/19560 | loss 3.377351 (+1.04z)| norm 0.2975 (+2.09z)| lr 1.48e-04 | 8456.73 ms | -100.0% bf16 MFU | 61996 tok/s +step 13314/19560 | loss 3.303751 (-0.80z)| norm 0.2814 (+0.73z)| lr 1.48e-04 | 8449.06 ms | -100.0% bf16 MFU | 61999 tok/s +step 13315/19560 | loss 3.308641 (-0.68z)| norm 0.2809 (+0.68z)| lr 1.48e-04 | 8447.82 ms | -100.0% bf16 MFU | 62002 tok/s +step 13316/19560 | loss 3.364608 (+0.75z)| norm 0.2932 (+1.69z)| lr 1.48e-04 | 8451.96 ms | -100.0% bf16 MFU | 62003 tok/s +step 13317/19560 | loss 3.371302 (+0.93z)| norm 0.2923 (+1.58z)| lr 1.48e-04 | 8456.85 ms | -100.0% bf16 MFU | 62003 tok/s +step 13318/19560 | loss 3.304656 (-0.77z)| norm 0.2633 (-0.81z)| lr 1.48e-04 | 8452.06 ms | -100.0% bf16 MFU | 62004 tok/s +step 13319/19560 | loss 3.299741 (-0.90z)| norm 0.2929 (+1.60z)| lr 1.48e-04 | 8450.94 ms | -100.0% bf16 MFU | 62006 tok/s +step 13320/19560 | loss 3.335800 (+0.03z)| norm 0.2807 (+0.59z)| lr 1.48e-04 | 8452.58 ms | -100.0% bf16 MFU | 62007 tok/s +step 13321/19560 | loss 3.303168 (-0.80z)| norm 0.2737 (+0.01z)| lr 1.48e-04 | 8448.00 ms | -100.0% bf16 MFU | 62010 tok/s +step 13322/19560 | loss 3.326442 (-0.20z)| norm 0.2735 (-0.02z)| lr 1.48e-04 | 8447.75 ms | -100.0% bf16 MFU | 62013 tok/s +step 13323/19560 | loss 3.316930 (-0.44z)| norm 0.2664 (-0.61z)| lr 1.48e-04 | 8452.79 ms | -100.0% bf16 MFU | 62013 tok/s +step 13324/19560 | loss 3.290833 (-1.11z)| norm 0.2612 (-1.04z)| lr 1.48e-04 | 8447.13 ms | -100.0% bf16 MFU | 62016 tok/s +step 13325/19560 | loss 3.322824 (-0.27z)| norm 0.2894 (+1.38z)| lr 1.48e-04 | 8447.16 ms | -100.0% bf16 MFU | 62018 tok/s +step 13326/19560 | loss 3.356021 (+0.60z)| norm 0.2698 (-0.31z)| lr 1.48e-04 | 8447.08 ms | -100.0% bf16 MFU | 62021 tok/s +step 13327/19560 | loss 3.364967 (+0.83z)| norm 0.2643 (-0.77z)| lr 1.48e-04 | 8453.53 ms | -100.0% bf16 MFU | 62021 tok/s +step 13328/19560 | loss 3.332461 (-0.04z)| norm 0.2853 (+1.02z)| lr 1.48e-04 | 8450.32 ms | -100.0% bf16 MFU | 62022 tok/s +step 13329/19560 | loss 3.264194 (-1.81z)| norm 0.2715 (-0.16z)| lr 1.48e-04 | 8444.51 ms | -100.0% bf16 MFU | 62025 tok/s +step 13330/19560 | loss 3.317164 (-0.42z)| norm 0.2701 (-0.29z)| lr 1.48e-04 | 8448.22 ms | -100.0% bf16 MFU | 62027 tok/s +step 13331/19560 | loss 3.390287 (+1.48z)| norm 0.2744 (+0.07z)| lr 1.48e-04 | 8453.23 ms | -100.0% bf16 MFU | 62027 tok/s +step 13332/19560 | loss 3.306690 (-0.69z)| norm 0.2740 (+0.05z)| lr 1.48e-04 | 8451.35 ms | -100.0% bf16 MFU | 62027 tok/s +step 13333/19560 | loss 3.300514 (-0.86z)| norm 0.2668 (-0.57z)| lr 1.47e-04 | 8447.36 ms | -100.0% bf16 MFU | 62029 tok/s +step 13334/19560 | loss 3.327066 (-0.16z)| norm 0.2705 (-0.25z)| lr 1.47e-04 | 8446.20 ms | -100.0% bf16 MFU | 62031 tok/s +step 13335/19560 | loss 3.360420 (+0.70z)| norm 0.2782 (+0.40z)| lr 1.47e-04 | 8445.77 ms | -100.0% bf16 MFU | 62034 tok/s +step 13336/19560 | loss 3.315219 (-0.47z)| norm 0.2779 (+0.38z)| lr 1.47e-04 | 8450.96 ms | -100.0% bf16 MFU | 62034 tok/s +step 13337/19560 | loss 3.347457 (+0.39z)| norm 0.2875 (+1.20z)| lr 1.47e-04 | 8449.17 ms | -100.0% bf16 MFU | 62035 tok/s +step 13338/19560 | loss 3.374639 (+1.10z)| norm 0.2716 (-0.17z)| lr 1.47e-04 | 8451.05 ms | -100.0% bf16 MFU | 62035 tok/s +step 13339/19560 | loss 3.320030 (-0.36z)| norm 0.2918 (+1.55z)| lr 1.47e-04 | 8443.68 ms | -100.0% bf16 MFU | 62038 tok/s +step 13340/19560 | loss 3.359124 (+0.67z)| norm 0.2796 (+0.49z)| lr 1.47e-04 | 8443.49 ms | -100.0% bf16 MFU | 62041 tok/s +step 13341/19560 | loss 3.335755 (+0.05z)| norm 0.2889 (+1.28z)| lr 1.47e-04 | 8445.51 ms | -100.0% bf16 MFU | 62042 tok/s +step 13342/19560 | loss 3.307980 (-0.69z)| norm 0.2803 (+0.52z)| lr 1.47e-04 | 8447.68 ms | -100.0% bf16 MFU | 62043 tok/s +step 13343/19560 | loss 3.296646 (-0.98z)| norm 0.2996 (+2.13z)| lr 1.47e-04 | 8444.87 ms | -100.0% bf16 MFU | 62046 tok/s +step 13344/19560 | loss 3.315114 (-0.47z)| norm 0.2876 (+1.11z)| lr 1.47e-04 | 8444.53 ms | -100.0% bf16 MFU | 62048 tok/s +step 13345/19560 | loss 3.311519 (-0.56z)| norm 0.2912 (+1.39z)| lr 1.47e-04 | 8445.60 ms | -100.0% bf16 MFU | 62049 tok/s +step 13346/19560 | loss 3.285530 (-1.25z)| norm 0.2789 (+0.35z)| lr 1.47e-04 | 8445.67 ms | -100.0% bf16 MFU | 62050 tok/s +step 13347/19560 | loss 3.349371 (+0.48z)| norm 0.2886 (+1.15z)| lr 1.47e-04 | 8448.85 ms | -100.0% bf16 MFU | 62051 tok/s +step 13348/19560 | loss 3.301032 (-0.82z)| norm 0.2821 (+0.60z)| lr 1.47e-04 | 8446.41 ms | -100.0% bf16 MFU | 62052 tok/s +step 13349/19560 | loss 3.322625 (-0.23z)| norm 0.2904 (+1.28z)| lr 1.47e-04 | 8446.40 ms | -100.0% bf16 MFU | 62053 tok/s +step 13350/19560 | loss 3.364663 (+0.93z)| norm 0.3065 (+2.55z)| lr 1.47e-04 | 8446.98 ms | -100.0% bf16 MFU | 62054 tok/s +step 13351/19560 | loss 3.321542 (-0.26z)| norm 0.3073 (+2.54z)| lr 1.47e-04 | 8444.88 ms | -100.0% bf16 MFU | 62055 tok/s +step 13352/19560 | loss 3.333758 (+0.08z)| norm 0.2825 (+0.53z)| lr 1.47e-04 | 8449.98 ms | -100.0% bf16 MFU | 62055 tok/s +step 13353/19560 | loss 3.313945 (-0.47z)| norm 0.3044 (+2.24z)| lr 1.47e-04 | 8446.39 ms | -100.0% bf16 MFU | 62055 tok/s +step 13354/19560 | loss 3.294393 (-1.01z)| norm 0.2830 (+0.52z)| lr 1.47e-04 | 8446.92 ms | -100.0% bf16 MFU | 62056 tok/s +step 13355/19560 | loss 3.377106 (+1.27z)| norm 0.2917 (+1.21z)| lr 1.47e-04 | 8449.85 ms | -100.0% bf16 MFU | 62056 tok/s +step 13356/19560 | loss 3.282981 (-1.32z)| norm 0.2918 (+1.20z)| lr 1.46e-04 | 8449.79 ms | -100.0% bf16 MFU | 62055 tok/s +step 13357/19560 | loss 3.315528 (-0.41z)| norm 0.2685 (-0.69z)| lr 1.46e-04 | 8443.27 ms | -100.0% bf16 MFU | 62057 tok/s +step 13358/19560 | loss 3.376250 (+1.31z)| norm 0.2721 (-0.40z)| lr 1.46e-04 | 8445.95 ms | -100.0% bf16 MFU | 62058 tok/s +step 13359/19560 | loss 3.335456 (+0.16z)| norm 0.2803 (+0.27z)| lr 1.46e-04 | 8446.78 ms | -100.0% bf16 MFU | 62059 tok/s +step 13360/19560 | loss 3.336883 (+0.22z)| norm 0.2797 (+0.21z)| lr 1.46e-04 | 8439.47 ms | -100.0% bf16 MFU | 62062 tok/s +step 13361/19560 | loss 3.339016 (+0.30z)| norm 0.2967 (+1.58z)| lr 1.46e-04 | 8434.46 ms | -100.0% bf16 MFU | 62067 tok/s +step 13362/19560 | loss 3.327161 (-0.04z)| norm 0.2876 (+0.83z)| lr 1.46e-04 | 8439.20 ms | -100.0% bf16 MFU | 62070 tok/s +step 13363/19560 | loss 3.381270 (+1.53z)| norm 0.2833 (+0.49z)| lr 1.46e-04 | 8436.75 ms | -100.0% bf16 MFU | 62074 tok/s +step 13364/19560 | loss 3.407346 (+2.33z)| norm 0.2784 (+0.09z)| lr 1.46e-04 | 8432.14 ms | -100.0% bf16 MFU | 62079 tok/s +step 13365/19560 | loss 3.357509 (+0.84z)| norm 0.3014 (+1.93z)| lr 1.46e-04 | 8436.40 ms | -100.0% bf16 MFU | 62082 tok/s +step 13366/19560 | loss 3.317885 (-0.33z)| norm 0.2517 (-2.04z)| lr 1.46e-04 | 8436.15 ms | -100.0% bf16 MFU | 62085 tok/s +step 13367/19560 | loss 3.355606 (+0.78z)| norm 0.3077 (+2.35z)| lr 1.46e-04 | 8435.18 ms | -100.0% bf16 MFU | 62089 tok/s +step 13368/19560 | loss 3.284083 (-1.33z)| norm 0.2621 (-1.21z)| lr 1.46e-04 | 8436.47 ms | -100.0% bf16 MFU | 62092 tok/s +step 13369/19560 | loss 3.357599 (+0.83z)| norm 0.2831 (+0.44z)| lr 1.46e-04 | 8434.61 ms | -100.0% bf16 MFU | 62095 tok/s +step 13370/19560 | loss 3.375731 (+1.34z)| norm 0.2763 (-0.10z)| lr 1.46e-04 | 8436.23 ms | -100.0% bf16 MFU | 62098 tok/s +step 13371/19560 | loss 3.316920 (-0.39z)| norm 0.2898 (+0.96z)| lr 1.46e-04 | 8440.00 ms | -100.0% bf16 MFU | 62099 tok/s +step 13372/19560 | loss 3.323554 (-0.20z)| norm 0.2712 (-0.49z)| lr 1.46e-04 | 8438.03 ms | -100.0% bf16 MFU | 62100 tok/s +step 13373/19560 | loss 3.287510 (-1.24z)| norm 0.3187 (+3.08z)| lr 1.46e-04 | 8436.41 ms | -100.0% bf16 MFU | 62103 tok/s +step 13374/19560 | loss 3.278631 (-1.52z)| norm 0.2646 (-0.99z)| lr 1.46e-04 | 8437.64 ms | -100.0% bf16 MFU | 62104 tok/s +step 13375/19560 | loss 3.350351 (+0.66z)| norm 0.2836 (+0.43z)| lr 1.46e-04 | 8434.85 ms | -100.0% bf16 MFU | 62107 tok/s +step 13376/19560 | loss 3.283976 (-1.35z)| norm 0.2680 (-0.73z)| lr 1.46e-04 | 8438.54 ms | -100.0% bf16 MFU | 62108 tok/s +step 13377/19560 | loss 3.311271 (-0.51z)| norm 0.2740 (-0.29z)| lr 1.46e-04 | 8437.79 ms | -100.0% bf16 MFU | 62110 tok/s +step 13378/19560 | loss 3.324512 (-0.10z)| norm 0.2843 (+0.49z)| lr 1.46e-04 | 8438.67 ms | -100.0% bf16 MFU | 62111 tok/s +step 13379/19560 | loss 3.290248 (-1.14z)| norm 0.2892 (+0.85z)| lr 1.45e-04 | 8440.20 ms | -100.0% bf16 MFU | 62111 tok/s +step 13380/19560 | loss 3.346087 (+0.56z)| norm 0.2761 (-0.15z)| lr 1.45e-04 | 8433.22 ms | -100.0% bf16 MFU | 62114 tok/s +step 13381/19560 | loss 3.304478 (-0.70z)| norm 0.2953 (+1.30z)| lr 1.45e-04 | 8436.50 ms | -100.0% bf16 MFU | 62115 tok/s +step 13382/19560 | loss 3.288442 (-1.19z)| norm 0.2664 (-0.88z)| lr 1.45e-04 | 8438.83 ms | -100.0% bf16 MFU | 62116 tok/s +step 13383/19560 | loss 3.318269 (-0.24z)| norm 0.2801 (+0.15z)| lr 1.45e-04 | 8437.77 ms | -100.0% bf16 MFU | 62117 tok/s +step 13384/19560 | loss 3.326009 (-0.01z)| norm 0.2627 (-1.15z)| lr 1.45e-04 | 8442.11 ms | -100.0% bf16 MFU | 62116 tok/s +step 13385/19560 | loss 3.382557 (+1.75z)| norm 0.2683 (-0.72z)| lr 1.45e-04 | 8436.42 ms | -100.0% bf16 MFU | 62118 tok/s +step 13386/19560 | loss 3.261346 (-2.00z)| norm 0.2674 (-0.80z)| lr 1.45e-04 | 8440.25 ms | -100.0% bf16 MFU | 62118 tok/s +step 13387/19560 | loss 3.308318 (-0.54z)| norm 0.2724 (-0.42z)| lr 1.45e-04 | 8439.16 ms | -100.0% bf16 MFU | 62118 tok/s +step 13388/19560 | loss 3.382019 (+1.72z)| norm 0.2750 (-0.22z)| lr 1.45e-04 | 8439.17 ms | -100.0% bf16 MFU | 62119 tok/s +step 13389/19560 | loss 3.296604 (-0.89z)| norm 0.2569 (-1.56z)| lr 1.45e-04 | 8440.02 ms | -100.0% bf16 MFU | 62119 tok/s +step 13390/19560 | loss 3.311420 (-0.43z)| norm 0.2644 (-0.98z)| lr 1.45e-04 | 8446.03 ms | -100.0% bf16 MFU | 62116 tok/s +step 13391/19560 | loss 3.345616 (+0.64z)| norm 0.2540 (-1.74z)| lr 1.45e-04 | 8440.39 ms | -100.0% bf16 MFU | 62116 tok/s +step 13392/19560 | loss 3.342466 (+0.55z)| norm 0.2736 (-0.28z)| lr 1.45e-04 | 8444.86 ms | -100.0% bf16 MFU | 62115 tok/s +step 13393/19560 | loss 3.277909 (-1.49z)| norm 0.2734 (-0.29z)| lr 1.45e-04 | 8441.80 ms | -100.0% bf16 MFU | 62114 tok/s +step 13394/19560 | loss 3.357932 (+1.03z)| norm 0.2696 (-0.57z)| lr 1.45e-04 | 8440.27 ms | -100.0% bf16 MFU | 62115 tok/s +step 13395/19560 | loss 3.365248 (+1.25z)| norm 0.2624 (-1.10z)| lr 1.45e-04 | 8441.99 ms | -100.0% bf16 MFU | 62114 tok/s +step 13396/19560 | loss 3.351079 (+0.79z)| norm 0.2731 (-0.29z)| lr 1.45e-04 | 8446.11 ms | -100.0% bf16 MFU | 62112 tok/s +step 13397/19560 | loss 3.338152 (+0.38z)| norm 0.2750 (-0.16z)| lr 1.45e-04 | 8441.81 ms | -100.0% bf16 MFU | 62112 tok/s +step 13398/19560 | loss 3.330401 (+0.14z)| norm 0.2674 (-0.72z)| lr 1.45e-04 | 8442.57 ms | -100.0% bf16 MFU | 62111 tok/s +step 13399/19560 | loss 3.309289 (-0.52z)| norm 0.2911 (+1.03z)| lr 1.45e-04 | 8445.00 ms | -100.0% bf16 MFU | 62110 tok/s +step 13400/19560 | loss 3.286409 (-1.24z)| norm 0.2549 (-1.67z)| lr 1.45e-04 | 8444.85 ms | -100.0% bf16 MFU | 62108 tok/s +step 13401/19560 | loss 3.327081 (+0.05z)| norm 0.2839 (+0.48z)| lr 1.45e-04 | 8441.16 ms | -100.0% bf16 MFU | 62109 tok/s +step 13402/19560 | loss 3.286108 (-1.25z)| norm 0.2883 (+0.80z)| lr 1.45e-04 | 8442.97 ms | -100.0% bf16 MFU | 62108 tok/s +step 13403/19560 | loss 3.343099 (+0.57z)| norm 0.2702 (-0.56z)| lr 1.44e-04 | 8440.62 ms | -100.0% bf16 MFU | 62108 tok/s +step 13404/19560 | loss 3.367419 (+1.32z)| norm 0.2846 (+0.52z)| lr 1.44e-04 | 8440.21 ms | -100.0% bf16 MFU | 62109 tok/s +step 13405/19560 | loss 3.423095 (+2.96z)| norm 0.2953 (+1.31z)| lr 1.44e-04 | 8442.60 ms | -100.0% bf16 MFU | 62108 tok/s +step 13406/19560 | loss 3.340129 (+0.43z)| norm 0.2690 (-0.68z)| lr 1.44e-04 | 8443.67 ms | -100.0% bf16 MFU | 62108 tok/s +step 13407/19560 | loss 3.291652 (-1.06z)| norm 0.2666 (-0.86z)| lr 1.44e-04 | 8442.45 ms | -100.0% bf16 MFU | 62107 tok/s +step 13408/19560 | loss 3.271239 (-1.66z)| norm 0.2699 (-0.61z)| lr 1.44e-04 | 8443.56 ms | -100.0% bf16 MFU | 62107 tok/s +step 13409/19560 | loss 3.316357 (-0.26z)| norm 0.2677 (-0.78z)| lr 1.44e-04 | 8444.18 ms | -100.0% bf16 MFU | 62106 tok/s +step 13410/19560 | loss 3.373576 (+1.49z)| norm 0.2788 (+0.06z)| lr 1.44e-04 | 8440.25 ms | -100.0% bf16 MFU | 62106 tok/s +step 13411/19560 | loss 3.321401 (-0.13z)| norm 0.2661 (-0.91z)| lr 1.44e-04 | 8443.09 ms | -100.0% bf16 MFU | 62106 tok/s +step 13412/19560 | loss 3.310560 (-0.47z)| norm 0.2738 (-0.33z)| lr 1.44e-04 | 8443.73 ms | -100.0% bf16 MFU | 62105 tok/s +step 13413/19560 | loss 3.389119 (+1.94z)| norm 0.2874 (+0.70z)| lr 1.44e-04 | 8440.78 ms | -100.0% bf16 MFU | 62106 tok/s +step 13414/19560 | loss 3.316255 (-0.31z)| norm 0.2742 (-0.32z)| lr 1.44e-04 | 8442.28 ms | -100.0% bf16 MFU | 62105 tok/s +step 13415/19560 | loss 3.287921 (-1.18z)| norm 0.2833 (+0.37z)| lr 1.44e-04 | 8442.37 ms | -100.0% bf16 MFU | 62105 tok/s +step 13416/19560 | loss 3.355492 (+0.89z)| norm 0.2794 (+0.06z)| lr 1.44e-04 | 8440.81 ms | -100.0% bf16 MFU | 62106 tok/s +step 13417/19560 | loss 3.436356 (+3.20z)| norm 0.2680 (-0.85z)| lr 1.44e-04 | 8443.23 ms | -100.0% bf16 MFU | 62105 tok/s +step 13418/19560 | loss 3.346032 (+0.54z)| norm 0.2636 (-1.19z)| lr 1.44e-04 | 8443.48 ms | -100.0% bf16 MFU | 62105 tok/s +step 13419/19560 | loss 3.337763 (+0.28z)| norm 0.2883 (+0.74z)| lr 1.44e-04 | 8445.35 ms | -100.0% bf16 MFU | 62103 tok/s +step 13420/19560 | loss 3.299489 (-0.85z)| norm 0.2689 (-0.78z)| lr 1.44e-04 | 8439.86 ms | -100.0% bf16 MFU | 62104 tok/s +step 13421/19560 | loss 3.281558 (-1.38z)| norm 0.2798 (+0.07z)| lr 1.44e-04 | 8440.35 ms | -100.0% bf16 MFU | 62105 tok/s +step 13422/19560 | loss 3.304378 (-0.70z)| norm 0.2755 (-0.27z)| lr 1.44e-04 | 8444.54 ms | -100.0% bf16 MFU | 62104 tok/s +step 13423/19560 | loss 3.364571 (+1.07z)| norm 0.2757 (-0.26z)| lr 1.44e-04 | 8441.64 ms | -100.0% bf16 MFU | 62104 tok/s +step 13424/19560 | loss 3.339972 (+0.33z)| norm 0.2614 (-1.37z)| lr 1.44e-04 | 8442.78 ms | -100.0% bf16 MFU | 62104 tok/s +step 13425/19560 | loss 3.280616 (-1.42z)| norm 0.2654 (-1.05z)| lr 1.44e-04 | 8444.63 ms | -100.0% bf16 MFU | 62103 tok/s +step 13426/19560 | loss 3.377171 (+1.43z)| norm 0.2928 (+1.09z)| lr 1.43e-04 | 8442.19 ms | -100.0% bf16 MFU | 62103 tok/s +step 13427/19560 | loss 3.352295 (+0.69z)| norm 0.2639 (-1.16z)| lr 1.43e-04 | 8438.72 ms | -100.0% bf16 MFU | 62104 tok/s +step 13428/19560 | loss 3.372013 (+1.25z)| norm 0.2734 (-0.43z)| lr 1.43e-04 | 8443.07 ms | -100.0% bf16 MFU | 62104 tok/s +step 13429/19560 | loss 3.325915 (-0.10z)| norm 0.2539 (-1.93z)| lr 1.43e-04 | 8441.97 ms | -100.0% bf16 MFU | 62104 tok/s +step 13430/19560 | loss 3.270529 (-1.70z)| norm 0.2804 (+0.12z)| lr 1.43e-04 | 8442.79 ms | -100.0% bf16 MFU | 62104 tok/s +step 13431/19560 | loss 3.295518 (-0.97z)| norm 0.2695 (-0.74z)| lr 1.43e-04 | 8441.87 ms | -100.0% bf16 MFU | 62104 tok/s +step 13432/19560 | loss 3.291490 (-1.07z)| norm 0.2915 (+0.98z)| lr 1.43e-04 | 8442.88 ms | -100.0% bf16 MFU | 62103 tok/s +step 13433/19560 | loss 3.332726 (+0.12z)| norm 0.2551 (-1.84z)| lr 1.43e-04 | 8441.24 ms | -100.0% bf16 MFU | 62104 tok/s +step 13434/19560 | loss 3.357872 (+0.84z)| norm 0.2662 (-0.96z)| lr 1.43e-04 | 8439.74 ms | -100.0% bf16 MFU | 62105 tok/s +step 13435/19560 | loss 3.264309 (-1.85z)| norm 0.2559 (-1.73z)| lr 1.43e-04 | 8440.75 ms | -100.0% bf16 MFU | 62105 tok/s +step 13436/19560 | loss 3.311475 (-0.48z)| norm 0.2721 (-0.47z)| lr 1.43e-04 | 8443.55 ms | -100.0% bf16 MFU | 62105 tok/s +step 13437/19560 | loss 3.311265 (-0.49z)| norm 0.2521 (-1.99z)| lr 1.43e-04 | 8441.57 ms | -100.0% bf16 MFU | 62105 tok/s +step 13438/19560 | loss 3.364642 (+1.04z)| norm 0.2604 (-1.33z)| lr 1.43e-04 | 8440.78 ms | -100.0% bf16 MFU | 62105 tok/s +step 13439/19560 | loss 3.327514 (-0.03z)| norm 0.2590 (-1.42z)| lr 1.43e-04 | 8443.39 ms | -100.0% bf16 MFU | 62105 tok/s +step 13440/19560 | loss 3.303310 (-0.72z)| norm 0.2644 (-0.99z)| lr 1.43e-04 | 8441.04 ms | -100.0% bf16 MFU | 62105 tok/s +step 13441/19560 | loss 3.319871 (-0.23z)| norm 0.2756 (-0.11z)| lr 1.43e-04 | 8441.44 ms | -100.0% bf16 MFU | 62105 tok/s +step 13442/19560 | loss 3.278650 (-1.42z)| norm 0.2805 (+0.27z)| lr 1.43e-04 | 8442.48 ms | -100.0% bf16 MFU | 62105 tok/s +step 13443/19560 | loss 3.308624 (-0.55z)| norm 0.2679 (-0.70z)| lr 1.43e-04 | 8442.10 ms | -100.0% bf16 MFU | 62105 tok/s +step 13444/19560 | loss 3.319643 (-0.22z)| norm 0.2648 (-0.93z)| lr 1.43e-04 | 8442.22 ms | -100.0% bf16 MFU | 62105 tok/s +step 13445/19560 | loss 3.317087 (-0.29z)| norm 0.2673 (-0.72z)| lr 1.43e-04 | 8442.54 ms | -100.0% bf16 MFU | 62105 tok/s +step 13446/19560 | loss 3.331872 (+0.14z)| norm 0.2617 (-1.16z)| lr 1.43e-04 | 8441.27 ms | -100.0% bf16 MFU | 62105 tok/s +step 13447/19560 | loss 3.352180 (+0.73z)| norm 0.2910 (+1.13z)| lr 1.43e-04 | 8445.02 ms | -100.0% bf16 MFU | 62104 tok/s +step 13448/19560 | loss 3.328981 (+0.05z)| norm 0.2613 (-1.17z)| lr 1.43e-04 | 8442.37 ms | -100.0% bf16 MFU | 62104 tok/s +step 13449/19560 | loss 3.300787 (-0.78z)| norm 0.2646 (-0.91z)| lr 1.43e-04 | 8438.04 ms | -100.0% bf16 MFU | 62105 tok/s +step 13450/19560 | loss 3.276711 (-1.47z)| norm 0.2668 (-0.73z)| lr 1.42e-04 | 8442.33 ms | -100.0% bf16 MFU | 62105 tok/s +step 13451/19560 | loss 3.275828 (-1.47z)| norm 0.2711 (-0.40z)| lr 1.42e-04 | 8440.21 ms | -100.0% bf16 MFU | 62106 tok/s +step 13452/19560 | loss 3.343069 (+0.47z)| norm 0.2737 (-0.21z)| lr 1.42e-04 | 8441.75 ms | -100.0% bf16 MFU | 62106 tok/s +step 13453/19560 | loss 3.312372 (-0.42z)| norm 0.2699 (-0.49z)| lr 1.42e-04 | 8441.56 ms | -100.0% bf16 MFU | 62106 tok/s +step 13454/19560 | loss 3.336947 (+0.30z)| norm 0.2770 (+0.06z)| lr 1.42e-04 | 8450.21 ms | -100.0% bf16 MFU | 62103 tok/s +step 13455/19560 | loss 3.293039 (-0.97z)| norm 0.2658 (-0.82z)| lr 1.42e-04 | 8467.16 ms | -100.0% bf16 MFU | 62094 tok/s +step 13456/19560 | loss 3.353011 (+0.77z)| norm 0.2690 (-0.56z)| lr 1.42e-04 | 8468.81 ms | -100.0% bf16 MFU | 62084 tok/s +step 13457/19560 | loss 3.288440 (-1.12z)| norm 0.2605 (-1.22z)| lr 1.42e-04 | 8466.97 ms | -100.0% bf16 MFU | 62076 tok/s +step 13458/19560 | loss 3.318813 (-0.23z)| norm 0.2818 (+0.44z)| lr 1.42e-04 | 8468.02 ms | -100.0% bf16 MFU | 62068 tok/s +step 13459/19560 | loss 3.347734 (+0.64z)| norm 0.2500 (-2.00z)| lr 1.42e-04 | 8468.13 ms | -100.0% bf16 MFU | 62060 tok/s +step 13460/19560 | loss 3.372020 (+1.34z)| norm 0.2729 (-0.23z)| lr 1.42e-04 | 8465.73 ms | -100.0% bf16 MFU | 62054 tok/s +step 13461/19560 | loss 3.287471 (-1.15z)| norm 0.2735 (-0.20z)| lr 1.42e-04 | 8461.91 ms | -100.0% bf16 MFU | 62049 tok/s +step 13462/19560 | loss 3.312357 (-0.42z)| norm 0.2640 (-0.91z)| lr 1.42e-04 | 8462.80 ms | -100.0% bf16 MFU | 62044 tok/s +step 13463/19560 | loss 3.357107 (+0.90z)| norm 0.2886 (+0.96z)| lr 1.42e-04 | 8465.27 ms | -100.0% bf16 MFU | 62039 tok/s +step 13464/19560 | loss 3.350210 (+0.69z)| norm 0.2586 (-1.31z)| lr 1.42e-04 | 8460.81 ms | -100.0% bf16 MFU | 62035 tok/s +step 13465/19560 | loss 3.315622 (-0.32z)| norm 0.2900 (+1.07z)| lr 1.42e-04 | 8462.81 ms | -100.0% bf16 MFU | 62031 tok/s +step 13466/19560 | loss 3.325926 (-0.01z)| norm 0.2549 (-1.57z)| lr 1.42e-04 | 8459.99 ms | -100.0% bf16 MFU | 62028 tok/s +step 13467/19560 | loss 3.300549 (-0.75z)| norm 0.2661 (-0.72z)| lr 1.42e-04 | 8459.15 ms | -100.0% bf16 MFU | 62026 tok/s +step 13468/19560 | loss 3.402934 (+2.23z)| norm 0.2828 (+0.54z)| lr 1.42e-04 | 8466.06 ms | -100.0% bf16 MFU | 62021 tok/s +step 13469/19560 | loss 3.280348 (-1.32z)| norm 0.2961 (+1.53z)| lr 1.42e-04 | 8458.18 ms | -100.0% bf16 MFU | 62019 tok/s +step 13470/19560 | loss 3.318466 (-0.22z)| norm 0.2821 (+0.49z)| lr 1.42e-04 | 8458.48 ms | -100.0% bf16 MFU | 62017 tok/s +step 13471/19560 | loss 3.255233 (-2.01z)| norm 0.2677 (-0.58z)| lr 1.42e-04 | 8461.54 ms | -100.0% bf16 MFU | 62014 tok/s +step 13472/19560 | loss 3.369463 (+1.23z)| norm 0.2966 (+1.60z)| lr 1.42e-04 | 8457.20 ms | -100.0% bf16 MFU | 62013 tok/s +step 13473/19560 | loss 3.349353 (+0.65z)| norm 0.2589 (-1.23z)| lr 1.41e-04 | 8457.89 ms | -100.0% bf16 MFU | 62012 tok/s +step 13474/19560 | loss 3.284847 (-1.18z)| norm 0.2778 (+0.19z)| lr 1.41e-04 | 8460.66 ms | -100.0% bf16 MFU | 62010 tok/s +step 13475/19560 | loss 3.308994 (-0.49z)| norm 0.2727 (-0.18z)| lr 1.41e-04 | 8464.23 ms | -100.0% bf16 MFU | 62006 tok/s +step 13476/19560 | loss 3.305712 (-0.58z)| norm 0.2855 (+0.79z)| lr 1.41e-04 | 8461.80 ms | -100.0% bf16 MFU | 62004 tok/s +step 13477/19560 | loss 3.328647 (+0.07z)| norm 0.3069 (+2.35z)| lr 1.41e-04 | 8456.49 ms | -100.0% bf16 MFU | 62004 tok/s +step 13478/19560 | loss 3.362669 (+1.04z)| norm 0.2764 (+0.10z)| lr 1.41e-04 | 8457.78 ms | -100.0% bf16 MFU | 62003 tok/s +step 13479/19560 | loss 3.340427 (+0.40z)| norm 0.3105 (+2.68z)| lr 1.41e-04 | 8457.49 ms | -100.0% bf16 MFU | 62002 tok/s +step 13480/19560 | loss 3.295631 (-0.86z)| norm 0.2617 (-0.99z)| lr 1.41e-04 | 8456.29 ms | -100.0% bf16 MFU | 62002 tok/s +step 13481/19560 | loss 3.346461 (+0.57z)| norm 0.2929 (+1.38z)| lr 1.41e-04 | 8461.49 ms | -100.0% bf16 MFU | 62000 tok/s +step 13482/19560 | loss 3.325277 (-0.03z)| norm 0.2646 (-0.76z)| lr 1.41e-04 | 8460.30 ms | -100.0% bf16 MFU | 61999 tok/s +step 13483/19560 | loss 3.350280 (+0.69z)| norm 0.2531 (-1.61z)| lr 1.41e-04 | 8451.87 ms | -100.0% bf16 MFU | 62000 tok/s +step 13484/19560 | loss 3.414654 (+2.45z)| norm 0.2973 (+1.73z)| lr 1.41e-04 | 8453.75 ms | -100.0% bf16 MFU | 62001 tok/s +step 13485/19560 | loss 3.384778 (+1.58z)| norm 0.2690 (-0.41z)| lr 1.41e-04 | 8460.78 ms | -100.0% bf16 MFU | 62000 tok/s +step 13486/19560 | loss 3.350205 (+0.63z)| norm 0.2682 (-0.46z)| lr 1.41e-04 | 8456.03 ms | -100.0% bf16 MFU | 62000 tok/s +step 13487/19560 | loss 3.346159 (+0.51z)| norm 0.2796 (+0.40z)| lr 1.41e-04 | 8453.25 ms | -100.0% bf16 MFU | 62001 tok/s +step 13488/19560 | loss 3.356233 (+0.79z)| norm 0.2661 (-0.62z)| lr 1.41e-04 | 8452.54 ms | -100.0% bf16 MFU | 62002 tok/s +step 13489/19560 | loss 3.384780 (+1.56z)| norm 0.2920 (+1.35z)| lr 1.41e-04 | 8457.30 ms | -100.0% bf16 MFU | 62002 tok/s +step 13490/19560 | loss 3.332356 (+0.11z)| norm 0.2687 (-0.41z)| lr 1.41e-04 | 8452.27 ms | -100.0% bf16 MFU | 62003 tok/s +step 13491/19560 | loss 3.371296 (+1.19z)| norm 0.2861 (+0.91z)| lr 1.41e-04 | 8450.78 ms | -100.0% bf16 MFU | 62005 tok/s +step 13492/19560 | loss 3.337392 (+0.27z)| norm 0.2730 (-0.08z)| lr 1.41e-04 | 8456.58 ms | -100.0% bf16 MFU | 62005 tok/s +step 13493/19560 | loss 3.308336 (-0.54z)| norm 0.2835 (+0.74z)| lr 1.41e-04 | 8458.42 ms | -100.0% bf16 MFU | 62004 tok/s +step 13494/19560 | loss 3.377385 (+1.39z)| norm 0.2925 (+1.42z)| lr 1.41e-04 | 8450.93 ms | -100.0% bf16 MFU | 62005 tok/s +step 13495/19560 | loss 3.339622 (+0.34z)| norm 0.2873 (+1.05z)| lr 1.41e-04 | 8447.84 ms | -100.0% bf16 MFU | 62008 tok/s +step 13496/19560 | loss 3.312049 (-0.45z)| norm 0.2833 (+0.72z)| lr 1.41e-04 | 8458.70 ms | -100.0% bf16 MFU | 62007 tok/s +step 13497/19560 | loss 3.320295 (-0.21z)| norm 0.2835 (+0.74z)| lr 1.40e-04 | 8448.38 ms | -100.0% bf16 MFU | 62009 tok/s +step 13498/19560 | loss 3.380111 (+1.49z)| norm 0.2979 (+1.84z)| lr 1.40e-04 | 8446.53 ms | -100.0% bf16 MFU | 62012 tok/s +step 13499/19560 | loss 3.368277 (+1.13z)| norm 0.2913 (+1.32z)| lr 1.40e-04 | 8453.06 ms | -100.0% bf16 MFU | 62013 tok/s +step 13500/19560 | loss 3.344849 (+0.47z)| norm 0.3331 (+4.22z)| lr 1.40e-04 | 8454.83 ms | -100.0% bf16 MFU | 62013 tok/s +val loss 3.315934 +evaluating HellaSwag: 0/1256 evaluating HellaSwag: 10/1256 evaluating HellaSwag: 20/1256 evaluating HellaSwag: 30/1256 evaluating HellaSwag: 40/1256 evaluating HellaSwag: 50/1256 evaluating HellaSwag: 60/1256 evaluating HellaSwag: 70/1256 evaluating HellaSwag: 80/1256 evaluating HellaSwag: 90/1256 evaluating HellaSwag: 100/1256 evaluating HellaSwag: 110/1256 evaluating HellaSwag: 120/1256 evaluating HellaSwag: 130/1256 evaluating HellaSwag: 140/1256 evaluating HellaSwag: 150/1256 evaluating HellaSwag: 160/1256 evaluating HellaSwag: 170/1256 evaluating HellaSwag: 180/1256 evaluating HellaSwag: 190/1256 evaluating HellaSwag: 200/1256 evaluating HellaSwag: 210/1256 evaluating HellaSwag: 220/1256 evaluating HellaSwag: 230/1256 evaluating HellaSwag: 240/1256 evaluating HellaSwag: 250/1256 evaluating HellaSwag: 260/1256 evaluating HellaSwag: 270/1256 evaluating HellaSwag: 280/1256 evaluating HellaSwag: 290/1256 evaluating HellaSwag: 300/1256 evaluating HellaSwag: 310/1256 evaluating HellaSwag: 320/1256 evaluating HellaSwag: 330/1256 evaluating HellaSwag: 340/1256 evaluating HellaSwag: 350/1256 evaluating HellaSwag: 360/1256 evaluating HellaSwag: 370/1256 evaluating HellaSwag: 380/1256 evaluating HellaSwag: 390/1256 evaluating HellaSwag: 400/1256 evaluating HellaSwag: 410/1256 evaluating HellaSwag: 420/1256 evaluating HellaSwag: 430/1256 evaluating HellaSwag: 440/1256 evaluating HellaSwag: 450/1256 evaluating HellaSwag: 460/1256 evaluating HellaSwag: 470/1256 evaluating HellaSwag: 480/1256 evaluating HellaSwag: 490/1256 evaluating HellaSwag: 500/1256 evaluating HellaSwag: 510/1256 evaluating HellaSwag: 520/1256 evaluating HellaSwag: 530/1256 evaluating HellaSwag: 540/1256 evaluating HellaSwag: 550/1256 evaluating HellaSwag: 560/1256 evaluating HellaSwag: 570/1256 evaluating HellaSwag: 580/1256 evaluating HellaSwag: 590/1256 evaluating HellaSwag: 600/1256 evaluating HellaSwag: 610/1256 evaluating HellaSwag: 620/1256 evaluating HellaSwag: 630/1256 evaluating HellaSwag: 640/1256 evaluating HellaSwag: 650/1256 evaluating HellaSwag: 660/1256 evaluating HellaSwag: 670/1256 evaluating HellaSwag: 680/1256 evaluating HellaSwag: 690/1256 evaluating HellaSwag: 700/1256 evaluating HellaSwag: 710/1256 evaluating HellaSwag: 720/1256 evaluating HellaSwag: 730/1256 evaluating HellaSwag: 740/1256 evaluating HellaSwag: 750/1256 evaluating HellaSwag: 760/1256 evaluating HellaSwag: 770/1256 evaluating HellaSwag: 780/1256 evaluating HellaSwag: 790/1256 evaluating HellaSwag: 800/1256 evaluating HellaSwag: 810/1256 evaluating HellaSwag: 820/1256 evaluating HellaSwag: 830/1256 evaluating HellaSwag: 840/1256 evaluating HellaSwag: 850/1256 evaluating HellaSwag: 860/1256 evaluating HellaSwag: 870/1256 evaluating HellaSwag: 880/1256 evaluating HellaSwag: 890/1256 evaluating HellaSwag: 900/1256 evaluating HellaSwag: 910/1256 evaluating HellaSwag: 920/1256 evaluating HellaSwag: 930/1256 evaluating HellaSwag: 940/1256 evaluating HellaSwag: 950/1256 evaluating HellaSwag: 960/1256 evaluating HellaSwag: 970/1256 evaluating HellaSwag: 980/1256 evaluating HellaSwag: 990/1256 evaluating HellaSwag: 1000/1256 evaluating HellaSwag: 1010/1256 evaluating HellaSwag: 1020/1256 evaluating HellaSwag: 1030/1256 evaluating HellaSwag: 1040/1256 evaluating HellaSwag: 1050/1256 evaluating HellaSwag: 1060/1256 evaluating HellaSwag: 1070/1256 evaluating HellaSwag: 1080/1256 evaluating HellaSwag: 1090/1256 evaluating HellaSwag: 1100/1256 evaluating HellaSwag: 1110/1256 evaluating HellaSwag: 1120/1256 evaluating HellaSwag: 1130/1256 evaluating HellaSwag: 1140/1256 evaluating HellaSwag: 1150/1256 evaluating HellaSwag: 1160/1256 evaluating HellaSwag: 1170/1256 evaluating HellaSwag: 1180/1256 evaluating HellaSwag: 1190/1256 evaluating HellaSwag: 1200/1256 evaluating HellaSwag: 1210/1256 evaluating HellaSwag: 1220/1256 evaluating HellaSwag: 1230/1256 evaluating HellaSwag: 1240/1256 evaluating HellaSwag: 1250/1256 HellaSwag: 2955/10042 = 0.294264 +step 13501/19560 | loss 3.289153 (-1.10z)| norm 0.2871 (+0.94z)| lr 1.40e-04 | 8444.77 ms | -100.0% bf16 MFU | 62016 tok/s +step 13502/19560 | loss 3.353357 (+0.70z)| norm 0.3092 (+2.52z)| lr 1.40e-04 | 8450.22 ms | -100.0% bf16 MFU | 62018 tok/s +step 13503/19560 | loss 3.400204 (+1.98z)| norm 0.2938 (+1.37z)| lr 1.40e-04 | 8457.62 ms | -100.0% bf16 MFU | 62016 tok/s +step 13504/19560 | loss 3.307247 (-0.62z)| norm 0.2768 (+0.13z)| lr 1.40e-04 | 8446.95 ms | -100.0% bf16 MFU | 62019 tok/s +step 13505/19560 | loss 3.290019 (-1.09z)| norm 0.3091 (+2.41z)| lr 1.40e-04 | 8453.79 ms | -100.0% bf16 MFU | 62019 tok/s +step 13506/19560 | loss 3.323685 (-0.15z)| norm 0.2754 (+0.01z)| lr 1.40e-04 | 8457.73 ms | -100.0% bf16 MFU | 62018 tok/s +step 13507/19560 | loss 3.380317 (+1.41z)| norm 0.3145 (+2.72z)| lr 1.40e-04 | 8453.31 ms | -100.0% bf16 MFU | 62018 tok/s +step 13508/19560 | loss 3.327336 (-0.07z)| norm 0.2855 (+0.69z)| lr 1.40e-04 | 8447.26 ms | -100.0% bf16 MFU | 62020 tok/s +step 13509/19560 | loss 3.270896 (-1.62z)| norm 0.3268 (+3.41z)| lr 1.40e-04 | 8454.77 ms | -100.0% bf16 MFU | 62020 tok/s +step 13510/19560 | loss 3.289665 (-1.10z)| norm 0.2546 (-1.41z)| lr 1.40e-04 | 8452.45 ms | -100.0% bf16 MFU | 62020 tok/s +step 13511/19560 | loss 3.357661 (+0.77z)| norm 0.2784 (+0.17z)| lr 1.40e-04 | 8450.13 ms | -100.0% bf16 MFU | 62021 tok/s +step 13512/19560 | loss 3.324343 (-0.15z)| norm 0.2765 (+0.04z)| lr 1.40e-04 | 8453.65 ms | -100.0% bf16 MFU | 62021 tok/s +step 13513/19560 | loss 3.369854 (+1.11z)| norm 0.2669 (-0.59z)| lr 1.40e-04 | 8449.07 ms | -100.0% bf16 MFU | 62023 tok/s +step 13514/19560 | loss 3.300297 (-0.83z)| norm 0.2634 (-0.83z)| lr 1.40e-04 | 8446.82 ms | -100.0% bf16 MFU | 62025 tok/s +step 13515/19560 | loss 3.296817 (-0.92z)| norm 0.2601 (-1.03z)| lr 1.40e-04 | 8451.70 ms | -100.0% bf16 MFU | 62026 tok/s +step 13516/19560 | loss 3.325539 (-0.11z)| norm 0.2664 (-0.61z)| lr 1.40e-04 | 8441.73 ms | -100.0% bf16 MFU | 62030 tok/s +step 13517/19560 | loss 3.365032 (+0.99z)| norm 0.2720 (-0.25z)| lr 1.40e-04 | 8450.85 ms | -100.0% bf16 MFU | 62030 tok/s +step 13518/19560 | loss 3.354206 (+0.67z)| norm 0.2846 (+0.58z)| lr 1.40e-04 | 8452.85 ms | -100.0% bf16 MFU | 62030 tok/s +step 13519/19560 | loss 3.300138 (-0.84z)| norm 0.2925 (+1.09z)| lr 1.40e-04 | 8451.01 ms | -100.0% bf16 MFU | 62030 tok/s +step 13520/19560 | loss 3.357504 (+0.77z)| norm 0.3218 (+2.92z)| lr 1.39e-04 | 8448.01 ms | -100.0% bf16 MFU | 62032 tok/s +step 13521/19560 | loss 3.305364 (-0.70z)| norm 0.2624 (-0.91z)| lr 1.39e-04 | 8447.60 ms | -100.0% bf16 MFU | 62033 tok/s +step 13522/19560 | loss 3.328295 (-0.05z)| norm 0.2854 (+0.57z)| lr 1.39e-04 | 8447.14 ms | -100.0% bf16 MFU | 62035 tok/s +step 13523/19560 | loss 3.306797 (-0.65z)| norm 0.2640 (-0.82z)| lr 1.39e-04 | 8451.59 ms | -100.0% bf16 MFU | 62035 tok/s +step 13524/19560 | loss 3.283169 (-1.30z)| norm 0.2604 (-1.03z)| lr 1.39e-04 | 8448.75 ms | -100.0% bf16 MFU | 62036 tok/s +step 13525/19560 | loss 3.366409 (+1.04z)| norm 0.2833 (+0.43z)| lr 1.39e-04 | 8447.23 ms | -100.0% bf16 MFU | 62038 tok/s +step 13526/19560 | loss 3.307388 (-0.61z)| norm 0.2649 (-0.75z)| lr 1.39e-04 | 8449.32 ms | -100.0% bf16 MFU | 62038 tok/s +step 13527/19560 | loss 3.383678 (+1.50z)| norm 0.2563 (-1.28z)| lr 1.39e-04 | 8449.54 ms | -100.0% bf16 MFU | 62039 tok/s +step 13528/19560 | loss 3.335361 (+0.15z)| norm 0.2827 (+0.39z)| lr 1.39e-04 | 8446.19 ms | -100.0% bf16 MFU | 62041 tok/s +step 13529/19560 | loss 3.348106 (+0.50z)| norm 0.2761 (-0.02z)| lr 1.39e-04 | 8446.65 ms | -100.0% bf16 MFU | 62042 tok/s +step 13530/19560 | loss 3.340909 (+0.29z)| norm 0.2983 (+1.39z)| lr 1.39e-04 | 8445.51 ms | -100.0% bf16 MFU | 62044 tok/s +step 13531/19560 | loss 3.332945 (+0.07z)| norm 0.2755 (-0.07z)| lr 1.39e-04 | 8441.72 ms | -100.0% bf16 MFU | 62047 tok/s +step 13532/19560 | loss 3.362695 (+0.91z)| norm 0.2900 (+0.85z)| lr 1.39e-04 | 8450.02 ms | -100.0% bf16 MFU | 62047 tok/s +step 13533/19560 | loss 3.379997 (+1.43z)| norm 0.2714 (-0.33z)| lr 1.39e-04 | 8447.18 ms | -100.0% bf16 MFU | 62048 tok/s +step 13534/19560 | loss 3.347473 (+0.49z)| norm 0.2998 (+1.48z)| lr 1.39e-04 | 8444.62 ms | -100.0% bf16 MFU | 62050 tok/s +step 13535/19560 | loss 3.292372 (-1.09z)| norm 0.2723 (-0.29z)| lr 1.39e-04 | 8440.48 ms | -100.0% bf16 MFU | 62053 tok/s +step 13536/19560 | loss 3.345356 (+0.42z)| norm 0.2852 (+0.53z)| lr 1.39e-04 | 8443.94 ms | -100.0% bf16 MFU | 62055 tok/s +step 13537/19560 | loss 3.336543 (+0.16z)| norm 0.2726 (-0.28z)| lr 1.39e-04 | 8437.27 ms | -100.0% bf16 MFU | 62059 tok/s +step 13538/19560 | loss 3.328607 (-0.06z)| norm 0.2684 (-0.54z)| lr 1.39e-04 | 8439.17 ms | -100.0% bf16 MFU | 62063 tok/s +step 13539/19560 | loss 3.386820 (+1.61z)| norm 0.2646 (-0.78z)| lr 1.39e-04 | 8435.47 ms | -100.0% bf16 MFU | 62067 tok/s +step 13540/19560 | loss 3.257689 (-2.08z)| norm 0.2676 (-0.59z)| lr 1.39e-04 | 8436.94 ms | -100.0% bf16 MFU | 62071 tok/s +step 13541/19560 | loss 3.359346 (+0.83z)| norm 0.2614 (-0.96z)| lr 1.39e-04 | 8437.41 ms | -100.0% bf16 MFU | 62074 tok/s +step 13542/19560 | loss 3.340397 (+0.28z)| norm 0.2705 (-0.39z)| lr 1.39e-04 | 8441.92 ms | -100.0% bf16 MFU | 62076 tok/s +step 13543/19560 | loss 3.352721 (+0.62z)| norm 0.2598 (-1.05z)| lr 1.39e-04 | 8435.63 ms | -100.0% bf16 MFU | 62080 tok/s +step 13544/19560 | loss 3.307168 (-0.68z)| norm 0.2726 (-0.24z)| lr 1.38e-04 | 8435.34 ms | -100.0% bf16 MFU | 62083 tok/s +step 13545/19560 | loss 3.324086 (-0.18z)| norm 0.2762 (-0.01z)| lr 1.38e-04 | 8439.29 ms | -100.0% bf16 MFU | 62085 tok/s +step 13546/19560 | loss 3.309887 (-0.59z)| norm 0.2634 (-0.82z)| lr 1.38e-04 | 8442.71 ms | -100.0% bf16 MFU | 62086 tok/s +step 13547/19560 | loss 3.291458 (-1.13z)| norm 0.2733 (-0.19z)| lr 1.38e-04 | 8439.84 ms | -100.0% bf16 MFU | 62088 tok/s +step 13548/19560 | loss 3.361813 (+0.95z)| norm 0.2686 (-0.49z)| lr 1.38e-04 | 8435.58 ms | -100.0% bf16 MFU | 62091 tok/s +step 13549/19560 | loss 3.353132 (+0.68z)| norm 0.2723 (-0.25z)| lr 1.38e-04 | 8440.23 ms | -100.0% bf16 MFU | 62092 tok/s +step 13550/19560 | loss 3.370589 (+1.19z)| norm 0.2516 (-1.53z)| lr 1.38e-04 | 8438.62 ms | -100.0% bf16 MFU | 62094 tok/s +step 13551/19560 | loss 3.348092 (+0.52z)| norm 0.2688 (-0.45z)| lr 1.38e-04 | 8433.99 ms | -100.0% bf16 MFU | 62098 tok/s +step 13552/19560 | loss 3.293360 (-1.11z)| norm 0.2708 (-0.33z)| lr 1.38e-04 | 8435.54 ms | -100.0% bf16 MFU | 62100 tok/s +step 13553/19560 | loss 3.297429 (-0.99z)| norm 0.2649 (-0.70z)| lr 1.38e-04 | 8439.94 ms | -100.0% bf16 MFU | 62101 tok/s +step 13554/19560 | loss 3.377991 (+1.42z)| norm 0.2662 (-0.61z)| lr 1.38e-04 | 8435.74 ms | -100.0% bf16 MFU | 62104 tok/s +step 13555/19560 | loss 3.305603 (-0.74z)| norm 0.2522 (-1.48z)| lr 1.38e-04 | 8435.11 ms | -100.0% bf16 MFU | 62106 tok/s +step 13556/19560 | loss 3.324884 (-0.15z)| norm 0.2664 (-0.58z)| lr 1.38e-04 | 8440.91 ms | -100.0% bf16 MFU | 62107 tok/s +step 13557/19560 | loss 3.457175 (+3.61z)| norm 0.3004 (+1.53z)| lr 1.38e-04 | 8435.81 ms | -100.0% bf16 MFU | 62109 tok/s +step 13558/19560 | loss 3.319325 (-0.34z)| norm 0.2964 (+1.26z)| lr 1.38e-04 | 8438.95 ms | -100.0% bf16 MFU | 62110 tok/s +step 13559/19560 | loss 3.319230 (-0.35z)| norm 0.2778 (+0.10z)| lr 1.38e-04 | 8437.12 ms | -100.0% bf16 MFU | 62111 tok/s +step 13560/19560 | loss 3.321544 (-0.29z)| norm 0.2631 (-0.80z)| lr 1.38e-04 | 8440.61 ms | -100.0% bf16 MFU | 62112 tok/s +step 13561/19560 | loss 3.338420 (+0.20z)| norm 0.2830 (+0.42z)| lr 1.38e-04 | 8445.26 ms | -100.0% bf16 MFU | 62110 tok/s +step 13562/19560 | loss 3.363922 (+0.94z)| norm 0.2760 (-0.02z)| lr 1.38e-04 | 8437.54 ms | -100.0% bf16 MFU | 62111 tok/s +step 13563/19560 | loss 3.300918 (-0.92z)| norm 0.2606 (-0.99z)| lr 1.38e-04 | 8444.65 ms | -100.0% bf16 MFU | 62110 tok/s +step 13564/19560 | loss 3.282399 (-1.44z)| norm 0.2667 (-0.60z)| lr 1.38e-04 | 8443.37 ms | -100.0% bf16 MFU | 62109 tok/s +step 13565/19560 | loss 3.345825 (+0.40z)| norm 0.2408 (-2.20z)| lr 1.38e-04 | 8439.97 ms | -100.0% bf16 MFU | 62110 tok/s +step 13566/19560 | loss 3.305604 (-0.76z)| norm 0.2690 (-0.45z)| lr 1.38e-04 | 8437.03 ms | -100.0% bf16 MFU | 62111 tok/s +step 13567/19560 | loss 3.355024 (+0.68z)| norm 0.2821 (+0.36z)| lr 1.38e-04 | 8442.76 ms | -100.0% bf16 MFU | 62111 tok/s +step 13568/19560 | loss 3.313010 (-0.55z)| norm 0.2574 (-1.19z)| lr 1.37e-04 | 8445.26 ms | -100.0% bf16 MFU | 62109 tok/s +step 13569/19560 | loss 3.333758 (+0.05z)| norm 0.2611 (-0.94z)| lr 1.37e-04 | 8443.78 ms | -100.0% bf16 MFU | 62108 tok/s +step 13570/19560 | loss 3.297833 (-1.01z)| norm 0.2678 (-0.52z)| lr 1.37e-04 | 8444.55 ms | -100.0% bf16 MFU | 62107 tok/s +step 13571/19560 | loss 3.332410 (+0.00z)| norm 0.2754 (-0.05z)| lr 1.37e-04 | 8439.11 ms | -100.0% bf16 MFU | 62108 tok/s +step 13572/19560 | loss 3.311100 (-0.62z)| norm 0.2569 (-1.19z)| lr 1.37e-04 | 8441.24 ms | -100.0% bf16 MFU | 62108 tok/s +step 13573/19560 | loss 3.340493 (+0.24z)| norm 0.2722 (-0.25z)| lr 1.37e-04 | 8440.16 ms | -100.0% bf16 MFU | 62109 tok/s +step 13574/19560 | loss 3.329862 (-0.08z)| norm 0.2575 (-1.16z)| lr 1.37e-04 | 8440.23 ms | -100.0% bf16 MFU | 62109 tok/s +step 13575/19560 | loss 3.289874 (-1.24z)| norm 0.2546 (-1.31z)| lr 1.37e-04 | 8439.24 ms | -100.0% bf16 MFU | 62110 tok/s +step 13576/19560 | loss 3.295841 (-1.05z)| norm 0.2821 (+0.38z)| lr 1.37e-04 | 8439.56 ms | -100.0% bf16 MFU | 62111 tok/s +step 13577/19560 | loss 3.327495 (-0.13z)| norm 0.2537 (-1.37z)| lr 1.37e-04 | 8443.57 ms | -100.0% bf16 MFU | 62110 tok/s +step 13578/19560 | loss 3.323292 (-0.27z)| norm 0.2761 (+0.00z)| lr 1.37e-04 | 8441.72 ms | -100.0% bf16 MFU | 62110 tok/s +step 13579/19560 | loss 3.329918 (-0.08z)| norm 0.2698 (-0.38z)| lr 1.37e-04 | 8446.39 ms | -100.0% bf16 MFU | 62108 tok/s +step 13580/19560 | loss 3.322001 (-0.31z)| norm 0.2609 (-0.92z)| lr 1.37e-04 | 8442.79 ms | -100.0% bf16 MFU | 62107 tok/s +step 13581/19560 | loss 3.258388 (-2.17z)| norm 0.2625 (-0.82z)| lr 1.37e-04 | 8444.06 ms | -100.0% bf16 MFU | 62106 tok/s +step 13582/19560 | loss 3.334803 (+0.08z)| norm 0.2801 (+0.26z)| lr 1.37e-04 | 8441.12 ms | -100.0% bf16 MFU | 62107 tok/s +step 13583/19560 | loss 3.343474 (+0.33z)| norm 0.2596 (-0.99z)| lr 1.37e-04 | 8439.10 ms | -100.0% bf16 MFU | 62108 tok/s +step 13584/19560 | loss 3.292651 (-1.16z)| norm 0.2525 (-1.41z)| lr 1.37e-04 | 8442.72 ms | -100.0% bf16 MFU | 62107 tok/s +step 13585/19560 | loss 3.397276 (+1.89z)| norm 0.2728 (-0.18z)| lr 1.37e-04 | 8444.30 ms | -100.0% bf16 MFU | 62106 tok/s +step 13586/19560 | loss 3.377618 (+1.29z)| norm 0.2509 (-1.49z)| lr 1.37e-04 | 8441.16 ms | -100.0% bf16 MFU | 62106 tok/s +step 13587/19560 | loss 3.339821 (+0.19z)| norm 0.2460 (-1.78z)| lr 1.37e-04 | 8438.67 ms | -100.0% bf16 MFU | 62108 tok/s +step 13588/19560 | loss 3.337713 (+0.14z)| norm 0.2828 (+0.43z)| lr 1.37e-04 | 8438.05 ms | -100.0% bf16 MFU | 62109 tok/s +step 13589/19560 | loss 3.314322 (-0.55z)| norm 0.2432 (-1.91z)| lr 1.37e-04 | 8440.42 ms | -100.0% bf16 MFU | 62109 tok/s +step 13590/19560 | loss 3.453053 (+3.35z)| norm 0.2779 (+0.14z)| lr 1.37e-04 | 8438.98 ms | -100.0% bf16 MFU | 62110 tok/s +step 13591/19560 | loss 3.329453 (-0.13z)| norm 0.2648 (-0.63z)| lr 1.37e-04 | 8442.89 ms | -100.0% bf16 MFU | 62110 tok/s +step 13592/19560 | loss 3.327742 (-0.17z)| norm 0.2650 (-0.62z)| lr 1.36e-04 | 8438.92 ms | -100.0% bf16 MFU | 62110 tok/s +step 13593/19560 | loss 3.284350 (-1.38z)| norm 0.2664 (-0.53z)| lr 1.36e-04 | 8445.03 ms | -100.0% bf16 MFU | 62109 tok/s +step 13594/19560 | loss 3.310777 (-0.64z)| norm 0.2716 (-0.22z)| lr 1.36e-04 | 8447.39 ms | -100.0% bf16 MFU | 62107 tok/s +step 13595/19560 | loss 3.400222 (+1.83z)| norm 0.2849 (+0.57z)| lr 1.36e-04 | 8447.31 ms | -100.0% bf16 MFU | 62105 tok/s +step 13596/19560 | loss 3.343951 (+0.28z)| norm 0.2697 (-0.34z)| lr 1.36e-04 | 8439.01 ms | -100.0% bf16 MFU | 62106 tok/s +step 13597/19560 | loss 3.412196 (+2.15z)| norm 0.2980 (+1.36z)| lr 1.36e-04 | 8433.59 ms | -100.0% bf16 MFU | 62109 tok/s +step 13598/19560 | loss 3.332938 (-0.06z)| norm 0.2655 (-0.59z)| lr 1.36e-04 | 8434.22 ms | -100.0% bf16 MFU | 62112 tok/s +step 13599/19560 | loss 3.274566 (-1.70z)| norm 0.2687 (-0.39z)| lr 1.36e-04 | 8435.23 ms | -100.0% bf16 MFU | 62114 tok/s +step 13600/19560 | loss 3.262318 (-2.00z)| norm 0.2598 (-0.91z)| lr 1.36e-04 | 8432.61 ms | -100.0% bf16 MFU | 62117 tok/s +step 13601/19560 | loss 3.258929 (-2.04z)| norm 0.2685 (-0.39z)| lr 1.36e-04 | 8432.02 ms | -100.0% bf16 MFU | 62120 tok/s +step 13602/19560 | loss 3.342325 (+0.23z)| norm 0.2572 (-1.06z)| lr 1.36e-04 | 8434.65 ms | -100.0% bf16 MFU | 62122 tok/s +step 13603/19560 | loss 3.311773 (-0.61z)| norm 0.2489 (-1.54z)| lr 1.36e-04 | 8431.75 ms | -100.0% bf16 MFU | 62125 tok/s +step 13604/19560 | loss 3.338450 (+0.11z)| norm 0.2873 (+0.75z)| lr 1.36e-04 | 8430.53 ms | -100.0% bf16 MFU | 62128 tok/s +step 13605/19560 | loss 3.364757 (+0.83z)| norm 0.2697 (-0.28z)| lr 1.36e-04 | 8431.79 ms | -100.0% bf16 MFU | 62131 tok/s +step 13606/19560 | loss 3.325507 (-0.24z)| norm 0.2847 (+0.62z)| lr 1.36e-04 | 8432.91 ms | -100.0% bf16 MFU | 62133 tok/s +step 13607/19560 | loss 3.323733 (-0.29z)| norm 0.2958 (+1.31z)| lr 1.36e-04 | 8431.12 ms | -100.0% bf16 MFU | 62135 tok/s +step 13608/19560 | loss 3.401376 (+1.81z)| norm 0.2742 (-0.01z)| lr 1.36e-04 | 8432.03 ms | -100.0% bf16 MFU | 62137 tok/s +step 13609/19560 | loss 3.436016 (+2.66z)| norm 0.2873 (+0.79z)| lr 1.36e-04 | 8431.35 ms | -100.0% bf16 MFU | 62140 tok/s +step 13610/19560 | loss 3.401613 (+1.72z)| norm 0.2763 (+0.11z)| lr 1.36e-04 | 8434.32 ms | -100.0% bf16 MFU | 62141 tok/s +step 13611/19560 | loss 3.324971 (-0.29z)| norm 0.2849 (+0.63z)| lr 1.36e-04 | 8433.62 ms | -100.0% bf16 MFU | 62142 tok/s +step 13612/19560 | loss 3.386553 (+1.34z)| norm 0.2929 (+1.13z)| lr 1.36e-04 | 8435.09 ms | -100.0% bf16 MFU | 62143 tok/s +step 13613/19560 | loss 3.338845 (+0.09z)| norm 0.2834 (+0.53z)| lr 1.36e-04 | 8438.24 ms | -100.0% bf16 MFU | 62142 tok/s +step 13614/19560 | loss 3.457486 (+3.11z)| norm 0.3013 (+1.61z)| lr 1.36e-04 | 8433.35 ms | -100.0% bf16 MFU | 62143 tok/s +step 13615/19560 | loss 3.327312 (-0.23z)| norm 0.2936 (+1.12z)| lr 1.36e-04 | 8433.33 ms | -100.0% bf16 MFU | 62145 tok/s +step 13616/19560 | loss 3.341562 (+0.14z)| norm 0.2779 (+0.16z)| lr 1.35e-04 | 8437.58 ms | -100.0% bf16 MFU | 62144 tok/s +step 13617/19560 | loss 3.351357 (+0.40z)| norm 0.2761 (+0.06z)| lr 1.35e-04 | 8435.26 ms | -100.0% bf16 MFU | 62145 tok/s +step 13618/19560 | loss 3.411956 (+1.92z)| norm 0.2895 (+0.87z)| lr 1.35e-04 | 8433.71 ms | -100.0% bf16 MFU | 62146 tok/s +step 13619/19560 | loss 3.308713 (-0.69z)| norm 0.2524 (-1.38z)| lr 1.35e-04 | 8435.98 ms | -100.0% bf16 MFU | 62146 tok/s +step 13620/19560 | loss 3.384001 (+1.21z)| norm 0.2627 (-0.74z)| lr 1.35e-04 | 8433.49 ms | -100.0% bf16 MFU | 62147 tok/s +step 13621/19560 | loss 3.356960 (+0.51z)| norm 0.2798 (+0.30z)| lr 1.35e-04 | 8435.30 ms | -100.0% bf16 MFU | 62147 tok/s +step 13622/19560 | loss 3.280194 (-1.41z)| norm 0.2692 (-0.34z)| lr 1.35e-04 | 8436.13 ms | -100.0% bf16 MFU | 62147 tok/s +step 13623/19560 | loss 3.331541 (-0.11z)| norm 0.2830 (+0.51z)| lr 1.35e-04 | 8438.20 ms | -100.0% bf16 MFU | 62147 tok/s +step 13624/19560 | loss 3.348605 (+0.31z)| norm 0.2860 (+0.69z)| lr 1.35e-04 | 8436.66 ms | -100.0% bf16 MFU | 62147 tok/s +step 13625/19560 | loss 3.387592 (+1.28z)| norm 0.2653 (-0.57z)| lr 1.35e-04 | 8440.81 ms | -100.0% bf16 MFU | 62145 tok/s +step 13626/19560 | loss 3.301526 (-0.87z)| norm 0.2860 (+0.71z)| lr 1.35e-04 | 8437.05 ms | -100.0% bf16 MFU | 62145 tok/s +step 13627/19560 | loss 3.337212 (+0.03z)| norm 0.2668 (-0.46z)| lr 1.35e-04 | 8439.35 ms | -100.0% bf16 MFU | 62144 tok/s +step 13628/19560 | loss 3.348538 (+0.32z)| norm 0.2720 (-0.12z)| lr 1.35e-04 | 8437.13 ms | -100.0% bf16 MFU | 62144 tok/s +step 13629/19560 | loss 3.390925 (+1.37z)| norm 0.2866 (+0.83z)| lr 1.35e-04 | 8438.40 ms | -100.0% bf16 MFU | 62143 tok/s +step 13630/19560 | loss 3.344265 (+0.19z)| norm 0.2836 (+0.66z)| lr 1.35e-04 | 8440.00 ms | -100.0% bf16 MFU | 62142 tok/s +step 13631/19560 | loss 3.394659 (+1.47z)| norm 0.2563 (-1.14z)| lr 1.35e-04 | 8436.17 ms | -100.0% bf16 MFU | 62142 tok/s +step 13632/19560 | loss 3.411666 (+1.86z)| norm 0.2691 (-0.28z)| lr 1.35e-04 | 8441.45 ms | -100.0% bf16 MFU | 62140 tok/s +step 13633/19560 | loss 3.379095 (+1.03z)| norm 0.2945 (+1.44z)| lr 1.35e-04 | 8437.89 ms | -100.0% bf16 MFU | 62140 tok/s +step 13634/19560 | loss 3.269893 (-1.68z)| norm 0.2881 (+1.00z)| lr 1.35e-04 | 8440.85 ms | -100.0% bf16 MFU | 62139 tok/s +step 13635/19560 | loss 3.341243 (+0.10z)| norm 0.2575 (-1.06z)| lr 1.35e-04 | 8436.26 ms | -100.0% bf16 MFU | 62139 tok/s +step 13636/19560 | loss 3.307341 (-0.74z)| norm 0.2666 (-0.42z)| lr 1.35e-04 | 8436.86 ms | -100.0% bf16 MFU | 62139 tok/s +step 13637/19560 | loss 3.286274 (-1.27z)| norm 0.2601 (-0.88z)| lr 1.35e-04 | 8444.75 ms | -100.0% bf16 MFU | 62137 tok/s +step 13638/19560 | loss 3.328798 (-0.22z)| norm 0.2640 (-0.60z)| lr 1.35e-04 | 8437.17 ms | -100.0% bf16 MFU | 62137 tok/s +step 13639/19560 | loss 3.303631 (-0.84z)| norm 0.2534 (-1.36z)| lr 1.35e-04 | 8438.12 ms | -100.0% bf16 MFU | 62137 tok/s +step 13640/19560 | loss 3.370531 (+0.82z)| norm 0.2639 (-0.59z)| lr 1.34e-04 | 8440.14 ms | -100.0% bf16 MFU | 62136 tok/s +step 13641/19560 | loss 3.295959 (-1.02z)| norm 0.2518 (-1.45z)| lr 1.34e-04 | 8441.02 ms | -100.0% bf16 MFU | 62135 tok/s +step 13642/19560 | loss 3.402004 (+1.59z)| norm 0.2768 (+0.35z)| lr 1.34e-04 | 8440.06 ms | -100.0% bf16 MFU | 62134 tok/s +step 13643/19560 | loss 3.377232 (+0.96z)| norm 0.2616 (-0.75z)| lr 1.34e-04 | 8439.80 ms | -100.0% bf16 MFU | 62133 tok/s +step 13644/19560 | loss 3.301415 (-0.91z)| norm 0.2770 (+0.36z)| lr 1.34e-04 | 8441.66 ms | -100.0% bf16 MFU | 62132 tok/s +step 13645/19560 | loss 3.378675 (+0.99z)| norm 0.2576 (-1.03z)| lr 1.34e-04 | 8460.63 ms | -100.0% bf16 MFU | 62124 tok/s +step 13646/19560 | loss 3.351954 (+0.34z)| norm 0.2774 (+0.40z)| lr 1.34e-04 | 8463.28 ms | -100.0% bf16 MFU | 62115 tok/s +step 13647/19560 | loss 3.353741 (+0.37z)| norm 0.2676 (-0.30z)| lr 1.34e-04 | 8464.30 ms | -100.0% bf16 MFU | 62106 tok/s +step 13648/19560 | loss 3.417730 (+1.91z)| norm 0.2792 (+0.61z)| lr 1.34e-04 | 8461.67 ms | -100.0% bf16 MFU | 62099 tok/s +step 13649/19560 | loss 3.328324 (-0.27z)| norm 0.2656 (-0.45z)| lr 1.34e-04 | 8458.32 ms | -100.0% bf16 MFU | 62093 tok/s +step 13650/19560 | loss 3.415866 (+1.83z)| norm 0.2625 (-0.67z)| lr 1.34e-04 | 8461.12 ms | -100.0% bf16 MFU | 62087 tok/s +step 13651/19560 | loss 3.378321 (+0.91z)| norm 0.2615 (-0.75z)| lr 1.34e-04 | 8454.05 ms | -100.0% bf16 MFU | 62083 tok/s +step 13652/19560 | loss 3.316082 (-0.60z)| norm 0.2779 (+0.51z)| lr 1.34e-04 | 8456.67 ms | -100.0% bf16 MFU | 62079 tok/s +step 13653/19560 | loss 3.344438 (+0.09z)| norm 0.2971 (+1.97z)| lr 1.34e-04 | 8454.04 ms | -100.0% bf16 MFU | 62076 tok/s +step 13654/19560 | loss 3.325053 (-0.38z)| norm 0.2530 (-1.39z)| lr 1.34e-04 | 8454.56 ms | -100.0% bf16 MFU | 62073 tok/s +step 13655/19560 | loss 3.424056 (+1.99z)| norm 0.2780 (+0.50z)| lr 1.34e-04 | 8450.05 ms | -100.0% bf16 MFU | 62071 tok/s +step 13656/19560 | loss 3.388196 (+1.12z)| norm 0.2589 (-0.95z)| lr 1.34e-04 | 8452.26 ms | -100.0% bf16 MFU | 62069 tok/s +step 13657/19560 | loss 3.380864 (+0.93z)| norm 0.2760 (+0.36z)| lr 1.34e-04 | 8454.76 ms | -100.0% bf16 MFU | 62066 tok/s +step 13658/19560 | loss 3.373249 (+0.74z)| norm 0.2789 (+0.60z)| lr 1.34e-04 | 8453.58 ms | -100.0% bf16 MFU | 62064 tok/s +step 13659/19560 | loss 3.331218 (-0.26z)| norm 0.2719 (+0.06z)| lr 1.34e-04 | 8446.67 ms | -100.0% bf16 MFU | 62064 tok/s +step 13660/19560 | loss 3.364514 (+0.53z)| norm 0.2700 (-0.07z)| lr 1.34e-04 | 8454.30 ms | -100.0% bf16 MFU | 62062 tok/s +step 13661/19560 | loss 3.501740 (+3.59z)| norm 0.2725 (+0.12z)| lr 1.34e-04 | 8448.95 ms | -100.0% bf16 MFU | 62061 tok/s +step 13662/19560 | loss 3.349339 (+0.14z)| norm 0.2769 (+0.49z)| lr 1.34e-04 | 8454.25 ms | -100.0% bf16 MFU | 62059 tok/s +step 13663/19560 | loss 3.437506 (+2.09z)| norm 0.2724 (+0.13z)| lr 1.34e-04 | 8443.96 ms | -100.0% bf16 MFU | 62061 tok/s +step 13664/19560 | loss 3.365594 (+0.48z)| norm 0.2778 (+0.56z)| lr 1.33e-04 | 8454.13 ms | -100.0% bf16 MFU | 62058 tok/s +step 13665/19560 | loss 3.330460 (-0.31z)| norm 0.2799 (+0.73z)| lr 1.33e-04 | 8449.57 ms | -100.0% bf16 MFU | 62058 tok/s +step 13666/19560 | loss 3.319382 (-0.55z)| norm 0.2724 (+0.13z)| lr 1.33e-04 | 8450.44 ms | -100.0% bf16 MFU | 62057 tok/s +step 13667/19560 | loss 3.350890 (+0.16z)| norm 0.2763 (+0.43z)| lr 1.33e-04 | 8454.90 ms | -100.0% bf16 MFU | 62055 tok/s +step 13668/19560 | loss 3.360533 (+0.36z)| norm 0.2716 (+0.05z)| lr 1.33e-04 | 8443.93 ms | -100.0% bf16 MFU | 62057 tok/s +step 13669/19560 | loss 3.285779 (-1.32z)| norm 0.2555 (-1.22z)| lr 1.33e-04 | 8452.62 ms | -100.0% bf16 MFU | 62055 tok/s +step 13670/19560 | loss 3.320701 (-0.52z)| norm 0.2686 (-0.18z)| lr 1.33e-04 | 8453.96 ms | -100.0% bf16 MFU | 62053 tok/s +step 13671/19560 | loss 3.317105 (-0.60z)| norm 0.2757 (+0.38z)| lr 1.33e-04 | 8458.02 ms | -100.0% bf16 MFU | 62050 tok/s +step 13672/19560 | loss 3.324110 (-0.45z)| norm 0.2609 (-0.79z)| lr 1.33e-04 | 8454.24 ms | -100.0% bf16 MFU | 62048 tok/s +step 13673/19560 | loss 3.338790 (-0.12z)| norm 0.2658 (-0.40z)| lr 1.33e-04 | 8447.87 ms | -100.0% bf16 MFU | 62049 tok/s +step 13674/19560 | loss 3.327187 (-0.38z)| norm 0.2710 (+0.01z)| lr 1.33e-04 | 8447.02 ms | -100.0% bf16 MFU | 62050 tok/s +step 13675/19560 | loss 3.307223 (-0.84z)| norm 0.2554 (-1.22z)| lr 1.33e-04 | 8450.00 ms | -100.0% bf16 MFU | 62050 tok/s +step 13676/19560 | loss 3.367210 (+0.52z)| norm 0.2601 (-0.84z)| lr 1.33e-04 | 8447.65 ms | -100.0% bf16 MFU | 62050 tok/s +step 13677/19560 | loss 3.313640 (-0.69z)| norm 0.2681 (-0.20z)| lr 1.33e-04 | 8450.95 ms | -100.0% bf16 MFU | 62050 tok/s +step 13678/19560 | loss 3.368386 (+0.55z)| norm 0.2575 (-1.05z)| lr 1.33e-04 | 8454.62 ms | -100.0% bf16 MFU | 62048 tok/s +step 13679/19560 | loss 3.311634 (-0.72z)| norm 0.2573 (-1.05z)| lr 1.33e-04 | 8452.04 ms | -100.0% bf16 MFU | 62047 tok/s +step 13680/19560 | loss 3.346374 (+0.05z)| norm 0.2663 (-0.34z)| lr 1.33e-04 | 8453.43 ms | -100.0% bf16 MFU | 62046 tok/s +step 13681/19560 | loss 3.323894 (-0.46z)| norm 0.2681 (-0.20z)| lr 1.33e-04 | 8445.97 ms | -100.0% bf16 MFU | 62047 tok/s +step 13682/19560 | loss 3.323323 (-0.47z)| norm 0.2616 (-0.71z)| lr 1.33e-04 | 8457.32 ms | -100.0% bf16 MFU | 62044 tok/s +step 13683/19560 | loss 3.431523 (+1.95z)| norm 0.2851 (+1.14z)| lr 1.33e-04 | 8454.92 ms | -100.0% bf16 MFU | 62043 tok/s +step 13684/19560 | loss 3.367560 (+0.50z)| norm 0.2642 (-0.52z)| lr 1.33e-04 | 8457.17 ms | -100.0% bf16 MFU | 62040 tok/s +step 13685/19560 | loss 3.323236 (-0.48z)| norm 0.2724 (+0.15z)| lr 1.33e-04 | 8445.84 ms | -100.0% bf16 MFU | 62042 tok/s +step 13686/19560 | loss 3.372891 (+0.65z)| norm 0.2752 (+0.39z)| lr 1.33e-04 | 8452.47 ms | -100.0% bf16 MFU | 62041 tok/s +step 13687/19560 | loss 3.325949 (-0.43z)| norm 0.2709 (+0.05z)| lr 1.33e-04 | 8446.83 ms | -100.0% bf16 MFU | 62043 tok/s +step 13688/19560 | loss 3.369641 (+0.57z)| norm 0.2702 (-0.02z)| lr 1.32e-04 | 8451.26 ms | -100.0% bf16 MFU | 62042 tok/s +step 13689/19560 | loss 3.309085 (-0.82z)| norm 0.2583 (-0.99z)| lr 1.32e-04 | 8450.02 ms | -100.0% bf16 MFU | 62043 tok/s +step 13690/19560 | loss 3.347264 (+0.06z)| norm 0.2734 (+0.27z)| lr 1.32e-04 | 8440.95 ms | -100.0% bf16 MFU | 62046 tok/s +step 13691/19560 | loss 3.329461 (-0.36z)| norm 0.2700 (-0.02z)| lr 1.32e-04 | 8454.77 ms | -100.0% bf16 MFU | 62044 tok/s +step 13692/19560 | loss 3.333572 (-0.27z)| norm 0.2597 (-0.87z)| lr 1.32e-04 | 8453.09 ms | -100.0% bf16 MFU | 62043 tok/s +step 13693/19560 | loss 3.328218 (-0.39z)| norm 0.2709 (+0.04z)| lr 1.32e-04 | 8448.18 ms | -100.0% bf16 MFU | 62044 tok/s +step 13694/19560 | loss 3.398799 (+1.23z)| norm 0.2551 (-1.28z)| lr 1.32e-04 | 8444.11 ms | -100.0% bf16 MFU | 62046 tok/s +step 13695/19560 | loss 3.301488 (-1.02z)| norm 0.2716 (+0.11z)| lr 1.32e-04 | 8452.04 ms | -100.0% bf16 MFU | 62046 tok/s +step 13696/19560 | loss 3.400409 (+1.25z)| norm 0.2681 (-0.19z)| lr 1.32e-04 | 8453.29 ms | -100.0% bf16 MFU | 62044 tok/s +step 13697/19560 | loss 3.326648 (-0.45z)| norm 0.2644 (-0.51z)| lr 1.32e-04 | 8449.14 ms | -100.0% bf16 MFU | 62045 tok/s +step 13698/19560 | loss 3.370453 (+0.55z)| norm 0.2769 (+0.55z)| lr 1.32e-04 | 8448.72 ms | -100.0% bf16 MFU | 62045 tok/s +step 13699/19560 | loss 3.383026 (+0.83z)| norm 0.2817 (+0.95z)| lr 1.32e-04 | 8448.34 ms | -100.0% bf16 MFU | 62046 tok/s +step 13700/19560 | loss 3.306421 (-0.94z)| norm 0.2567 (-1.17z)| lr 1.32e-04 | 8446.73 ms | -100.0% bf16 MFU | 62047 tok/s +step 13701/19560 | loss 3.349177 (+0.05z)| norm 0.2919 (+1.77z)| lr 1.32e-04 | 8449.96 ms | -100.0% bf16 MFU | 62047 tok/s +step 13702/19560 | loss 3.352108 (+0.11z)| norm 0.2670 (-0.31z)| lr 1.32e-04 | 8453.18 ms | -100.0% bf16 MFU | 62046 tok/s +step 13703/19560 | loss 3.341399 (-0.14z)| norm 0.2577 (-1.10z)| lr 1.32e-04 | 8449.73 ms | -100.0% bf16 MFU | 62046 tok/s +step 13704/19560 | loss 3.352065 (+0.09z)| norm 0.2668 (-0.32z)| lr 1.32e-04 | 8450.28 ms | -100.0% bf16 MFU | 62046 tok/s +step 13705/19560 | loss 3.426285 (+1.79z)| norm 0.2803 (+0.80z)| lr 1.32e-04 | 8454.55 ms | -100.0% bf16 MFU | 62044 tok/s +step 13706/19560 | loss 3.359259 (+0.24z)| norm 0.2769 (+0.52z)| lr 1.32e-04 | 8447.19 ms | -100.0% bf16 MFU | 62045 tok/s +step 13707/19560 | loss 3.322135 (-0.62z)| norm 0.2674 (-0.29z)| lr 1.32e-04 | 8452.59 ms | -100.0% bf16 MFU | 62044 tok/s +step 13708/19560 | loss 3.319055 (-0.69z)| norm 0.2905 (+1.64z)| lr 1.32e-04 | 8446.09 ms | -100.0% bf16 MFU | 62046 tok/s +step 13709/19560 | loss 3.316834 (-0.77z)| norm 0.3017 (+2.49z)| lr 1.32e-04 | 8450.16 ms | -100.0% bf16 MFU | 62046 tok/s +step 13710/19560 | loss 3.352226 (+0.06z)| norm 0.2780 (+0.55z)| lr 1.32e-04 | 8445.79 ms | -100.0% bf16 MFU | 62047 tok/s +step 13711/19560 | loss 3.346776 (-0.07z)| norm 0.2789 (+0.61z)| lr 1.32e-04 | 8448.23 ms | -100.0% bf16 MFU | 62048 tok/s +step 13712/19560 | loss 3.381956 (+0.75z)| norm 0.2563 (-1.26z)| lr 1.31e-04 | 8444.95 ms | -100.0% bf16 MFU | 62050 tok/s +step 13713/19560 | loss 3.351075 (+0.03z)| norm 0.2827 (+0.92z)| lr 1.31e-04 | 8443.75 ms | -100.0% bf16 MFU | 62052 tok/s +step 13714/19560 | loss 3.246758 (-2.37z)| norm 0.2582 (-1.12z)| lr 1.31e-04 | 8440.67 ms | -100.0% bf16 MFU | 62055 tok/s +step 13715/19560 | loss 3.378099 (+0.67z)| norm 0.3104 (+3.11z)| lr 1.31e-04 | 8442.17 ms | -100.0% bf16 MFU | 62057 tok/s +step 13716/19560 | loss 3.302409 (-1.07z)| norm 0.2606 (-0.92z)| lr 1.31e-04 | 8436.69 ms | -100.0% bf16 MFU | 62062 tok/s +step 13717/19560 | loss 3.308501 (-0.93z)| norm 0.2691 (-0.26z)| lr 1.31e-04 | 8442.52 ms | -100.0% bf16 MFU | 62064 tok/s +step 13718/19560 | loss 3.322285 (-0.60z)| norm 0.2592 (-1.06z)| lr 1.31e-04 | 8435.67 ms | -100.0% bf16 MFU | 62068 tok/s +step 13719/19560 | loss 3.464358 (+2.65z)| norm 0.2864 (+1.17z)| lr 1.31e-04 | 8433.85 ms | -100.0% bf16 MFU | 62073 tok/s +step 13720/19560 | loss 3.452454 (+2.30z)| norm 0.2764 (+0.34z)| lr 1.31e-04 | 8437.96 ms | -100.0% bf16 MFU | 62076 tok/s +step 13721/19560 | loss 3.313640 (-0.82z)| norm 0.2711 (-0.10z)| lr 1.31e-04 | 8442.92 ms | -100.0% bf16 MFU | 62077 tok/s +step 13722/19560 | loss 3.394293 (+0.98z)| norm 0.3050 (+2.61z)| lr 1.31e-04 | 8437.81 ms | -100.0% bf16 MFU | 62080 tok/s +step 13723/19560 | loss 3.276221 (-1.65z)| norm 0.2827 (+0.82z)| lr 1.31e-04 | 8436.48 ms | -100.0% bf16 MFU | 62083 tok/s +step 13724/19560 | loss 3.381794 (+0.71z)| norm 0.2822 (+0.77z)| lr 1.31e-04 | 8440.03 ms | -100.0% bf16 MFU | 62085 tok/s +step 13725/19560 | loss 3.447100 (+2.14z)| norm 0.2951 (+1.81z)| lr 1.31e-04 | 8444.18 ms | -100.0% bf16 MFU | 62085 tok/s +step 13726/19560 | loss 3.311226 (-0.86z)| norm 0.2760 (+0.26z)| lr 1.31e-04 | 8440.88 ms | -100.0% bf16 MFU | 62087 tok/s +step 13727/19560 | loss 3.340709 (-0.22z)| norm 0.2759 (+0.25z)| lr 1.31e-04 | 8436.44 ms | -100.0% bf16 MFU | 62090 tok/s +step 13728/19560 | loss 3.360639 (+0.21z)| norm 0.2822 (+0.74z)| lr 1.31e-04 | 8439.96 ms | -100.0% bf16 MFU | 62091 tok/s +step 13729/19560 | loss 3.387193 (+0.80z)| norm 0.2777 (+0.38z)| lr 1.31e-04 | 8436.32 ms | -100.0% bf16 MFU | 62094 tok/s +step 13730/19560 | loss 3.407099 (+1.24z)| norm 0.2948 (+1.73z)| lr 1.31e-04 | 8438.96 ms | -100.0% bf16 MFU | 62095 tok/s +step 13731/19560 | loss 3.351469 (-0.04z)| norm 0.2740 (+0.04z)| lr 1.31e-04 | 8438.96 ms | -100.0% bf16 MFU | 62097 tok/s +step 13732/19560 | loss 3.351722 (-0.04z)| norm 0.2826 (+0.75z)| lr 1.31e-04 | 8431.71 ms | -100.0% bf16 MFU | 62101 tok/s +step 13733/19560 | loss 3.333273 (-0.46z)| norm 0.2759 (+0.19z)| lr 1.31e-04 | 8438.38 ms | -100.0% bf16 MFU | 62103 tok/s +step 13734/19560 | loss 3.422868 (+1.57z)| norm 0.2574 (-1.30z)| lr 1.31e-04 | 8437.75 ms | -100.0% bf16 MFU | 62104 tok/s +step 13735/19560 | loss 3.385603 (+0.71z)| norm 0.2570 (-1.32z)| lr 1.31e-04 | 8433.19 ms | -100.0% bf16 MFU | 62108 tok/s +step 13736/19560 | loss 3.271734 (-1.85z)| norm 0.2560 (-1.38z)| lr 1.30e-04 | 8433.53 ms | -100.0% bf16 MFU | 62111 tok/s +step 13737/19560 | loss 3.342303 (-0.24z)| norm 0.2703 (-0.20z)| lr 1.30e-04 | 8434.35 ms | -100.0% bf16 MFU | 62113 tok/s +step 13738/19560 | loss 3.375185 (+0.52z)| norm 0.2642 (-0.69z)| lr 1.30e-04 | 8431.64 ms | -100.0% bf16 MFU | 62117 tok/s +step 13739/19560 | loss 3.335782 (-0.39z)| norm 0.2705 (-0.17z)| lr 1.30e-04 | 8435.66 ms | -100.0% bf16 MFU | 62118 tok/s +step 13740/19560 | loss 3.331125 (-0.48z)| norm 0.2875 (+1.24z)| lr 1.30e-04 | 8433.83 ms | -100.0% bf16 MFU | 62121 tok/s +step 13741/19560 | loss 3.308561 (-1.00z)| norm 0.2751 (+0.22z)| lr 1.30e-04 | 8438.77 ms | -100.0% bf16 MFU | 62121 tok/s +step 13742/19560 | loss 3.358062 (+0.16z)| norm 0.2741 (+0.16z)| lr 1.30e-04 | 8439.83 ms | -100.0% bf16 MFU | 62121 tok/s +step 13743/19560 | loss 3.345348 (-0.14z)| norm 0.2717 (-0.03z)| lr 1.30e-04 | 8435.04 ms | -100.0% bf16 MFU | 62123 tok/s +step 13744/19560 | loss 3.346534 (-0.11z)| norm 0.2711 (-0.08z)| lr 1.30e-04 | 8433.09 ms | -100.0% bf16 MFU | 62125 tok/s +step 13745/19560 | loss 3.321783 (-0.69z)| norm 0.2704 (-0.14z)| lr 1.30e-04 | 8436.86 ms | -100.0% bf16 MFU | 62126 tok/s +step 13746/19560 | loss 3.375493 (+0.58z)| norm 0.2926 (+1.77z)| lr 1.30e-04 | 8437.97 ms | -100.0% bf16 MFU | 62126 tok/s +step 13747/19560 | loss 3.340518 (-0.25z)| norm 0.2821 (+0.85z)| lr 1.30e-04 | 8437.49 ms | -100.0% bf16 MFU | 62127 tok/s +step 13748/19560 | loss 3.342400 (-0.20z)| norm 0.2629 (-0.80z)| lr 1.30e-04 | 8436.86 ms | -100.0% bf16 MFU | 62128 tok/s +step 13749/19560 | loss 3.353253 (+0.06z)| norm 0.2876 (+1.31z)| lr 1.30e-04 | 8440.83 ms | -100.0% bf16 MFU | 62127 tok/s +step 13750/19560 | loss 3.463547 (+2.61z)| norm 0.2722 (-0.01z)| lr 1.30e-04 | 8437.09 ms | -100.0% bf16 MFU | 62128 tok/s +val loss 3.311002 +evaluating HellaSwag: 0/1256 evaluating HellaSwag: 10/1256 evaluating HellaSwag: 20/1256 evaluating HellaSwag: 30/1256 evaluating HellaSwag: 40/1256 evaluating HellaSwag: 50/1256 evaluating HellaSwag: 60/1256 evaluating HellaSwag: 70/1256 evaluating HellaSwag: 80/1256 evaluating HellaSwag: 90/1256 evaluating HellaSwag: 100/1256 evaluating HellaSwag: 110/1256 evaluating HellaSwag: 120/1256 evaluating HellaSwag: 130/1256 evaluating HellaSwag: 140/1256 evaluating HellaSwag: 150/1256 evaluating HellaSwag: 160/1256 evaluating HellaSwag: 170/1256 evaluating HellaSwag: 180/1256 evaluating HellaSwag: 190/1256 evaluating HellaSwag: 200/1256 evaluating HellaSwag: 210/1256 evaluating HellaSwag: 220/1256 evaluating HellaSwag: 230/1256 evaluating HellaSwag: 240/1256 evaluating HellaSwag: 250/1256 evaluating HellaSwag: 260/1256 evaluating HellaSwag: 270/1256 evaluating HellaSwag: 280/1256 evaluating HellaSwag: 290/1256 evaluating HellaSwag: 300/1256 evaluating HellaSwag: 310/1256 evaluating HellaSwag: 320/1256 evaluating HellaSwag: 330/1256 evaluating HellaSwag: 340/1256 evaluating HellaSwag: 350/1256 evaluating HellaSwag: 360/1256 evaluating HellaSwag: 370/1256 evaluating HellaSwag: 380/1256 evaluating HellaSwag: 390/1256 evaluating HellaSwag: 400/1256 evaluating HellaSwag: 410/1256 evaluating HellaSwag: 420/1256 evaluating HellaSwag: 430/1256 evaluating HellaSwag: 440/1256 evaluating HellaSwag: 450/1256 evaluating HellaSwag: 460/1256 evaluating HellaSwag: 470/1256 evaluating HellaSwag: 480/1256 evaluating HellaSwag: 490/1256 evaluating HellaSwag: 500/1256 evaluating HellaSwag: 510/1256 evaluating HellaSwag: 520/1256 evaluating HellaSwag: 530/1256 evaluating HellaSwag: 540/1256 evaluating HellaSwag: 550/1256 evaluating HellaSwag: 560/1256 evaluating HellaSwag: 570/1256 evaluating HellaSwag: 580/1256 evaluating HellaSwag: 590/1256 evaluating HellaSwag: 600/1256 evaluating HellaSwag: 610/1256 evaluating HellaSwag: 620/1256 evaluating HellaSwag: 630/1256 evaluating HellaSwag: 640/1256 evaluating HellaSwag: 650/1256 evaluating HellaSwag: 660/1256 evaluating HellaSwag: 670/1256 evaluating HellaSwag: 680/1256 evaluating HellaSwag: 690/1256 evaluating HellaSwag: 700/1256 evaluating HellaSwag: 710/1256 evaluating HellaSwag: 720/1256 evaluating HellaSwag: 730/1256 evaluating HellaSwag: 740/1256 evaluating HellaSwag: 750/1256 evaluating HellaSwag: 760/1256 evaluating HellaSwag: 770/1256 evaluating HellaSwag: 780/1256 evaluating HellaSwag: 790/1256 evaluating HellaSwag: 800/1256 evaluating HellaSwag: 810/1256 evaluating HellaSwag: 820/1256 evaluating HellaSwag: 830/1256 evaluating HellaSwag: 840/1256 evaluating HellaSwag: 850/1256 evaluating HellaSwag: 860/1256 evaluating HellaSwag: 870/1256 evaluating HellaSwag: 880/1256 evaluating HellaSwag: 890/1256 evaluating HellaSwag: 900/1256 evaluating HellaSwag: 910/1256 evaluating HellaSwag: 920/1256 evaluating HellaSwag: 930/1256 evaluating HellaSwag: 940/1256 evaluating HellaSwag: 950/1256 evaluating HellaSwag: 960/1256 evaluating HellaSwag: 970/1256 evaluating HellaSwag: 980/1256 evaluating HellaSwag: 990/1256 evaluating HellaSwag: 1000/1256 evaluating HellaSwag: 1010/1256 evaluating HellaSwag: 1020/1256 evaluating HellaSwag: 1030/1256 evaluating HellaSwag: 1040/1256 evaluating HellaSwag: 1050/1256 evaluating HellaSwag: 1060/1256 evaluating HellaSwag: 1070/1256 evaluating HellaSwag: 1080/1256 evaluating HellaSwag: 1090/1256 evaluating HellaSwag: 1100/1256 evaluating HellaSwag: 1110/1256 evaluating HellaSwag: 1120/1256 evaluating HellaSwag: 1130/1256 evaluating HellaSwag: 1140/1256 evaluating HellaSwag: 1150/1256 evaluating HellaSwag: 1160/1256 evaluating HellaSwag: 1170/1256 evaluating HellaSwag: 1180/1256 evaluating HellaSwag: 1190/1256 evaluating HellaSwag: 1200/1256 evaluating HellaSwag: 1210/1256 evaluating HellaSwag: 1220/1256 evaluating HellaSwag: 1230/1256 evaluating HellaSwag: 1240/1256 evaluating HellaSwag: 1250/1256 HellaSwag: 2977/10042 = 0.296455 +step 13751/19560 | loss 3.294955 (-1.32z)| norm 0.2739 (+0.15z)| lr 1.30e-04 | 8435.95 ms | -100.0% bf16 MFU | 62129 tok/s +step 13752/19560 | loss 3.372338 (+0.47z)| norm 0.2729 (+0.07z)| lr 1.30e-04 | 8431.44 ms | -100.0% bf16 MFU | 62131 tok/s +step 13753/19560 | loss 3.350152 (-0.04z)| norm 0.2782 (+0.52z)| lr 1.30e-04 | 8433.22 ms | -100.0% bf16 MFU | 62133 tok/s +step 13754/19560 | loss 3.380721 (+0.66z)| norm 0.2675 (-0.40z)| lr 1.30e-04 | 8431.89 ms | -100.0% bf16 MFU | 62136 tok/s +step 13755/19560 | loss 3.347781 (-0.11z)| norm 0.2852 (+1.13z)| lr 1.30e-04 | 8436.25 ms | -100.0% bf16 MFU | 62136 tok/s +step 13756/19560 | loss 3.372237 (+0.46z)| norm 0.2556 (-1.43z)| lr 1.30e-04 | 8438.09 ms | -100.0% bf16 MFU | 62136 tok/s +step 13757/19560 | loss 3.375817 (+0.55z)| norm 0.2924 (+1.73z)| lr 1.30e-04 | 8438.57 ms | -100.0% bf16 MFU | 62136 tok/s +step 13758/19560 | loss 3.358608 (+0.14z)| norm 0.2761 (+0.35z)| lr 1.30e-04 | 8435.87 ms | -100.0% bf16 MFU | 62137 tok/s +step 13759/19560 | loss 3.385209 (+0.77z)| norm 0.3008 (+2.40z)| lr 1.30e-04 | 8434.66 ms | -100.0% bf16 MFU | 62138 tok/s +step 13760/19560 | loss 3.401658 (+1.16z)| norm 0.2905 (+1.50z)| lr 1.29e-04 | 8435.88 ms | -100.0% bf16 MFU | 62138 tok/s +step 13761/19560 | loss 3.330325 (-0.51z)| norm 0.2682 (-0.35z)| lr 1.29e-04 | 8441.74 ms | -100.0% bf16 MFU | 62137 tok/s +step 13762/19560 | loss 3.303224 (-1.17z)| norm 0.3051 (+2.70z)| lr 1.29e-04 | 8439.15 ms | -100.0% bf16 MFU | 62136 tok/s +step 13763/19560 | loss 3.342712 (-0.23z)| norm 0.2900 (+1.43z)| lr 1.29e-04 | 8440.79 ms | -100.0% bf16 MFU | 62135 tok/s +step 13764/19560 | loss 3.338436 (-0.34z)| norm 0.3050 (+2.58z)| lr 1.29e-04 | 8440.33 ms | -100.0% bf16 MFU | 62134 tok/s +step 13765/19560 | loss 3.309593 (-1.04z)| norm 0.2843 (+0.89z)| lr 1.29e-04 | 8440.51 ms | -100.0% bf16 MFU | 62133 tok/s +step 13766/19560 | loss 3.313793 (-0.93z)| norm 0.2837 (+0.83z)| lr 1.29e-04 | 8436.10 ms | -100.0% bf16 MFU | 62134 tok/s +step 13767/19560 | loss 3.359906 (+0.16z)| norm 0.2779 (+0.35z)| lr 1.29e-04 | 8442.57 ms | -100.0% bf16 MFU | 62132 tok/s +step 13768/19560 | loss 3.314195 (-0.92z)| norm 0.2904 (+1.34z)| lr 1.29e-04 | 8438.95 ms | -100.0% bf16 MFU | 62132 tok/s +step 13769/19560 | loss 3.327054 (-0.62z)| norm 0.2743 (+0.03z)| lr 1.29e-04 | 8441.84 ms | -100.0% bf16 MFU | 62131 tok/s +step 13770/19560 | loss 3.310308 (-1.01z)| norm 0.3087 (+2.75z)| lr 1.29e-04 | 8440.78 ms | -100.0% bf16 MFU | 62130 tok/s +step 13771/19560 | loss 3.348054 (-0.09z)| norm 0.2873 (+1.02z)| lr 1.29e-04 | 8438.30 ms | -100.0% bf16 MFU | 62130 tok/s +step 13772/19560 | loss 3.246130 (-2.51z)| norm 0.2944 (+1.56z)| lr 1.29e-04 | 8438.15 ms | -100.0% bf16 MFU | 62130 tok/s +step 13773/19560 | loss 3.295917 (-1.30z)| norm 0.2941 (+1.52z)| lr 1.29e-04 | 8441.61 ms | -100.0% bf16 MFU | 62129 tok/s +step 13774/19560 | loss 3.323686 (-0.64z)| norm 0.2781 (+0.26z)| lr 1.29e-04 | 8437.27 ms | -100.0% bf16 MFU | 62129 tok/s +step 13775/19560 | loss 3.473752 (+2.79z)| norm 0.2958 (+1.62z)| lr 1.29e-04 | 8437.53 ms | -100.0% bf16 MFU | 62130 tok/s +step 13776/19560 | loss 3.369226 (+0.42z)| norm 0.2937 (+1.43z)| lr 1.29e-04 | 8439.49 ms | -100.0% bf16 MFU | 62130 tok/s +step 13777/19560 | loss 3.392117 (+0.93z)| norm 0.2769 (+0.13z)| lr 1.29e-04 | 8439.40 ms | -100.0% bf16 MFU | 62129 tok/s +step 13778/19560 | loss 3.321728 (-0.68z)| norm 0.2844 (+0.70z)| lr 1.29e-04 | 8442.18 ms | -100.0% bf16 MFU | 62128 tok/s +step 13779/19560 | loss 3.332057 (-0.43z)| norm 0.2647 (-0.84z)| lr 1.29e-04 | 8441.79 ms | -100.0% bf16 MFU | 62127 tok/s +step 13780/19560 | loss 3.355956 (+0.12z)| norm 0.2738 (-0.13z)| lr 1.29e-04 | 8441.51 ms | -100.0% bf16 MFU | 62126 tok/s +step 13781/19560 | loss 3.376420 (+0.59z)| norm 0.2767 (+0.11z)| lr 1.29e-04 | 8440.35 ms | -100.0% bf16 MFU | 62126 tok/s +step 13782/19560 | loss 3.353199 (+0.04z)| norm 0.2695 (-0.47z)| lr 1.29e-04 | 8442.51 ms | -100.0% bf16 MFU | 62124 tok/s +step 13783/19560 | loss 3.412988 (+1.44z)| norm 0.2686 (-0.54z)| lr 1.29e-04 | 8438.73 ms | -100.0% bf16 MFU | 62124 tok/s +step 13784/19560 | loss 3.335034 (-0.37z)| norm 0.2625 (-1.03z)| lr 1.29e-04 | 8439.88 ms | -100.0% bf16 MFU | 62124 tok/s +step 13785/19560 | loss 3.346910 (-0.09z)| norm 0.2662 (-0.73z)| lr 1.28e-04 | 8444.42 ms | -100.0% bf16 MFU | 62122 tok/s +step 13786/19560 | loss 3.385059 (+0.80z)| norm 0.2623 (-1.03z)| lr 1.28e-04 | 8441.12 ms | -100.0% bf16 MFU | 62122 tok/s +step 13787/19560 | loss 3.354930 (+0.09z)| norm 0.2782 (+0.24z)| lr 1.28e-04 | 8439.00 ms | -100.0% bf16 MFU | 62122 tok/s +step 13788/19560 | loss 3.299835 (-1.18z)| norm 0.2453 (-2.32z)| lr 1.28e-04 | 8437.96 ms | -100.0% bf16 MFU | 62123 tok/s +step 13789/19560 | loss 3.310119 (-0.95z)| norm 0.2638 (-0.86z)| lr 1.28e-04 | 8437.13 ms | -100.0% bf16 MFU | 62124 tok/s +step 13790/19560 | loss 3.328906 (-0.48z)| norm 0.2699 (-0.39z)| lr 1.28e-04 | 8440.65 ms | -100.0% bf16 MFU | 62123 tok/s +step 13791/19560 | loss 3.344583 (-0.09z)| norm 0.2614 (-1.04z)| lr 1.28e-04 | 8441.58 ms | -100.0% bf16 MFU | 62122 tok/s +step 13792/19560 | loss 3.332811 (-0.37z)| norm 0.2649 (-0.76z)| lr 1.28e-04 | 8440.78 ms | -100.0% bf16 MFU | 62122 tok/s +step 13793/19560 | loss 3.289743 (-1.43z)| norm 0.2626 (-0.92z)| lr 1.28e-04 | 8439.52 ms | -100.0% bf16 MFU | 62122 tok/s +step 13794/19560 | loss 3.397043 (+1.21z)| norm 0.2672 (-0.57z)| lr 1.28e-04 | 8437.84 ms | -100.0% bf16 MFU | 62123 tok/s +step 13795/19560 | loss 3.362788 (+0.36z)| norm 0.2606 (-1.06z)| lr 1.28e-04 | 8439.57 ms | -100.0% bf16 MFU | 62123 tok/s +step 13796/19560 | loss 3.391019 (+1.04z)| norm 0.2764 (+0.15z)| lr 1.28e-04 | 8442.04 ms | -100.0% bf16 MFU | 62122 tok/s +step 13797/19560 | loss 3.320178 (-0.70z)| norm 0.2595 (-1.15z)| lr 1.28e-04 | 8438.42 ms | -100.0% bf16 MFU | 62122 tok/s +step 13798/19560 | loss 3.320619 (-0.69z)| norm 0.2681 (-0.49z)| lr 1.28e-04 | 8439.82 ms | -100.0% bf16 MFU | 62122 tok/s +step 13799/19560 | loss 3.322258 (-0.65z)| norm 0.2546 (-1.50z)| lr 1.28e-04 | 8440.19 ms | -100.0% bf16 MFU | 62122 tok/s +step 13800/19560 | loss 3.340606 (-0.20z)| norm 0.2554 (-1.43z)| lr 1.28e-04 | 8441.23 ms | -100.0% bf16 MFU | 62121 tok/s +step 13801/19560 | loss 3.306473 (-1.04z)| norm 0.2782 (+0.29z)| lr 1.28e-04 | 8437.72 ms | -100.0% bf16 MFU | 62122 tok/s +step 13802/19560 | loss 3.331065 (-0.43z)| norm 0.2518 (-1.69z)| lr 1.28e-04 | 8440.57 ms | -100.0% bf16 MFU | 62122 tok/s +step 13803/19560 | loss 3.350967 (+0.05z)| norm 0.2816 (+0.54z)| lr 1.28e-04 | 8439.40 ms | -100.0% bf16 MFU | 62122 tok/s +step 13804/19560 | loss 3.273783 (-1.82z)| norm 0.2667 (-0.59z)| lr 1.28e-04 | 8441.25 ms | -100.0% bf16 MFU | 62121 tok/s +step 13805/19560 | loss 3.413122 (+1.56z)| norm 0.2874 (+0.96z)| lr 1.28e-04 | 8437.16 ms | -100.0% bf16 MFU | 62122 tok/s +step 13806/19560 | loss 3.364001 (+0.37z)| norm 0.2889 (+1.06z)| lr 1.28e-04 | 8438.21 ms | -100.0% bf16 MFU | 62123 tok/s +step 13807/19560 | loss 3.570353 (+4.83z)| norm 0.2941 (+1.43z)| lr 1.28e-04 | 8438.48 ms | -100.0% bf16 MFU | 62123 tok/s +step 13808/19560 | loss 3.276262 (-1.62z)| norm 0.2832 (+0.60z)| lr 1.28e-04 | 8437.42 ms | -100.0% bf16 MFU | 62124 tok/s +step 13809/19560 | loss 3.382844 (+0.70z)| norm 0.2826 (+0.55z)| lr 1.27e-04 | 8438.58 ms | -100.0% bf16 MFU | 62124 tok/s +step 13810/19560 | loss 3.355047 (+0.09z)| norm 0.2863 (+0.81z)| lr 1.27e-04 | 8437.71 ms | -100.0% bf16 MFU | 62125 tok/s +step 13811/19560 | loss 3.317292 (-0.73z)| norm 0.2630 (-0.94z)| lr 1.27e-04 | 8439.64 ms | -100.0% bf16 MFU | 62125 tok/s +step 13812/19560 | loss 3.406994 (+1.24z)| norm 0.3157 (+2.93z)| lr 1.27e-04 | 8441.21 ms | -100.0% bf16 MFU | 62124 tok/s +step 13813/19560 | loss 3.376209 (+0.55z)| norm 0.2808 (+0.36z)| lr 1.27e-04 | 8436.97 ms | -100.0% bf16 MFU | 62125 tok/s +step 13814/19560 | loss 3.353431 (+0.06z)| norm 0.2902 (+1.04z)| lr 1.27e-04 | 8438.46 ms | -100.0% bf16 MFU | 62125 tok/s +step 13815/19560 | loss 3.379401 (+0.62z)| norm 0.2730 (-0.22z)| lr 1.27e-04 | 8436.94 ms | -100.0% bf16 MFU | 62126 tok/s +step 13816/19560 | loss 3.422960 (+1.55z)| norm 0.2892 (+0.95z)| lr 1.27e-04 | 8439.12 ms | -100.0% bf16 MFU | 62126 tok/s +step 13817/19560 | loss 3.444075 (+1.96z)| norm 0.3060 (+2.12z)| lr 1.27e-04 | 8438.92 ms | -100.0% bf16 MFU | 62126 tok/s +step 13818/19560 | loss 3.334842 (-0.38z)| norm 0.2838 (+0.52z)| lr 1.27e-04 | 8438.88 ms | -100.0% bf16 MFU | 62126 tok/s +step 13819/19560 | loss 3.373915 (+0.45z)| norm 0.2736 (-0.22z)| lr 1.27e-04 | 8438.93 ms | -100.0% bf16 MFU | 62126 tok/s +step 13820/19560 | loss 3.357511 (+0.09z)| norm 0.2757 (-0.08z)| lr 1.27e-04 | 8439.19 ms | -100.0% bf16 MFU | 62126 tok/s +step 13821/19560 | loss 3.477108 (+2.57z)| norm 0.3082 (+2.21z)| lr 1.27e-04 | 8438.86 ms | -100.0% bf16 MFU | 62126 tok/s +step 13822/19560 | loss 3.305213 (-1.01z)| norm 0.2750 (-0.16z)| lr 1.27e-04 | 8439.16 ms | -100.0% bf16 MFU | 62126 tok/s +step 13823/19560 | loss 3.376376 (+0.47z)| norm 0.2937 (+1.16z)| lr 1.27e-04 | 8440.28 ms | -100.0% bf16 MFU | 62126 tok/s +step 13824/19560 | loss 3.332418 (-0.45z)| norm 0.2752 (-0.16z)| lr 1.27e-04 | 8439.04 ms | -100.0% bf16 MFU | 62126 tok/s +step 13825/19560 | loss 3.343563 (-0.21z)| norm 0.2790 (+0.10z)| lr 1.27e-04 | 8439.52 ms | -100.0% bf16 MFU | 62126 tok/s +step 13826/19560 | loss 3.369080 (+0.32z)| norm 0.2724 (-0.37z)| lr 1.27e-04 | 8440.65 ms | -100.0% bf16 MFU | 62125 tok/s +step 13827/19560 | loss 3.308183 (-0.95z)| norm 0.2619 (-1.11z)| lr 1.27e-04 | 8435.99 ms | -100.0% bf16 MFU | 62126 tok/s +step 13828/19560 | loss 3.363775 (+0.21z)| norm 0.2889 (+0.81z)| lr 1.27e-04 | 8435.90 ms | -100.0% bf16 MFU | 62127 tok/s +step 13829/19560 | loss 3.342543 (-0.23z)| norm 0.2566 (-1.49z)| lr 1.27e-04 | 8434.71 ms | -100.0% bf16 MFU | 62129 tok/s +step 13830/19560 | loss 3.366169 (+0.26z)| norm 0.2804 (+0.21z)| lr 1.27e-04 | 8433.33 ms | -100.0% bf16 MFU | 62131 tok/s +step 13831/19560 | loss 3.336060 (-0.37z)| norm 0.2928 (+1.08z)| lr 1.27e-04 | 8429.97 ms | -100.0% bf16 MFU | 62134 tok/s +step 13832/19560 | loss 3.324420 (-0.61z)| norm 0.2763 (-0.11z)| lr 1.27e-04 | 8431.79 ms | -100.0% bf16 MFU | 62136 tok/s +step 13833/19560 | loss 3.331052 (-0.46z)| norm 0.2932 (+1.09z)| lr 1.27e-04 | 8429.29 ms | -100.0% bf16 MFU | 62139 tok/s +step 13834/19560 | loss 3.341129 (-0.24z)| norm 0.2795 (+0.11z)| lr 1.26e-04 | 8429.47 ms | -100.0% bf16 MFU | 62142 tok/s +step 13835/19560 | loss 3.389631 (+0.78z)| norm 0.2832 (+0.37z)| lr 1.26e-04 | 8431.85 ms | -100.0% bf16 MFU | 62144 tok/s +step 13836/19560 | loss 3.250951 (-2.12z)| norm 0.2804 (+0.18z)| lr 1.26e-04 | 8456.35 ms | -100.0% bf16 MFU | 62137 tok/s +step 13837/19560 | loss 3.344810 (-0.17z)| norm 0.2608 (-1.22z)| lr 1.26e-04 | 8457.57 ms | -100.0% bf16 MFU | 62130 tok/s +step 13838/19560 | loss 3.248078 (-2.14z)| norm 0.2757 (-0.14z)| lr 1.26e-04 | 8456.95 ms | -100.0% bf16 MFU | 62123 tok/s +step 13839/19560 | loss 3.325451 (-0.54z)| norm 0.2767 (-0.07z)| lr 1.26e-04 | 8457.71 ms | -100.0% bf16 MFU | 62116 tok/s +step 13840/19560 | loss 3.374269 (+0.46z)| norm 0.2801 (+0.17z)| lr 1.26e-04 | 8458.82 ms | -100.0% bf16 MFU | 62109 tok/s +step 13841/19560 | loss 3.269615 (-1.66z)| norm 0.2931 (+1.11z)| lr 1.26e-04 | 8454.77 ms | -100.0% bf16 MFU | 62105 tok/s +step 13842/19560 | loss 3.293051 (-1.20z)| norm 0.2648 (-0.97z)| lr 1.26e-04 | 8456.24 ms | -100.0% bf16 MFU | 62099 tok/s +step 13843/19560 | loss 3.300055 (-1.04z)| norm 0.3000 (+1.64z)| lr 1.26e-04 | 8457.67 ms | -100.0% bf16 MFU | 62094 tok/s +step 13844/19560 | loss 3.323904 (-0.56z)| norm 0.2999 (+1.61z)| lr 1.26e-04 | 8459.82 ms | -100.0% bf16 MFU | 62088 tok/s +step 13845/19560 | loss 3.299378 (-1.06z)| norm 0.3114 (+2.38z)| lr 1.26e-04 | 8458.70 ms | -100.0% bf16 MFU | 62083 tok/s +step 13846/19560 | loss 3.277251 (-1.49z)| norm 0.2897 (+0.79z)| lr 1.26e-04 | 8455.23 ms | -100.0% bf16 MFU | 62079 tok/s +step 13847/19560 | loss 3.365882 (+0.34z)| norm 0.2866 (+0.57z)| lr 1.26e-04 | 8453.05 ms | -100.0% bf16 MFU | 62076 tok/s +step 13848/19560 | loss 3.325072 (-0.50z)| norm 0.2864 (+0.55z)| lr 1.26e-04 | 8454.14 ms | -100.0% bf16 MFU | 62073 tok/s +step 13849/19560 | loss 3.323912 (-0.53z)| norm 0.2893 (+0.75z)| lr 1.26e-04 | 8448.58 ms | -100.0% bf16 MFU | 62072 tok/s +step 13850/19560 | loss 3.302676 (-0.96z)| norm 0.2666 (-0.89z)| lr 1.26e-04 | 8456.31 ms | -100.0% bf16 MFU | 62069 tok/s +step 13851/19560 | loss 3.267761 (-1.70z)| norm 0.2973 (+1.35z)| lr 1.26e-04 | 8455.06 ms | -100.0% bf16 MFU | 62066 tok/s +step 13852/19560 | loss 3.303596 (-0.93z)| norm 0.2708 (-0.57z)| lr 1.26e-04 | 8456.18 ms | -100.0% bf16 MFU | 62062 tok/s +step 13853/19560 | loss 3.349345 (+0.06z)| norm 0.2562 (-1.60z)| lr 1.26e-04 | 8451.06 ms | -100.0% bf16 MFU | 62061 tok/s +step 13854/19560 | loss 3.295394 (-1.10z)| norm 0.2743 (-0.30z)| lr 1.26e-04 | 8458.37 ms | -100.0% bf16 MFU | 62057 tok/s +step 13855/19560 | loss 3.253047 (-1.96z)| norm 0.2631 (-1.09z)| lr 1.26e-04 | 8462.99 ms | -100.0% bf16 MFU | 62052 tok/s +step 13856/19560 | loss 3.396931 (+1.07z)| norm 0.2647 (-0.97z)| lr 1.26e-04 | 8456.11 ms | -100.0% bf16 MFU | 62049 tok/s +step 13857/19560 | loss 3.306733 (-0.82z)| norm 0.2771 (-0.08z)| lr 1.26e-04 | 8460.17 ms | -100.0% bf16 MFU | 62045 tok/s +step 13858/19560 | loss 3.263705 (-1.69z)| norm 0.2764 (-0.12z)| lr 1.25e-04 | 8458.39 ms | -100.0% bf16 MFU | 62042 tok/s +step 13859/19560 | loss 3.298237 (-0.96z)| norm 0.2615 (-1.18z)| lr 1.25e-04 | 8458.46 ms | -100.0% bf16 MFU | 62040 tok/s +step 13860/19560 | loss 3.249119 (-1.94z)| norm 0.2914 (+0.97z)| lr 1.25e-04 | 8458.97 ms | -100.0% bf16 MFU | 62037 tok/s +step 13861/19560 | loss 3.256426 (-1.75z)| norm 0.2651 (-0.91z)| lr 1.25e-04 | 8459.03 ms | -100.0% bf16 MFU | 62034 tok/s +step 13862/19560 | loss 3.279031 (-1.27z)| norm 0.2662 (-0.84z)| lr 1.25e-04 | 8455.57 ms | -100.0% bf16 MFU | 62032 tok/s +step 13863/19560 | loss 3.345062 (+0.08z)| norm 0.2676 (-0.75z)| lr 1.25e-04 | 8457.50 ms | -100.0% bf16 MFU | 62030 tok/s +step 13864/19560 | loss 3.357023 (+0.31z)| norm 0.2855 (+0.53z)| lr 1.25e-04 | 8452.96 ms | -100.0% bf16 MFU | 62030 tok/s +step 13865/19560 | loss 3.266749 (-1.52z)| norm 0.2688 (-0.69z)| lr 1.25e-04 | 8448.10 ms | -100.0% bf16 MFU | 62031 tok/s +step 13866/19560 | loss 3.356250 (+0.31z)| norm 0.2896 (+0.81z)| lr 1.25e-04 | 8451.01 ms | -100.0% bf16 MFU | 62032 tok/s +step 13867/19560 | loss 3.381292 (+0.81z)| norm 0.2736 (-0.36z)| lr 1.25e-04 | 8453.14 ms | -100.0% bf16 MFU | 62031 tok/s +step 13868/19560 | loss 3.344071 (+0.05z)| norm 0.3091 (+2.19z)| lr 1.25e-04 | 8458.04 ms | -100.0% bf16 MFU | 62029 tok/s +step 13869/19560 | loss 3.360043 (+0.37z)| norm 0.2869 (+0.59z)| lr 1.25e-04 | 8452.51 ms | -100.0% bf16 MFU | 62029 tok/s +step 13870/19560 | loss 3.278270 (-1.28z)| norm 0.2608 (-1.27z)| lr 1.25e-04 | 8451.23 ms | -100.0% bf16 MFU | 62029 tok/s +step 13871/19560 | loss 3.304994 (-0.73z)| norm 0.2694 (-0.66z)| lr 1.25e-04 | 8459.17 ms | -100.0% bf16 MFU | 62027 tok/s +step 13872/19560 | loss 3.258558 (-1.64z)| norm 0.2647 (-0.98z)| lr 1.25e-04 | 8452.35 ms | -100.0% bf16 MFU | 62027 tok/s +step 13873/19560 | loss 3.294592 (-0.91z)| norm 0.2696 (-0.64z)| lr 1.25e-04 | 8449.88 ms | -100.0% bf16 MFU | 62028 tok/s +step 13874/19560 | loss 3.333552 (-0.13z)| norm 0.2791 (+0.05z)| lr 1.25e-04 | 8448.46 ms | -100.0% bf16 MFU | 62029 tok/s +step 13875/19560 | loss 3.277817 (-1.23z)| norm 0.2671 (-0.80z)| lr 1.25e-04 | 8453.87 ms | -100.0% bf16 MFU | 62029 tok/s +step 13876/19560 | loss 3.306357 (-0.65z)| norm 0.2797 (+0.09z)| lr 1.25e-04 | 8456.57 ms | -100.0% bf16 MFU | 62027 tok/s +step 13877/19560 | loss 3.268641 (-1.38z)| norm 0.2710 (-0.53z)| lr 1.25e-04 | 8450.19 ms | -100.0% bf16 MFU | 62028 tok/s +step 13878/19560 | loss 3.246199 (-1.81z)| norm 0.2781 (-0.02z)| lr 1.25e-04 | 8448.79 ms | -100.0% bf16 MFU | 62029 tok/s +step 13879/19560 | loss 3.337347 (+0.01z)| norm 0.2741 (-0.31z)| lr 1.25e-04 | 8452.46 ms | -100.0% bf16 MFU | 62029 tok/s +step 13880/19560 | loss 3.326838 (-0.20z)| norm 0.2775 (-0.07z)| lr 1.25e-04 | 8450.84 ms | -100.0% bf16 MFU | 62030 tok/s +step 13881/19560 | loss 3.304562 (-0.64z)| norm 0.2777 (-0.05z)| lr 1.25e-04 | 8454.56 ms | -100.0% bf16 MFU | 62029 tok/s +step 13882/19560 | loss 3.305583 (-0.60z)| norm 0.2772 (-0.09z)| lr 1.25e-04 | 8450.69 ms | -100.0% bf16 MFU | 62030 tok/s +step 13883/19560 | loss 3.322232 (-0.27z)| norm 0.2812 (+0.20z)| lr 1.24e-04 | 8452.83 ms | -100.0% bf16 MFU | 62029 tok/s +step 13884/19560 | loss 3.274894 (-1.20z)| norm 0.2706 (-0.58z)| lr 1.24e-04 | 8450.01 ms | -100.0% bf16 MFU | 62030 tok/s +step 13885/19560 | loss 3.305483 (-0.58z)| norm 0.2737 (-0.34z)| lr 1.24e-04 | 8457.05 ms | -100.0% bf16 MFU | 62028 tok/s +step 13886/19560 | loss 3.328416 (-0.11z)| norm 0.2638 (-1.06z)| lr 1.24e-04 | 8456.35 ms | -100.0% bf16 MFU | 62027 tok/s +step 13887/19560 | loss 3.292859 (-0.81z)| norm 0.2783 (+0.01z)| lr 1.24e-04 | 8451.74 ms | -100.0% bf16 MFU | 62027 tok/s +step 13888/19560 | loss 3.292230 (-0.81z)| norm 0.2607 (-1.26z)| lr 1.24e-04 | 8445.31 ms | -100.0% bf16 MFU | 62030 tok/s +step 13889/19560 | loss 3.332964 (+0.01z)| norm 0.2733 (-0.34z)| lr 1.24e-04 | 8453.04 ms | -100.0% bf16 MFU | 62030 tok/s +step 13890/19560 | loss 3.382812 (+1.00z)| norm 0.2634 (-1.05z)| lr 1.24e-04 | 8449.42 ms | -100.0% bf16 MFU | 62031 tok/s +step 13891/19560 | loss 3.244580 (-1.74z)| norm 0.2802 (+0.19z)| lr 1.24e-04 | 8443.84 ms | -100.0% bf16 MFU | 62034 tok/s +step 13892/19560 | loss 3.265832 (-1.30z)| norm 0.2465 (-2.26z)| lr 1.24e-04 | 8446.27 ms | -100.0% bf16 MFU | 62036 tok/s +step 13893/19560 | loss 3.306393 (-0.50z)| norm 0.2753 (-0.12z)| lr 1.24e-04 | 8442.84 ms | -100.0% bf16 MFU | 62039 tok/s +step 13894/19560 | loss 3.275095 (-1.11z)| norm 0.2487 (-2.05z)| lr 1.24e-04 | 8454.66 ms | -100.0% bf16 MFU | 62037 tok/s +step 13895/19560 | loss 3.275341 (-1.09z)| norm 0.2604 (-1.17z)| lr 1.24e-04 | 8448.94 ms | -100.0% bf16 MFU | 62038 tok/s +step 13896/19560 | loss 3.315683 (-0.29z)| norm 0.2458 (-2.18z)| lr 1.24e-04 | 8452.38 ms | -100.0% bf16 MFU | 62038 tok/s +step 13897/19560 | loss 3.318540 (-0.24z)| norm 0.2503 (-1.82z)| lr 1.24e-04 | 8444.69 ms | -100.0% bf16 MFU | 62040 tok/s +step 13898/19560 | loss 3.290774 (-0.78z)| norm 0.2683 (-0.54z)| lr 1.24e-04 | 8448.00 ms | -100.0% bf16 MFU | 62041 tok/s +step 13899/19560 | loss 3.269447 (-1.18z)| norm 0.2480 (-1.95z)| lr 1.24e-04 | 8450.03 ms | -100.0% bf16 MFU | 62041 tok/s +step 13900/19560 | loss 3.199479 (-2.50z)| norm 0.2925 (+1.22z)| lr 1.24e-04 | 8449.85 ms | -100.0% bf16 MFU | 62042 tok/s +step 13901/19560 | loss 3.288654 (-0.78z)| norm 0.2633 (-0.85z)| lr 1.24e-04 | 8443.72 ms | -100.0% bf16 MFU | 62044 tok/s +step 13902/19560 | loss 3.365934 (+0.69z)| norm 0.2734 (-0.13z)| lr 1.24e-04 | 8446.62 ms | -100.0% bf16 MFU | 62045 tok/s +step 13903/19560 | loss 3.354127 (+0.50z)| norm 0.2626 (-0.88z)| lr 1.24e-04 | 8453.24 ms | -100.0% bf16 MFU | 62044 tok/s +step 13904/19560 | loss 3.317614 (-0.21z)| norm 0.2786 (+0.28z)| lr 1.24e-04 | 8451.01 ms | -100.0% bf16 MFU | 62044 tok/s +step 13905/19560 | loss 3.242857 (-1.66z)| norm 0.2813 (+0.47z)| lr 1.24e-04 | 8444.73 ms | -100.0% bf16 MFU | 62046 tok/s +step 13906/19560 | loss 3.264341 (-1.22z)| norm 0.2805 (+0.41z)| lr 1.24e-04 | 8446.89 ms | -100.0% bf16 MFU | 62047 tok/s +step 13907/19560 | loss 3.293736 (-0.64z)| norm 0.2719 (-0.21z)| lr 1.24e-04 | 8448.12 ms | -100.0% bf16 MFU | 62048 tok/s +step 13908/19560 | loss 3.277763 (-0.94z)| norm 0.2725 (-0.17z)| lr 1.23e-04 | 8447.40 ms | -100.0% bf16 MFU | 62049 tok/s +step 13909/19560 | loss 3.326326 (+0.02z)| norm 0.2776 (+0.20z)| lr 1.23e-04 | 8445.40 ms | -100.0% bf16 MFU | 62050 tok/s +step 13910/19560 | loss 3.338934 (+0.26z)| norm 0.2984 (+1.67z)| lr 1.23e-04 | 8446.36 ms | -100.0% bf16 MFU | 62051 tok/s +step 13911/19560 | loss 3.272073 (-1.03z)| norm 0.2876 (+0.88z)| lr 1.23e-04 | 8440.37 ms | -100.0% bf16 MFU | 62055 tok/s +step 13912/19560 | loss 3.258993 (-1.27z)| norm 0.2886 (+0.94z)| lr 1.23e-04 | 8445.50 ms | -100.0% bf16 MFU | 62056 tok/s +step 13913/19560 | loss 3.297026 (-0.52z)| norm 0.2931 (+1.24z)| lr 1.23e-04 | 8435.99 ms | -100.0% bf16 MFU | 62060 tok/s +step 13914/19560 | loss 3.329181 (+0.12z)| norm 0.2899 (+1.00z)| lr 1.23e-04 | 8440.96 ms | -100.0% bf16 MFU | 62063 tok/s +step 13915/19560 | loss 3.271773 (-0.99z)| norm 0.2870 (+0.79z)| lr 1.23e-04 | 8446.00 ms | -100.0% bf16 MFU | 62064 tok/s +step 13916/19560 | loss 3.356445 (+0.66z)| norm 0.2930 (+1.20z)| lr 1.23e-04 | 8440.76 ms | -100.0% bf16 MFU | 62066 tok/s +step 13917/19560 | loss 3.339693 (+0.33z)| norm 0.3118 (+2.47z)| lr 1.23e-04 | 8438.46 ms | -100.0% bf16 MFU | 62069 tok/s +step 13918/19560 | loss 3.281349 (-0.81z)| norm 0.2573 (-1.34z)| lr 1.23e-04 | 8435.54 ms | -100.0% bf16 MFU | 62074 tok/s +step 13919/19560 | loss 3.294703 (-0.54z)| norm 0.2879 (+0.78z)| lr 1.23e-04 | 8436.20 ms | -100.0% bf16 MFU | 62077 tok/s +step 13920/19560 | loss 3.371728 (+0.96z)| norm 0.2755 (-0.10z)| lr 1.23e-04 | 8437.15 ms | -100.0% bf16 MFU | 62080 tok/s +step 13921/19560 | loss 3.282536 (-0.78z)| norm 0.2795 (+0.17z)| lr 1.23e-04 | 8433.40 ms | -100.0% bf16 MFU | 62085 tok/s +step 13922/19560 | loss 3.234439 (-1.69z)| norm 0.2845 (+0.52z)| lr 1.23e-04 | 8439.02 ms | -100.0% bf16 MFU | 62087 tok/s +step 13923/19560 | loss 3.297402 (-0.45z)| norm 0.2819 (+0.33z)| lr 1.23e-04 | 8432.55 ms | -100.0% bf16 MFU | 62091 tok/s +step 13924/19560 | loss 3.292803 (-0.53z)| norm 0.3090 (+2.19z)| lr 1.23e-04 | 8433.58 ms | -100.0% bf16 MFU | 62095 tok/s +step 13925/19560 | loss 3.384009 (+1.24z)| norm 0.2951 (+1.20z)| lr 1.23e-04 | 8436.65 ms | -100.0% bf16 MFU | 62097 tok/s +step 13926/19560 | loss 3.316361 (-0.08z)| norm 0.2794 (+0.10z)| lr 1.23e-04 | 8435.88 ms | -100.0% bf16 MFU | 62100 tok/s +step 13927/19560 | loss 3.246699 (-1.41z)| norm 0.2870 (+0.62z)| lr 1.23e-04 | 8436.65 ms | -100.0% bf16 MFU | 62102 tok/s +step 13928/19560 | loss 3.368299 (+0.93z)| norm 0.3202 (+2.86z)| lr 1.23e-04 | 8432.52 ms | -100.0% bf16 MFU | 62106 tok/s +step 13929/19560 | loss 3.307645 (-0.24z)| norm 0.2586 (-1.36z)| lr 1.23e-04 | 8433.57 ms | -100.0% bf16 MFU | 62109 tok/s +step 13930/19560 | loss 3.278497 (-0.79z)| norm 0.3130 (+2.30z)| lr 1.23e-04 | 8437.42 ms | -100.0% bf16 MFU | 62110 tok/s +step 13931/19560 | loss 3.307164 (-0.23z)| norm 0.2565 (-1.49z)| lr 1.23e-04 | 8435.80 ms | -100.0% bf16 MFU | 62112 tok/s +step 13932/19560 | loss 3.273221 (-0.89z)| norm 0.2812 (+0.16z)| lr 1.22e-04 | 8438.70 ms | -100.0% bf16 MFU | 62113 tok/s +step 13933/19560 | loss 3.315221 (-0.06z)| norm 0.2735 (-0.36z)| lr 1.22e-04 | 8437.32 ms | -100.0% bf16 MFU | 62115 tok/s +step 13934/19560 | loss 3.390871 (+1.40z)| norm 0.2748 (-0.26z)| lr 1.22e-04 | 8438.58 ms | -100.0% bf16 MFU | 62115 tok/s +step 13935/19560 | loss 3.248249 (-1.45z)| norm 0.2770 (-0.10z)| lr 1.22e-04 | 8437.67 ms | -100.0% bf16 MFU | 62116 tok/s +step 13936/19560 | loss 3.286537 (-0.64z)| norm 0.2684 (-0.68z)| lr 1.22e-04 | 8436.52 ms | -100.0% bf16 MFU | 62118 tok/s +step 13937/19560 | loss 3.258152 (-1.22z)| norm 0.2720 (-0.43z)| lr 1.22e-04 | 8437.96 ms | -100.0% bf16 MFU | 62119 tok/s +step 13938/19560 | loss 3.255582 (-1.26z)| norm 0.2793 (+0.07z)| lr 1.22e-04 | 8440.64 ms | -100.0% bf16 MFU | 62118 tok/s +step 13939/19560 | loss 3.272226 (-0.89z)| norm 0.2602 (-1.22z)| lr 1.22e-04 | 8439.79 ms | -100.0% bf16 MFU | 62119 tok/s +step 13940/19560 | loss 3.260900 (-1.12z)| norm 0.2714 (-0.45z)| lr 1.22e-04 | 8445.33 ms | -100.0% bf16 MFU | 62117 tok/s +step 13941/19560 | loss 3.262347 (-1.07z)| norm 0.2677 (-0.70z)| lr 1.22e-04 | 8439.64 ms | -100.0% bf16 MFU | 62117 tok/s +step 13942/19560 | loss 3.315420 (+0.08z)| norm 0.2627 (-1.03z)| lr 1.22e-04 | 8436.85 ms | -100.0% bf16 MFU | 62118 tok/s +step 13943/19560 | loss 3.326191 (+0.32z)| norm 0.2499 (-1.88z)| lr 1.22e-04 | 8435.92 ms | -100.0% bf16 MFU | 62120 tok/s +step 13944/19560 | loss 3.240762 (-1.53z)| norm 0.2804 (+0.21z)| lr 1.22e-04 | 8438.96 ms | -100.0% bf16 MFU | 62120 tok/s +step 13945/19560 | loss 3.350379 (+0.94z)| norm 0.2539 (-1.59z)| lr 1.22e-04 | 8434.25 ms | -100.0% bf16 MFU | 62122 tok/s +step 13946/19560 | loss 3.324515 (+0.35z)| norm 0.2581 (-1.27z)| lr 1.22e-04 | 8439.64 ms | -100.0% bf16 MFU | 62122 tok/s +step 13947/19560 | loss 3.395968 (+1.97z)| norm 0.2821 (+0.36z)| lr 1.22e-04 | 8444.49 ms | -100.0% bf16 MFU | 62120 tok/s +step 13948/19560 | loss 3.375360 (+1.49z)| norm 0.2765 (-0.02z)| lr 1.22e-04 | 8439.86 ms | -100.0% bf16 MFU | 62120 tok/s +step 13949/19560 | loss 3.409478 (+2.37z)| norm 0.2770 (+0.03z)| lr 1.22e-04 | 8441.50 ms | -100.0% bf16 MFU | 62120 tok/s +step 13950/19560 | loss 3.269349 (-0.92z)| norm 0.2641 (-0.86z)| lr 1.22e-04 | 8439.23 ms | -100.0% bf16 MFU | 62120 tok/s +step 13951/19560 | loss 3.295635 (-0.29z)| norm 0.2750 (-0.09z)| lr 1.22e-04 | 8444.05 ms | -100.0% bf16 MFU | 62119 tok/s +step 13952/19560 | loss 3.304497 (-0.08z)| norm 0.2491 (-1.86z)| lr 1.22e-04 | 8439.84 ms | -100.0% bf16 MFU | 62119 tok/s +step 13953/19560 | loss 3.270020 (-0.88z)| norm 0.2730 (-0.22z)| lr 1.22e-04 | 8440.39 ms | -100.0% bf16 MFU | 62119 tok/s +step 13954/19560 | loss 3.321196 (+0.34z)| norm 0.2724 (-0.25z)| lr 1.22e-04 | 8445.25 ms | -100.0% bf16 MFU | 62117 tok/s +step 13955/19560 | loss 3.274117 (-0.77z)| norm 0.2650 (-0.77z)| lr 1.22e-04 | 8443.84 ms | -100.0% bf16 MFU | 62115 tok/s +step 13956/19560 | loss 3.328825 (+0.54z)| norm 0.2715 (-0.31z)| lr 1.22e-04 | 8444.47 ms | -100.0% bf16 MFU | 62114 tok/s +step 13957/19560 | loss 3.290821 (-0.36z)| norm 0.2687 (-0.51z)| lr 1.21e-04 | 8436.25 ms | -100.0% bf16 MFU | 62116 tok/s +step 13958/19560 | loss 3.308764 (+0.08z)| norm 0.2995 (+1.60z)| lr 1.21e-04 | 8435.48 ms | -100.0% bf16 MFU | 62117 tok/s +step 13959/19560 | loss 3.366636 (+1.47z)| norm 0.2599 (-1.11z)| lr 1.21e-04 | 8441.96 ms | -100.0% bf16 MFU | 62117 tok/s +step 13960/19560 | loss 3.343233 (+0.90z)| norm 0.3145 (+2.56z)| lr 1.21e-04 | 8442.43 ms | -100.0% bf16 MFU | 62116 tok/s +step 13961/19560 | loss 3.339526 (+0.81z)| norm 0.3237 (+3.06z)| lr 1.21e-04 | 8439.08 ms | -100.0% bf16 MFU | 62117 tok/s +step 13962/19560 | loss 3.306250 (+0.02z)| norm 0.2699 (-0.43z)| lr 1.21e-04 | 8443.09 ms | -100.0% bf16 MFU | 62116 tok/s +step 13963/19560 | loss 3.263421 (-1.01z)| norm 0.3136 (+2.34z)| lr 1.21e-04 | 8441.09 ms | -100.0% bf16 MFU | 62115 tok/s +step 13964/19560 | loss 3.208336 (-2.31z)| norm 0.2775 (+0.05z)| lr 1.21e-04 | 8440.49 ms | -100.0% bf16 MFU | 62115 tok/s +step 13965/19560 | loss 3.289783 (-0.34z)| norm 0.2717 (-0.32z)| lr 1.21e-04 | 8444.10 ms | -100.0% bf16 MFU | 62114 tok/s +step 13966/19560 | loss 3.350818 (+1.12z)| norm 0.2830 (+0.39z)| lr 1.21e-04 | 8441.32 ms | -100.0% bf16 MFU | 62114 tok/s +step 13967/19560 | loss 3.292039 (-0.30z)| norm 0.3101 (+2.07z)| lr 1.21e-04 | 8437.16 ms | -100.0% bf16 MFU | 62115 tok/s +step 13968/19560 | loss 3.280977 (-0.55z)| norm 0.2808 (+0.23z)| lr 1.21e-04 | 8442.07 ms | -100.0% bf16 MFU | 62115 tok/s +step 13969/19560 | loss 3.282372 (-0.52z)| norm 0.2723 (-0.29z)| lr 1.21e-04 | 8438.44 ms | -100.0% bf16 MFU | 62115 tok/s +step 13970/19560 | loss 3.327479 (+0.57z)| norm 0.2822 (+0.32z)| lr 1.21e-04 | 8441.22 ms | -100.0% bf16 MFU | 62115 tok/s +step 13971/19560 | loss 3.340273 (+0.88z)| norm 0.2787 (+0.12z)| lr 1.21e-04 | 8443.74 ms | -100.0% bf16 MFU | 62114 tok/s +step 13972/19560 | loss 3.287365 (-0.41z)| norm 0.2770 (+0.02z)| lr 1.21e-04 | 8440.68 ms | -100.0% bf16 MFU | 62114 tok/s +step 13973/19560 | loss 3.372938 (+1.65z)| norm 0.2634 (-0.84z)| lr 1.21e-04 | 8439.30 ms | -100.0% bf16 MFU | 62115 tok/s +step 13974/19560 | loss 3.308295 (+0.08z)| norm 0.2895 (+0.86z)| lr 1.21e-04 | 8439.22 ms | -100.0% bf16 MFU | 62115 tok/s +step 13975/19560 | loss 3.319695 (+0.37z)| norm 0.2662 (-0.65z)| lr 1.21e-04 | 8438.81 ms | -100.0% bf16 MFU | 62116 tok/s +step 13976/19560 | loss 3.335008 (+0.74z)| norm 0.2640 (-0.78z)| lr 1.21e-04 | 8438.91 ms | -100.0% bf16 MFU | 62116 tok/s +step 13977/19560 | loss 3.256968 (-1.14z)| norm 0.2666 (-0.60z)| lr 1.21e-04 | 8441.39 ms | -100.0% bf16 MFU | 62116 tok/s +step 13978/19560 | loss 3.242842 (-1.46z)| norm 0.2524 (-1.51z)| lr 1.21e-04 | 8442.01 ms | -100.0% bf16 MFU | 62115 tok/s +step 13979/19560 | loss 3.276632 (-0.65z)| norm 0.2918 (+1.05z)| lr 1.21e-04 | 8440.40 ms | -100.0% bf16 MFU | 62115 tok/s +step 13980/19560 | loss 3.363368 (+1.41z)| norm 0.2737 (-0.13z)| lr 1.21e-04 | 8437.36 ms | -100.0% bf16 MFU | 62117 tok/s +step 13981/19560 | loss 3.315107 (+0.27z)| norm 0.2692 (-0.43z)| lr 1.21e-04 | 8439.62 ms | -100.0% bf16 MFU | 62117 tok/s +step 13982/19560 | loss 3.299666 (-0.10z)| norm 0.2848 (+0.58z)| lr 1.20e-04 | 8438.75 ms | -100.0% bf16 MFU | 62118 tok/s +step 13983/19560 | loss 3.334091 (+0.71z)| norm 0.2541 (-1.40z)| lr 1.20e-04 | 8435.96 ms | -100.0% bf16 MFU | 62119 tok/s +step 13984/19560 | loss 3.320071 (+0.40z)| norm 0.2704 (-0.36z)| lr 1.20e-04 | 8440.63 ms | -100.0% bf16 MFU | 62119 tok/s +step 13985/19560 | loss 3.328466 (+0.60z)| norm 0.2595 (-1.05z)| lr 1.20e-04 | 8442.95 ms | -100.0% bf16 MFU | 62118 tok/s +step 13986/19560 | loss 3.293938 (-0.26z)| norm 0.2700 (-0.36z)| lr 1.20e-04 | 8439.31 ms | -100.0% bf16 MFU | 62118 tok/s +step 13987/19560 | loss 3.272167 (-0.78z)| norm 0.2622 (-0.87z)| lr 1.20e-04 | 8434.84 ms | -100.0% bf16 MFU | 62120 tok/s +step 13988/19560 | loss 3.386047 (+1.97z)| norm 0.2699 (-0.36z)| lr 1.20e-04 | 8439.71 ms | -100.0% bf16 MFU | 62120 tok/s +step 13989/19560 | loss 3.330839 (+0.61z)| norm 0.2487 (-1.71z)| lr 1.20e-04 | 8437.67 ms | -100.0% bf16 MFU | 62121 tok/s +step 13990/19560 | loss 3.307782 (+0.04z)| norm 0.2521 (-1.48z)| lr 1.20e-04 | 8441.05 ms | -100.0% bf16 MFU | 62121 tok/s +step 13991/19560 | loss 3.322151 (+0.40z)| norm 0.2604 (-0.94z)| lr 1.20e-04 | 8438.29 ms | -100.0% bf16 MFU | 62121 tok/s +step 13992/19560 | loss 3.269474 (-0.88z)| norm 0.3682 (+5.22z)| lr 1.20e-04 | 8438.13 ms | -100.0% bf16 MFU | 62122 tok/s +step 13993/19560 | loss 3.291477 (-0.34z)| norm 0.2539 (-1.23z)| lr 1.20e-04 | 8440.38 ms | -100.0% bf16 MFU | 62121 tok/s +step 13994/19560 | loss 3.250562 (-1.33z)| norm 0.2498 (-1.43z)| lr 1.20e-04 | 8439.38 ms | -100.0% bf16 MFU | 62122 tok/s +step 13995/19560 | loss 3.296516 (-0.18z)| norm 0.2695 (-0.33z)| lr 1.20e-04 | 8438.46 ms | -100.0% bf16 MFU | 62122 tok/s +step 13996/19560 | loss 3.302094 (-0.04z)| norm 0.2630 (-0.68z)| lr 1.20e-04 | 8437.25 ms | -100.0% bf16 MFU | 62123 tok/s +step 13997/19560 | loss 3.311662 (+0.21z)| norm 0.2559 (-1.06z)| lr 1.20e-04 | 8439.54 ms | -100.0% bf16 MFU | 62123 tok/s +step 13998/19560 | loss 3.331351 (+0.70z)| norm 0.2657 (-0.51z)| lr 1.20e-04 | 8437.77 ms | -100.0% bf16 MFU | 62124 tok/s +step 13999/19560 | loss 3.278460 (-0.63z)| norm 0.2478 (-1.50z)| lr 1.20e-04 | 8439.25 ms | -100.0% bf16 MFU | 62124 tok/s +step 14000/19560 | loss 3.429107 (+3.04z)| norm 0.2556 (-1.06z)| lr 1.20e-04 | 8441.71 ms | -100.0% bf16 MFU | 62123 tok/s +val loss 3.308178 +evaluating HellaSwag: 0/1256 evaluating HellaSwag: 10/1256 evaluating HellaSwag: 20/1256 evaluating HellaSwag: 30/1256 evaluating HellaSwag: 40/1256 evaluating HellaSwag: 50/1256 evaluating HellaSwag: 60/1256 evaluating HellaSwag: 70/1256 evaluating HellaSwag: 80/1256 evaluating HellaSwag: 90/1256 evaluating HellaSwag: 100/1256 evaluating HellaSwag: 110/1256 evaluating HellaSwag: 120/1256 evaluating HellaSwag: 130/1256 evaluating HellaSwag: 140/1256 evaluating HellaSwag: 150/1256 evaluating HellaSwag: 160/1256 evaluating HellaSwag: 170/1256 evaluating HellaSwag: 180/1256 evaluating HellaSwag: 190/1256 evaluating HellaSwag: 200/1256 evaluating HellaSwag: 210/1256 evaluating HellaSwag: 220/1256 evaluating HellaSwag: 230/1256 evaluating HellaSwag: 240/1256 evaluating HellaSwag: 250/1256 evaluating HellaSwag: 260/1256 evaluating HellaSwag: 270/1256 evaluating HellaSwag: 280/1256 evaluating HellaSwag: 290/1256 evaluating HellaSwag: 300/1256 evaluating HellaSwag: 310/1256 evaluating HellaSwag: 320/1256 evaluating HellaSwag: 330/1256 evaluating HellaSwag: 340/1256 evaluating HellaSwag: 350/1256 evaluating HellaSwag: 360/1256 evaluating HellaSwag: 370/1256 evaluating HellaSwag: 380/1256 evaluating HellaSwag: 390/1256 evaluating HellaSwag: 400/1256 evaluating HellaSwag: 410/1256 evaluating HellaSwag: 420/1256 evaluating HellaSwag: 430/1256 evaluating HellaSwag: 440/1256 evaluating HellaSwag: 450/1256 evaluating HellaSwag: 460/1256 evaluating HellaSwag: 470/1256 evaluating HellaSwag: 480/1256 evaluating HellaSwag: 490/1256 evaluating HellaSwag: 500/1256 evaluating HellaSwag: 510/1256 evaluating HellaSwag: 520/1256 evaluating HellaSwag: 530/1256 evaluating HellaSwag: 540/1256 evaluating HellaSwag: 550/1256 evaluating HellaSwag: 560/1256 evaluating HellaSwag: 570/1256 evaluating HellaSwag: 580/1256 evaluating HellaSwag: 590/1256 evaluating HellaSwag: 600/1256 evaluating HellaSwag: 610/1256 evaluating HellaSwag: 620/1256 evaluating HellaSwag: 630/1256 evaluating HellaSwag: 640/1256 evaluating HellaSwag: 650/1256 evaluating HellaSwag: 660/1256 evaluating HellaSwag: 670/1256 evaluating HellaSwag: 680/1256 evaluating HellaSwag: 690/1256 evaluating HellaSwag: 700/1256 evaluating HellaSwag: 710/1256 evaluating HellaSwag: 720/1256 evaluating HellaSwag: 730/1256 evaluating HellaSwag: 740/1256 evaluating HellaSwag: 750/1256 evaluating HellaSwag: 760/1256 evaluating HellaSwag: 770/1256 evaluating HellaSwag: 780/1256 evaluating HellaSwag: 790/1256 evaluating HellaSwag: 800/1256 evaluating HellaSwag: 810/1256 evaluating HellaSwag: 820/1256 evaluating HellaSwag: 830/1256 evaluating HellaSwag: 840/1256 evaluating HellaSwag: 850/1256 evaluating HellaSwag: 860/1256 evaluating HellaSwag: 870/1256 evaluating HellaSwag: 880/1256 evaluating HellaSwag: 890/1256 evaluating HellaSwag: 900/1256 evaluating HellaSwag: 910/1256 evaluating HellaSwag: 920/1256 evaluating HellaSwag: 930/1256 evaluating HellaSwag: 940/1256 evaluating HellaSwag: 950/1256 evaluating HellaSwag: 960/1256 evaluating HellaSwag: 970/1256 evaluating HellaSwag: 980/1256 evaluating HellaSwag: 990/1256 evaluating HellaSwag: 1000/1256 evaluating HellaSwag: 1010/1256 evaluating HellaSwag: 1020/1256 evaluating HellaSwag: 1030/1256 evaluating HellaSwag: 1040/1256 evaluating HellaSwag: 1050/1256 evaluating HellaSwag: 1060/1256 evaluating HellaSwag: 1070/1256 evaluating HellaSwag: 1080/1256 evaluating HellaSwag: 1090/1256 evaluating HellaSwag: 1100/1256 evaluating HellaSwag: 1110/1256 evaluating HellaSwag: 1120/1256 evaluating HellaSwag: 1130/1256 evaluating HellaSwag: 1140/1256 evaluating HellaSwag: 1150/1256 evaluating HellaSwag: 1160/1256 evaluating HellaSwag: 1170/1256 evaluating HellaSwag: 1180/1256 evaluating HellaSwag: 1190/1256 evaluating HellaSwag: 1200/1256 evaluating HellaSwag: 1210/1256 evaluating HellaSwag: 1220/1256 evaluating HellaSwag: 1230/1256 evaluating HellaSwag: 1240/1256 evaluating HellaSwag: 1250/1256 HellaSwag: 2973/10042 = 0.296057 +step 14001/19560 | loss 3.339525 (+0.84z)| norm 0.2551 (-1.08z)| lr 1.20e-04 | 8437.76 ms | -100.0% bf16 MFU | 62123 tok/s +step 14002/19560 | loss 3.311837 (+0.17z)| norm 0.2729 (-0.08z)| lr 1.20e-04 | 8437.84 ms | -100.0% bf16 MFU | 62124 tok/s +step 14003/19560 | loss 3.324175 (+0.46z)| norm 0.2606 (-0.77z)| lr 1.20e-04 | 8443.20 ms | -100.0% bf16 MFU | 62123 tok/s +step 14004/19560 | loss 3.342780 (+0.91z)| norm 0.2656 (-0.48z)| lr 1.20e-04 | 8438.81 ms | -100.0% bf16 MFU | 62123 tok/s +step 14005/19560 | loss 3.334986 (+0.71z)| norm 0.2528 (-1.17z)| lr 1.20e-04 | 8439.11 ms | -100.0% bf16 MFU | 62123 tok/s +step 14006/19560 | loss 3.373828 (+1.63z)| norm 0.2758 (+0.09z)| lr 1.20e-04 | 8437.10 ms | -100.0% bf16 MFU | 62124 tok/s +step 14007/19560 | loss 3.309667 (+0.07z)| norm 0.2625 (-0.63z)| lr 1.19e-04 | 8437.94 ms | -100.0% bf16 MFU | 62124 tok/s +step 14008/19560 | loss 3.292033 (-0.36z)| norm 0.2573 (-0.91z)| lr 1.19e-04 | 8438.04 ms | -100.0% bf16 MFU | 62125 tok/s +step 14009/19560 | loss 3.335083 (+0.69z)| norm 0.2667 (-0.39z)| lr 1.19e-04 | 8440.18 ms | -100.0% bf16 MFU | 62125 tok/s +step 14010/19560 | loss 3.307025 (+0.00z)| norm 0.2481 (-1.38z)| lr 1.19e-04 | 8439.13 ms | -100.0% bf16 MFU | 62125 tok/s +step 14011/19560 | loss 3.269330 (-0.90z)| norm 0.2619 (-0.62z)| lr 1.19e-04 | 8438.37 ms | -100.0% bf16 MFU | 62125 tok/s +step 14012/19560 | loss 3.310202 (+0.09z)| norm 0.2597 (-0.74z)| lr 1.19e-04 | 8435.95 ms | -100.0% bf16 MFU | 62126 tok/s +step 14013/19560 | loss 3.290605 (-0.39z)| norm 0.2635 (-0.53z)| lr 1.19e-04 | 8437.19 ms | -100.0% bf16 MFU | 62127 tok/s +step 14014/19560 | loss 3.333817 (+0.66z)| norm 0.2770 (+0.20z)| lr 1.19e-04 | 8437.39 ms | -100.0% bf16 MFU | 62127 tok/s +step 14015/19560 | loss 3.324971 (+0.44z)| norm 0.2597 (-0.73z)| lr 1.19e-04 | 8438.47 ms | -100.0% bf16 MFU | 62128 tok/s +step 14016/19560 | loss 3.258354 (-1.17z)| norm 0.2652 (-0.43z)| lr 1.19e-04 | 8438.93 ms | -100.0% bf16 MFU | 62128 tok/s +step 14017/19560 | loss 3.217121 (-2.11z)| norm 0.2735 (+0.02z)| lr 1.19e-04 | 8437.89 ms | -100.0% bf16 MFU | 62128 tok/s +step 14018/19560 | loss 3.329162 (+0.58z)| norm 0.2741 (+0.05z)| lr 1.19e-04 | 8436.94 ms | -100.0% bf16 MFU | 62129 tok/s +step 14019/19560 | loss 3.231231 (-1.78z)| norm 0.2563 (-0.91z)| lr 1.19e-04 | 8437.45 ms | -100.0% bf16 MFU | 62129 tok/s +step 14020/19560 | loss 3.304960 (-0.01z)| norm 0.2578 (-0.84z)| lr 1.19e-04 | 8439.28 ms | -100.0% bf16 MFU | 62129 tok/s +step 14021/19560 | loss 3.336867 (+0.75z)| norm 0.2681 (-0.27z)| lr 1.19e-04 | 8437.01 ms | -100.0% bf16 MFU | 62130 tok/s +step 14022/19560 | loss 3.278824 (-0.65z)| norm 0.2548 (-1.00z)| lr 1.19e-04 | 8438.32 ms | -100.0% bf16 MFU | 62130 tok/s +step 14023/19560 | loss 3.322147 (+0.39z)| norm 0.2608 (-0.68z)| lr 1.19e-04 | 8437.31 ms | -100.0% bf16 MFU | 62130 tok/s +step 14024/19560 | loss 3.273954 (-0.77z)| norm 0.2667 (-0.36z)| lr 1.19e-04 | 8438.97 ms | -100.0% bf16 MFU | 62130 tok/s +step 14025/19560 | loss 3.312838 (+0.17z)| norm 0.2558 (-0.97z)| lr 1.19e-04 | 8440.96 ms | -100.0% bf16 MFU | 62129 tok/s +step 14026/19560 | loss 3.340448 (+0.82z)| norm 0.2647 (-0.48z)| lr 1.19e-04 | 8448.34 ms | -100.0% bf16 MFU | 62126 tok/s +step 14027/19560 | loss 3.263865 (-1.01z)| norm 0.2481 (-1.39z)| lr 1.19e-04 | 8462.59 ms | -100.0% bf16 MFU | 62117 tok/s +step 14028/19560 | loss 3.331116 (+0.59z)| norm 0.2734 (+0.01z)| lr 1.19e-04 | 8462.10 ms | -100.0% bf16 MFU | 62109 tok/s +step 14029/19560 | loss 3.286761 (-0.50z)| norm 0.2425 (-1.68z)| lr 1.19e-04 | 8455.17 ms | -100.0% bf16 MFU | 62104 tok/s +step 14030/19560 | loss 3.308401 (+0.04z)| norm 0.2616 (-0.62z)| lr 1.19e-04 | 8459.17 ms | -100.0% bf16 MFU | 62098 tok/s +step 14031/19560 | loss 3.283873 (-0.55z)| norm 0.2539 (-1.04z)| lr 1.19e-04 | 8455.15 ms | -100.0% bf16 MFU | 62093 tok/s +step 14032/19560 | loss 3.256965 (-1.20z)| norm 0.2777 (+0.26z)| lr 1.18e-04 | 8456.81 ms | -100.0% bf16 MFU | 62088 tok/s +step 14033/19560 | loss 3.314716 (+0.21z)| norm 0.2555 (-0.93z)| lr 1.18e-04 | 8452.30 ms | -100.0% bf16 MFU | 62085 tok/s +step 14034/19560 | loss 3.324349 (+0.44z)| norm 0.2581 (-0.79z)| lr 1.18e-04 | 8450.53 ms | -100.0% bf16 MFU | 62083 tok/s +step 14035/19560 | loss 3.320177 (+0.33z)| norm 0.2703 (-0.12z)| lr 1.18e-04 | 8451.73 ms | -100.0% bf16 MFU | 62081 tok/s +step 14036/19560 | loss 3.284799 (-0.56z)| norm 0.2649 (-0.41z)| lr 1.18e-04 | 8456.45 ms | -100.0% bf16 MFU | 62077 tok/s +step 14037/19560 | loss 3.270549 (-0.90z)| norm 0.2624 (-0.54z)| lr 1.18e-04 | 8455.37 ms | -100.0% bf16 MFU | 62073 tok/s +step 14038/19560 | loss 3.287047 (-0.48z)| norm 0.2493 (-1.23z)| lr 1.18e-04 | 8453.29 ms | -100.0% bf16 MFU | 62071 tok/s +step 14039/19560 | loss 3.281564 (-0.62z)| norm 0.2452 (-1.43z)| lr 1.18e-04 | 8451.43 ms | -100.0% bf16 MFU | 62069 tok/s +step 14040/19560 | loss 3.340296 (+0.84z)| norm 0.2802 (+0.47z)| lr 1.18e-04 | 8450.88 ms | -100.0% bf16 MFU | 62067 tok/s +step 14041/19560 | loss 3.283214 (-0.59z)| norm 0.2562 (-0.82z)| lr 1.18e-04 | 8452.88 ms | -100.0% bf16 MFU | 62065 tok/s +step 14042/19560 | loss 3.297208 (-0.23z)| norm 0.2637 (-0.40z)| lr 1.18e-04 | 8448.21 ms | -100.0% bf16 MFU | 62065 tok/s +step 14043/19560 | loss 3.313458 (+0.17z)| norm 0.2677 (-0.17z)| lr 1.18e-04 | 8450.71 ms | -100.0% bf16 MFU | 62064 tok/s +step 14044/19560 | loss 3.249536 (-1.42z)| norm 0.2675 (-0.18z)| lr 1.18e-04 | 8451.26 ms | -100.0% bf16 MFU | 62062 tok/s +step 14045/19560 | loss 3.332012 (+0.66z)| norm 0.2729 (+0.14z)| lr 1.18e-04 | 8451.79 ms | -100.0% bf16 MFU | 62061 tok/s +step 14046/19560 | loss 3.311611 (+0.14z)| norm 0.2684 (-0.12z)| lr 1.18e-04 | 8449.10 ms | -100.0% bf16 MFU | 62060 tok/s +step 14047/19560 | loss 3.325372 (+0.48z)| norm 0.2665 (-0.22z)| lr 1.18e-04 | 8450.42 ms | -100.0% bf16 MFU | 62060 tok/s +step 14048/19560 | loss 3.333549 (+0.70z)| norm 0.2666 (-0.21z)| lr 1.18e-04 | 8452.86 ms | -100.0% bf16 MFU | 62058 tok/s +step 14049/19560 | loss 3.306934 (+0.02z)| norm 0.2779 (+0.43z)| lr 1.18e-04 | 8453.39 ms | -100.0% bf16 MFU | 62056 tok/s +step 14050/19560 | loss 3.293697 (-0.34z)| norm 0.2782 (+0.45z)| lr 1.18e-04 | 8454.21 ms | -100.0% bf16 MFU | 62054 tok/s +step 14051/19560 | loss 3.280318 (-0.68z)| norm 0.2648 (-0.30z)| lr 1.18e-04 | 8451.46 ms | -100.0% bf16 MFU | 62053 tok/s +step 14052/19560 | loss 3.309085 (+0.06z)| norm 0.2795 (+0.56z)| lr 1.18e-04 | 8440.96 ms | -100.0% bf16 MFU | 62056 tok/s +step 14053/19560 | loss 3.300808 (-0.14z)| norm 0.2636 (-0.35z)| lr 1.18e-04 | 8454.42 ms | -100.0% bf16 MFU | 62054 tok/s +step 14054/19560 | loss 3.344041 (+0.99z)| norm 0.2684 (-0.06z)| lr 1.18e-04 | 8453.10 ms | -100.0% bf16 MFU | 62052 tok/s +step 14055/19560 | loss 3.267650 (-1.02z)| norm 0.2597 (-0.56z)| lr 1.18e-04 | 8455.98 ms | -100.0% bf16 MFU | 62050 tok/s +step 14056/19560 | loss 3.280381 (-0.67z)| norm 0.2604 (-0.51z)| lr 1.18e-04 | 8449.35 ms | -100.0% bf16 MFU | 62050 tok/s +step 14057/19560 | loss 3.302697 (-0.08z)| norm 0.2618 (-0.43z)| lr 1.17e-04 | 8452.63 ms | -100.0% bf16 MFU | 62049 tok/s +step 14058/19560 | loss 3.308929 (+0.08z)| norm 0.2456 (-1.41z)| lr 1.17e-04 | 8446.00 ms | -100.0% bf16 MFU | 62050 tok/s +step 14059/19560 | loss 3.254396 (-1.35z)| norm 0.2687 (+0.02z)| lr 1.17e-04 | 8453.32 ms | -100.0% bf16 MFU | 62049 tok/s +step 14060/19560 | loss 3.293822 (-0.31z)| norm 0.2528 (-0.95z)| lr 1.17e-04 | 8451.82 ms | -100.0% bf16 MFU | 62048 tok/s +step 14061/19560 | loss 3.364415 (+1.53z)| norm 0.2773 (+0.56z)| lr 1.17e-04 | 8447.09 ms | -100.0% bf16 MFU | 62049 tok/s +step 14062/19560 | loss 3.341842 (+0.96z)| norm 0.2722 (+0.25z)| lr 1.17e-04 | 8453.88 ms | -100.0% bf16 MFU | 62047 tok/s +step 14063/19560 | loss 3.252433 (-1.42z)| norm 0.2747 (+0.41z)| lr 1.17e-04 | 8450.19 ms | -100.0% bf16 MFU | 62047 tok/s +step 14064/19560 | loss 3.326861 (+0.56z)| norm 0.2660 (-0.13z)| lr 1.17e-04 | 8447.07 ms | -100.0% bf16 MFU | 62048 tok/s +step 14065/19560 | loss 3.289192 (-0.46z)| norm 0.2690 (+0.05z)| lr 1.17e-04 | 8453.68 ms | -100.0% bf16 MFU | 62047 tok/s +step 14066/19560 | loss 3.269828 (-0.98z)| norm 0.2572 (-0.67z)| lr 1.17e-04 | 8450.98 ms | -100.0% bf16 MFU | 62046 tok/s +step 14067/19560 | loss 3.309145 (+0.07z)| norm 0.2818 (+0.85z)| lr 1.17e-04 | 8451.66 ms | -100.0% bf16 MFU | 62046 tok/s +step 14068/19560 | loss 3.296445 (-0.29z)| norm 0.2600 (-0.50z)| lr 1.17e-04 | 8448.48 ms | -100.0% bf16 MFU | 62046 tok/s +step 14069/19560 | loss 3.285026 (-0.60z)| norm 0.2776 (+0.58z)| lr 1.17e-04 | 8450.33 ms | -100.0% bf16 MFU | 62046 tok/s +step 14070/19560 | loss 3.288819 (-0.49z)| norm 0.2660 (-0.13z)| lr 1.17e-04 | 8451.02 ms | -100.0% bf16 MFU | 62046 tok/s +step 14071/19560 | loss 3.347506 (+1.10z)| norm 0.2617 (-0.41z)| lr 1.17e-04 | 8452.25 ms | -100.0% bf16 MFU | 62045 tok/s +step 14072/19560 | loss 3.284504 (-0.63z)| norm 0.2690 (+0.05z)| lr 1.17e-04 | 8449.93 ms | -100.0% bf16 MFU | 62045 tok/s +step 14073/19560 | loss 3.353454 (+1.26z)| norm 0.2650 (-0.20z)| lr 1.17e-04 | 8445.64 ms | -100.0% bf16 MFU | 62047 tok/s +step 14074/19560 | loss 3.352151 (+1.22z)| norm 0.2579 (-0.64z)| lr 1.17e-04 | 8449.45 ms | -100.0% bf16 MFU | 62047 tok/s +step 14075/19560 | loss 3.268997 (-1.05z)| norm 0.2657 (-0.15z)| lr 1.17e-04 | 8448.79 ms | -100.0% bf16 MFU | 62047 tok/s +step 14076/19560 | loss 3.337531 (+0.88z)| norm 0.2672 (-0.05z)| lr 1.17e-04 | 8445.42 ms | -100.0% bf16 MFU | 62049 tok/s +step 14077/19560 | loss 3.341978 (+1.05z)| norm 0.2739 (+0.37z)| lr 1.17e-04 | 8448.89 ms | -100.0% bf16 MFU | 62049 tok/s +step 14078/19560 | loss 3.371848 (+1.88z)| norm 0.2696 (+0.10z)| lr 1.17e-04 | 8452.64 ms | -100.0% bf16 MFU | 62048 tok/s +step 14079/19560 | loss 3.348094 (+1.17z)| norm 0.2660 (-0.12z)| lr 1.17e-04 | 8444.33 ms | -100.0% bf16 MFU | 62050 tok/s +step 14080/19560 | loss 3.349296 (+1.19z)| norm 0.2627 (-0.34z)| lr 1.17e-04 | 8452.49 ms | -100.0% bf16 MFU | 62049 tok/s +step 14081/19560 | loss 3.265197 (-1.20z)| norm 0.2583 (-0.61z)| lr 1.17e-04 | 8449.30 ms | -100.0% bf16 MFU | 62049 tok/s +step 14082/19560 | loss 3.300228 (-0.20z)| norm 0.2729 (+0.31z)| lr 1.17e-04 | 8447.35 ms | -100.0% bf16 MFU | 62050 tok/s +step 14083/19560 | loss 3.224893 (-2.29z)| norm 0.2544 (-0.85z)| lr 1.16e-04 | 8448.68 ms | -100.0% bf16 MFU | 62050 tok/s +step 14084/19560 | loss 3.283933 (-0.63z)| norm 0.2697 (+0.11z)| lr 1.16e-04 | 8451.97 ms | -100.0% bf16 MFU | 62049 tok/s +step 14085/19560 | loss 3.261052 (-1.26z)| norm 0.2533 (-0.90z)| lr 1.16e-04 | 8444.16 ms | -100.0% bf16 MFU | 62051 tok/s +step 14086/19560 | loss 3.270310 (-0.99z)| norm 0.2746 (+0.45z)| lr 1.16e-04 | 8444.61 ms | -100.0% bf16 MFU | 62053 tok/s +step 14087/19560 | loss 3.296628 (-0.25z)| norm 0.2575 (-0.64z)| lr 1.16e-04 | 8444.64 ms | -100.0% bf16 MFU | 62054 tok/s +step 14088/19560 | loss 3.323885 (+0.52z)| norm 0.2690 (+0.12z)| lr 1.16e-04 | 8448.10 ms | -100.0% bf16 MFU | 62055 tok/s +step 14089/19560 | loss 3.340978 (+1.00z)| norm 0.2706 (+0.27z)| lr 1.16e-04 | 8449.21 ms | -100.0% bf16 MFU | 62055 tok/s +step 14090/19560 | loss 3.274272 (-0.86z)| norm 0.2746 (+0.54z)| lr 1.16e-04 | 8445.10 ms | -100.0% bf16 MFU | 62056 tok/s +step 14091/19560 | loss 3.334695 (+0.82z)| norm 0.2615 (-0.35z)| lr 1.16e-04 | 8436.97 ms | -100.0% bf16 MFU | 62060 tok/s +step 14092/19560 | loss 3.385621 (+2.23z)| norm 0.2735 (+0.52z)| lr 1.16e-04 | 8449.46 ms | -100.0% bf16 MFU | 62060 tok/s +step 14093/19560 | loss 3.346532 (+1.11z)| norm 0.2634 (-0.21z)| lr 1.16e-04 | 8446.64 ms | -100.0% bf16 MFU | 62060 tok/s +step 14094/19560 | loss 3.295166 (-0.33z)| norm 0.2764 (+0.74z)| lr 1.16e-04 | 8451.22 ms | -100.0% bf16 MFU | 62059 tok/s +step 14095/19560 | loss 3.329284 (+0.63z)| norm 0.2737 (+0.59z)| lr 1.16e-04 | 8447.91 ms | -100.0% bf16 MFU | 62059 tok/s +step 14096/19560 | loss 3.303214 (-0.12z)| norm 0.2665 (+0.05z)| lr 1.16e-04 | 8443.66 ms | -100.0% bf16 MFU | 62061 tok/s +step 14097/19560 | loss 3.324319 (+0.47z)| norm 0.2865 (+1.56z)| lr 1.16e-04 | 8450.37 ms | -100.0% bf16 MFU | 62060 tok/s +step 14098/19560 | loss 3.328249 (+0.58z)| norm 0.2558 (-0.76z)| lr 1.16e-04 | 8447.31 ms | -100.0% bf16 MFU | 62060 tok/s +step 14099/19560 | loss 3.365948 (+1.64z)| norm 0.2608 (-0.37z)| lr 1.16e-04 | 8449.76 ms | -100.0% bf16 MFU | 62060 tok/s +step 14100/19560 | loss 3.323775 (+0.44z)| norm 0.2552 (-0.78z)| lr 1.16e-04 | 8442.99 ms | -100.0% bf16 MFU | 62061 tok/s +step 14101/19560 | loss 3.276088 (-0.90z)| norm 0.2539 (-0.88z)| lr 1.16e-04 | 8446.08 ms | -100.0% bf16 MFU | 62062 tok/s +step 14102/19560 | loss 3.321358 (+0.39z)| norm 0.2640 (-0.09z)| lr 1.16e-04 | 8447.35 ms | -100.0% bf16 MFU | 62062 tok/s +step 14103/19560 | loss 3.333830 (+0.75z)| norm 0.2575 (-0.59z)| lr 1.16e-04 | 8444.86 ms | -100.0% bf16 MFU | 62063 tok/s +step 14104/19560 | loss 3.350158 (+1.20z)| norm 0.2492 (-1.22z)| lr 1.16e-04 | 8442.83 ms | -100.0% bf16 MFU | 62065 tok/s +step 14105/19560 | loss 3.308219 (+0.00z)| norm 0.2617 (-0.25z)| lr 1.16e-04 | 8453.20 ms | -100.0% bf16 MFU | 62063 tok/s +step 14106/19560 | loss 3.323051 (+0.41z)| norm 0.2482 (-1.28z)| lr 1.16e-04 | 8441.05 ms | -100.0% bf16 MFU | 62065 tok/s +step 14107/19560 | loss 3.340971 (+0.92z)| norm 0.2460 (-1.44z)| lr 1.16e-04 | 8448.23 ms | -100.0% bf16 MFU | 62065 tok/s +step 14108/19560 | loss 3.307249 (-0.05z)| norm 0.2562 (-0.64z)| lr 1.15e-04 | 8445.83 ms | -100.0% bf16 MFU | 62066 tok/s +step 14109/19560 | loss 3.356672 (+1.38z)| norm 0.2622 (-0.17z)| lr 1.15e-04 | 8448.54 ms | -100.0% bf16 MFU | 62065 tok/s +step 14110/19560 | loss 3.286166 (-0.67z)| norm 0.2591 (-0.40z)| lr 1.15e-04 | 8439.11 ms | -100.0% bf16 MFU | 62068 tok/s +step 14111/19560 | loss 3.366850 (+1.66z)| norm 0.2471 (-1.32z)| lr 1.15e-04 | 8441.14 ms | -100.0% bf16 MFU | 62070 tok/s +step 14112/19560 | loss 3.313746 (+0.13z)| norm 0.2693 (+0.40z)| lr 1.15e-04 | 8443.66 ms | -100.0% bf16 MFU | 62072 tok/s +step 14113/19560 | loss 3.320427 (+0.32z)| norm 0.2687 (+0.35z)| lr 1.15e-04 | 8445.83 ms | -100.0% bf16 MFU | 62072 tok/s +step 14114/19560 | loss 3.281778 (-0.79z)| norm 0.2934 (+2.21z)| lr 1.15e-04 | 8440.67 ms | -100.0% bf16 MFU | 62074 tok/s +step 14115/19560 | loss 3.272530 (-1.05z)| norm 0.2804 (+1.20z)| lr 1.15e-04 | 8448.44 ms | -100.0% bf16 MFU | 62073 tok/s +step 14116/19560 | loss 3.319833 (+0.33z)| norm 0.2677 (+0.24z)| lr 1.15e-04 | 8443.44 ms | -100.0% bf16 MFU | 62074 tok/s +step 14117/19560 | loss 3.300432 (-0.23z)| norm 0.2732 (+0.65z)| lr 1.15e-04 | 8439.23 ms | -100.0% bf16 MFU | 62077 tok/s +step 14118/19560 | loss 3.300105 (-0.24z)| norm 0.2601 (-0.36z)| lr 1.15e-04 | 8435.22 ms | -100.0% bf16 MFU | 62081 tok/s +step 14119/19560 | loss 3.280243 (-0.81z)| norm 0.2865 (+1.63z)| lr 1.15e-04 | 8445.44 ms | -100.0% bf16 MFU | 62081 tok/s +step 14120/19560 | loss 3.339481 (+0.91z)| norm 0.2707 (+0.69z)| lr 1.15e-04 | 8439.82 ms | -100.0% bf16 MFU | 62083 tok/s +step 14121/19560 | loss 3.349327 (+1.18z)| norm 0.2781 (+1.43z)| lr 1.15e-04 | 8441.45 ms | -100.0% bf16 MFU | 62084 tok/s +step 14122/19560 | loss 3.290700 (-0.55z)| norm 0.2877 (+2.37z)| lr 1.15e-04 | 8438.89 ms | -100.0% bf16 MFU | 62086 tok/s +step 14123/19560 | loss 3.339090 (+0.87z)| norm 0.2731 (+0.86z)| lr 1.15e-04 | 8441.31 ms | -100.0% bf16 MFU | 62087 tok/s +step 14124/19560 | loss 3.278049 (-0.92z)| norm 0.2801 (+1.55z)| lr 1.15e-04 | 8442.04 ms | -100.0% bf16 MFU | 62088 tok/s +step 14125/19560 | loss 3.300149 (-0.27z)| norm 0.2735 (+0.87z)| lr 1.15e-04 | 8445.00 ms | -100.0% bf16 MFU | 62088 tok/s +step 14126/19560 | loss 3.315048 (+0.17z)| norm 0.2907 (+2.54z)| lr 1.15e-04 | 8441.58 ms | -100.0% bf16 MFU | 62089 tok/s +step 14127/19560 | loss 3.252928 (-1.64z)| norm 0.2615 (-0.37z)| lr 1.15e-04 | 8447.45 ms | -100.0% bf16 MFU | 62088 tok/s +step 14128/19560 | loss 3.322622 (+0.44z)| norm 0.2814 (+1.59z)| lr 1.15e-04 | 8442.43 ms | -100.0% bf16 MFU | 62088 tok/s +step 14129/19560 | loss 3.364888 (+1.72z)| norm 0.2729 (+0.73z)| lr 1.15e-04 | 8446.73 ms | -100.0% bf16 MFU | 62087 tok/s +step 14130/19560 | loss 3.348020 (+1.19z)| norm 0.2548 (-1.06z)| lr 1.15e-04 | 8440.23 ms | -100.0% bf16 MFU | 62089 tok/s +step 14131/19560 | loss 3.342731 (+1.02z)| norm 0.2653 (-0.02z)| lr 1.15e-04 | 8440.79 ms | -100.0% bf16 MFU | 62090 tok/s +step 14132/19560 | loss 3.288829 (-0.59z)| norm 0.2714 (+0.58z)| lr 1.15e-04 | 8444.85 ms | -100.0% bf16 MFU | 62090 tok/s +step 14133/19560 | loss 3.258245 (-1.49z)| norm 0.2567 (-0.89z)| lr 1.14e-04 | 8441.95 ms | -100.0% bf16 MFU | 62091 tok/s +step 14134/19560 | loss 3.306690 (-0.02z)| norm 0.2734 (+0.79z)| lr 1.14e-04 | 8442.56 ms | -100.0% bf16 MFU | 62091 tok/s +step 14135/19560 | loss 3.313050 (+0.17z)| norm 0.2675 (+0.19z)| lr 1.14e-04 | 8443.00 ms | -100.0% bf16 MFU | 62091 tok/s +step 14136/19560 | loss 3.281001 (-0.80z)| norm 0.2508 (-1.47z)| lr 1.14e-04 | 8441.32 ms | -100.0% bf16 MFU | 62092 tok/s +step 14137/19560 | loss 3.308893 (+0.06z)| norm 0.2727 (+0.71z)| lr 1.14e-04 | 8439.96 ms | -100.0% bf16 MFU | 62094 tok/s +step 14138/19560 | loss 3.241664 (-1.95z)| norm 0.2621 (-0.37z)| lr 1.14e-04 | 8443.79 ms | -100.0% bf16 MFU | 62094 tok/s +step 14139/19560 | loss 3.350525 (+1.30z)| norm 0.2621 (-0.36z)| lr 1.14e-04 | 8443.18 ms | -100.0% bf16 MFU | 62094 tok/s +step 14140/19560 | loss 3.301159 (-0.18z)| norm 0.2663 (+0.05z)| lr 1.14e-04 | 8443.49 ms | -100.0% bf16 MFU | 62094 tok/s +step 14141/19560 | loss 3.324178 (+0.51z)| norm 0.2657 (-0.01z)| lr 1.14e-04 | 8442.29 ms | -100.0% bf16 MFU | 62094 tok/s +step 14142/19560 | loss 3.337349 (+0.90z)| norm 0.2568 (-0.89z)| lr 1.14e-04 | 8441.35 ms | -100.0% bf16 MFU | 62095 tok/s +step 14143/19560 | loss 3.303105 (-0.12z)| norm 0.2718 (+0.61z)| lr 1.14e-04 | 8442.73 ms | -100.0% bf16 MFU | 62095 tok/s +step 14144/19560 | loss 3.330250 (+0.68z)| norm 0.2671 (+0.14z)| lr 1.14e-04 | 8442.55 ms | -100.0% bf16 MFU | 62095 tok/s +step 14145/19560 | loss 3.307506 (-0.03z)| norm 0.2650 (-0.07z)| lr 1.14e-04 | 8438.85 ms | -100.0% bf16 MFU | 62097 tok/s +step 14146/19560 | loss 3.257757 (-1.55z)| norm 0.2728 (+0.72z)| lr 1.14e-04 | 8441.23 ms | -100.0% bf16 MFU | 62098 tok/s +step 14147/19560 | loss 3.295904 (-0.40z)| norm 0.2444 (-2.11z)| lr 1.14e-04 | 8442.84 ms | -100.0% bf16 MFU | 62098 tok/s +step 14148/19560 | loss 3.274711 (-1.06z)| norm 0.2598 (-0.58z)| lr 1.14e-04 | 8439.86 ms | -100.0% bf16 MFU | 62099 tok/s +step 14149/19560 | loss 3.320270 (+0.39z)| norm 0.2654 (-0.01z)| lr 1.14e-04 | 8443.05 ms | -100.0% bf16 MFU | 62099 tok/s +step 14150/19560 | loss 3.306998 (-0.04z)| norm 0.2741 (+0.84z)| lr 1.14e-04 | 8441.93 ms | -100.0% bf16 MFU | 62099 tok/s +step 14151/19560 | loss 3.302453 (-0.18z)| norm 0.2651 (-0.07z)| lr 1.14e-04 | 8440.87 ms | -100.0% bf16 MFU | 62100 tok/s +step 14152/19560 | loss 3.305557 (-0.09z)| norm 0.2721 (+0.63z)| lr 1.14e-04 | 8436.62 ms | -100.0% bf16 MFU | 62102 tok/s +step 14153/19560 | loss 3.345819 (+1.18z)| norm 0.2794 (+1.34z)| lr 1.14e-04 | 8443.38 ms | -100.0% bf16 MFU | 62102 tok/s +step 14154/19560 | loss 3.282890 (-0.80z)| norm 0.2872 (+2.07z)| lr 1.14e-04 | 8439.12 ms | -100.0% bf16 MFU | 62103 tok/s +step 14155/19560 | loss 3.319596 (+0.35z)| norm 0.2645 (-0.18z)| lr 1.14e-04 | 8439.73 ms | -100.0% bf16 MFU | 62104 tok/s +step 14156/19560 | loss 3.323154 (+0.47z)| norm 0.2715 (+0.52z)| lr 1.14e-04 | 8438.71 ms | -100.0% bf16 MFU | 62105 tok/s +step 14157/19560 | loss 3.353278 (+1.41z)| norm 0.2612 (-0.53z)| lr 1.14e-04 | 8438.86 ms | -100.0% bf16 MFU | 62106 tok/s +step 14158/19560 | loss 3.292933 (-0.51z)| norm 0.2710 (+0.46z)| lr 1.14e-04 | 8436.42 ms | -100.0% bf16 MFU | 62108 tok/s +step 14159/19560 | loss 3.293490 (-0.50z)| norm 0.2716 (+0.51z)| lr 1.13e-04 | 8438.24 ms | -100.0% bf16 MFU | 62109 tok/s +step 14160/19560 | loss 3.261378 (-1.52z)| norm 0.2486 (-1.80z)| lr 1.13e-04 | 8438.76 ms | -100.0% bf16 MFU | 62110 tok/s +step 14161/19560 | loss 3.265285 (-1.37z)| norm 0.2600 (-0.65z)| lr 1.13e-04 | 8441.69 ms | -100.0% bf16 MFU | 62110 tok/s +step 14162/19560 | loss 3.317121 (+0.27z)| norm 0.2824 (+1.59z)| lr 1.13e-04 | 8439.98 ms | -100.0% bf16 MFU | 62111 tok/s +step 14163/19560 | loss 3.339982 (+0.99z)| norm 0.2631 (-0.34z)| lr 1.13e-04 | 8440.30 ms | -100.0% bf16 MFU | 62111 tok/s +step 14164/19560 | loss 3.411183 (+3.09z)| norm 0.2693 (+0.27z)| lr 1.13e-04 | 8445.22 ms | -100.0% bf16 MFU | 62109 tok/s +step 14165/19560 | loss 3.302491 (-0.23z)| norm 0.3103 (+4.07z)| lr 1.13e-04 | 8442.85 ms | -100.0% bf16 MFU | 62109 tok/s +step 14166/19560 | loss 3.325611 (+0.47z)| norm 0.2735 (+0.60z)| lr 1.13e-04 | 8442.54 ms | -100.0% bf16 MFU | 62109 tok/s +step 14167/19560 | loss 3.213993 (-2.85z)| norm 0.2779 (+1.01z)| lr 1.13e-04 | 8441.82 ms | -100.0% bf16 MFU | 62108 tok/s +step 14168/19560 | loss 3.300782 (-0.26z)| norm 0.2737 (+0.61z)| lr 1.13e-04 | 8439.02 ms | -100.0% bf16 MFU | 62109 tok/s +step 14169/19560 | loss 3.306344 (-0.10z)| norm 0.2615 (-0.58z)| lr 1.13e-04 | 8437.58 ms | -100.0% bf16 MFU | 62111 tok/s +step 14170/19560 | loss 3.311655 (+0.06z)| norm 0.2623 (-0.50z)| lr 1.13e-04 | 8436.83 ms | -100.0% bf16 MFU | 62112 tok/s +step 14171/19560 | loss 3.310131 (+0.01z)| norm 0.2585 (-0.85z)| lr 1.13e-04 | 8437.80 ms | -100.0% bf16 MFU | 62113 tok/s +step 14172/19560 | loss 3.333329 (+0.69z)| norm 0.2706 (+0.31z)| lr 1.13e-04 | 8441.54 ms | -100.0% bf16 MFU | 62113 tok/s +step 14173/19560 | loss 3.294720 (-0.47z)| norm 0.2775 (+0.97z)| lr 1.13e-04 | 8436.79 ms | -100.0% bf16 MFU | 62115 tok/s +step 14174/19560 | loss 3.278690 (-0.94z)| norm 0.2577 (-0.92z)| lr 1.13e-04 | 8441.71 ms | -100.0% bf16 MFU | 62114 tok/s +step 14175/19560 | loss 3.358910 (+1.46z)| norm 0.2913 (+2.24z)| lr 1.13e-04 | 8436.80 ms | -100.0% bf16 MFU | 62116 tok/s +step 14176/19560 | loss 3.346438 (+1.08z)| norm 0.2854 (+1.66z)| lr 1.13e-04 | 8438.50 ms | -100.0% bf16 MFU | 62116 tok/s +step 14177/19560 | loss 3.308046 (-0.07z)| norm 0.2684 (+0.07z)| lr 1.13e-04 | 8444.35 ms | -100.0% bf16 MFU | 62115 tok/s +step 14178/19560 | loss 3.288409 (-0.65z)| norm 0.2775 (+0.93z)| lr 1.13e-04 | 8438.41 ms | -100.0% bf16 MFU | 62116 tok/s +step 14179/19560 | loss 3.307163 (-0.10z)| norm 0.2795 (+1.10z)| lr 1.13e-04 | 8439.61 ms | -100.0% bf16 MFU | 62116 tok/s +step 14180/19560 | loss 3.277531 (-0.97z)| norm 0.2636 (-0.37z)| lr 1.13e-04 | 8441.75 ms | -100.0% bf16 MFU | 62116 tok/s +step 14181/19560 | loss 3.313631 (+0.10z)| norm 0.2670 (-0.06z)| lr 1.13e-04 | 8439.26 ms | -100.0% bf16 MFU | 62116 tok/s +step 14182/19560 | loss 3.295983 (-0.42z)| norm 0.2931 (+2.32z)| lr 1.13e-04 | 8440.71 ms | -100.0% bf16 MFU | 62116 tok/s +step 14183/19560 | loss 3.373780 (+1.87z)| norm 0.2944 (+2.37z)| lr 1.13e-04 | 8434.00 ms | -100.0% bf16 MFU | 62118 tok/s +step 14184/19560 | loss 3.273973 (-1.09z)| norm 0.2801 (+1.06z)| lr 1.13e-04 | 8439.03 ms | -100.0% bf16 MFU | 62119 tok/s +step 14185/19560 | loss 3.226584 (-2.42z)| norm 0.2757 (+0.66z)| lr 1.12e-04 | 8440.20 ms | -100.0% bf16 MFU | 62119 tok/s +step 14186/19560 | loss 3.310289 (+0.01z)| norm 0.2845 (+1.44z)| lr 1.12e-04 | 8435.75 ms | -100.0% bf16 MFU | 62120 tok/s +step 14187/19560 | loss 3.329136 (+0.54z)| norm 0.2672 (-0.13z)| lr 1.12e-04 | 8438.71 ms | -100.0% bf16 MFU | 62121 tok/s +step 14188/19560 | loss 3.311216 (+0.01z)| norm 0.2809 (+1.10z)| lr 1.12e-04 | 8437.01 ms | -100.0% bf16 MFU | 62122 tok/s +step 14189/19560 | loss 3.308544 (-0.05z)| norm 0.2642 (-0.41z)| lr 1.12e-04 | 8438.78 ms | -100.0% bf16 MFU | 62122 tok/s +step 14190/19560 | loss 3.285970 (-0.71z)| norm 0.2541 (-1.30z)| lr 1.12e-04 | 8435.80 ms | -100.0% bf16 MFU | 62124 tok/s +step 14191/19560 | loss 3.305974 (-0.13z)| norm 0.2596 (-0.80z)| lr 1.12e-04 | 8440.86 ms | -100.0% bf16 MFU | 62123 tok/s +step 14192/19560 | loss 3.249310 (-1.79z)| norm 0.2517 (-1.49z)| lr 1.12e-04 | 8435.55 ms | -100.0% bf16 MFU | 62124 tok/s +step 14193/19560 | loss 3.284365 (-0.75z)| norm 0.2768 (+0.75z)| lr 1.12e-04 | 8436.78 ms | -100.0% bf16 MFU | 62125 tok/s +step 14194/19560 | loss 3.265271 (-1.31z)| norm 0.2556 (-1.14z)| lr 1.12e-04 | 8436.77 ms | -100.0% bf16 MFU | 62126 tok/s +step 14195/19560 | loss 3.263559 (-1.34z)| norm 0.2718 (+0.32z)| lr 1.12e-04 | 8438.58 ms | -100.0% bf16 MFU | 62126 tok/s +step 14196/19560 | loss 3.327330 (+0.52z)| norm 0.2677 (-0.06z)| lr 1.12e-04 | 8443.00 ms | -100.0% bf16 MFU | 62125 tok/s +step 14197/19560 | loss 3.284672 (-0.73z)| norm 0.2620 (-0.56z)| lr 1.12e-04 | 8441.57 ms | -100.0% bf16 MFU | 62124 tok/s +step 14198/19560 | loss 3.349807 (+1.16z)| norm 0.2727 (+0.39z)| lr 1.12e-04 | 8439.93 ms | -100.0% bf16 MFU | 62124 tok/s +step 14199/19560 | loss 3.341607 (+0.92z)| norm 0.2714 (+0.28z)| lr 1.12e-04 | 8439.32 ms | -100.0% bf16 MFU | 62124 tok/s +step 14200/19560 | loss 3.322645 (+0.36z)| norm 0.2740 (+0.50z)| lr 1.12e-04 | 8438.71 ms | -100.0% bf16 MFU | 62124 tok/s +step 14201/19560 | loss 3.336691 (+0.78z)| norm 0.2720 (+0.32z)| lr 1.12e-04 | 8439.99 ms | -100.0% bf16 MFU | 62124 tok/s +step 14202/19560 | loss 3.261848 (-1.40z)| norm 0.2961 (+2.41z)| lr 1.12e-04 | 8438.45 ms | -100.0% bf16 MFU | 62124 tok/s +step 14203/19560 | loss 3.224738 (-2.43z)| norm 0.2681 (-0.06z)| lr 1.12e-04 | 8437.09 ms | -100.0% bf16 MFU | 62125 tok/s +step 14204/19560 | loss 3.363160 (+1.54z)| norm 0.2977 (+2.46z)| lr 1.12e-04 | 8436.67 ms | -100.0% bf16 MFU | 62126 tok/s +step 14205/19560 | loss 3.337325 (+0.80z)| norm 0.2902 (+1.79z)| lr 1.12e-04 | 8436.19 ms | -100.0% bf16 MFU | 62127 tok/s +step 14206/19560 | loss 3.279589 (-0.84z)| norm 0.2829 (+1.15z)| lr 1.12e-04 | 8435.63 ms | -100.0% bf16 MFU | 62128 tok/s +step 14207/19560 | loss 3.396767 (+2.49z)| norm 0.2908 (+1.78z)| lr 1.12e-04 | 8438.42 ms | -100.0% bf16 MFU | 62129 tok/s +step 14208/19560 | loss 3.319350 (+0.30z)| norm 0.2687 (-0.06z)| lr 1.12e-04 | 8437.06 ms | -100.0% bf16 MFU | 62129 tok/s +step 14209/19560 | loss 3.268536 (-1.15z)| norm 0.2857 (+1.33z)| lr 1.12e-04 | 8442.75 ms | -100.0% bf16 MFU | 62128 tok/s +step 14210/19560 | loss 3.361975 (+1.49z)| norm 0.2720 (+0.19z)| lr 1.11e-04 | 8436.37 ms | -100.0% bf16 MFU | 62129 tok/s +step 14211/19560 | loss 3.330238 (+0.58z)| norm 0.2730 (+0.27z)| lr 1.11e-04 | 8441.98 ms | -100.0% bf16 MFU | 62127 tok/s +step 14212/19560 | loss 3.320210 (+0.29z)| norm 0.2609 (-0.74z)| lr 1.11e-04 | 8436.93 ms | -100.0% bf16 MFU | 62128 tok/s +step 14213/19560 | loss 3.321125 (+0.30z)| norm 0.2562 (-1.13z)| lr 1.11e-04 | 8439.02 ms | -100.0% bf16 MFU | 62128 tok/s +step 14214/19560 | loss 3.258724 (-1.51z)| norm 0.2685 (-0.10z)| lr 1.11e-04 | 8439.72 ms | -100.0% bf16 MFU | 62128 tok/s +step 14215/19560 | loss 3.286108 (-0.71z)| norm 0.2631 (-0.56z)| lr 1.11e-04 | 8435.40 ms | -100.0% bf16 MFU | 62129 tok/s +step 14216/19560 | loss 3.286841 (-0.68z)| norm 0.2622 (-0.63z)| lr 1.11e-04 | 8438.95 ms | -100.0% bf16 MFU | 62129 tok/s +step 14217/19560 | loss 3.325266 (+0.44z)| norm 0.2753 (+0.46z)| lr 1.11e-04 | 8455.49 ms | -100.0% bf16 MFU | 62123 tok/s +step 14218/19560 | loss 3.376718 (+1.89z)| norm 0.2761 (+0.52z)| lr 1.11e-04 | 8465.14 ms | -100.0% bf16 MFU | 62113 tok/s +step 14219/19560 | loss 3.308414 (-0.07z)| norm 0.2623 (-0.63z)| lr 1.11e-04 | 8463.81 ms | -100.0% bf16 MFU | 62105 tok/s +step 14220/19560 | loss 3.302528 (-0.22z)| norm 0.2960 (+2.14z)| lr 1.11e-04 | 8461.46 ms | -100.0% bf16 MFU | 62098 tok/s +step 14221/19560 | loss 3.339970 (+0.88z)| norm 0.2573 (-1.04z)| lr 1.11e-04 | 8462.80 ms | -100.0% bf16 MFU | 62090 tok/s +step 14222/19560 | loss 3.309976 (-0.01z)| norm 0.2805 (+0.86z)| lr 1.11e-04 | 8465.23 ms | -100.0% bf16 MFU | 62083 tok/s +step 14223/19560 | loss 3.332557 (+0.66z)| norm 0.2767 (+0.55z)| lr 1.11e-04 | 8463.45 ms | -100.0% bf16 MFU | 62076 tok/s +step 14224/19560 | loss 3.340993 (+0.89z)| norm 0.2619 (-0.66z)| lr 1.11e-04 | 8457.69 ms | -100.0% bf16 MFU | 62072 tok/s +step 14225/19560 | loss 3.271247 (-1.13z)| norm 0.2624 (-0.61z)| lr 1.11e-04 | 8460.04 ms | -100.0% bf16 MFU | 62067 tok/s +step 14226/19560 | loss 3.401647 (+2.59z)| norm 0.2727 (+0.23z)| lr 1.11e-04 | 8455.40 ms | -100.0% bf16 MFU | 62064 tok/s +step 14227/19560 | loss 3.445498 (+3.65z)| norm 0.2930 (+1.86z)| lr 1.11e-04 | 8452.53 ms | -100.0% bf16 MFU | 62062 tok/s +step 14228/19560 | loss 3.322539 (+0.31z)| norm 0.2588 (-0.93z)| lr 1.11e-04 | 8451.98 ms | -100.0% bf16 MFU | 62060 tok/s +step 14229/19560 | loss 3.336989 (+0.69z)| norm 0.2730 (+0.22z)| lr 1.11e-04 | 8451.61 ms | -100.0% bf16 MFU | 62059 tok/s +step 14230/19560 | loss 3.311443 (-0.01z)| norm 0.2520 (-1.49z)| lr 1.11e-04 | 8449.47 ms | -100.0% bf16 MFU | 62058 tok/s +step 14231/19560 | loss 3.293493 (-0.49z)| norm 0.2573 (-1.05z)| lr 1.11e-04 | 8455.71 ms | -100.0% bf16 MFU | 62056 tok/s +step 14232/19560 | loss 3.340727 (+0.80z)| norm 0.2562 (-1.16z)| lr 1.11e-04 | 8448.81 ms | -100.0% bf16 MFU | 62056 tok/s +step 14233/19560 | loss 3.279825 (-0.85z)| norm 0.2599 (-0.85z)| lr 1.11e-04 | 8446.70 ms | -100.0% bf16 MFU | 62056 tok/s +step 14234/19560 | loss 3.322457 (+0.31z)| norm 0.2828 (+1.02z)| lr 1.11e-04 | 8446.62 ms | -100.0% bf16 MFU | 62057 tok/s +step 14235/19560 | loss 3.310727 (-0.00z)| norm 0.2481 (-1.86z)| lr 1.11e-04 | 8447.59 ms | -100.0% bf16 MFU | 62057 tok/s +step 14236/19560 | loss 3.325690 (+0.40z)| norm 0.3037 (+2.66z)| lr 1.10e-04 | 8452.68 ms | -100.0% bf16 MFU | 62056 tok/s +step 14237/19560 | loss 3.321534 (+0.30z)| norm 0.2762 (+0.42z)| lr 1.10e-04 | 8451.12 ms | -100.0% bf16 MFU | 62055 tok/s +step 14238/19560 | loss 3.343838 (+0.90z)| norm 0.3011 (+2.38z)| lr 1.10e-04 | 8454.21 ms | -100.0% bf16 MFU | 62053 tok/s +step 14239/19560 | loss 3.346321 (+0.97z)| norm 0.2726 (+0.09z)| lr 1.10e-04 | 8448.03 ms | -100.0% bf16 MFU | 62053 tok/s +step 14240/19560 | loss 3.305188 (-0.16z)| norm 0.2694 (-0.17z)| lr 1.10e-04 | 8454.58 ms | -100.0% bf16 MFU | 62051 tok/s +step 14241/19560 | loss 3.287042 (-0.65z)| norm 0.2706 (-0.08z)| lr 1.10e-04 | 8453.75 ms | -100.0% bf16 MFU | 62050 tok/s +step 14242/19560 | loss 3.284235 (-0.73z)| norm 0.2578 (-1.11z)| lr 1.10e-04 | 8446.19 ms | -100.0% bf16 MFU | 62051 tok/s +step 14243/19560 | loss 3.357959 (+1.28z)| norm 0.2667 (-0.37z)| lr 1.10e-04 | 8453.80 ms | -100.0% bf16 MFU | 62049 tok/s +step 14244/19560 | loss 3.348634 (+1.01z)| norm 0.2813 (+0.82z)| lr 1.10e-04 | 8450.43 ms | -100.0% bf16 MFU | 62049 tok/s +step 14245/19560 | loss 3.337325 (+0.70z)| norm 0.2848 (+1.09z)| lr 1.10e-04 | 8455.89 ms | -100.0% bf16 MFU | 62047 tok/s +step 14246/19560 | loss 3.318347 (+0.17z)| norm 0.2656 (-0.47z)| lr 1.10e-04 | 8449.71 ms | -100.0% bf16 MFU | 62047 tok/s +step 14247/19560 | loss 3.399624 (+2.33z)| norm 0.2929 (+1.73z)| lr 1.10e-04 | 8451.60 ms | -100.0% bf16 MFU | 62046 tok/s +step 14248/19560 | loss 3.287005 (-0.68z)| norm 0.2940 (+1.78z)| lr 1.10e-04 | 8453.13 ms | -100.0% bf16 MFU | 62045 tok/s +step 14249/19560 | loss 3.299143 (-0.35z)| norm 0.2849 (+1.05z)| lr 1.10e-04 | 8446.62 ms | -100.0% bf16 MFU | 62046 tok/s +step 14250/19560 | loss 3.327184 (+0.40z)| norm 0.2637 (-0.63z)| lr 1.10e-04 | 8455.37 ms | -100.0% bf16 MFU | 62044 tok/s +val loss 3.304654 +evaluating HellaSwag: 0/1256 evaluating HellaSwag: 10/1256 evaluating HellaSwag: 20/1256 evaluating HellaSwag: 30/1256 evaluating HellaSwag: 40/1256 evaluating HellaSwag: 50/1256 evaluating HellaSwag: 60/1256 evaluating HellaSwag: 70/1256 evaluating HellaSwag: 80/1256 evaluating HellaSwag: 90/1256 evaluating HellaSwag: 100/1256 evaluating HellaSwag: 110/1256 evaluating HellaSwag: 120/1256 evaluating HellaSwag: 130/1256 evaluating HellaSwag: 140/1256 evaluating HellaSwag: 150/1256 evaluating HellaSwag: 160/1256 evaluating HellaSwag: 170/1256 evaluating HellaSwag: 180/1256 evaluating HellaSwag: 190/1256 evaluating HellaSwag: 200/1256 evaluating HellaSwag: 210/1256 evaluating HellaSwag: 220/1256 evaluating HellaSwag: 230/1256 evaluating HellaSwag: 240/1256 evaluating HellaSwag: 250/1256 evaluating HellaSwag: 260/1256 evaluating HellaSwag: 270/1256 evaluating HellaSwag: 280/1256 evaluating HellaSwag: 290/1256 evaluating HellaSwag: 300/1256 evaluating HellaSwag: 310/1256 evaluating HellaSwag: 320/1256 evaluating HellaSwag: 330/1256 evaluating HellaSwag: 340/1256 evaluating HellaSwag: 350/1256 evaluating HellaSwag: 360/1256 evaluating HellaSwag: 370/1256 evaluating HellaSwag: 380/1256 evaluating HellaSwag: 390/1256 evaluating HellaSwag: 400/1256 evaluating HellaSwag: 410/1256 evaluating HellaSwag: 420/1256 evaluating HellaSwag: 430/1256 evaluating HellaSwag: 440/1256 evaluating HellaSwag: 450/1256 evaluating HellaSwag: 460/1256 evaluating HellaSwag: 470/1256 evaluating HellaSwag: 480/1256 evaluating HellaSwag: 490/1256 evaluating HellaSwag: 500/1256 evaluating HellaSwag: 510/1256 evaluating HellaSwag: 520/1256 evaluating HellaSwag: 530/1256 evaluating HellaSwag: 540/1256 evaluating HellaSwag: 550/1256 evaluating HellaSwag: 560/1256 evaluating HellaSwag: 570/1256 evaluating HellaSwag: 580/1256 evaluating HellaSwag: 590/1256 evaluating HellaSwag: 600/1256 evaluating HellaSwag: 610/1256 evaluating HellaSwag: 620/1256 evaluating HellaSwag: 630/1256 evaluating HellaSwag: 640/1256 evaluating HellaSwag: 650/1256 evaluating HellaSwag: 660/1256 evaluating HellaSwag: 670/1256 evaluating HellaSwag: 680/1256 evaluating HellaSwag: 690/1256 evaluating HellaSwag: 700/1256 evaluating HellaSwag: 710/1256 evaluating HellaSwag: 720/1256 evaluating HellaSwag: 730/1256 evaluating HellaSwag: 740/1256 evaluating HellaSwag: 750/1256 evaluating HellaSwag: 760/1256 evaluating HellaSwag: 770/1256 evaluating HellaSwag: 780/1256 evaluating HellaSwag: 790/1256 evaluating HellaSwag: 800/1256 evaluating HellaSwag: 810/1256 evaluating HellaSwag: 820/1256 evaluating HellaSwag: 830/1256 evaluating HellaSwag: 840/1256 evaluating HellaSwag: 850/1256 evaluating HellaSwag: 860/1256 evaluating HellaSwag: 870/1256 evaluating HellaSwag: 880/1256 evaluating HellaSwag: 890/1256 evaluating HellaSwag: 900/1256 evaluating HellaSwag: 910/1256 evaluating HellaSwag: 920/1256 evaluating HellaSwag: 930/1256 evaluating HellaSwag: 940/1256 evaluating HellaSwag: 950/1256 evaluating HellaSwag: 960/1256 evaluating HellaSwag: 970/1256 evaluating HellaSwag: 980/1256 evaluating HellaSwag: 990/1256 evaluating HellaSwag: 1000/1256 evaluating HellaSwag: 1010/1256 evaluating HellaSwag: 1020/1256 evaluating HellaSwag: 1030/1256 evaluating HellaSwag: 1040/1256 evaluating HellaSwag: 1050/1256 evaluating HellaSwag: 1060/1256 evaluating HellaSwag: 1070/1256 evaluating HellaSwag: 1080/1256 evaluating HellaSwag: 1090/1256 evaluating HellaSwag: 1100/1256 evaluating HellaSwag: 1110/1256 evaluating HellaSwag: 1120/1256 evaluating HellaSwag: 1130/1256 evaluating HellaSwag: 1140/1256 evaluating HellaSwag: 1150/1256 evaluating HellaSwag: 1160/1256 evaluating HellaSwag: 1170/1256 evaluating HellaSwag: 1180/1256 evaluating HellaSwag: 1190/1256 evaluating HellaSwag: 1200/1256 evaluating HellaSwag: 1210/1256 evaluating HellaSwag: 1220/1256 evaluating HellaSwag: 1230/1256 evaluating HellaSwag: 1240/1256 evaluating HellaSwag: 1250/1256 HellaSwag: 3001/10042 = 0.298845 +step 14251/19560 | loss 3.288724 (-0.63z)| norm 0.2855 (+1.10z)| lr 1.10e-04 | 8457.47 ms | -100.0% bf16 MFU | 62042 tok/s +step 14252/19560 | loss 3.300586 (-0.31z)| norm 0.2765 (+0.39z)| lr 1.10e-04 | 8452.65 ms | -100.0% bf16 MFU | 62041 tok/s +step 14253/19560 | loss 3.277892 (-0.92z)| norm 0.2847 (+1.03z)| lr 1.10e-04 | 8451.93 ms | -100.0% bf16 MFU | 62040 tok/s +step 14254/19560 | loss 3.347183 (+0.94z)| norm 0.2910 (+1.54z)| lr 1.10e-04 | 8453.44 ms | -100.0% bf16 MFU | 62039 tok/s +step 14255/19560 | loss 3.342801 (+0.81z)| norm 0.2739 (+0.17z)| lr 1.10e-04 | 8453.15 ms | -100.0% bf16 MFU | 62039 tok/s +step 14256/19560 | loss 3.365088 (+1.39z)| norm 0.2837 (+0.94z)| lr 1.10e-04 | 8444.01 ms | -100.0% bf16 MFU | 62041 tok/s +step 14257/19560 | loss 3.342074 (+0.78z)| norm 0.2765 (+0.37z)| lr 1.10e-04 | 8451.67 ms | -100.0% bf16 MFU | 62041 tok/s +step 14258/19560 | loss 3.296062 (-0.45z)| norm 0.2750 (+0.24z)| lr 1.10e-04 | 8451.43 ms | -100.0% bf16 MFU | 62040 tok/s +step 14259/19560 | loss 3.280062 (-0.87z)| norm 0.2605 (-0.91z)| lr 1.10e-04 | 8446.04 ms | -100.0% bf16 MFU | 62042 tok/s +step 14260/19560 | loss 3.324777 (+0.33z)| norm 0.2658 (-0.48z)| lr 1.10e-04 | 8452.19 ms | -100.0% bf16 MFU | 62042 tok/s +step 14261/19560 | loss 3.330960 (+0.49z)| norm 0.2643 (-0.61z)| lr 1.10e-04 | 8449.31 ms | -100.0% bf16 MFU | 62042 tok/s +step 14262/19560 | loss 3.272153 (-1.11z)| norm 0.2617 (-0.82z)| lr 1.09e-04 | 8449.30 ms | -100.0% bf16 MFU | 62042 tok/s +step 14263/19560 | loss 3.334036 (+0.57z)| norm 0.2765 (+0.36z)| lr 1.09e-04 | 8449.22 ms | -100.0% bf16 MFU | 62043 tok/s +step 14264/19560 | loss 3.304407 (-0.24z)| norm 0.2738 (+0.13z)| lr 1.09e-04 | 8452.76 ms | -100.0% bf16 MFU | 62042 tok/s +step 14265/19560 | loss 3.291332 (-0.59z)| norm 0.2772 (+0.41z)| lr 1.09e-04 | 8450.37 ms | -100.0% bf16 MFU | 62042 tok/s +step 14266/19560 | loss 3.355844 (+1.15z)| norm 0.2674 (-0.39z)| lr 1.09e-04 | 8455.69 ms | -100.0% bf16 MFU | 62040 tok/s +step 14267/19560 | loss 3.318471 (+0.13z)| norm 0.2655 (-0.54z)| lr 1.09e-04 | 8449.10 ms | -100.0% bf16 MFU | 62041 tok/s +step 14268/19560 | loss 3.355066 (+1.12z)| norm 0.2675 (-0.38z)| lr 1.09e-04 | 8445.53 ms | -100.0% bf16 MFU | 62043 tok/s +step 14269/19560 | loss 3.274445 (-1.07z)| norm 0.2535 (-1.50z)| lr 1.09e-04 | 8450.16 ms | -100.0% bf16 MFU | 62043 tok/s +step 14270/19560 | loss 3.314135 (+0.02z)| norm 0.2547 (-1.40z)| lr 1.09e-04 | 8443.65 ms | -100.0% bf16 MFU | 62045 tok/s +step 14271/19560 | loss 3.357598 (+1.19z)| norm 0.2792 (+0.56z)| lr 1.09e-04 | 8447.44 ms | -100.0% bf16 MFU | 62046 tok/s +step 14272/19560 | loss 3.310919 (-0.08z)| norm 0.2582 (-1.11z)| lr 1.09e-04 | 8444.65 ms | -100.0% bf16 MFU | 62048 tok/s +step 14273/19560 | loss 3.361005 (+1.27z)| norm 0.2700 (-0.17z)| lr 1.09e-04 | 8444.83 ms | -100.0% bf16 MFU | 62050 tok/s +step 14274/19560 | loss 3.293824 (-0.56z)| norm 0.2587 (-1.06z)| lr 1.09e-04 | 8449.55 ms | -100.0% bf16 MFU | 62050 tok/s +step 14275/19560 | loss 3.277814 (-0.99z)| norm 0.2574 (-1.19z)| lr 1.09e-04 | 8446.18 ms | -100.0% bf16 MFU | 62051 tok/s +step 14276/19560 | loss 3.337184 (+0.61z)| norm 0.2531 (-1.52z)| lr 1.09e-04 | 8447.27 ms | -100.0% bf16 MFU | 62052 tok/s +step 14277/19560 | loss 3.268190 (-1.25z)| norm 0.2512 (-1.65z)| lr 1.09e-04 | 8450.54 ms | -100.0% bf16 MFU | 62051 tok/s +step 14278/19560 | loss 3.317091 (+0.07z)| norm 0.2567 (-1.20z)| lr 1.09e-04 | 8446.46 ms | -100.0% bf16 MFU | 62052 tok/s +step 14279/19560 | loss 3.384514 (+1.86z)| norm 0.2584 (-1.06z)| lr 1.09e-04 | 8447.96 ms | -100.0% bf16 MFU | 62053 tok/s +step 14280/19560 | loss 3.351397 (+0.96z)| norm 0.2532 (-1.44z)| lr 1.09e-04 | 8444.72 ms | -100.0% bf16 MFU | 62054 tok/s +step 14281/19560 | loss 3.310090 (-0.14z)| norm 0.2586 (-1.01z)| lr 1.09e-04 | 8445.23 ms | -100.0% bf16 MFU | 62056 tok/s +step 14282/19560 | loss 3.368502 (+1.40z)| norm 0.2596 (-0.91z)| lr 1.09e-04 | 8439.64 ms | -100.0% bf16 MFU | 62059 tok/s +step 14283/19560 | loss 3.313325 (-0.07z)| norm 0.2692 (-0.17z)| lr 1.09e-04 | 8445.88 ms | -100.0% bf16 MFU | 62060 tok/s +step 14284/19560 | loss 3.301968 (-0.36z)| norm 0.2662 (-0.39z)| lr 1.09e-04 | 8443.34 ms | -100.0% bf16 MFU | 62062 tok/s +step 14285/19560 | loss 3.307942 (-0.20z)| norm 0.2719 (+0.04z)| lr 1.09e-04 | 8447.47 ms | -100.0% bf16 MFU | 62062 tok/s +step 14286/19560 | loss 3.332863 (+0.46z)| norm 0.2796 (+0.64z)| lr 1.09e-04 | 8445.85 ms | -100.0% bf16 MFU | 62063 tok/s +step 14287/19560 | loss 3.363755 (+1.27z)| norm 0.2669 (-0.35z)| lr 1.09e-04 | 8445.14 ms | -100.0% bf16 MFU | 62064 tok/s +step 14288/19560 | loss 3.293927 (-0.60z)| norm 0.2694 (-0.17z)| lr 1.08e-04 | 8445.55 ms | -100.0% bf16 MFU | 62064 tok/s +step 14289/19560 | loss 3.359057 (+1.12z)| norm 0.2589 (-1.00z)| lr 1.08e-04 | 8457.05 ms | -100.0% bf16 MFU | 62061 tok/s +step 14290/19560 | loss 3.333132 (+0.42z)| norm 0.2742 (+0.22z)| lr 1.08e-04 | 8448.43 ms | -100.0% bf16 MFU | 62061 tok/s +step 14291/19560 | loss 3.339077 (+0.58z)| norm 0.2653 (-0.49z)| lr 1.08e-04 | 8445.44 ms | -100.0% bf16 MFU | 62062 tok/s +step 14292/19560 | loss 3.349487 (+0.90z)| norm 0.2636 (-0.62z)| lr 1.08e-04 | 8443.90 ms | -100.0% bf16 MFU | 62063 tok/s +step 14293/19560 | loss 3.322771 (+0.16z)| norm 0.2660 (-0.42z)| lr 1.08e-04 | 8440.82 ms | -100.0% bf16 MFU | 62066 tok/s +step 14294/19560 | loss 3.340413 (+0.64z)| norm 0.2771 (+0.50z)| lr 1.08e-04 | 8442.02 ms | -100.0% bf16 MFU | 62067 tok/s +step 14295/19560 | loss 3.247412 (-1.95z)| norm 0.2714 (+0.02z)| lr 1.08e-04 | 8451.29 ms | -100.0% bf16 MFU | 62066 tok/s +step 14296/19560 | loss 3.303001 (-0.40z)| norm 0.2683 (-0.22z)| lr 1.08e-04 | 8441.27 ms | -100.0% bf16 MFU | 62068 tok/s +step 14297/19560 | loss 3.356673 (+1.08z)| norm 0.2648 (-0.51z)| lr 1.08e-04 | 8443.95 ms | -100.0% bf16 MFU | 62069 tok/s +step 14298/19560 | loss 3.328026 (+0.28z)| norm 0.3354 (+4.78z)| lr 1.08e-04 | 8449.26 ms | -100.0% bf16 MFU | 62068 tok/s +step 14299/19560 | loss 3.330233 (+0.34z)| norm 0.2807 (+0.67z)| lr 1.08e-04 | 8442.90 ms | -100.0% bf16 MFU | 62070 tok/s +step 14300/19560 | loss 3.375290 (+1.57z)| norm 0.2683 (-0.26z)| lr 1.08e-04 | 8439.49 ms | -100.0% bf16 MFU | 62072 tok/s +step 14301/19560 | loss 3.290849 (-0.76z)| norm 0.2821 (+0.77z)| lr 1.08e-04 | 8447.15 ms | -100.0% bf16 MFU | 62072 tok/s +step 14302/19560 | loss 3.382556 (+1.73z)| norm 0.2760 (+0.31z)| lr 1.08e-04 | 8440.89 ms | -100.0% bf16 MFU | 62074 tok/s +step 14303/19560 | loss 3.270172 (-1.32z)| norm 0.2524 (-1.45z)| lr 1.08e-04 | 8442.55 ms | -100.0% bf16 MFU | 62076 tok/s +step 14304/19560 | loss 3.308713 (-0.26z)| norm 0.2835 (+0.90z)| lr 1.08e-04 | 8444.66 ms | -100.0% bf16 MFU | 62076 tok/s +step 14305/19560 | loss 3.312750 (-0.15z)| norm 0.2512 (-1.52z)| lr 1.08e-04 | 8443.18 ms | -100.0% bf16 MFU | 62077 tok/s +step 14306/19560 | loss 3.288736 (-0.80z)| norm 0.2757 (+0.31z)| lr 1.08e-04 | 8445.13 ms | -100.0% bf16 MFU | 62077 tok/s +step 14307/19560 | loss 3.257849 (-1.62z)| norm 0.2616 (-0.73z)| lr 1.08e-04 | 8444.86 ms | -100.0% bf16 MFU | 62078 tok/s +step 14308/19560 | loss 3.317669 (-0.01z)| norm 0.2890 (+1.29z)| lr 1.08e-04 | 8441.56 ms | -100.0% bf16 MFU | 62079 tok/s +step 14309/19560 | loss 3.378735 (+1.62z)| norm 0.2688 (-0.21z)| lr 1.08e-04 | 8443.92 ms | -100.0% bf16 MFU | 62080 tok/s +step 14310/19560 | loss 3.321833 (+0.08z)| norm 0.2798 (+0.62z)| lr 1.08e-04 | 8441.94 ms | -100.0% bf16 MFU | 62081 tok/s +step 14311/19560 | loss 3.382832 (+1.72z)| norm 0.2614 (-0.74z)| lr 1.08e-04 | 8443.38 ms | -100.0% bf16 MFU | 62082 tok/s +step 14312/19560 | loss 3.295645 (-0.63z)| norm 0.2896 (+1.38z)| lr 1.08e-04 | 8441.42 ms | -100.0% bf16 MFU | 62083 tok/s +step 14313/19560 | loss 3.265883 (-1.47z)| norm 0.2653 (-0.45z)| lr 1.08e-04 | 8445.91 ms | -100.0% bf16 MFU | 62083 tok/s +step 14314/19560 | loss 3.332334 (+0.35z)| norm 0.2659 (-0.39z)| lr 1.07e-04 | 8444.69 ms | -100.0% bf16 MFU | 62083 tok/s +step 14315/19560 | loss 3.334719 (+0.41z)| norm 0.2718 (+0.05z)| lr 1.07e-04 | 8445.65 ms | -100.0% bf16 MFU | 62083 tok/s +step 14316/19560 | loss 3.352800 (+0.90z)| norm 0.2748 (+0.28z)| lr 1.07e-04 | 8443.61 ms | -100.0% bf16 MFU | 62083 tok/s +step 14317/19560 | loss 3.363434 (+1.17z)| norm 0.2666 (-0.34z)| lr 1.07e-04 | 8438.59 ms | -100.0% bf16 MFU | 62085 tok/s +step 14318/19560 | loss 3.309864 (-0.29z)| norm 0.2553 (-1.20z)| lr 1.07e-04 | 8438.84 ms | -100.0% bf16 MFU | 62088 tok/s +step 14319/19560 | loss 3.318108 (-0.07z)| norm 0.2795 (+0.63z)| lr 1.07e-04 | 8440.60 ms | -100.0% bf16 MFU | 62089 tok/s +step 14320/19560 | loss 3.322328 (+0.03z)| norm 0.2626 (-0.67z)| lr 1.07e-04 | 8440.26 ms | -100.0% bf16 MFU | 62090 tok/s +step 14321/19560 | loss 3.364345 (+1.18z)| norm 0.2584 (-0.97z)| lr 1.07e-04 | 8446.47 ms | -100.0% bf16 MFU | 62089 tok/s +step 14322/19560 | loss 3.258224 (-1.75z)| norm 0.2621 (-0.70z)| lr 1.07e-04 | 8441.10 ms | -100.0% bf16 MFU | 62090 tok/s +step 14323/19560 | loss 3.308959 (-0.37z)| norm 0.2671 (-0.31z)| lr 1.07e-04 | 8443.63 ms | -100.0% bf16 MFU | 62091 tok/s +step 14324/19560 | loss 3.340205 (+0.50z)| norm 0.2552 (-1.21z)| lr 1.07e-04 | 8442.25 ms | -100.0% bf16 MFU | 62091 tok/s +step 14325/19560 | loss 3.332563 (+0.28z)| norm 0.2860 (+1.12z)| lr 1.07e-04 | 8442.80 ms | -100.0% bf16 MFU | 62092 tok/s +step 14326/19560 | loss 3.311691 (-0.30z)| norm 0.2793 (+0.61z)| lr 1.07e-04 | 8440.79 ms | -100.0% bf16 MFU | 62093 tok/s +step 14327/19560 | loss 3.330863 (+0.24z)| norm 0.2687 (-0.19z)| lr 1.07e-04 | 8441.67 ms | -100.0% bf16 MFU | 62093 tok/s +step 14328/19560 | loss 3.300536 (-0.60z)| norm 0.2547 (-1.24z)| lr 1.07e-04 | 8442.71 ms | -100.0% bf16 MFU | 62094 tok/s +step 14329/19560 | loss 3.299895 (-0.61z)| norm 0.2632 (-0.59z)| lr 1.07e-04 | 8444.06 ms | -100.0% bf16 MFU | 62094 tok/s +step 14330/19560 | loss 3.334692 (+0.35z)| norm 0.2722 (+0.10z)| lr 1.07e-04 | 8439.41 ms | -100.0% bf16 MFU | 62095 tok/s +step 14331/19560 | loss 3.294670 (-0.82z)| norm 0.2568 (-1.06z)| lr 1.07e-04 | 8439.91 ms | -100.0% bf16 MFU | 62096 tok/s +step 14332/19560 | loss 3.354832 (+0.93z)| norm 0.3292 (+4.16z)| lr 1.07e-04 | 8442.85 ms | -100.0% bf16 MFU | 62096 tok/s +step 14333/19560 | loss 3.331326 (+0.25z)| norm 0.2738 (+0.21z)| lr 1.07e-04 | 8442.21 ms | -100.0% bf16 MFU | 62097 tok/s +step 14334/19560 | loss 3.265952 (-1.64z)| norm 0.2739 (+0.22z)| lr 1.07e-04 | 8442.66 ms | -100.0% bf16 MFU | 62097 tok/s +step 14335/19560 | loss 3.348071 (+0.76z)| norm 0.2934 (+1.62z)| lr 1.07e-04 | 8441.94 ms | -100.0% bf16 MFU | 62097 tok/s +step 14336/19560 | loss 3.329359 (+0.21z)| norm 0.2518 (-1.35z)| lr 1.07e-04 | 8441.60 ms | -100.0% bf16 MFU | 62098 tok/s +step 14337/19560 | loss 3.337828 (+0.44z)| norm 0.3213 (+3.44z)| lr 1.07e-04 | 8437.80 ms | -100.0% bf16 MFU | 62100 tok/s +step 14338/19560 | loss 3.286727 (-1.06z)| norm 0.2505 (-1.39z)| lr 1.07e-04 | 8440.15 ms | -100.0% bf16 MFU | 62101 tok/s +step 14339/19560 | loss 3.321682 (-0.02z)| norm 0.2728 (+0.13z)| lr 1.07e-04 | 8440.69 ms | -100.0% bf16 MFU | 62101 tok/s +step 14340/19560 | loss 3.274004 (-1.41z)| norm 0.2648 (-0.41z)| lr 1.06e-04 | 8438.50 ms | -100.0% bf16 MFU | 62103 tok/s +step 14341/19560 | loss 3.297261 (-0.72z)| norm 0.2650 (-0.41z)| lr 1.06e-04 | 8444.43 ms | -100.0% bf16 MFU | 62102 tok/s +step 14342/19560 | loss 3.320476 (-0.05z)| norm 0.2739 (+0.20z)| lr 1.06e-04 | 8439.86 ms | -100.0% bf16 MFU | 62103 tok/s +step 14343/19560 | loss 3.262597 (-1.76z)| norm 0.2624 (-0.58z)| lr 1.06e-04 | 8437.93 ms | -100.0% bf16 MFU | 62104 tok/s +step 14344/19560 | loss 3.352341 (+0.89z)| norm 0.2622 (-0.60z)| lr 1.06e-04 | 8442.17 ms | -100.0% bf16 MFU | 62104 tok/s +step 14345/19560 | loss 3.296141 (-0.77z)| norm 0.2966 (+1.72z)| lr 1.06e-04 | 8438.31 ms | -100.0% bf16 MFU | 62106 tok/s +step 14346/19560 | loss 3.479808 (+4.33z)| norm 0.2686 (-0.17z)| lr 1.06e-04 | 8439.26 ms | -100.0% bf16 MFU | 62107 tok/s +step 14347/19560 | loss 3.317157 (-0.17z)| norm 0.2600 (-0.75z)| lr 1.06e-04 | 8443.34 ms | -100.0% bf16 MFU | 62106 tok/s +step 14348/19560 | loss 3.282899 (-1.10z)| norm 0.2760 (+0.35z)| lr 1.06e-04 | 8439.83 ms | -100.0% bf16 MFU | 62107 tok/s +step 14349/19560 | loss 3.303206 (-0.54z)| norm 0.2909 (+1.34z)| lr 1.06e-04 | 8439.39 ms | -100.0% bf16 MFU | 62108 tok/s +step 14350/19560 | loss 3.325984 (+0.09z)| norm 0.2624 (-0.59z)| lr 1.06e-04 | 8438.40 ms | -100.0% bf16 MFU | 62109 tok/s +step 14351/19560 | loss 3.411904 (+2.38z)| norm 0.2726 (+0.11z)| lr 1.06e-04 | 8443.55 ms | -100.0% bf16 MFU | 62108 tok/s +step 14352/19560 | loss 3.343694 (+0.54z)| norm 0.2554 (-1.06z)| lr 1.06e-04 | 8442.51 ms | -100.0% bf16 MFU | 62108 tok/s +step 14353/19560 | loss 3.294537 (-0.79z)| norm 0.2978 (+1.78z)| lr 1.06e-04 | 8440.37 ms | -100.0% bf16 MFU | 62108 tok/s +step 14354/19560 | loss 3.406525 (+2.24z)| norm 0.2788 (+0.50z)| lr 1.06e-04 | 8436.75 ms | -100.0% bf16 MFU | 62110 tok/s +step 14355/19560 | loss 3.376626 (+1.50z)| norm 0.2778 (+0.45z)| lr 1.06e-04 | 8440.74 ms | -100.0% bf16 MFU | 62110 tok/s +step 14356/19560 | loss 3.286814 (-1.01z)| norm 0.2713 (+0.01z)| lr 1.06e-04 | 8440.59 ms | -100.0% bf16 MFU | 62110 tok/s +step 14357/19560 | loss 3.347546 (+0.68z)| norm 0.2671 (-0.28z)| lr 1.06e-04 | 8438.73 ms | -100.0% bf16 MFU | 62111 tok/s +step 14358/19560 | loss 3.283486 (-1.09z)| norm 0.2787 (+0.50z)| lr 1.06e-04 | 8439.32 ms | -100.0% bf16 MFU | 62112 tok/s +step 14359/19560 | loss 3.375008 (+1.42z)| norm 0.2803 (+0.59z)| lr 1.06e-04 | 8439.01 ms | -100.0% bf16 MFU | 62113 tok/s +step 14360/19560 | loss 3.329302 (+0.17z)| norm 0.2579 (-0.94z)| lr 1.06e-04 | 8438.87 ms | -100.0% bf16 MFU | 62113 tok/s +step 14361/19560 | loss 3.288467 (-0.97z)| norm 0.2752 (+0.24z)| lr 1.06e-04 | 8437.07 ms | -100.0% bf16 MFU | 62115 tok/s +step 14362/19560 | loss 3.334976 (+0.32z)| norm 0.2714 (-0.02z)| lr 1.06e-04 | 8438.37 ms | -100.0% bf16 MFU | 62116 tok/s +step 14363/19560 | loss 3.337124 (+0.37z)| norm 0.2642 (-0.53z)| lr 1.06e-04 | 8438.87 ms | -100.0% bf16 MFU | 62116 tok/s +step 14364/19560 | loss 3.355258 (+0.86z)| norm 0.2866 (+1.05z)| lr 1.06e-04 | 8441.42 ms | -100.0% bf16 MFU | 62116 tok/s +step 14365/19560 | loss 3.325867 (+0.05z)| norm 0.2522 (-1.34z)| lr 1.06e-04 | 8443.53 ms | -100.0% bf16 MFU | 62115 tok/s +step 14366/19560 | loss 3.350299 (+0.72z)| norm 0.2760 (+0.33z)| lr 1.05e-04 | 8438.37 ms | -100.0% bf16 MFU | 62116 tok/s +step 14367/19560 | loss 3.346233 (+0.61z)| norm 0.2621 (-0.64z)| lr 1.05e-04 | 8440.35 ms | -100.0% bf16 MFU | 62116 tok/s +step 14368/19560 | loss 3.299988 (-0.66z)| norm 0.2553 (-1.11z)| lr 1.05e-04 | 8437.08 ms | -100.0% bf16 MFU | 62117 tok/s +step 14369/19560 | loss 3.327863 (+0.10z)| norm 0.2456 (-1.76z)| lr 1.05e-04 | 8435.84 ms | -100.0% bf16 MFU | 62119 tok/s +step 14370/19560 | loss 3.367475 (+1.18z)| norm 0.2692 (-0.12z)| lr 1.05e-04 | 8441.44 ms | -100.0% bf16 MFU | 62118 tok/s +step 14371/19560 | loss 3.362636 (+1.04z)| norm 0.2714 (+0.03z)| lr 1.05e-04 | 8439.76 ms | -100.0% bf16 MFU | 62118 tok/s +step 14372/19560 | loss 3.320407 (-0.12z)| norm 0.2434 (-1.88z)| lr 1.05e-04 | 8441.09 ms | -100.0% bf16 MFU | 62118 tok/s +step 14373/19560 | loss 3.386062 (+1.67z)| norm 0.2804 (+0.67z)| lr 1.05e-04 | 8440.99 ms | -100.0% bf16 MFU | 62118 tok/s +step 14374/19560 | loss 3.313683 (-0.31z)| norm 0.2511 (-1.34z)| lr 1.05e-04 | 8438.43 ms | -100.0% bf16 MFU | 62118 tok/s +step 14375/19560 | loss 3.314219 (-0.28z)| norm 0.2808 (+0.72z)| lr 1.05e-04 | 8443.47 ms | -100.0% bf16 MFU | 62117 tok/s +step 14376/19560 | loss 3.303035 (-0.60z)| norm 0.2590 (-0.78z)| lr 1.05e-04 | 8440.07 ms | -100.0% bf16 MFU | 62117 tok/s +step 14377/19560 | loss 3.281816 (-1.18z)| norm 0.2668 (-0.23z)| lr 1.05e-04 | 8439.20 ms | -100.0% bf16 MFU | 62118 tok/s +step 14378/19560 | loss 3.299286 (-0.69z)| norm 0.2595 (-0.74z)| lr 1.05e-04 | 8440.47 ms | -100.0% bf16 MFU | 62118 tok/s +step 14379/19560 | loss 3.367326 (+1.18z)| norm 0.2622 (-0.53z)| lr 1.05e-04 | 8438.55 ms | -100.0% bf16 MFU | 62118 tok/s +step 14380/19560 | loss 3.288757 (-0.99z)| norm 0.2631 (-0.47z)| lr 1.05e-04 | 8436.27 ms | -100.0% bf16 MFU | 62120 tok/s +step 14381/19560 | loss 3.369105 (+1.21z)| norm 0.2755 (+0.41z)| lr 1.05e-04 | 8445.06 ms | -100.0% bf16 MFU | 62118 tok/s +step 14382/19560 | loss 3.333770 (+0.23z)| norm 0.2539 (-1.09z)| lr 1.05e-04 | 8438.40 ms | -100.0% bf16 MFU | 62118 tok/s +step 14383/19560 | loss 3.406669 (+2.20z)| norm 0.2661 (-0.23z)| lr 1.05e-04 | 8441.21 ms | -100.0% bf16 MFU | 62118 tok/s +step 14384/19560 | loss 3.343848 (+0.50z)| norm 0.2645 (-0.33z)| lr 1.05e-04 | 8439.65 ms | -100.0% bf16 MFU | 62118 tok/s +step 14385/19560 | loss 3.321388 (-0.11z)| norm 0.2575 (-0.82z)| lr 1.05e-04 | 8439.90 ms | -100.0% bf16 MFU | 62118 tok/s +step 14386/19560 | loss 3.388115 (+1.67z)| norm 0.2670 (-0.14z)| lr 1.05e-04 | 8437.73 ms | -100.0% bf16 MFU | 62119 tok/s +step 14387/19560 | loss 3.333130 (+0.18z)| norm 0.2777 (+0.61z)| lr 1.05e-04 | 8439.61 ms | -100.0% bf16 MFU | 62119 tok/s +step 14388/19560 | loss 3.301915 (-0.67z)| norm 0.2697 (+0.04z)| lr 1.05e-04 | 8438.38 ms | -100.0% bf16 MFU | 62120 tok/s +step 14389/19560 | loss 3.357893 (+0.85z)| norm 0.2642 (-0.35z)| lr 1.05e-04 | 8438.30 ms | -100.0% bf16 MFU | 62121 tok/s +step 14390/19560 | loss 3.287677 (-1.06z)| norm 0.2859 (+1.18z)| lr 1.05e-04 | 8438.52 ms | -100.0% bf16 MFU | 62121 tok/s +step 14391/19560 | loss 3.308541 (-0.49z)| norm 0.2802 (+0.77z)| lr 1.05e-04 | 8438.22 ms | -100.0% bf16 MFU | 62122 tok/s +step 14392/19560 | loss 3.328009 (+0.03z)| norm 0.2581 (-0.78z)| lr 1.05e-04 | 8440.11 ms | -100.0% bf16 MFU | 62121 tok/s +step 14393/19560 | loss 3.294188 (-0.89z)| norm 0.2935 (+1.68z)| lr 1.04e-04 | 8438.15 ms | -100.0% bf16 MFU | 62122 tok/s +step 14394/19560 | loss 3.447577 (+3.15z)| norm 0.2869 (+1.21z)| lr 1.04e-04 | 8438.89 ms | -100.0% bf16 MFU | 62122 tok/s +step 14395/19560 | loss 3.290872 (-0.95z)| norm 0.2677 (-0.12z)| lr 1.04e-04 | 8439.00 ms | -100.0% bf16 MFU | 62123 tok/s +step 14396/19560 | loss 3.317365 (-0.25z)| norm 0.2752 (+0.39z)| lr 1.04e-04 | 8437.29 ms | -100.0% bf16 MFU | 62123 tok/s +step 14397/19560 | loss 3.270319 (-1.48z)| norm 0.2812 (+0.80z)| lr 1.04e-04 | 8438.96 ms | -100.0% bf16 MFU | 62124 tok/s +step 14398/19560 | loss 3.343203 (+0.42z)| norm 0.2716 (+0.12z)| lr 1.04e-04 | 8439.34 ms | -100.0% bf16 MFU | 62124 tok/s +step 14399/19560 | loss 3.305840 (-0.55z)| norm 0.2820 (+0.84z)| lr 1.04e-04 | 8439.35 ms | -100.0% bf16 MFU | 62124 tok/s +step 14400/19560 | loss 3.369855 (+1.11z)| norm 0.2855 (+1.07z)| lr 1.04e-04 | 8438.58 ms | -100.0% bf16 MFU | 62124 tok/s +step 14401/19560 | loss 3.308666 (-0.47z)| norm 0.2531 (-1.17z)| lr 1.04e-04 | 8435.94 ms | -100.0% bf16 MFU | 62125 tok/s +step 14402/19560 | loss 3.305973 (-0.55z)| norm 0.2828 (+0.87z)| lr 1.04e-04 | 8438.56 ms | -100.0% bf16 MFU | 62125 tok/s +step 14403/19560 | loss 3.352289 (+0.65z)| norm 0.2798 (+0.65z)| lr 1.04e-04 | 8437.83 ms | -100.0% bf16 MFU | 62126 tok/s +step 14404/19560 | loss 3.331281 (+0.10z)| norm 0.2684 (-0.15z)| lr 1.04e-04 | 8438.13 ms | -100.0% bf16 MFU | 62126 tok/s +step 14405/19560 | loss 3.342375 (+0.38z)| norm 0.2826 (+0.83z)| lr 1.04e-04 | 8439.80 ms | -100.0% bf16 MFU | 62126 tok/s +step 14406/19560 | loss 3.316745 (-0.30z)| norm 0.2702 (-0.04z)| lr 1.04e-04 | 8437.74 ms | -100.0% bf16 MFU | 62127 tok/s +step 14407/19560 | loss 3.309147 (-0.49z)| norm 0.2558 (-1.06z)| lr 1.04e-04 | 8438.82 ms | -100.0% bf16 MFU | 62127 tok/s +step 14408/19560 | loss 3.306386 (-0.55z)| norm 0.2575 (-0.94z)| lr 1.04e-04 | 8465.99 ms | -100.0% bf16 MFU | 62117 tok/s +step 14409/19560 | loss 3.275613 (-1.36z)| norm 0.2790 (+0.56z)| lr 1.04e-04 | 8462.43 ms | -100.0% bf16 MFU | 62109 tok/s +step 14410/19560 | loss 3.373965 (+1.25z)| norm 0.2655 (-0.39z)| lr 1.04e-04 | 8467.09 ms | -100.0% bf16 MFU | 62099 tok/s +step 14411/19560 | loss 3.310795 (-0.43z)| norm 0.2634 (-0.53z)| lr 1.04e-04 | 8462.00 ms | -100.0% bf16 MFU | 62092 tok/s +step 14412/19560 | loss 3.370328 (+1.14z)| norm 0.2720 (+0.07z)| lr 1.04e-04 | 8462.71 ms | -100.0% bf16 MFU | 62085 tok/s +step 14413/19560 | loss 3.330884 (+0.09z)| norm 0.2632 (-0.55z)| lr 1.04e-04 | 8462.14 ms | -100.0% bf16 MFU | 62079 tok/s +step 14414/19560 | loss 3.331383 (+0.10z)| norm 0.2583 (-0.88z)| lr 1.04e-04 | 8462.78 ms | -100.0% bf16 MFU | 62072 tok/s +step 14415/19560 | loss 3.306595 (-0.54z)| norm 0.2838 (+0.90z)| lr 1.04e-04 | 8459.03 ms | -100.0% bf16 MFU | 62068 tok/s +step 14416/19560 | loss 3.312040 (-0.40z)| norm 0.2682 (-0.19z)| lr 1.04e-04 | 8458.40 ms | -100.0% bf16 MFU | 62064 tok/s +step 14417/19560 | loss 3.342992 (+0.42z)| norm 0.2733 (+0.16z)| lr 1.04e-04 | 8455.46 ms | -100.0% bf16 MFU | 62061 tok/s +step 14418/19560 | loss 3.322104 (-0.13z)| norm 0.2951 (+1.66z)| lr 1.04e-04 | 8460.89 ms | -100.0% bf16 MFU | 62056 tok/s +step 14419/19560 | loss 3.367190 (+1.06z)| norm 0.2839 (+0.87z)| lr 1.03e-04 | 8454.83 ms | -100.0% bf16 MFU | 62054 tok/s +step 14420/19560 | loss 3.322714 (-0.11z)| norm 0.2638 (-0.52z)| lr 1.03e-04 | 8463.04 ms | -100.0% bf16 MFU | 62049 tok/s +step 14421/19560 | loss 3.340644 (+0.36z)| norm 0.2662 (-0.36z)| lr 1.03e-04 | 8456.04 ms | -100.0% bf16 MFU | 62046 tok/s +step 14422/19560 | loss 3.361971 (+0.92z)| norm 0.2731 (+0.12z)| lr 1.03e-04 | 8458.46 ms | -100.0% bf16 MFU | 62043 tok/s +step 14423/19560 | loss 3.329737 (+0.05z)| norm 0.2593 (-0.83z)| lr 1.03e-04 | 8456.74 ms | -100.0% bf16 MFU | 62041 tok/s +step 14424/19560 | loss 3.406785 (+2.07z)| norm 0.2820 (+0.74z)| lr 1.03e-04 | 8457.23 ms | -100.0% bf16 MFU | 62038 tok/s +step 14425/19560 | loss 3.307348 (-0.56z)| norm 0.2611 (-0.71z)| lr 1.03e-04 | 8449.16 ms | -100.0% bf16 MFU | 62039 tok/s +step 14426/19560 | loss 3.383004 (+1.43z)| norm 0.2783 (+0.56z)| lr 1.03e-04 | 8450.50 ms | -100.0% bf16 MFU | 62039 tok/s +step 14427/19560 | loss 3.328432 (-0.01z)| norm 0.2591 (-0.87z)| lr 1.03e-04 | 8449.37 ms | -100.0% bf16 MFU | 62040 tok/s +step 14428/19560 | loss 3.305423 (-0.61z)| norm 0.2706 (-0.01z)| lr 1.03e-04 | 8453.51 ms | -100.0% bf16 MFU | 62039 tok/s +step 14429/19560 | loss 3.313612 (-0.39z)| norm 0.2537 (-1.25z)| lr 1.03e-04 | 8449.49 ms | -100.0% bf16 MFU | 62039 tok/s +step 14430/19560 | loss 3.368393 (+1.07z)| norm 0.2711 (+0.05z)| lr 1.03e-04 | 8445.24 ms | -100.0% bf16 MFU | 62041 tok/s +step 14431/19560 | loss 3.318474 (-0.28z)| norm 0.2503 (-1.51z)| lr 1.03e-04 | 8443.30 ms | -100.0% bf16 MFU | 62044 tok/s +step 14432/19560 | loss 3.289093 (-1.06z)| norm 0.2555 (-1.10z)| lr 1.03e-04 | 8450.01 ms | -100.0% bf16 MFU | 62044 tok/s +step 14433/19560 | loss 3.313566 (-0.40z)| norm 0.2928 (+1.66z)| lr 1.03e-04 | 8444.08 ms | -100.0% bf16 MFU | 62046 tok/s +step 14434/19560 | loss 3.311583 (-0.46z)| norm 0.2507 (-1.45z)| lr 1.03e-04 | 8443.86 ms | -100.0% bf16 MFU | 62049 tok/s +step 14435/19560 | loss 3.289597 (-1.07z)| norm 0.2748 (+0.32z)| lr 1.03e-04 | 8445.27 ms | -100.0% bf16 MFU | 62050 tok/s +step 14436/19560 | loss 3.344078 (+0.40z)| norm 0.2504 (-1.46z)| lr 1.03e-04 | 8445.57 ms | -100.0% bf16 MFU | 62052 tok/s +step 14437/19560 | loss 3.294148 (-0.94z)| norm 0.2616 (-0.63z)| lr 1.03e-04 | 8444.93 ms | -100.0% bf16 MFU | 62053 tok/s +step 14438/19560 | loss 3.324286 (-0.12z)| norm 0.2728 (+0.21z)| lr 1.03e-04 | 8446.45 ms | -100.0% bf16 MFU | 62054 tok/s +step 14439/19560 | loss 3.312476 (-0.43z)| norm 0.2531 (-1.24z)| lr 1.03e-04 | 8452.05 ms | -100.0% bf16 MFU | 62053 tok/s +step 14440/19560 | loss 3.351124 (+0.63z)| norm 0.2511 (-1.37z)| lr 1.03e-04 | 8446.13 ms | -100.0% bf16 MFU | 62054 tok/s +step 14441/19560 | loss 3.366026 (+1.02z)| norm 0.2583 (-0.83z)| lr 1.03e-04 | 8449.87 ms | -100.0% bf16 MFU | 62054 tok/s +step 14442/19560 | loss 3.254022 (-2.04z)| norm 0.2772 (+0.55z)| lr 1.03e-04 | 8446.41 ms | -100.0% bf16 MFU | 62055 tok/s +step 14443/19560 | loss 3.291336 (-1.01z)| norm 0.2599 (-0.71z)| lr 1.03e-04 | 8447.02 ms | -100.0% bf16 MFU | 62055 tok/s +step 14444/19560 | loss 3.315509 (-0.34z)| norm 0.2603 (-0.67z)| lr 1.03e-04 | 8453.10 ms | -100.0% bf16 MFU | 62054 tok/s +step 14445/19560 | loss 3.286194 (-1.12z)| norm 0.2677 (-0.13z)| lr 1.03e-04 | 8443.64 ms | -100.0% bf16 MFU | 62056 tok/s +step 14446/19560 | loss 3.279154 (-1.30z)| norm 0.2492 (-1.48z)| lr 1.02e-04 | 8454.43 ms | -100.0% bf16 MFU | 62054 tok/s +step 14447/19560 | loss 3.345115 (+0.48z)| norm 0.2684 (-0.07z)| lr 1.02e-04 | 8442.92 ms | -100.0% bf16 MFU | 62056 tok/s +step 14448/19560 | loss 3.391584 (+1.70z)| norm 0.2591 (-0.75z)| lr 1.02e-04 | 8444.31 ms | -100.0% bf16 MFU | 62057 tok/s +step 14449/19560 | loss 3.302764 (-0.66z)| norm 0.2635 (-0.43z)| lr 1.02e-04 | 8442.98 ms | -100.0% bf16 MFU | 62059 tok/s +step 14450/19560 | loss 3.285865 (-1.13z)| norm 0.2475 (-1.57z)| lr 1.02e-04 | 8446.65 ms | -100.0% bf16 MFU | 62060 tok/s +step 14451/19560 | loss 3.270748 (-1.52z)| norm 0.2646 (-0.34z)| lr 1.02e-04 | 8449.57 ms | -100.0% bf16 MFU | 62059 tok/s +step 14452/19560 | loss 3.321878 (-0.14z)| norm 0.2755 (+0.44z)| lr 1.02e-04 | 8452.55 ms | -100.0% bf16 MFU | 62058 tok/s +step 14453/19560 | loss 3.276373 (-1.34z)| norm 0.2687 (-0.04z)| lr 1.02e-04 | 8447.27 ms | -100.0% bf16 MFU | 62058 tok/s +step 14454/19560 | loss 3.334822 (+0.21z)| norm 0.2869 (+1.27z)| lr 1.02e-04 | 8452.35 ms | -100.0% bf16 MFU | 62057 tok/s +step 14455/19560 | loss 3.380142 (+1.40z)| norm 0.2729 (+0.26z)| lr 1.02e-04 | 8443.60 ms | -100.0% bf16 MFU | 62059 tok/s +step 14456/19560 | loss 3.329556 (+0.05z)| norm 0.2917 (+1.59z)| lr 1.02e-04 | 8450.76 ms | -100.0% bf16 MFU | 62058 tok/s +step 14457/19560 | loss 3.328294 (+0.01z)| norm 0.2634 (-0.45z)| lr 1.02e-04 | 8442.85 ms | -100.0% bf16 MFU | 62060 tok/s +step 14458/19560 | loss 3.317210 (-0.28z)| norm 0.2730 (+0.24z)| lr 1.02e-04 | 8453.30 ms | -100.0% bf16 MFU | 62058 tok/s +step 14459/19560 | loss 3.279441 (-1.27z)| norm 0.2608 (-0.65z)| lr 1.02e-04 | 8448.70 ms | -100.0% bf16 MFU | 62058 tok/s +step 14460/19560 | loss 3.289559 (-0.99z)| norm 0.2597 (-0.74z)| lr 1.02e-04 | 8446.47 ms | -100.0% bf16 MFU | 62058 tok/s +step 14461/19560 | loss 3.347802 (+0.55z)| norm 0.2693 (+0.02z)| lr 1.02e-04 | 8446.56 ms | -100.0% bf16 MFU | 62059 tok/s +step 14462/19560 | loss 3.290475 (-0.98z)| norm 0.2666 (-0.20z)| lr 1.02e-04 | 8449.07 ms | -100.0% bf16 MFU | 62059 tok/s +step 14463/19560 | loss 3.299520 (-0.73z)| norm 0.2802 (+0.89z)| lr 1.02e-04 | 8449.89 ms | -100.0% bf16 MFU | 62058 tok/s +step 14464/19560 | loss 3.304212 (-0.60z)| norm 0.2477 (-1.67z)| lr 1.02e-04 | 8449.82 ms | -100.0% bf16 MFU | 62058 tok/s +step 14465/19560 | loss 3.290348 (-0.95z)| norm 0.2941 (+2.11z)| lr 1.02e-04 | 8444.50 ms | -100.0% bf16 MFU | 62059 tok/s +step 14466/19560 | loss 3.306798 (-0.52z)| norm 0.2632 (-0.47z)| lr 1.02e-04 | 8449.22 ms | -100.0% bf16 MFU | 62059 tok/s +step 14467/19560 | loss 3.380247 (+1.40z)| norm 0.2506 (-1.50z)| lr 1.02e-04 | 8452.81 ms | -100.0% bf16 MFU | 62057 tok/s +step 14468/19560 | loss 3.337692 (+0.27z)| norm 0.2599 (-0.72z)| lr 1.02e-04 | 8446.13 ms | -100.0% bf16 MFU | 62058 tok/s +step 14469/19560 | loss 3.312910 (-0.39z)| norm 0.2544 (-1.17z)| lr 1.02e-04 | 8448.56 ms | -100.0% bf16 MFU | 62058 tok/s +step 14470/19560 | loss 3.303180 (-0.64z)| norm 0.2573 (-0.91z)| lr 1.02e-04 | 8440.64 ms | -100.0% bf16 MFU | 62061 tok/s +step 14471/19560 | loss 3.288173 (-1.05z)| norm 0.2628 (-0.46z)| lr 1.02e-04 | 8449.56 ms | -100.0% bf16 MFU | 62060 tok/s +step 14472/19560 | loss 3.303308 (-0.64z)| norm 0.2627 (-0.47z)| lr 1.01e-04 | 8448.59 ms | -100.0% bf16 MFU | 62060 tok/s +step 14473/19560 | loss 3.343012 (+0.41z)| norm 0.2814 (+1.10z)| lr 1.01e-04 | 8441.45 ms | -100.0% bf16 MFU | 62062 tok/s +step 14474/19560 | loss 3.287523 (-1.10z)| norm 0.2533 (-1.24z)| lr 1.01e-04 | 8446.82 ms | -100.0% bf16 MFU | 62063 tok/s +step 14475/19560 | loss 3.313769 (-0.35z)| norm 0.2473 (-1.71z)| lr 1.01e-04 | 8444.27 ms | -100.0% bf16 MFU | 62064 tok/s +step 14476/19560 | loss 3.326286 (-0.01z)| norm 0.2695 (+0.12z)| lr 1.01e-04 | 8439.50 ms | -100.0% bf16 MFU | 62067 tok/s +step 14477/19560 | loss 3.327932 (+0.04z)| norm 0.2595 (-0.69z)| lr 1.01e-04 | 8446.97 ms | -100.0% bf16 MFU | 62067 tok/s +step 14478/19560 | loss 3.312555 (-0.40z)| norm 0.2568 (-0.91z)| lr 1.01e-04 | 8443.83 ms | -100.0% bf16 MFU | 62068 tok/s +step 14479/19560 | loss 3.313883 (-0.35z)| norm 0.2483 (-1.59z)| lr 1.01e-04 | 8439.59 ms | -100.0% bf16 MFU | 62071 tok/s +step 14480/19560 | loss 3.289019 (-1.07z)| norm 0.2681 (+0.04z)| lr 1.01e-04 | 8433.83 ms | -100.0% bf16 MFU | 62076 tok/s +step 14481/19560 | loss 3.315061 (-0.31z)| norm 0.2592 (-0.69z)| lr 1.01e-04 | 8438.32 ms | -100.0% bf16 MFU | 62078 tok/s +step 14482/19560 | loss 3.332809 (+0.24z)| norm 0.2598 (-0.63z)| lr 1.01e-04 | 8435.32 ms | -100.0% bf16 MFU | 62082 tok/s +step 14483/19560 | loss 3.385798 (+1.82z)| norm 0.2827 (+1.31z)| lr 1.01e-04 | 8437.04 ms | -100.0% bf16 MFU | 62085 tok/s +step 14484/19560 | loss 3.371327 (+1.37z)| norm 0.2687 (+0.12z)| lr 1.01e-04 | 8437.24 ms | -100.0% bf16 MFU | 62088 tok/s +step 14485/19560 | loss 3.331635 (+0.18z)| norm 0.2730 (+0.49z)| lr 1.01e-04 | 8438.53 ms | -100.0% bf16 MFU | 62090 tok/s +step 14486/19560 | loss 3.341570 (+0.47z)| norm 0.2674 (+0.02z)| lr 1.01e-04 | 8434.12 ms | -100.0% bf16 MFU | 62094 tok/s +step 14487/19560 | loss 3.326755 (+0.03z)| norm 0.2694 (+0.20z)| lr 1.01e-04 | 8440.65 ms | -100.0% bf16 MFU | 62095 tok/s +step 14488/19560 | loss 3.440054 (+3.30z)| norm 0.2757 (+0.73z)| lr 1.01e-04 | 8434.34 ms | -100.0% bf16 MFU | 62098 tok/s +step 14489/19560 | loss 3.279264 (-1.37z)| norm 0.2631 (-0.34z)| lr 1.01e-04 | 8435.35 ms | -100.0% bf16 MFU | 62101 tok/s +step 14490/19560 | loss 3.302755 (-0.68z)| norm 0.2694 (+0.20z)| lr 1.01e-04 | 8437.63 ms | -100.0% bf16 MFU | 62103 tok/s +step 14491/19560 | loss 3.316995 (-0.26z)| norm 0.2702 (+0.26z)| lr 1.01e-04 | 8439.50 ms | -100.0% bf16 MFU | 62104 tok/s +step 14492/19560 | loss 3.356205 (+0.87z)| norm 0.2728 (+0.50z)| lr 1.01e-04 | 8434.19 ms | -100.0% bf16 MFU | 62107 tok/s +step 14493/19560 | loss 3.349575 (+0.67z)| norm 0.2605 (-0.58z)| lr 1.01e-04 | 8439.99 ms | -100.0% bf16 MFU | 62107 tok/s +step 14494/19560 | loss 3.377636 (+1.47z)| norm 0.2675 (+0.04z)| lr 1.01e-04 | 8440.50 ms | -100.0% bf16 MFU | 62108 tok/s +step 14495/19560 | loss 3.379883 (+1.51z)| norm 0.2674 (+0.03z)| lr 1.01e-04 | 8435.53 ms | -100.0% bf16 MFU | 62110 tok/s +step 14496/19560 | loss 3.318727 (-0.23z)| norm 0.2642 (-0.26z)| lr 1.01e-04 | 8437.85 ms | -100.0% bf16 MFU | 62111 tok/s +step 14497/19560 | loss 3.334465 (+0.22z)| norm 0.2682 (+0.08z)| lr 1.01e-04 | 8442.75 ms | -100.0% bf16 MFU | 62110 tok/s +step 14498/19560 | loss 3.328913 (+0.07z)| norm 0.2612 (-0.54z)| lr 1.01e-04 | 8439.87 ms | -100.0% bf16 MFU | 62111 tok/s +step 14499/19560 | loss 3.300920 (-0.72z)| norm 0.2472 (-1.74z)| lr 1.00e-04 | 8442.07 ms | -100.0% bf16 MFU | 62111 tok/s +step 14500/19560 | loss 3.322417 (-0.11z)| norm 0.2557 (-1.02z)| lr 1.00e-04 | 8438.93 ms | -100.0% bf16 MFU | 62111 tok/s +val loss 3.299815 +evaluating HellaSwag: 0/1256 evaluating HellaSwag: 10/1256 evaluating HellaSwag: 20/1256 evaluating HellaSwag: 30/1256 evaluating HellaSwag: 40/1256 evaluating HellaSwag: 50/1256 evaluating HellaSwag: 60/1256 evaluating HellaSwag: 70/1256 evaluating HellaSwag: 80/1256 evaluating HellaSwag: 90/1256 evaluating HellaSwag: 100/1256 evaluating HellaSwag: 110/1256 evaluating HellaSwag: 120/1256 evaluating HellaSwag: 130/1256 evaluating HellaSwag: 140/1256 evaluating HellaSwag: 150/1256 evaluating HellaSwag: 160/1256 evaluating HellaSwag: 170/1256 evaluating HellaSwag: 180/1256 evaluating HellaSwag: 190/1256 evaluating HellaSwag: 200/1256 evaluating HellaSwag: 210/1256 evaluating HellaSwag: 220/1256 evaluating HellaSwag: 230/1256 evaluating HellaSwag: 240/1256 evaluating HellaSwag: 250/1256 evaluating HellaSwag: 260/1256 evaluating HellaSwag: 270/1256 evaluating HellaSwag: 280/1256 evaluating HellaSwag: 290/1256 evaluating HellaSwag: 300/1256 evaluating HellaSwag: 310/1256 evaluating HellaSwag: 320/1256 evaluating HellaSwag: 330/1256 evaluating HellaSwag: 340/1256 evaluating HellaSwag: 350/1256 evaluating HellaSwag: 360/1256 evaluating HellaSwag: 370/1256 evaluating HellaSwag: 380/1256 evaluating HellaSwag: 390/1256 evaluating HellaSwag: 400/1256 evaluating HellaSwag: 410/1256 evaluating HellaSwag: 420/1256 evaluating HellaSwag: 430/1256 evaluating HellaSwag: 440/1256 evaluating HellaSwag: 450/1256 evaluating HellaSwag: 460/1256 evaluating HellaSwag: 470/1256 evaluating HellaSwag: 480/1256 evaluating HellaSwag: 490/1256 evaluating HellaSwag: 500/1256 evaluating HellaSwag: 510/1256 evaluating HellaSwag: 520/1256 evaluating HellaSwag: 530/1256 evaluating HellaSwag: 540/1256 evaluating HellaSwag: 550/1256 evaluating HellaSwag: 560/1256 evaluating HellaSwag: 570/1256 evaluating HellaSwag: 580/1256 evaluating HellaSwag: 590/1256 evaluating HellaSwag: 600/1256 evaluating HellaSwag: 610/1256 evaluating HellaSwag: 620/1256 evaluating HellaSwag: 630/1256 evaluating HellaSwag: 640/1256 evaluating HellaSwag: 650/1256 evaluating HellaSwag: 660/1256 evaluating HellaSwag: 670/1256 evaluating HellaSwag: 680/1256 evaluating HellaSwag: 690/1256 evaluating HellaSwag: 700/1256 evaluating HellaSwag: 710/1256 evaluating HellaSwag: 720/1256 evaluating HellaSwag: 730/1256 evaluating HellaSwag: 740/1256 evaluating HellaSwag: 750/1256 evaluating HellaSwag: 760/1256 evaluating HellaSwag: 770/1256 evaluating HellaSwag: 780/1256 evaluating HellaSwag: 790/1256 evaluating HellaSwag: 800/1256 evaluating HellaSwag: 810/1256 evaluating HellaSwag: 820/1256 evaluating HellaSwag: 830/1256 evaluating HellaSwag: 840/1256 evaluating HellaSwag: 850/1256 evaluating HellaSwag: 860/1256 evaluating HellaSwag: 870/1256 evaluating HellaSwag: 880/1256 evaluating HellaSwag: 890/1256 evaluating HellaSwag: 900/1256 evaluating HellaSwag: 910/1256 evaluating HellaSwag: 920/1256 evaluating HellaSwag: 930/1256 evaluating HellaSwag: 940/1256 evaluating HellaSwag: 950/1256 evaluating HellaSwag: 960/1256 evaluating HellaSwag: 970/1256 evaluating HellaSwag: 980/1256 evaluating HellaSwag: 990/1256 evaluating HellaSwag: 1000/1256 evaluating HellaSwag: 1010/1256 evaluating HellaSwag: 1020/1256 evaluating HellaSwag: 1030/1256 evaluating HellaSwag: 1040/1256 evaluating HellaSwag: 1050/1256 evaluating HellaSwag: 1060/1256 evaluating HellaSwag: 1070/1256 evaluating HellaSwag: 1080/1256 evaluating HellaSwag: 1090/1256 evaluating HellaSwag: 1100/1256 evaluating HellaSwag: 1110/1256 evaluating HellaSwag: 1120/1256 evaluating HellaSwag: 1130/1256 evaluating HellaSwag: 1140/1256 evaluating HellaSwag: 1150/1256 evaluating HellaSwag: 1160/1256 evaluating HellaSwag: 1170/1256 evaluating HellaSwag: 1180/1256 evaluating HellaSwag: 1190/1256 evaluating HellaSwag: 1200/1256 evaluating HellaSwag: 1210/1256 evaluating HellaSwag: 1220/1256 evaluating HellaSwag: 1230/1256 evaluating HellaSwag: 1240/1256 evaluating HellaSwag: 1250/1256 HellaSwag: 2995/10042 = 0.298247 +step 14501/19560 | loss 3.301363 (-0.70z)| norm 0.2702 (+0.28z)| lr 1.00e-04 | 8438.77 ms | -100.0% bf16 MFU | 62112 tok/s +step 14502/19560 | loss 3.327758 (+0.06z)| norm 0.2514 (-1.40z)| lr 1.00e-04 | 8436.30 ms | -100.0% bf16 MFU | 62114 tok/s +step 14503/19560 | loss 3.312041 (-0.39z)| norm 0.2634 (-0.32z)| lr 1.00e-04 | 8435.57 ms | -100.0% bf16 MFU | 62116 tok/s +step 14504/19560 | loss 3.331061 (+0.15z)| norm 0.2763 (+0.82z)| lr 1.00e-04 | 8436.43 ms | -100.0% bf16 MFU | 62117 tok/s +step 14505/19560 | loss 3.350903 (+0.72z)| norm 0.2685 (+0.13z)| lr 1.00e-04 | 8438.37 ms | -100.0% bf16 MFU | 62118 tok/s +step 14506/19560 | loss 3.320004 (-0.19z)| norm 0.2684 (+0.11z)| lr 1.00e-04 | 8441.40 ms | -100.0% bf16 MFU | 62118 tok/s +step 14507/19560 | loss 3.312916 (-0.39z)| norm 0.2613 (-0.53z)| lr 1.00e-04 | 8439.56 ms | -100.0% bf16 MFU | 62118 tok/s +step 14508/19560 | loss 3.304229 (-0.65z)| norm 0.2768 (+0.85z)| lr 1.00e-04 | 8436.57 ms | -100.0% bf16 MFU | 62119 tok/s +step 14509/19560 | loss 3.320777 (-0.15z)| norm 0.2584 (-0.78z)| lr 1.00e-04 | 8436.80 ms | -100.0% bf16 MFU | 62120 tok/s +step 14510/19560 | loss 3.371118 (+1.33z)| norm 0.2603 (-0.62z)| lr 1.00e-04 | 8437.44 ms | -100.0% bf16 MFU | 62121 tok/s +step 14511/19560 | loss 3.299154 (-0.79z)| norm 0.2476 (-1.73z)| lr 1.00e-04 | 8439.80 ms | -100.0% bf16 MFU | 62121 tok/s +step 14512/19560 | loss 3.310794 (-0.43z)| norm 0.2681 (+0.09z)| lr 1.00e-04 | 8436.24 ms | -100.0% bf16 MFU | 62123 tok/s +step 14513/19560 | loss 3.363343 (+1.14z)| norm 0.2631 (-0.36z)| lr 1.00e-04 | 8441.19 ms | -100.0% bf16 MFU | 62122 tok/s +step 14514/19560 | loss 3.237833 (-2.56z)| norm 0.2634 (-0.33z)| lr 9.99e-05 | 8441.58 ms | -100.0% bf16 MFU | 62121 tok/s +step 14515/19560 | loss 3.290266 (-0.99z)| norm 0.2645 (-0.22z)| lr 9.99e-05 | 8443.56 ms | -100.0% bf16 MFU | 62120 tok/s +step 14516/19560 | loss 3.311079 (-0.38z)| norm 0.2484 (-1.64z)| lr 9.98e-05 | 8441.22 ms | -100.0% bf16 MFU | 62119 tok/s +step 14517/19560 | loss 3.297564 (-0.76z)| norm 0.2605 (-0.56z)| lr 9.98e-05 | 8440.08 ms | -100.0% bf16 MFU | 62119 tok/s +step 14518/19560 | loss 3.287247 (-1.07z)| norm 0.2672 (+0.05z)| lr 9.98e-05 | 8438.48 ms | -100.0% bf16 MFU | 62120 tok/s +step 14519/19560 | loss 3.349890 (+0.77z)| norm 0.2515 (-1.34z)| lr 9.97e-05 | 8439.21 ms | -100.0% bf16 MFU | 62120 tok/s +step 14520/19560 | loss 3.295231 (-0.83z)| norm 0.2655 (-0.09z)| lr 9.97e-05 | 8443.17 ms | -100.0% bf16 MFU | 62119 tok/s +step 14521/19560 | loss 3.349379 (+0.75z)| norm 0.2765 (+0.92z)| lr 9.97e-05 | 8443.78 ms | -100.0% bf16 MFU | 62118 tok/s +step 14522/19560 | loss 3.336346 (+0.41z)| norm 0.2532 (-1.19z)| lr 9.96e-05 | 8438.92 ms | -100.0% bf16 MFU | 62118 tok/s +step 14523/19560 | loss 3.419397 (+2.88z)| norm 0.2564 (-0.88z)| lr 9.96e-05 | 8437.73 ms | -100.0% bf16 MFU | 62119 tok/s +step 14524/19560 | loss 3.316851 (-0.22z)| norm 0.2784 (+1.13z)| lr 9.95e-05 | 8441.68 ms | -100.0% bf16 MFU | 62118 tok/s +step 14525/19560 | loss 3.323910 (-0.02z)| norm 0.2626 (-0.30z)| lr 9.95e-05 | 8446.96 ms | -100.0% bf16 MFU | 62116 tok/s +step 14526/19560 | loss 3.357122 (+0.99z)| norm 0.2893 (+2.12z)| lr 9.95e-05 | 8440.95 ms | -100.0% bf16 MFU | 62116 tok/s +step 14527/19560 | loss 3.315104 (-0.29z)| norm 0.2686 (+0.24z)| lr 9.94e-05 | 8441.56 ms | -100.0% bf16 MFU | 62115 tok/s +step 14528/19560 | loss 3.393593 (+2.08z)| norm 0.2799 (+1.30z)| lr 9.94e-05 | 8439.47 ms | -100.0% bf16 MFU | 62116 tok/s +step 14529/19560 | loss 3.341595 (+0.50z)| norm 0.2723 (+0.58z)| lr 9.94e-05 | 8442.96 ms | -100.0% bf16 MFU | 62115 tok/s +step 14530/19560 | loss 3.326931 (+0.05z)| norm 0.2792 (+1.23z)| lr 9.93e-05 | 8441.49 ms | -100.0% bf16 MFU | 62115 tok/s +step 14531/19560 | loss 3.316003 (-0.27z)| norm 0.2916 (+2.33z)| lr 9.93e-05 | 8444.01 ms | -100.0% bf16 MFU | 62113 tok/s +step 14532/19560 | loss 3.364052 (+1.17z)| norm 0.2674 (+0.12z)| lr 9.92e-05 | 8440.29 ms | -100.0% bf16 MFU | 62113 tok/s +step 14533/19560 | loss 3.408604 (+2.44z)| norm 0.2841 (+1.64z)| lr 9.92e-05 | 8444.41 ms | -100.0% bf16 MFU | 62112 tok/s +step 14534/19560 | loss 3.357406 (+0.92z)| norm 0.2723 (+0.56z)| lr 9.92e-05 | 8441.08 ms | -100.0% bf16 MFU | 62112 tok/s +step 14535/19560 | loss 3.381876 (+1.61z)| norm 0.2946 (+2.52z)| lr 9.91e-05 | 8438.46 ms | -100.0% bf16 MFU | 62113 tok/s +step 14536/19560 | loss 3.383294 (+1.62z)| norm 0.2794 (+1.14z)| lr 9.91e-05 | 8440.52 ms | -100.0% bf16 MFU | 62113 tok/s +step 14537/19560 | loss 3.374841 (+1.35z)| norm 0.2671 (+0.06z)| lr 9.91e-05 | 8441.16 ms | -100.0% bf16 MFU | 62113 tok/s +step 14538/19560 | loss 3.326853 (-0.02z)| norm 0.2700 (+0.31z)| lr 9.90e-05 | 8441.99 ms | -100.0% bf16 MFU | 62113 tok/s +step 14539/19560 | loss 3.321615 (-0.18z)| norm 0.2678 (+0.11z)| lr 9.90e-05 | 8443.13 ms | -100.0% bf16 MFU | 62112 tok/s +step 14540/19560 | loss 3.286069 (-1.19z)| norm 0.2868 (+1.78z)| lr 9.90e-05 | 8444.84 ms | -100.0% bf16 MFU | 62110 tok/s +step 14541/19560 | loss 3.285415 (-1.20z)| norm 0.2863 (+1.70z)| lr 9.89e-05 | 8440.75 ms | -100.0% bf16 MFU | 62111 tok/s +step 14542/19560 | loss 3.340246 (+0.39z)| norm 0.2860 (+1.64z)| lr 9.89e-05 | 8441.25 ms | -100.0% bf16 MFU | 62111 tok/s +step 14543/19560 | loss 3.305269 (-0.62z)| norm 0.2802 (+1.15z)| lr 9.88e-05 | 8444.24 ms | -100.0% bf16 MFU | 62109 tok/s +step 14544/19560 | loss 3.281990 (-1.28z)| norm 0.2865 (+1.66z)| lr 9.88e-05 | 8441.25 ms | -100.0% bf16 MFU | 62110 tok/s +step 14545/19560 | loss 3.319261 (-0.20z)| norm 0.2623 (-0.41z)| lr 9.88e-05 | 8441.35 ms | -100.0% bf16 MFU | 62110 tok/s +step 14546/19560 | loss 3.362529 (+1.03z)| norm 0.2908 (+2.06z)| lr 9.87e-05 | 8438.43 ms | -100.0% bf16 MFU | 62111 tok/s +step 14547/19560 | loss 3.312401 (-0.40z)| norm 0.2740 (+0.61z)| lr 9.87e-05 | 8439.15 ms | -100.0% bf16 MFU | 62111 tok/s +step 14548/19560 | loss 3.307569 (-0.53z)| norm 0.2701 (+0.26z)| lr 9.87e-05 | 8439.22 ms | -100.0% bf16 MFU | 62112 tok/s +step 14549/19560 | loss 3.338869 (+0.37z)| norm 0.2762 (+0.79z)| lr 9.86e-05 | 8438.14 ms | -100.0% bf16 MFU | 62113 tok/s +step 14550/19560 | loss 3.317627 (-0.23z)| norm 0.2812 (+1.21z)| lr 9.86e-05 | 8440.12 ms | -100.0% bf16 MFU | 62113 tok/s +step 14551/19560 | loss 3.317997 (-0.22z)| norm 0.2555 (-1.01z)| lr 9.85e-05 | 8439.52 ms | -100.0% bf16 MFU | 62114 tok/s +step 14552/19560 | loss 3.289405 (-1.04z)| norm 0.2749 (+0.68z)| lr 9.85e-05 | 8441.44 ms | -100.0% bf16 MFU | 62114 tok/s +step 14553/19560 | loss 3.309475 (-0.45z)| norm 0.2658 (-0.11z)| lr 9.85e-05 | 8439.59 ms | -100.0% bf16 MFU | 62114 tok/s +step 14554/19560 | loss 3.326621 (+0.07z)| norm 0.2603 (-0.58z)| lr 9.84e-05 | 8439.42 ms | -100.0% bf16 MFU | 62115 tok/s +step 14555/19560 | loss 3.321044 (-0.10z)| norm 0.2568 (-0.88z)| lr 9.84e-05 | 8439.01 ms | -100.0% bf16 MFU | 62115 tok/s +step 14556/19560 | loss 3.308393 (-0.47z)| norm 0.2752 (+0.71z)| lr 9.84e-05 | 8438.52 ms | -100.0% bf16 MFU | 62116 tok/s +step 14557/19560 | loss 3.369454 (+1.32z)| norm 0.2578 (-0.80z)| lr 9.83e-05 | 8437.55 ms | -100.0% bf16 MFU | 62117 tok/s +step 14558/19560 | loss 3.406394 (+2.37z)| norm 0.2791 (+1.04z)| lr 9.83e-05 | 8442.53 ms | -100.0% bf16 MFU | 62116 tok/s +step 14559/19560 | loss 3.327527 (+0.07z)| norm 0.2650 (-0.19z)| lr 9.82e-05 | 8437.60 ms | -100.0% bf16 MFU | 62117 tok/s +step 14560/19560 | loss 3.368141 (+1.23z)| norm 0.2756 (+0.72z)| lr 9.82e-05 | 8439.47 ms | -100.0% bf16 MFU | 62118 tok/s +step 14561/19560 | loss 3.334610 (+0.25z)| norm 0.2659 (-0.11z)| lr 9.82e-05 | 8438.22 ms | -100.0% bf16 MFU | 62118 tok/s +step 14562/19560 | loss 3.332675 (+0.19z)| norm 0.2598 (-0.67z)| lr 9.81e-05 | 8439.29 ms | -100.0% bf16 MFU | 62119 tok/s +step 14563/19560 | loss 3.333332 (+0.20z)| norm 0.2566 (-0.95z)| lr 9.81e-05 | 8442.73 ms | -100.0% bf16 MFU | 62118 tok/s +step 14564/19560 | loss 3.330703 (+0.13z)| norm 0.2815 (+1.28z)| lr 9.81e-05 | 8436.44 ms | -100.0% bf16 MFU | 62119 tok/s +step 14565/19560 | loss 3.320360 (-0.18z)| norm 0.2536 (-1.23z)| lr 9.80e-05 | 8441.39 ms | -100.0% bf16 MFU | 62119 tok/s +step 14566/19560 | loss 3.294557 (-0.93z)| norm 0.2590 (-0.73z)| lr 9.80e-05 | 8440.62 ms | -100.0% bf16 MFU | 62118 tok/s +step 14567/19560 | loss 3.331185 (+0.14z)| norm 0.2659 (-0.12z)| lr 9.80e-05 | 8438.04 ms | -100.0% bf16 MFU | 62119 tok/s +step 14568/19560 | loss 3.238711 (-2.48z)| norm 0.2546 (-1.14z)| lr 9.79e-05 | 8439.62 ms | -100.0% bf16 MFU | 62119 tok/s +step 14569/19560 | loss 3.302961 (-0.63z)| norm 0.2496 (-1.58z)| lr 9.79e-05 | 8439.04 ms | -100.0% bf16 MFU | 62120 tok/s +step 14570/19560 | loss 3.380465 (+1.57z)| norm 0.2631 (-0.36z)| lr 9.78e-05 | 8437.11 ms | -100.0% bf16 MFU | 62121 tok/s +step 14571/19560 | loss 3.284406 (-1.20z)| norm 0.2756 (+0.75z)| lr 9.78e-05 | 8441.15 ms | -100.0% bf16 MFU | 62120 tok/s +step 14572/19560 | loss 3.354402 (+0.81z)| norm 0.2545 (-1.14z)| lr 9.78e-05 | 8442.07 ms | -100.0% bf16 MFU | 62119 tok/s +step 14573/19560 | loss 3.412344 (+2.41z)| norm 0.2910 (+2.08z)| lr 9.77e-05 | 8438.19 ms | -100.0% bf16 MFU | 62120 tok/s +step 14574/19560 | loss 3.289283 (-1.08z)| norm 0.2548 (-1.12z)| lr 9.77e-05 | 8438.42 ms | -100.0% bf16 MFU | 62121 tok/s +step 14575/19560 | loss 3.331749 (+0.13z)| norm 0.2679 (+0.05z)| lr 9.77e-05 | 8438.76 ms | -100.0% bf16 MFU | 62121 tok/s +step 14576/19560 | loss 3.284929 (-1.19z)| norm 0.2652 (-0.20z)| lr 9.76e-05 | 8443.14 ms | -100.0% bf16 MFU | 62120 tok/s +step 14577/19560 | loss 3.263730 (-1.76z)| norm 0.2459 (-1.88z)| lr 9.76e-05 | 8439.11 ms | -100.0% bf16 MFU | 62120 tok/s +step 14578/19560 | loss 3.335749 (+0.26z)| norm 0.2749 (+0.65z)| lr 9.75e-05 | 8438.29 ms | -100.0% bf16 MFU | 62121 tok/s +step 14579/19560 | loss 3.285779 (-1.17z)| norm 0.2475 (-1.74z)| lr 9.75e-05 | 8437.38 ms | -100.0% bf16 MFU | 62122 tok/s +step 14580/19560 | loss 3.305809 (-0.59z)| norm 0.2658 (-0.14z)| lr 9.75e-05 | 8437.02 ms | -100.0% bf16 MFU | 62123 tok/s +step 14581/19560 | loss 3.290710 (-1.03z)| norm 0.2471 (-1.74z)| lr 9.74e-05 | 8439.40 ms | -100.0% bf16 MFU | 62123 tok/s +step 14582/19560 | loss 3.315536 (-0.31z)| norm 0.2642 (-0.24z)| lr 9.74e-05 | 8439.43 ms | -100.0% bf16 MFU | 62123 tok/s +step 14583/19560 | loss 3.441969 (+3.19z)| norm 0.2593 (-0.66z)| lr 9.74e-05 | 8436.96 ms | -100.0% bf16 MFU | 62124 tok/s +step 14584/19560 | loss 3.294947 (-0.88z)| norm 0.2602 (-0.58z)| lr 9.73e-05 | 8440.64 ms | -100.0% bf16 MFU | 62123 tok/s +step 14585/19560 | loss 3.286685 (-1.09z)| norm 0.2838 (+1.50z)| lr 9.73e-05 | 8438.76 ms | -100.0% bf16 MFU | 62123 tok/s +step 14586/19560 | loss 3.349707 (+0.64z)| norm 0.2612 (-0.48z)| lr 9.73e-05 | 8438.71 ms | -100.0% bf16 MFU | 62124 tok/s +step 14587/19560 | loss 3.258443 (-1.85z)| norm 0.3025 (+3.02z)| lr 9.72e-05 | 8436.39 ms | -100.0% bf16 MFU | 62125 tok/s +step 14588/19560 | loss 3.279489 (-1.27z)| norm 0.2611 (-0.50z)| lr 9.72e-05 | 8440.29 ms | -100.0% bf16 MFU | 62124 tok/s +step 14589/19560 | loss 3.316784 (-0.25z)| norm 0.2611 (-0.50z)| lr 9.71e-05 | 8438.79 ms | -100.0% bf16 MFU | 62125 tok/s +step 14590/19560 | loss 3.332466 (+0.17z)| norm 0.2577 (-0.78z)| lr 9.71e-05 | 8439.98 ms | -100.0% bf16 MFU | 62124 tok/s +step 14591/19560 | loss 3.329149 (+0.07z)| norm 0.2436 (-1.93z)| lr 9.71e-05 | 8440.65 ms | -100.0% bf16 MFU | 62124 tok/s +step 14592/19560 | loss 3.337411 (+0.29z)| norm 0.2644 (-0.20z)| lr 9.70e-05 | 8436.13 ms | -100.0% bf16 MFU | 62125 tok/s +step 14593/19560 | loss 3.306917 (-0.55z)| norm 0.2564 (-0.87z)| lr 9.70e-05 | 8440.01 ms | -100.0% bf16 MFU | 62125 tok/s +step 14594/19560 | loss 3.310388 (-0.46z)| norm 0.2638 (-0.23z)| lr 9.70e-05 | 8438.57 ms | -100.0% bf16 MFU | 62125 tok/s +step 14595/19560 | loss 3.229833 (-2.59z)| norm 0.2599 (-0.58z)| lr 9.69e-05 | 8438.21 ms | -100.0% bf16 MFU | 62125 tok/s +step 14596/19560 | loss 3.329069 (+0.09z)| norm 0.2597 (-0.59z)| lr 9.69e-05 | 8439.52 ms | -100.0% bf16 MFU | 62125 tok/s +step 14597/19560 | loss 3.368340 (+1.13z)| norm 0.2582 (-0.73z)| lr 9.68e-05 | 8437.42 ms | -100.0% bf16 MFU | 62126 tok/s +step 14598/19560 | loss 3.300688 (-0.69z)| norm 0.2570 (-0.83z)| lr 9.68e-05 | 8445.14 ms | -100.0% bf16 MFU | 62124 tok/s +step 14599/19560 | loss 3.316942 (-0.26z)| norm 0.2561 (-0.91z)| lr 9.68e-05 | 8460.98 ms | -100.0% bf16 MFU | 62116 tok/s +step 14600/19560 | loss 3.299149 (-0.74z)| norm 0.2475 (-1.63z)| lr 9.67e-05 | 8468.87 ms | -100.0% bf16 MFU | 62105 tok/s +step 14601/19560 | loss 3.299001 (-0.73z)| norm 0.2592 (-0.60z)| lr 9.67e-05 | 8465.36 ms | -100.0% bf16 MFU | 62097 tok/s +step 14602/19560 | loss 3.335113 (+0.23z)| norm 0.2525 (-1.18z)| lr 9.67e-05 | 8463.96 ms | -100.0% bf16 MFU | 62089 tok/s +step 14603/19560 | loss 3.336999 (+0.28z)| norm 0.2699 (+0.31z)| lr 9.66e-05 | 8457.50 ms | -100.0% bf16 MFU | 62084 tok/s +step 14604/19560 | loss 3.347035 (+0.55z)| norm 0.2693 (+0.25z)| lr 9.66e-05 | 8467.07 ms | -100.0% bf16 MFU | 62076 tok/s +step 14605/19560 | loss 3.373817 (+1.25z)| norm 0.2902 (+2.03z)| lr 9.66e-05 | 8463.94 ms | -100.0% bf16 MFU | 62069 tok/s +step 14606/19560 | loss 3.367920 (+1.08z)| norm 0.2825 (+1.34z)| lr 9.65e-05 | 8465.38 ms | -100.0% bf16 MFU | 62063 tok/s +step 14607/19560 | loss 3.330684 (+0.08z)| norm 0.2820 (+1.29z)| lr 9.65e-05 | 8461.90 ms | -100.0% bf16 MFU | 62057 tok/s +step 14608/19560 | loss 3.341750 (+0.37z)| norm 0.2703 (+0.27z)| lr 9.64e-05 | 8455.32 ms | -100.0% bf16 MFU | 62055 tok/s +step 14609/19560 | loss 3.257041 (-1.87z)| norm 0.2643 (-0.25z)| lr 9.64e-05 | 8466.53 ms | -100.0% bf16 MFU | 62048 tok/s +step 14610/19560 | loss 3.348515 (+0.55z)| norm 0.2659 (-0.11z)| lr 9.64e-05 | 8461.75 ms | -100.0% bf16 MFU | 62044 tok/s +step 14611/19560 | loss 3.305580 (-0.58z)| norm 0.2569 (-0.87z)| lr 9.63e-05 | 8459.62 ms | -100.0% bf16 MFU | 62041 tok/s +step 14612/19560 | loss 3.334658 (+0.21z)| norm 0.2623 (-0.40z)| lr 9.63e-05 | 8461.95 ms | -100.0% bf16 MFU | 62036 tok/s +step 14613/19560 | loss 3.294785 (-0.85z)| norm 0.2666 (-0.03z)| lr 9.63e-05 | 8457.84 ms | -100.0% bf16 MFU | 62034 tok/s +step 14614/19560 | loss 3.312321 (-0.38z)| norm 0.2898 (+1.95z)| lr 9.62e-05 | 8457.91 ms | -100.0% bf16 MFU | 62032 tok/s +step 14615/19560 | loss 3.347505 (+0.56z)| norm 0.2517 (-1.29z)| lr 9.62e-05 | 8459.53 ms | -100.0% bf16 MFU | 62029 tok/s +step 14616/19560 | loss 3.419480 (+2.52z)| norm 0.2676 (+0.07z)| lr 9.61e-05 | 8457.94 ms | -100.0% bf16 MFU | 62027 tok/s +step 14617/19560 | loss 3.364279 (+1.01z)| norm 0.2657 (-0.10z)| lr 9.61e-05 | 8456.51 ms | -100.0% bf16 MFU | 62025 tok/s +step 14618/19560 | loss 3.327707 (+0.01z)| norm 0.2651 (-0.15z)| lr 9.61e-05 | 8456.71 ms | -100.0% bf16 MFU | 62024 tok/s +step 14619/19560 | loss 3.382822 (+1.48z)| norm 0.2611 (-0.49z)| lr 9.60e-05 | 8452.38 ms | -100.0% bf16 MFU | 62024 tok/s +step 14620/19560 | loss 3.305934 (-0.58z)| norm 0.2832 (+1.38z)| lr 9.60e-05 | 8466.42 ms | -100.0% bf16 MFU | 62019 tok/s +step 14621/19560 | loss 3.215792 (-2.88z)| norm 0.2628 (-0.34z)| lr 9.60e-05 | 8452.86 ms | -100.0% bf16 MFU | 62020 tok/s +step 14622/19560 | loss 3.311899 (-0.36z)| norm 0.2882 (+1.77z)| lr 9.59e-05 | 8457.26 ms | -100.0% bf16 MFU | 62018 tok/s +step 14623/19560 | loss 3.377545 (+1.36z)| norm 0.2716 (+0.38z)| lr 9.59e-05 | 8459.82 ms | -100.0% bf16 MFU | 62016 tok/s +step 14624/19560 | loss 3.341503 (+0.41z)| norm 0.2694 (+0.19z)| lr 9.59e-05 | 8461.14 ms | -100.0% bf16 MFU | 62013 tok/s +step 14625/19560 | loss 3.299574 (-0.68z)| norm 0.2738 (+0.56z)| lr 9.58e-05 | 8458.08 ms | -100.0% bf16 MFU | 62012 tok/s +step 14626/19560 | loss 3.301664 (-0.62z)| norm 0.2506 (-1.36z)| lr 9.58e-05 | 8455.39 ms | -100.0% bf16 MFU | 62012 tok/s +step 14627/19560 | loss 3.384759 (+1.52z)| norm 0.2938 (+2.17z)| lr 9.57e-05 | 8456.19 ms | -100.0% bf16 MFU | 62011 tok/s +step 14628/19560 | loss 3.308623 (-0.45z)| norm 0.2868 (+1.56z)| lr 9.57e-05 | 8451.52 ms | -100.0% bf16 MFU | 62012 tok/s +step 14629/19560 | loss 3.405607 (+2.02z)| norm 0.2898 (+1.77z)| lr 9.57e-05 | 8455.27 ms | -100.0% bf16 MFU | 62012 tok/s +step 14630/19560 | loss 3.370226 (+1.10z)| norm 0.2728 (+0.39z)| lr 9.56e-05 | 8452.93 ms | -100.0% bf16 MFU | 62013 tok/s +step 14631/19560 | loss 3.324912 (-0.06z)| norm 0.2762 (+0.66z)| lr 9.56e-05 | 8453.75 ms | -100.0% bf16 MFU | 62013 tok/s +step 14632/19560 | loss 3.359373 (+0.81z)| norm 0.2929 (+1.98z)| lr 9.56e-05 | 8458.39 ms | -100.0% bf16 MFU | 62012 tok/s +step 14633/19560 | loss 3.346680 (+0.49z)| norm 0.2739 (+0.45z)| lr 9.55e-05 | 8454.72 ms | -100.0% bf16 MFU | 62012 tok/s +step 14634/19560 | loss 3.313000 (-0.37z)| norm 0.2802 (+0.94z)| lr 9.55e-05 | 8450.18 ms | -100.0% bf16 MFU | 62013 tok/s +step 14635/19560 | loss 3.410448 (+2.06z)| norm 0.2837 (+1.20z)| lr 9.55e-05 | 8448.26 ms | -100.0% bf16 MFU | 62016 tok/s +step 14636/19560 | loss 3.295237 (-0.82z)| norm 0.2666 (-0.15z)| lr 9.54e-05 | 8453.72 ms | -100.0% bf16 MFU | 62016 tok/s +step 14637/19560 | loss 3.249839 (-1.91z)| norm 0.2768 (+0.65z)| lr 9.54e-05 | 8454.71 ms | -100.0% bf16 MFU | 62015 tok/s +step 14638/19560 | loss 3.341403 (+0.35z)| norm 0.2619 (-0.53z)| lr 9.53e-05 | 8455.97 ms | -100.0% bf16 MFU | 62015 tok/s +step 14639/19560 | loss 3.301433 (-0.64z)| norm 0.2750 (+0.50z)| lr 9.53e-05 | 8456.27 ms | -100.0% bf16 MFU | 62014 tok/s +step 14640/19560 | loss 3.329511 (+0.05z)| norm 0.2779 (+0.72z)| lr 9.53e-05 | 8448.33 ms | -100.0% bf16 MFU | 62016 tok/s +step 14641/19560 | loss 3.337245 (+0.25z)| norm 0.2848 (+1.25z)| lr 9.52e-05 | 8452.40 ms | -100.0% bf16 MFU | 62017 tok/s +step 14642/19560 | loss 3.332865 (+0.12z)| norm 0.2619 (-0.57z)| lr 9.52e-05 | 8453.88 ms | -100.0% bf16 MFU | 62017 tok/s +step 14643/19560 | loss 3.278774 (-1.24z)| norm 0.2767 (+0.60z)| lr 9.52e-05 | 8442.97 ms | -100.0% bf16 MFU | 62021 tok/s +step 14644/19560 | loss 3.325479 (-0.06z)| norm 0.2727 (+0.27z)| lr 9.51e-05 | 8445.05 ms | -100.0% bf16 MFU | 62024 tok/s +step 14645/19560 | loss 3.338660 (+0.26z)| norm 0.2745 (+0.41z)| lr 9.51e-05 | 8448.39 ms | -100.0% bf16 MFU | 62026 tok/s +step 14646/19560 | loss 3.308829 (-0.50z)| norm 0.2635 (-0.47z)| lr 9.51e-05 | 8455.57 ms | -100.0% bf16 MFU | 62025 tok/s +step 14647/19560 | loss 3.363722 (+0.89z)| norm 0.8951 (+11.00z)| lr 9.50e-05 | 8452.84 ms | -100.0% bf16 MFU | 62025 tok/s +step 14648/19560 | loss 3.342937 (+0.35z)| norm 0.2926 (+0.32z)| lr 9.50e-05 | 8446.13 ms | -100.0% bf16 MFU | 62027 tok/s +step 14649/19560 | loss 3.267114 (-1.54z)| norm 0.2706 (-0.07z)| lr 9.49e-05 | 8448.02 ms | -100.0% bf16 MFU | 62029 tok/s +step 14650/19560 | loss 3.322482 (-0.14z)| norm 0.2827 (+0.14z)| lr 9.49e-05 | 8448.12 ms | -100.0% bf16 MFU | 62030 tok/s +step 14651/19560 | loss 3.381217 (+1.36z)| norm 0.2884 (+0.24z)| lr 9.49e-05 | 8448.45 ms | -100.0% bf16 MFU | 62032 tok/s +step 14652/19560 | loss 3.310082 (-0.45z)| norm 0.2744 (-0.01z)| lr 9.48e-05 | 8447.94 ms | -100.0% bf16 MFU | 62033 tok/s +step 14653/19560 | loss 3.291362 (-0.92z)| norm 0.2726 (-0.05z)| lr 9.48e-05 | 8448.65 ms | -100.0% bf16 MFU | 62034 tok/s +step 14654/19560 | loss 3.370510 (+1.09z)| norm 0.2794 (+0.08z)| lr 9.48e-05 | 8455.49 ms | -100.0% bf16 MFU | 62033 tok/s +step 14655/19560 | loss 3.309765 (-0.45z)| norm 0.2754 (+0.01z)| lr 9.47e-05 | 8447.97 ms | -100.0% bf16 MFU | 62034 tok/s +step 14656/19560 | loss 3.298660 (-0.72z)| norm 0.2750 (-0.00z)| lr 9.47e-05 | 8447.31 ms | -100.0% bf16 MFU | 62036 tok/s +step 14657/19560 | loss 3.355837 (+0.74z)| norm 0.2674 (-0.14z)| lr 9.47e-05 | 8449.66 ms | -100.0% bf16 MFU | 62036 tok/s +step 14658/19560 | loss 3.296312 (-0.78z)| norm 0.2708 (-0.07z)| lr 9.46e-05 | 8443.23 ms | -100.0% bf16 MFU | 62039 tok/s +step 14659/19560 | loss 3.347914 (+0.53z)| norm 0.2925 (+0.31z)| lr 9.46e-05 | 8447.16 ms | -100.0% bf16 MFU | 62041 tok/s +step 14660/19560 | loss 3.324321 (-0.06z)| norm 0.2636 (-0.20z)| lr 9.45e-05 | 8444.68 ms | -100.0% bf16 MFU | 62043 tok/s +step 14661/19560 | loss 3.360930 (+0.90z)| norm 0.2795 (+0.08z)| lr 9.45e-05 | 8445.55 ms | -100.0% bf16 MFU | 62045 tok/s +step 14662/19560 | loss 3.267985 (-1.49z)| norm 0.2667 (-0.14z)| lr 9.45e-05 | 8447.54 ms | -100.0% bf16 MFU | 62046 tok/s +step 14663/19560 | loss 3.286769 (-0.99z)| norm 0.2610 (-0.24z)| lr 9.44e-05 | 8444.54 ms | -100.0% bf16 MFU | 62048 tok/s +step 14664/19560 | loss 3.308340 (-0.42z)| norm 0.2790 (+0.08z)| lr 9.44e-05 | 8444.92 ms | -100.0% bf16 MFU | 62050 tok/s +step 14665/19560 | loss 3.409702 (+2.19z)| norm 0.2690 (-0.10z)| lr 9.44e-05 | 8445.21 ms | -100.0% bf16 MFU | 62051 tok/s +step 14666/19560 | loss 3.309438 (-0.39z)| norm 0.2632 (-0.20z)| lr 9.43e-05 | 8444.82 ms | -100.0% bf16 MFU | 62053 tok/s +step 14667/19560 | loss 3.331154 (+0.17z)| norm 0.2575 (-0.30z)| lr 9.43e-05 | 8447.33 ms | -100.0% bf16 MFU | 62053 tok/s +step 14668/19560 | loss 3.345953 (+0.54z)| norm 0.2743 (-0.00z)| lr 9.43e-05 | 8447.12 ms | -100.0% bf16 MFU | 62054 tok/s +step 14669/19560 | loss 3.293599 (-0.82z)| norm 0.2573 (-0.30z)| lr 9.42e-05 | 8444.22 ms | -100.0% bf16 MFU | 62056 tok/s +step 14670/19560 | loss 3.372397 (+1.21z)| norm 0.2644 (-0.17z)| lr 9.42e-05 | 8439.06 ms | -100.0% bf16 MFU | 62059 tok/s +step 14671/19560 | loss 3.323955 (-0.04z)| norm 0.2641 (-0.17z)| lr 9.41e-05 | 8446.41 ms | -100.0% bf16 MFU | 62060 tok/s +step 14672/19560 | loss 3.379620 (+1.37z)| norm 0.2643 (-0.17z)| lr 9.41e-05 | 8450.16 ms | -100.0% bf16 MFU | 62059 tok/s +step 14673/19560 | loss 3.319328 (-0.18z)| norm 0.2660 (-0.14z)| lr 9.41e-05 | 8439.15 ms | -100.0% bf16 MFU | 62063 tok/s +step 14674/19560 | loss 3.346092 (+0.51z)| norm 0.2638 (-0.17z)| lr 9.40e-05 | 8446.86 ms | -100.0% bf16 MFU | 62063 tok/s +step 14675/19560 | loss 3.381442 (+1.40z)| norm 0.2684 (-0.09z)| lr 9.40e-05 | 8442.69 ms | -100.0% bf16 MFU | 62065 tok/s +step 14676/19560 | loss 3.360413 (+0.85z)| norm 0.2726 (-0.02z)| lr 9.40e-05 | 8444.06 ms | -100.0% bf16 MFU | 62066 tok/s +step 14677/19560 | loss 3.340970 (+0.35z)| norm 0.2620 (-0.20z)| lr 9.39e-05 | 8443.88 ms | -100.0% bf16 MFU | 62067 tok/s +step 14678/19560 | loss 3.273433 (-1.35z)| norm 0.2562 (-0.30z)| lr 9.39e-05 | 8446.70 ms | -100.0% bf16 MFU | 62067 tok/s +step 14679/19560 | loss 3.323364 (-0.09z)| norm 0.2593 (-0.25z)| lr 9.39e-05 | 8442.94 ms | -100.0% bf16 MFU | 62069 tok/s +step 14680/19560 | loss 3.298336 (-0.72z)| norm 0.2694 (-0.07z)| lr 9.38e-05 | 8440.83 ms | -100.0% bf16 MFU | 62071 tok/s +step 14681/19560 | loss 3.353374 (+0.66z)| norm 0.2468 (-0.46z)| lr 9.38e-05 | 8443.92 ms | -100.0% bf16 MFU | 62072 tok/s +step 14682/19560 | loss 3.305647 (-0.54z)| norm 0.2523 (-0.37z)| lr 9.37e-05 | 8445.40 ms | -100.0% bf16 MFU | 62072 tok/s +step 14683/19560 | loss 3.377659 (+1.26z)| norm 0.2834 (+0.18z)| lr 9.37e-05 | 8443.27 ms | -100.0% bf16 MFU | 62074 tok/s +step 14684/19560 | loss 3.327431 (-0.01z)| norm 0.2647 (-0.15z)| lr 9.37e-05 | 8448.74 ms | -100.0% bf16 MFU | 62073 tok/s +step 14685/19560 | loss 3.349723 (+0.56z)| norm 0.2521 (-0.37z)| lr 9.36e-05 | 8439.32 ms | -100.0% bf16 MFU | 62075 tok/s +step 14686/19560 | loss 3.331933 (+0.13z)| norm 0.2876 (+0.26z)| lr 9.36e-05 | 8441.52 ms | -100.0% bf16 MFU | 62077 tok/s +step 14687/19560 | loss 3.313615 (-0.34z)| norm 0.2570 (-0.28z)| lr 9.36e-05 | 8446.95 ms | -100.0% bf16 MFU | 62076 tok/s +step 14688/19560 | loss 3.308888 (-0.45z)| norm 0.2965 (+0.41z)| lr 9.35e-05 | 8443.41 ms | -100.0% bf16 MFU | 62077 tok/s +step 14689/19560 | loss 3.322589 (-0.09z)| norm 0.2592 (-0.25z)| lr 9.35e-05 | 8443.48 ms | -100.0% bf16 MFU | 62078 tok/s +step 14690/19560 | loss 3.302633 (-0.60z)| norm 0.2551 (-0.32z)| lr 9.35e-05 | 8448.47 ms | -100.0% bf16 MFU | 62077 tok/s +step 14691/19560 | loss 3.399453 (+1.85z)| norm 0.2733 (+0.00z)| lr 9.34e-05 | 8443.50 ms | -100.0% bf16 MFU | 62078 tok/s +step 14692/19560 | loss 3.345386 (+0.48z)| norm 0.2740 (+0.01z)| lr 9.34e-05 | 8443.66 ms | -100.0% bf16 MFU | 62079 tok/s +step 14693/19560 | loss 3.336333 (+0.24z)| norm 0.2495 (-0.42z)| lr 9.33e-05 | 8439.03 ms | -100.0% bf16 MFU | 62081 tok/s +step 14694/19560 | loss 3.376923 (+1.25z)| norm 0.2676 (-0.10z)| lr 9.33e-05 | 8444.02 ms | -100.0% bf16 MFU | 62082 tok/s +step 14695/19560 | loss 3.365274 (+0.95z)| norm 0.2591 (-0.25z)| lr 9.33e-05 | 8441.42 ms | -100.0% bf16 MFU | 62083 tok/s +step 14696/19560 | loss 3.360496 (+0.82z)| norm 0.2608 (-0.22z)| lr 9.32e-05 | 8441.12 ms | -100.0% bf16 MFU | 62084 tok/s +step 14697/19560 | loss 3.381557 (+1.33z)| norm 0.2855 (+0.21z)| lr 9.32e-05 | 8437.38 ms | -100.0% bf16 MFU | 62087 tok/s +step 14698/19560 | loss 3.360945 (+0.82z)| norm 0.2692 (-0.08z)| lr 9.32e-05 | 8439.21 ms | -100.0% bf16 MFU | 62089 tok/s +step 14699/19560 | loss 3.335714 (+0.16z)| norm 0.2729 (-0.01z)| lr 9.31e-05 | 8439.28 ms | -100.0% bf16 MFU | 62091 tok/s +step 14700/19560 | loss 3.344303 (+0.38z)| norm 0.2869 (+0.23z)| lr 9.31e-05 | 8445.65 ms | -100.0% bf16 MFU | 62090 tok/s +step 14701/19560 | loss 3.328141 (-0.02z)| norm 0.2745 (+0.01z)| lr 9.31e-05 | 8443.56 ms | -100.0% bf16 MFU | 62090 tok/s +step 14702/19560 | loss 3.286917 (-1.10z)| norm 0.2729 (-0.02z)| lr 9.30e-05 | 8442.55 ms | -100.0% bf16 MFU | 62091 tok/s +step 14703/19560 | loss 3.232872 (-2.43z)| norm 0.2766 (+0.05z)| lr 9.30e-05 | 8443.87 ms | -100.0% bf16 MFU | 62091 tok/s +step 14704/19560 | loss 3.319665 (-0.22z)| norm 0.2945 (+0.36z)| lr 9.29e-05 | 8437.96 ms | -100.0% bf16 MFU | 62093 tok/s +step 14705/19560 | loss 3.312352 (-0.42z)| norm 0.2713 (-0.05z)| lr 9.29e-05 | 8441.07 ms | -100.0% bf16 MFU | 62094 tok/s +step 14706/19560 | loss 3.340276 (+0.30z)| norm 0.2797 (+0.09z)| lr 9.29e-05 | 8440.13 ms | -100.0% bf16 MFU | 62095 tok/s +step 14707/19560 | loss 3.350479 (+0.56z)| norm 0.2715 (-0.05z)| lr 9.28e-05 | 8440.62 ms | -100.0% bf16 MFU | 62096 tok/s +step 14708/19560 | loss 3.352961 (+0.61z)| norm 0.2825 (+0.14z)| lr 9.28e-05 | 8442.24 ms | -100.0% bf16 MFU | 62096 tok/s +step 14709/19560 | loss 3.270760 (-1.52z)| norm 0.2529 (-0.39z)| lr 9.28e-05 | 8438.88 ms | -100.0% bf16 MFU | 62098 tok/s +step 14710/19560 | loss 3.385709 (+1.44z)| norm 0.2575 (-0.30z)| lr 9.27e-05 | 8443.85 ms | -100.0% bf16 MFU | 62098 tok/s +step 14711/19560 | loss 3.313848 (-0.40z)| norm 0.2572 (-0.31z)| lr 9.27e-05 | 8443.63 ms | -100.0% bf16 MFU | 62097 tok/s +step 14712/19560 | loss 3.324761 (-0.12z)| norm 0.2511 (-0.42z)| lr 9.27e-05 | 8443.15 ms | -100.0% bf16 MFU | 62097 tok/s +step 14713/19560 | loss 3.273252 (-1.48z)| norm 0.2570 (-0.31z)| lr 9.26e-05 | 8442.41 ms | -100.0% bf16 MFU | 62098 tok/s +step 14714/19560 | loss 3.326173 (-0.07z)| norm 0.2559 (-0.33z)| lr 9.26e-05 | 8439.78 ms | -100.0% bf16 MFU | 62099 tok/s +step 14715/19560 | loss 3.299668 (-0.80z)| norm 0.2720 (-0.04z)| lr 9.25e-05 | 8442.88 ms | -100.0% bf16 MFU | 62099 tok/s +step 14716/19560 | loss 3.385238 (+1.49z)| norm 0.2555 (-0.33z)| lr 9.25e-05 | 8439.21 ms | -100.0% bf16 MFU | 62100 tok/s +step 14717/19560 | loss 3.369492 (+1.05z)| norm 0.2617 (-0.22z)| lr 9.25e-05 | 8439.62 ms | -100.0% bf16 MFU | 62101 tok/s +step 14718/19560 | loss 3.317848 (-0.33z)| norm 0.2620 (-0.21z)| lr 9.24e-05 | 8441.46 ms | -100.0% bf16 MFU | 62102 tok/s +step 14719/19560 | loss 3.332961 (+0.07z)| norm 0.2495 (-0.44z)| lr 9.24e-05 | 8440.38 ms | -100.0% bf16 MFU | 62102 tok/s +step 14720/19560 | loss 3.349078 (+0.50z)| norm 0.2607 (-0.24z)| lr 9.24e-05 | 8440.29 ms | -100.0% bf16 MFU | 62103 tok/s +step 14721/19560 | loss 3.326634 (-0.11z)| norm 0.2552 (-0.33z)| lr 9.23e-05 | 8446.40 ms | -100.0% bf16 MFU | 62101 tok/s +step 14722/19560 | loss 3.359389 (+0.76z)| norm 0.2590 (-0.27z)| lr 9.23e-05 | 8440.55 ms | -100.0% bf16 MFU | 62102 tok/s +step 14723/19560 | loss 3.314874 (-0.46z)| norm 0.2510 (-0.41z)| lr 9.23e-05 | 8440.61 ms | -100.0% bf16 MFU | 62103 tok/s +step 14724/19560 | loss 3.332693 (+0.03z)| norm 0.2576 (-0.29z)| lr 9.22e-05 | 8440.51 ms | -100.0% bf16 MFU | 62103 tok/s +step 14725/19560 | loss 3.346966 (+0.43z)| norm 0.2531 (-0.37z)| lr 9.22e-05 | 8438.35 ms | -100.0% bf16 MFU | 62105 tok/s +step 14726/19560 | loss 3.263840 (-1.84z)| norm 0.2549 (-0.34z)| lr 9.22e-05 | 8438.72 ms | -100.0% bf16 MFU | 62106 tok/s +step 14727/19560 | loss 3.271750 (-1.60z)| norm 0.2657 (-0.15z)| lr 9.21e-05 | 8441.44 ms | -100.0% bf16 MFU | 62106 tok/s +step 14728/19560 | loss 3.328735 (-0.06z)| norm 0.2573 (-0.30z)| lr 9.21e-05 | 8440.89 ms | -100.0% bf16 MFU | 62107 tok/s +step 14729/19560 | loss 3.321730 (-0.26z)| norm 0.2608 (-0.23z)| lr 9.20e-05 | 8441.66 ms | -100.0% bf16 MFU | 62107 tok/s +step 14730/19560 | loss 3.359630 (+0.77z)| norm 0.2633 (-0.19z)| lr 9.20e-05 | 8440.89 ms | -100.0% bf16 MFU | 62107 tok/s +step 14731/19560 | loss 3.376902 (+1.22z)| norm 0.2581 (-0.28z)| lr 9.20e-05 | 8439.17 ms | -100.0% bf16 MFU | 62108 tok/s +step 14732/19560 | loss 3.349530 (+0.48z)| norm 0.2627 (-0.20z)| lr 9.19e-05 | 8438.82 ms | -100.0% bf16 MFU | 62109 tok/s +step 14733/19560 | loss 3.281958 (-1.32z)| norm 0.2726 (-0.02z)| lr 9.19e-05 | 8441.78 ms | -100.0% bf16 MFU | 62109 tok/s +step 14734/19560 | loss 3.321153 (-0.26z)| norm 0.2460 (-0.49z)| lr 9.19e-05 | 8438.88 ms | -100.0% bf16 MFU | 62110 tok/s +step 14735/19560 | loss 3.288972 (-1.12z)| norm 0.2521 (-0.38z)| lr 9.18e-05 | 8443.32 ms | -100.0% bf16 MFU | 62109 tok/s +step 14736/19560 | loss 3.431998 (+2.64z)| norm 0.2674 (-0.10z)| lr 9.18e-05 | 8440.57 ms | -100.0% bf16 MFU | 62109 tok/s +step 14737/19560 | loss 3.307841 (-0.63z)| norm 0.2558 (-0.31z)| lr 9.18e-05 | 8443.61 ms | -100.0% bf16 MFU | 62108 tok/s +step 14738/19560 | loss 3.360864 (+0.78z)| norm 0.2598 (-0.24z)| lr 9.17e-05 | 8442.31 ms | -100.0% bf16 MFU | 62108 tok/s +step 14739/19560 | loss 3.333766 (+0.05z)| norm 0.2758 (+0.04z)| lr 9.17e-05 | 8441.08 ms | -100.0% bf16 MFU | 62108 tok/s +step 14740/19560 | loss 3.314958 (-0.44z)| norm 0.2706 (-0.05z)| lr 9.16e-05 | 8440.36 ms | -100.0% bf16 MFU | 62109 tok/s +step 14741/19560 | loss 3.309198 (-0.60z)| norm 0.2703 (-0.06z)| lr 9.16e-05 | 8438.17 ms | -100.0% bf16 MFU | 62110 tok/s +step 14742/19560 | loss 3.335712 (+0.10z)| norm 0.2821 (+0.15z)| lr 9.16e-05 | 8439.47 ms | -100.0% bf16 MFU | 62111 tok/s +step 14743/19560 | loss 3.353534 (+0.57z)| norm 0.2654 (-0.14z)| lr 9.15e-05 | 8440.21 ms | -100.0% bf16 MFU | 62111 tok/s +step 14744/19560 | loss 3.240056 (-2.41z)| norm 0.2675 (-0.11z)| lr 9.15e-05 | 8440.80 ms | -100.0% bf16 MFU | 62111 tok/s +step 14745/19560 | loss 3.331806 (+0.04z)| norm 0.2680 (-0.10z)| lr 9.15e-05 | 8437.34 ms | -100.0% bf16 MFU | 62112 tok/s +step 14746/19560 | loss 3.299651 (-0.81z)| norm 0.2457 (-0.49z)| lr 9.14e-05 | 8440.88 ms | -100.0% bf16 MFU | 62112 tok/s +step 14747/19560 | loss 3.285277 (-1.17z)| norm 0.2530 (-0.36z)| lr 9.14e-05 | 8436.78 ms | -100.0% bf16 MFU | 62114 tok/s +step 14748/19560 | loss 3.339540 (+0.27z)| norm 0.2635 (-0.17z)| lr 9.14e-05 | 8439.15 ms | -100.0% bf16 MFU | 62115 tok/s +step 14749/19560 | loss 3.269655 (-1.65z)| norm 0.2817 (+0.15z)| lr 9.13e-05 | 8439.82 ms | -100.0% bf16 MFU | 62115 tok/s +step 14750/19560 | loss 3.341949 (+0.32z)| norm 0.2526 (-0.36z)| lr 9.13e-05 | 8440.36 ms | -100.0% bf16 MFU | 62115 tok/s +val loss 3.296276 +evaluating HellaSwag: 0/1256 evaluating HellaSwag: 10/1256 evaluating HellaSwag: 20/1256 evaluating HellaSwag: 30/1256 evaluating HellaSwag: 40/1256 evaluating HellaSwag: 50/1256 evaluating HellaSwag: 60/1256 evaluating HellaSwag: 70/1256 evaluating HellaSwag: 80/1256 evaluating HellaSwag: 90/1256 evaluating HellaSwag: 100/1256 evaluating HellaSwag: 110/1256 evaluating HellaSwag: 120/1256 evaluating HellaSwag: 130/1256 evaluating HellaSwag: 140/1256 evaluating HellaSwag: 150/1256 evaluating HellaSwag: 160/1256 evaluating HellaSwag: 170/1256 evaluating HellaSwag: 180/1256 evaluating HellaSwag: 190/1256 evaluating HellaSwag: 200/1256 evaluating HellaSwag: 210/1256 evaluating HellaSwag: 220/1256 evaluating HellaSwag: 230/1256 evaluating HellaSwag: 240/1256 evaluating HellaSwag: 250/1256 evaluating HellaSwag: 260/1256 evaluating HellaSwag: 270/1256 evaluating HellaSwag: 280/1256 evaluating HellaSwag: 290/1256 evaluating HellaSwag: 300/1256 evaluating HellaSwag: 310/1256 evaluating HellaSwag: 320/1256 evaluating HellaSwag: 330/1256 evaluating HellaSwag: 340/1256 evaluating HellaSwag: 350/1256 evaluating HellaSwag: 360/1256 evaluating HellaSwag: 370/1256 evaluating HellaSwag: 380/1256 evaluating HellaSwag: 390/1256 evaluating HellaSwag: 400/1256 evaluating HellaSwag: 410/1256 evaluating HellaSwag: 420/1256 evaluating HellaSwag: 430/1256 evaluating HellaSwag: 440/1256 evaluating HellaSwag: 450/1256 evaluating HellaSwag: 460/1256 evaluating HellaSwag: 470/1256 evaluating HellaSwag: 480/1256 evaluating HellaSwag: 490/1256 evaluating HellaSwag: 500/1256 evaluating HellaSwag: 510/1256 evaluating HellaSwag: 520/1256 evaluating HellaSwag: 530/1256 evaluating HellaSwag: 540/1256 evaluating HellaSwag: 550/1256 evaluating HellaSwag: 560/1256 evaluating HellaSwag: 570/1256 evaluating HellaSwag: 580/1256 evaluating HellaSwag: 590/1256 evaluating HellaSwag: 600/1256 evaluating HellaSwag: 610/1256 evaluating HellaSwag: 620/1256 evaluating HellaSwag: 630/1256 evaluating HellaSwag: 640/1256 evaluating HellaSwag: 650/1256 evaluating HellaSwag: 660/1256 evaluating HellaSwag: 670/1256 evaluating HellaSwag: 680/1256 evaluating HellaSwag: 690/1256 evaluating HellaSwag: 700/1256 evaluating HellaSwag: 710/1256 evaluating HellaSwag: 720/1256 evaluating HellaSwag: 730/1256 evaluating HellaSwag: 740/1256 evaluating HellaSwag: 750/1256 evaluating HellaSwag: 760/1256 evaluating HellaSwag: 770/1256 evaluating HellaSwag: 780/1256 evaluating HellaSwag: 790/1256 evaluating HellaSwag: 800/1256 evaluating HellaSwag: 810/1256 evaluating HellaSwag: 820/1256 evaluating HellaSwag: 830/1256 evaluating HellaSwag: 840/1256 evaluating HellaSwag: 850/1256 evaluating HellaSwag: 860/1256 evaluating HellaSwag: 870/1256 evaluating HellaSwag: 880/1256 evaluating HellaSwag: 890/1256 evaluating HellaSwag: 900/1256 evaluating HellaSwag: 910/1256 evaluating HellaSwag: 920/1256 evaluating HellaSwag: 930/1256 evaluating HellaSwag: 940/1256 evaluating HellaSwag: 950/1256 evaluating HellaSwag: 960/1256 evaluating HellaSwag: 970/1256 evaluating HellaSwag: 980/1256 evaluating HellaSwag: 990/1256 evaluating HellaSwag: 1000/1256 evaluating HellaSwag: 1010/1256 evaluating HellaSwag: 1020/1256 evaluating HellaSwag: 1030/1256 evaluating HellaSwag: 1040/1256 evaluating HellaSwag: 1050/1256 evaluating HellaSwag: 1060/1256 evaluating HellaSwag: 1070/1256 evaluating HellaSwag: 1080/1256 evaluating HellaSwag: 1090/1256 evaluating HellaSwag: 1100/1256 evaluating HellaSwag: 1110/1256 evaluating HellaSwag: 1120/1256 evaluating HellaSwag: 1130/1256 evaluating HellaSwag: 1140/1256 evaluating HellaSwag: 1150/1256 evaluating HellaSwag: 1160/1256 evaluating HellaSwag: 1170/1256 evaluating HellaSwag: 1180/1256 evaluating HellaSwag: 1190/1256 evaluating HellaSwag: 1200/1256 evaluating HellaSwag: 1210/1256 evaluating HellaSwag: 1220/1256 evaluating HellaSwag: 1230/1256 evaluating HellaSwag: 1240/1256 evaluating HellaSwag: 1250/1256 HellaSwag: 2973/10042 = 0.296057 +step 14751/19560 | loss 3.333108 (+0.09z)| norm 0.2654 (-0.13z)| lr 9.13e-05 | 8430.98 ms | -100.0% bf16 MFU | 62119 tok/s +step 14752/19560 | loss 3.297520 (-0.88z)| norm 0.2523 (-0.36z)| lr 9.12e-05 | 8431.30 ms | -100.0% bf16 MFU | 62122 tok/s +step 14753/19560 | loss 3.382545 (+1.43z)| norm 0.2622 (-0.19z)| lr 9.12e-05 | 8428.68 ms | -100.0% bf16 MFU | 62126 tok/s +step 14754/19560 | loss 3.358358 (+0.76z)| norm 0.2625 (-0.18z)| lr 9.11e-05 | 8433.44 ms | -100.0% bf16 MFU | 62128 tok/s +step 14755/19560 | loss 3.263436 (-1.81z)| norm 0.2696 (-0.05z)| lr 9.11e-05 | 8429.80 ms | -100.0% bf16 MFU | 62131 tok/s +step 14756/19560 | loss 3.290844 (-1.05z)| norm 0.2457 (-0.47z)| lr 9.11e-05 | 8434.98 ms | -100.0% bf16 MFU | 62133 tok/s +step 14757/19560 | loss 3.302938 (-0.71z)| norm 0.2671 (-0.09z)| lr 9.10e-05 | 8431.74 ms | -100.0% bf16 MFU | 62135 tok/s +step 14758/19560 | loss 3.356010 (+0.76z)| norm 0.2719 (-0.00z)| lr 9.10e-05 | 8432.07 ms | -100.0% bf16 MFU | 62137 tok/s +step 14759/19560 | loss 3.350376 (+0.60z)| norm 0.2547 (-0.31z)| lr 9.10e-05 | 8433.26 ms | -100.0% bf16 MFU | 62139 tok/s +step 14760/19560 | loss 3.297981 (-0.84z)| norm 0.2599 (-0.21z)| lr 9.09e-05 | 8434.42 ms | -100.0% bf16 MFU | 62140 tok/s +step 14761/19560 | loss 3.324867 (-0.09z)| norm 0.2609 (-0.19z)| lr 9.09e-05 | 8434.43 ms | -100.0% bf16 MFU | 62141 tok/s +step 14762/19560 | loss 3.280430 (-1.31z)| norm 0.2534 (-0.32z)| lr 9.09e-05 | 8429.66 ms | -100.0% bf16 MFU | 62144 tok/s +step 14763/19560 | loss 3.349190 (+0.61z)| norm 0.2623 (-0.16z)| lr 9.08e-05 | 8433.06 ms | -100.0% bf16 MFU | 62145 tok/s +step 14764/19560 | loss 3.321374 (-0.18z)| norm 0.2523 (-0.33z)| lr 9.08e-05 | 8429.21 ms | -100.0% bf16 MFU | 62148 tok/s +step 14765/19560 | loss 3.336249 (+0.23z)| norm 0.2582 (-0.23z)| lr 9.07e-05 | 8431.12 ms | -100.0% bf16 MFU | 62149 tok/s +step 14766/19560 | loss 3.391121 (+1.77z)| norm 0.2611 (-0.17z)| lr 9.07e-05 | 8433.41 ms | -100.0% bf16 MFU | 62150 tok/s +step 14767/19560 | loss 3.318740 (-0.29z)| norm 0.2692 (-0.03z)| lr 9.07e-05 | 8434.16 ms | -100.0% bf16 MFU | 62151 tok/s +step 14768/19560 | loss 3.314445 (-0.41z)| norm 0.2547 (-0.28z)| lr 9.06e-05 | 8432.89 ms | -100.0% bf16 MFU | 62152 tok/s +step 14769/19560 | loss 3.308618 (-0.56z)| norm 0.2703 (-0.01z)| lr 9.06e-05 | 8434.21 ms | -100.0% bf16 MFU | 62153 tok/s +step 14770/19560 | loss 3.352983 (+0.69z)| norm 0.2876 (+0.30z)| lr 9.06e-05 | 8437.29 ms | -100.0% bf16 MFU | 62152 tok/s +step 14771/19560 | loss 3.284893 (-1.24z)| norm 0.2722 (+0.02z)| lr 9.05e-05 | 8435.93 ms | -100.0% bf16 MFU | 62152 tok/s +step 14772/19560 | loss 3.371228 (+1.19z)| norm 0.2646 (-0.11z)| lr 9.05e-05 | 8432.93 ms | -100.0% bf16 MFU | 62153 tok/s +step 14773/19560 | loss 3.395642 (+1.84z)| norm 0.2825 (+0.21z)| lr 9.05e-05 | 8437.33 ms | -100.0% bf16 MFU | 62152 tok/s +step 14774/19560 | loss 3.377847 (+1.32z)| norm 0.2681 (-0.05z)| lr 9.04e-05 | 8434.84 ms | -100.0% bf16 MFU | 62152 tok/s +step 14775/19560 | loss 3.290616 (-1.07z)| norm 0.2653 (-0.06z)| lr 9.04e-05 | 8435.13 ms | -100.0% bf16 MFU | 62152 tok/s +step 14776/19560 | loss 3.284421 (-1.23z)| norm 0.2568 (-0.82z)| lr 9.04e-05 | 8438.40 ms | -100.0% bf16 MFU | 62151 tok/s +step 14777/19560 | loss 3.386723 (+1.56z)| norm 0.2663 (+0.07z)| lr 9.03e-05 | 8432.89 ms | -100.0% bf16 MFU | 62152 tok/s +step 14778/19560 | loss 3.375402 (+1.23z)| norm 0.2663 (+0.08z)| lr 9.03e-05 | 8436.82 ms | -100.0% bf16 MFU | 62152 tok/s +step 14779/19560 | loss 3.312299 (-0.48z)| norm 0.2573 (-0.75z)| lr 9.02e-05 | 8435.41 ms | -100.0% bf16 MFU | 62152 tok/s +step 14780/19560 | loss 3.278858 (-1.39z)| norm 0.2855 (+1.89z)| lr 9.02e-05 | 8436.06 ms | -100.0% bf16 MFU | 62152 tok/s +step 14781/19560 | loss 3.366251 (+0.99z)| norm 0.2880 (+2.07z)| lr 9.02e-05 | 8439.77 ms | -100.0% bf16 MFU | 62150 tok/s +step 14782/19560 | loss 3.309612 (-0.55z)| norm 0.2759 (+0.97z)| lr 9.01e-05 | 8437.79 ms | -100.0% bf16 MFU | 62150 tok/s +step 14783/19560 | loss 3.322357 (-0.21z)| norm 0.2905 (+2.27z)| lr 9.01e-05 | 8436.82 ms | -100.0% bf16 MFU | 62149 tok/s +step 14784/19560 | loss 3.344860 (+0.40z)| norm 0.2773 (+1.07z)| lr 9.01e-05 | 8439.34 ms | -100.0% bf16 MFU | 62148 tok/s +step 14785/19560 | loss 3.282895 (-1.28z)| norm 0.2856 (+1.78z)| lr 9.00e-05 | 8438.03 ms | -100.0% bf16 MFU | 62147 tok/s +step 14786/19560 | loss 3.279003 (-1.38z)| norm 0.2886 (+2.01z)| lr 9.00e-05 | 8439.38 ms | -100.0% bf16 MFU | 62146 tok/s +step 14787/19560 | loss 3.280931 (-1.31z)| norm 0.2823 (+1.48z)| lr 9.00e-05 | 8439.94 ms | -100.0% bf16 MFU | 62145 tok/s +step 14788/19560 | loss 3.391373 (+1.66z)| norm 0.2901 (+2.12z)| lr 8.99e-05 | 8437.23 ms | -100.0% bf16 MFU | 62145 tok/s +step 14789/19560 | loss 3.419503 (+2.36z)| norm 0.2817 (+1.38z)| lr 8.99e-05 | 8456.22 ms | -100.0% bf16 MFU | 62137 tok/s +step 14790/19560 | loss 3.342537 (+0.32z)| norm 0.2957 (+2.52z)| lr 8.99e-05 | 8464.08 ms | -100.0% bf16 MFU | 62128 tok/s +step 14791/19560 | loss 3.335401 (+0.12z)| norm 0.2806 (+1.21z)| lr 8.98e-05 | 8465.21 ms | -100.0% bf16 MFU | 62118 tok/s +step 14792/19560 | loss 3.325955 (-0.14z)| norm 0.2950 (+2.38z)| lr 8.98e-05 | 8463.69 ms | -100.0% bf16 MFU | 62109 tok/s +step 14793/19560 | loss 3.312734 (-0.48z)| norm 0.2915 (+2.04z)| lr 8.97e-05 | 8463.00 ms | -100.0% bf16 MFU | 62101 tok/s +step 14794/19560 | loss 3.386238 (+1.50z)| norm 0.2800 (+1.08z)| lr 8.97e-05 | 8463.71 ms | -100.0% bf16 MFU | 62094 tok/s +step 14795/19560 | loss 3.347278 (+0.44z)| norm 0.3020 (+2.77z)| lr 8.97e-05 | 8466.70 ms | -100.0% bf16 MFU | 62085 tok/s +step 14796/19560 | loss 3.376279 (+1.21z)| norm 0.2899 (+1.77z)| lr 8.96e-05 | 8461.42 ms | -100.0% bf16 MFU | 62079 tok/s +step 14797/19560 | loss 3.304545 (-0.72z)| norm 0.2618 (-0.43z)| lr 8.96e-05 | 8462.73 ms | -100.0% bf16 MFU | 62073 tok/s +step 14798/19560 | loss 3.294197 (-0.99z)| norm 0.2682 (+0.07z)| lr 8.96e-05 | 8459.90 ms | -100.0% bf16 MFU | 62068 tok/s +step 14799/19560 | loss 3.268275 (-1.65z)| norm 0.2762 (+0.69z)| lr 8.95e-05 | 8462.23 ms | -100.0% bf16 MFU | 62062 tok/s +step 14800/19560 | loss 3.311616 (-0.49z)| norm 0.2546 (-1.00z)| lr 8.95e-05 | 8463.48 ms | -100.0% bf16 MFU | 62056 tok/s +step 14801/19560 | loss 3.259075 (-1.86z)| norm 0.2702 (+0.22z)| lr 8.95e-05 | 8464.34 ms | -100.0% bf16 MFU | 62051 tok/s +step 14802/19560 | loss 3.292239 (-0.97z)| norm 0.2656 (-0.14z)| lr 8.94e-05 | 8461.78 ms | -100.0% bf16 MFU | 62046 tok/s +step 14803/19560 | loss 3.256139 (-1.88z)| norm 0.2565 (-0.84z)| lr 8.94e-05 | 8448.17 ms | -100.0% bf16 MFU | 62047 tok/s +step 14804/19560 | loss 3.282725 (-1.17z)| norm 0.2624 (-0.37z)| lr 8.94e-05 | 8458.44 ms | -100.0% bf16 MFU | 62044 tok/s +step 14805/19560 | loss 3.314227 (-0.34z)| norm 0.2793 (+0.93z)| lr 8.93e-05 | 8460.38 ms | -100.0% bf16 MFU | 62040 tok/s +step 14806/19560 | loss 3.333226 (+0.15z)| norm 0.2790 (+0.89z)| lr 8.93e-05 | 8454.27 ms | -100.0% bf16 MFU | 62039 tok/s +step 14807/19560 | loss 3.271857 (-1.45z)| norm 0.2503 (-1.33z)| lr 8.93e-05 | 8462.43 ms | -100.0% bf16 MFU | 62034 tok/s +step 14808/19560 | loss 3.320128 (-0.19z)| norm 0.2898 (+1.70z)| lr 8.92e-05 | 8458.58 ms | -100.0% bf16 MFU | 62032 tok/s +step 14809/19560 | loss 3.264441 (-1.62z)| norm 0.2800 (+0.93z)| lr 8.92e-05 | 8452.98 ms | -100.0% bf16 MFU | 62031 tok/s +step 14810/19560 | loss 3.227563 (-2.50z)| norm 0.2552 (-0.98z)| lr 8.91e-05 | 8453.89 ms | -100.0% bf16 MFU | 62031 tok/s +step 14811/19560 | loss 3.380980 (+1.39z)| norm 0.2892 (+1.63z)| lr 8.91e-05 | 8456.59 ms | -100.0% bf16 MFU | 62029 tok/s +step 14812/19560 | loss 3.258602 (-1.68z)| norm 0.2495 (-1.39z)| lr 8.91e-05 | 8462.76 ms | -100.0% bf16 MFU | 62025 tok/s +step 14813/19560 | loss 3.275008 (-1.25z)| norm 0.2555 (-0.94z)| lr 8.90e-05 | 8452.88 ms | -100.0% bf16 MFU | 62025 tok/s +step 14814/19560 | loss 3.308535 (-0.40z)| norm 0.2592 (-0.65z)| lr 8.90e-05 | 8452.38 ms | -100.0% bf16 MFU | 62025 tok/s +step 14815/19560 | loss 3.342471 (+0.44z)| norm 0.2700 (+0.18z)| lr 8.90e-05 | 8459.70 ms | -100.0% bf16 MFU | 62023 tok/s +step 14816/19560 | loss 3.309565 (-0.38z)| norm 0.2500 (-1.35z)| lr 8.89e-05 | 8456.13 ms | -100.0% bf16 MFU | 62022 tok/s +step 14817/19560 | loss 3.454834 (+3.09z)| norm 0.3027 (+2.66z)| lr 8.89e-05 | 8460.44 ms | -100.0% bf16 MFU | 62019 tok/s +step 14818/19560 | loss 3.319753 (-0.15z)| norm 0.2741 (+0.47z)| lr 8.89e-05 | 8456.87 ms | -100.0% bf16 MFU | 62018 tok/s +step 14819/19560 | loss 3.287931 (-0.90z)| norm 0.2746 (+0.51z)| lr 8.88e-05 | 8452.58 ms | -100.0% bf16 MFU | 62018 tok/s +step 14820/19560 | loss 3.329104 (+0.10z)| norm 0.2559 (-0.90z)| lr 8.88e-05 | 8455.20 ms | -100.0% bf16 MFU | 62018 tok/s +step 14821/19560 | loss 3.290960 (-0.82z)| norm 0.2629 (-0.37z)| lr 8.88e-05 | 8442.10 ms | -100.0% bf16 MFU | 62022 tok/s +step 14822/19560 | loss 3.288292 (-0.87z)| norm 0.2522 (-1.18z)| lr 8.87e-05 | 8442.01 ms | -100.0% bf16 MFU | 62026 tok/s +step 14823/19560 | loss 3.289371 (-0.83z)| norm 0.2493 (-1.39z)| lr 8.87e-05 | 8446.49 ms | -100.0% bf16 MFU | 62029 tok/s +step 14824/19560 | loss 3.306908 (-0.39z)| norm 0.2651 (-0.19z)| lr 8.86e-05 | 8442.54 ms | -100.0% bf16 MFU | 62032 tok/s +step 14825/19560 | loss 3.282249 (-0.98z)| norm 0.2525 (-1.13z)| lr 8.86e-05 | 8450.12 ms | -100.0% bf16 MFU | 62033 tok/s +step 14826/19560 | loss 3.330404 (+0.21z)| norm 0.2790 (+0.87z)| lr 8.86e-05 | 8440.97 ms | -100.0% bf16 MFU | 62037 tok/s +step 14827/19560 | loss 3.336766 (+0.36z)| norm 0.2539 (-1.01z)| lr 8.85e-05 | 8446.20 ms | -100.0% bf16 MFU | 62039 tok/s +step 14828/19560 | loss 3.332812 (+0.27z)| norm 0.2662 (-0.07z)| lr 8.85e-05 | 8444.16 ms | -100.0% bf16 MFU | 62041 tok/s +step 14829/19560 | loss 3.297647 (-0.59z)| norm 0.2588 (-0.63z)| lr 8.85e-05 | 8437.70 ms | -100.0% bf16 MFU | 62046 tok/s +step 14830/19560 | loss 3.313008 (-0.22z)| norm 0.2794 (+0.93z)| lr 8.84e-05 | 8438.69 ms | -100.0% bf16 MFU | 62050 tok/s +step 14831/19560 | loss 3.362924 (+1.00z)| norm 0.3523 (+5.59z)| lr 8.84e-05 | 8441.49 ms | -100.0% bf16 MFU | 62053 tok/s +step 14832/19560 | loss 3.349349 (+0.65z)| norm 0.2818 (+0.95z)| lr 8.84e-05 | 8443.36 ms | -100.0% bf16 MFU | 62055 tok/s +step 14833/19560 | loss 3.316830 (-0.16z)| norm 0.2637 (-0.25z)| lr 8.83e-05 | 8444.68 ms | -100.0% bf16 MFU | 62057 tok/s +step 14834/19560 | loss 3.248842 (-1.81z)| norm 0.2839 (+1.09z)| lr 8.83e-05 | 8441.68 ms | -100.0% bf16 MFU | 62059 tok/s +step 14835/19560 | loss 3.317927 (-0.11z)| norm 0.2699 (+0.16z)| lr 8.83e-05 | 8444.22 ms | -100.0% bf16 MFU | 62061 tok/s +step 14836/19560 | loss 3.261246 (-1.48z)| norm 0.2701 (+0.17z)| lr 8.82e-05 | 8440.66 ms | -100.0% bf16 MFU | 62063 tok/s +step 14837/19560 | loss 3.272056 (-1.21z)| norm 0.2611 (-0.43z)| lr 8.82e-05 | 8440.25 ms | -100.0% bf16 MFU | 62066 tok/s +step 14838/19560 | loss 3.257276 (-1.55z)| norm 0.2898 (+1.46z)| lr 8.82e-05 | 8442.19 ms | -100.0% bf16 MFU | 62068 tok/s +step 14839/19560 | loss 3.235150 (-2.05z)| norm 0.2583 (-0.63z)| lr 8.81e-05 | 8440.39 ms | -100.0% bf16 MFU | 62070 tok/s +step 14840/19560 | loss 3.273409 (-1.11z)| norm 0.2463 (-1.42z)| lr 8.81e-05 | 8438.98 ms | -100.0% bf16 MFU | 62073 tok/s +step 14841/19560 | loss 3.319993 (+0.00z)| norm 0.2635 (-0.28z)| lr 8.80e-05 | 8446.36 ms | -100.0% bf16 MFU | 62073 tok/s +step 14842/19560 | loss 3.245621 (-1.76z)| norm 0.2710 (+0.21z)| lr 8.80e-05 | 8443.91 ms | -100.0% bf16 MFU | 62074 tok/s +step 14843/19560 | loss 3.285347 (-0.80z)| norm 0.2706 (+0.18z)| lr 8.80e-05 | 8438.10 ms | -100.0% bf16 MFU | 62077 tok/s +step 14844/19560 | loss 3.323195 (+0.11z)| norm 0.2646 (-0.22z)| lr 8.79e-05 | 8441.51 ms | -100.0% bf16 MFU | 62078 tok/s +step 14845/19560 | loss 3.326123 (+0.19z)| norm 0.2485 (-1.29z)| lr 8.79e-05 | 8439.38 ms | -100.0% bf16 MFU | 62081 tok/s +step 14846/19560 | loss 3.309897 (-0.20z)| norm 0.2525 (-1.01z)| lr 8.79e-05 | 8440.67 ms | -100.0% bf16 MFU | 62082 tok/s +step 14847/19560 | loss 3.279000 (-0.94z)| norm 0.2882 (+1.32z)| lr 8.78e-05 | 8447.55 ms | -100.0% bf16 MFU | 62082 tok/s +step 14848/19560 | loss 3.338217 (+0.49z)| norm 0.2537 (-0.95z)| lr 8.78e-05 | 8448.82 ms | -100.0% bf16 MFU | 62080 tok/s +step 14849/19560 | loss 3.365163 (+1.13z)| norm 0.2736 (+0.36z)| lr 8.78e-05 | 8441.59 ms | -100.0% bf16 MFU | 62082 tok/s +step 14850/19560 | loss 3.337553 (+0.47z)| norm 0.2658 (-0.16z)| lr 8.77e-05 | 8443.96 ms | -100.0% bf16 MFU | 62082 tok/s +step 14851/19560 | loss 3.269706 (-1.15z)| norm 0.2628 (-0.37z)| lr 8.77e-05 | 8446.62 ms | -100.0% bf16 MFU | 62081 tok/s +step 14852/19560 | loss 3.311685 (-0.14z)| norm 0.2437 (-1.61z)| lr 8.77e-05 | 8445.28 ms | -100.0% bf16 MFU | 62081 tok/s +step 14853/19560 | loss 3.328281 (+0.27z)| norm 0.2784 (+0.66z)| lr 8.76e-05 | 8444.82 ms | -100.0% bf16 MFU | 62082 tok/s +step 14854/19560 | loss 3.293100 (-0.59z)| norm 0.2595 (-0.59z)| lr 8.76e-05 | 8446.37 ms | -100.0% bf16 MFU | 62081 tok/s +step 14855/19560 | loss 3.315975 (-0.04z)| norm 0.2550 (-0.88z)| lr 8.76e-05 | 8448.95 ms | -100.0% bf16 MFU | 62080 tok/s +step 14856/19560 | loss 3.292528 (-0.61z)| norm 0.2661 (-0.15z)| lr 8.75e-05 | 8443.56 ms | -100.0% bf16 MFU | 62080 tok/s +step 14857/19560 | loss 3.344538 (+0.65z)| norm 0.2453 (-1.51z)| lr 8.75e-05 | 8444.15 ms | -100.0% bf16 MFU | 62081 tok/s +step 14858/19560 | loss 3.337126 (+0.48z)| norm 0.2714 (+0.20z)| lr 8.74e-05 | 8441.16 ms | -100.0% bf16 MFU | 62082 tok/s +step 14859/19560 | loss 3.285315 (-0.77z)| norm 0.2549 (-0.88z)| lr 8.74e-05 | 8449.49 ms | -100.0% bf16 MFU | 62081 tok/s +step 14860/19560 | loss 3.297187 (-0.47z)| norm 0.2608 (-0.49z)| lr 8.74e-05 | 8442.33 ms | -100.0% bf16 MFU | 62082 tok/s +step 14861/19560 | loss 3.282511 (-0.83z)| norm 0.2528 (-1.00z)| lr 8.73e-05 | 8447.51 ms | -100.0% bf16 MFU | 62081 tok/s +step 14862/19560 | loss 3.295613 (-0.50z)| norm 0.2435 (-1.60z)| lr 8.73e-05 | 8442.50 ms | -100.0% bf16 MFU | 62082 tok/s +step 14863/19560 | loss 3.268358 (-1.16z)| norm 0.2550 (-0.86z)| lr 8.73e-05 | 8441.55 ms | -100.0% bf16 MFU | 62083 tok/s +step 14864/19560 | loss 3.259332 (-1.38z)| norm 0.2476 (-1.32z)| lr 8.72e-05 | 8445.19 ms | -100.0% bf16 MFU | 62083 tok/s +step 14865/19560 | loss 3.285954 (-0.71z)| norm 0.2530 (-0.97z)| lr 8.72e-05 | 8440.18 ms | -100.0% bf16 MFU | 62085 tok/s +step 14866/19560 | loss 3.261681 (-1.29z)| norm 0.2684 (+0.02z)| lr 8.72e-05 | 8444.65 ms | -100.0% bf16 MFU | 62085 tok/s +step 14867/19560 | loss 3.318886 (+0.13z)| norm 0.2553 (-0.82z)| lr 8.71e-05 | 8442.54 ms | -100.0% bf16 MFU | 62086 tok/s +step 14868/19560 | loss 3.345156 (+0.78z)| norm 0.2760 (+0.52z)| lr 8.71e-05 | 8448.35 ms | -100.0% bf16 MFU | 62084 tok/s +step 14869/19560 | loss 3.379225 (+1.59z)| norm 0.2854 (+1.11z)| lr 8.71e-05 | 8445.55 ms | -100.0% bf16 MFU | 62084 tok/s +step 14870/19560 | loss 3.386049 (+1.73z)| norm 0.2574 (-0.68z)| lr 8.70e-05 | 8445.08 ms | -100.0% bf16 MFU | 62084 tok/s +step 14871/19560 | loss 3.296007 (-0.45z)| norm 0.2776 (+0.62z)| lr 8.70e-05 | 8444.65 ms | -100.0% bf16 MFU | 62084 tok/s +step 14872/19560 | loss 3.335788 (+0.51z)| norm 0.2642 (-0.24z)| lr 8.70e-05 | 8449.81 ms | -100.0% bf16 MFU | 62082 tok/s +step 14873/19560 | loss 3.315236 (+0.01z)| norm 0.2757 (+0.50z)| lr 8.69e-05 | 8447.63 ms | -100.0% bf16 MFU | 62081 tok/s +step 14874/19560 | loss 3.350737 (+0.87z)| norm 0.2596 (-0.55z)| lr 8.69e-05 | 8443.38 ms | -100.0% bf16 MFU | 62082 tok/s +step 14875/19560 | loss 3.285512 (-0.74z)| norm 0.2603 (-0.51z)| lr 8.68e-05 | 8441.02 ms | -100.0% bf16 MFU | 62083 tok/s +step 14876/19560 | loss 3.275119 (-0.98z)| norm 0.2752 (+0.45z)| lr 8.68e-05 | 8442.02 ms | -100.0% bf16 MFU | 62084 tok/s +step 14877/19560 | loss 3.287237 (-0.68z)| norm 0.2504 (-1.14z)| lr 8.68e-05 | 8442.59 ms | -100.0% bf16 MFU | 62085 tok/s +step 14878/19560 | loss 3.287534 (-0.67z)| norm 0.2625 (-0.36z)| lr 8.67e-05 | 8440.16 ms | -100.0% bf16 MFU | 62087 tok/s +step 14879/19560 | loss 3.291014 (-0.57z)| norm 0.2500 (-1.16z)| lr 8.67e-05 | 8450.38 ms | -100.0% bf16 MFU | 62085 tok/s +step 14880/19560 | loss 3.449489 (+3.17z)| norm 0.4046 (+6.92z)| lr 8.67e-05 | 8442.82 ms | -100.0% bf16 MFU | 62085 tok/s +step 14881/19560 | loss 3.299625 (-0.36z)| norm 0.2609 (-0.42z)| lr 8.66e-05 | 8440.47 ms | -100.0% bf16 MFU | 62087 tok/s +step 14882/19560 | loss 3.382187 (+1.60z)| norm 0.2636 (-0.28z)| lr 8.66e-05 | 8441.23 ms | -100.0% bf16 MFU | 62088 tok/s +step 14883/19560 | loss 3.303791 (-0.27z)| norm 0.2698 (+0.03z)| lr 8.66e-05 | 8445.52 ms | -100.0% bf16 MFU | 62088 tok/s +step 14884/19560 | loss 3.255202 (-1.42z)| norm 0.2612 (-0.41z)| lr 8.65e-05 | 8444.53 ms | -100.0% bf16 MFU | 62088 tok/s +step 14885/19560 | loss 3.235052 (-1.86z)| norm 0.2710 (+0.09z)| lr 8.65e-05 | 8444.59 ms | -100.0% bf16 MFU | 62087 tok/s +step 14886/19560 | loss 3.266460 (-1.11z)| norm 0.2519 (-0.88z)| lr 8.65e-05 | 8441.52 ms | -100.0% bf16 MFU | 62088 tok/s +step 14887/19560 | loss 3.283453 (-0.70z)| norm 0.2713 (+0.10z)| lr 8.64e-05 | 8447.63 ms | -100.0% bf16 MFU | 62087 tok/s +step 14888/19560 | loss 3.273438 (-0.92z)| norm 0.2557 (-0.70z)| lr 8.64e-05 | 8443.54 ms | -100.0% bf16 MFU | 62088 tok/s +step 14889/19560 | loss 3.241079 (-1.65z)| norm 0.2566 (-0.65z)| lr 8.64e-05 | 8440.89 ms | -100.0% bf16 MFU | 62089 tok/s +step 14890/19560 | loss 3.298955 (-0.31z)| norm 0.2540 (-0.78z)| lr 8.63e-05 | 8440.24 ms | -100.0% bf16 MFU | 62090 tok/s +step 14891/19560 | loss 3.300937 (-0.26z)| norm 0.2553 (-0.71z)| lr 8.63e-05 | 8438.60 ms | -100.0% bf16 MFU | 62092 tok/s +step 14892/19560 | loss 3.284753 (-0.63z)| norm 0.2739 (+0.23z)| lr 8.62e-05 | 8443.14 ms | -100.0% bf16 MFU | 62092 tok/s +step 14893/19560 | loss 3.315575 (+0.09z)| norm 0.2533 (-0.82z)| lr 8.62e-05 | 8440.11 ms | -100.0% bf16 MFU | 62094 tok/s +step 14894/19560 | loss 3.379781 (+1.59z)| norm 0.2584 (-0.56z)| lr 8.62e-05 | 8439.83 ms | -100.0% bf16 MFU | 62095 tok/s +step 14895/19560 | loss 3.292464 (-0.44z)| norm 0.2670 (-0.12z)| lr 8.61e-05 | 8439.76 ms | -100.0% bf16 MFU | 62096 tok/s +step 14896/19560 | loss 3.252748 (-1.34z)| norm 0.2632 (-0.31z)| lr 8.61e-05 | 8444.90 ms | -100.0% bf16 MFU | 62096 tok/s +step 14897/19560 | loss 3.260737 (-1.15z)| norm 0.2694 (+0.00z)| lr 8.61e-05 | 8441.25 ms | -100.0% bf16 MFU | 62096 tok/s +step 14898/19560 | loss 3.265245 (-1.03z)| norm 0.2551 (-0.72z)| lr 8.60e-05 | 8439.73 ms | -100.0% bf16 MFU | 62098 tok/s +step 14899/19560 | loss 3.308485 (-0.04z)| norm 0.2745 (+0.28z)| lr 8.60e-05 | 8447.23 ms | -100.0% bf16 MFU | 62096 tok/s +step 14900/19560 | loss 3.350252 (+0.93z)| norm 0.2565 (-0.64z)| lr 8.60e-05 | 8441.72 ms | -100.0% bf16 MFU | 62097 tok/s +step 14901/19560 | loss 3.307846 (-0.03z)| norm 0.2834 (+0.74z)| lr 8.59e-05 | 8438.12 ms | -100.0% bf16 MFU | 62098 tok/s +step 14902/19560 | loss 3.301718 (-0.16z)| norm 0.2609 (-0.42z)| lr 8.59e-05 | 8443.39 ms | -100.0% bf16 MFU | 62098 tok/s +step 14903/19560 | loss 3.285840 (-0.54z)| norm 0.2616 (-0.38z)| lr 8.59e-05 | 8438.20 ms | -100.0% bf16 MFU | 62100 tok/s +step 14904/19560 | loss 3.345078 (+0.85z)| norm 0.2818 (+0.65z)| lr 8.58e-05 | 8442.50 ms | -100.0% bf16 MFU | 62100 tok/s +step 14905/19560 | loss 3.269722 (-0.92z)| norm 0.2624 (-0.34z)| lr 8.58e-05 | 8438.17 ms | -100.0% bf16 MFU | 62102 tok/s +step 14906/19560 | loss 3.357117 (+1.18z)| norm 0.2846 (+0.78z)| lr 8.58e-05 | 8444.84 ms | -100.0% bf16 MFU | 62101 tok/s +step 14907/19560 | loss 3.244972 (-1.49z)| norm 0.2541 (-0.77z)| lr 8.57e-05 | 8443.25 ms | -100.0% bf16 MFU | 62101 tok/s +step 14908/19560 | loss 3.327873 (+0.48z)| norm 0.2760 (+0.35z)| lr 8.57e-05 | 8437.17 ms | -100.0% bf16 MFU | 62103 tok/s +step 14909/19560 | loss 3.277799 (-0.70z)| norm 0.2675 (-0.08z)| lr 8.57e-05 | 8444.87 ms | -100.0% bf16 MFU | 62102 tok/s +step 14910/19560 | loss 3.357286 (+1.19z)| norm 0.2775 (+0.43z)| lr 8.56e-05 | 8441.93 ms | -100.0% bf16 MFU | 62102 tok/s +step 14911/19560 | loss 3.334816 (+0.65z)| norm 0.2599 (-0.46z)| lr 8.56e-05 | 8446.74 ms | -100.0% bf16 MFU | 62100 tok/s +step 14912/19560 | loss 3.269114 (-0.90z)| norm 0.2647 (-0.21z)| lr 8.55e-05 | 8438.35 ms | -100.0% bf16 MFU | 62102 tok/s +step 14913/19560 | loss 3.315025 (+0.18z)| norm 0.2542 (-0.73z)| lr 8.55e-05 | 8441.54 ms | -100.0% bf16 MFU | 62102 tok/s +step 14914/19560 | loss 3.301898 (-0.13z)| norm 0.2459 (-1.14z)| lr 8.55e-05 | 8442.61 ms | -100.0% bf16 MFU | 62102 tok/s +step 14915/19560 | loss 3.309645 (+0.05z)| norm 0.2566 (-0.58z)| lr 8.54e-05 | 8441.89 ms | -100.0% bf16 MFU | 62102 tok/s +step 14916/19560 | loss 3.308290 (+0.03z)| norm 0.2635 (-0.22z)| lr 8.54e-05 | 8436.84 ms | -100.0% bf16 MFU | 62104 tok/s +step 14917/19560 | loss 3.269219 (-0.91z)| norm 0.2546 (-0.67z)| lr 8.54e-05 | 8442.59 ms | -100.0% bf16 MFU | 62104 tok/s +step 14918/19560 | loss 3.247003 (-1.44z)| norm 0.2781 (+0.56z)| lr 8.53e-05 | 8439.00 ms | -100.0% bf16 MFU | 62105 tok/s +step 14919/19560 | loss 3.288322 (-0.41z)| norm 0.2780 (+0.56z)| lr 8.53e-05 | 8440.86 ms | -100.0% bf16 MFU | 62106 tok/s +step 14920/19560 | loss 3.291511 (-0.32z)| norm 0.2814 (+0.74z)| lr 8.53e-05 | 8443.26 ms | -100.0% bf16 MFU | 62105 tok/s +step 14921/19560 | loss 3.371800 (+1.65z)| norm 0.2937 (+1.39z)| lr 8.52e-05 | 8440.61 ms | -100.0% bf16 MFU | 62106 tok/s +step 14922/19560 | loss 3.225486 (-1.93z)| norm 0.2542 (-0.67z)| lr 8.52e-05 | 8443.72 ms | -100.0% bf16 MFU | 62105 tok/s +step 14923/19560 | loss 3.328068 (+0.61z)| norm 0.2732 (+0.34z)| lr 8.52e-05 | 8440.70 ms | -100.0% bf16 MFU | 62105 tok/s +step 14924/19560 | loss 3.280944 (-0.55z)| norm 0.2598 (-0.36z)| lr 8.51e-05 | 8442.31 ms | -100.0% bf16 MFU | 62105 tok/s +step 14925/19560 | loss 3.291878 (-0.27z)| norm 0.2671 (+0.03z)| lr 8.51e-05 | 8439.34 ms | -100.0% bf16 MFU | 62106 tok/s +step 14926/19560 | loss 3.261519 (-1.02z)| norm 0.2664 (-0.01z)| lr 8.51e-05 | 8443.85 ms | -100.0% bf16 MFU | 62105 tok/s +step 14927/19560 | loss 3.298165 (-0.11z)| norm 0.2583 (-0.44z)| lr 8.50e-05 | 8442.40 ms | -100.0% bf16 MFU | 62105 tok/s +step 14928/19560 | loss 3.289009 (-0.34z)| norm 0.2532 (-0.71z)| lr 8.50e-05 | 8439.71 ms | -100.0% bf16 MFU | 62106 tok/s +step 14929/19560 | loss 3.316662 (+0.34z)| norm 0.2697 (+0.17z)| lr 8.50e-05 | 8439.70 ms | -100.0% bf16 MFU | 62107 tok/s +step 14930/19560 | loss 3.239500 (-1.57z)| norm 0.2608 (-0.30z)| lr 8.49e-05 | 8440.98 ms | -100.0% bf16 MFU | 62107 tok/s +step 14931/19560 | loss 3.300591 (-0.06z)| norm 0.2609 (-0.29z)| lr 8.49e-05 | 8439.37 ms | -100.0% bf16 MFU | 62108 tok/s +step 14932/19560 | loss 3.261919 (-1.02z)| norm 0.2536 (-0.68z)| lr 8.49e-05 | 8442.78 ms | -100.0% bf16 MFU | 62107 tok/s +step 14933/19560 | loss 3.311622 (+0.22z)| norm 0.2620 (-0.23z)| lr 8.48e-05 | 8440.34 ms | -100.0% bf16 MFU | 62108 tok/s +step 14934/19560 | loss 3.237636 (-1.59z)| norm 0.2591 (-0.37z)| lr 8.48e-05 | 8441.23 ms | -100.0% bf16 MFU | 62108 tok/s +step 14935/19560 | loss 3.304564 (+0.06z)| norm 0.2492 (-0.90z)| lr 8.47e-05 | 8440.53 ms | -100.0% bf16 MFU | 62108 tok/s +step 14936/19560 | loss 3.352425 (+1.23z)| norm 0.2695 (+0.20z)| lr 8.47e-05 | 8438.78 ms | -100.0% bf16 MFU | 62109 tok/s +step 14937/19560 | loss 3.333145 (+0.74z)| norm 0.2642 (-0.09z)| lr 8.47e-05 | 8438.63 ms | -100.0% bf16 MFU | 62110 tok/s +step 14938/19560 | loss 3.359383 (+1.37z)| norm 0.2839 (+0.96z)| lr 8.46e-05 | 8442.63 ms | -100.0% bf16 MFU | 62110 tok/s +step 14939/19560 | loss 3.336334 (+0.82z)| norm 0.2715 (+0.30z)| lr 8.46e-05 | 8437.79 ms | -100.0% bf16 MFU | 62111 tok/s +step 14940/19560 | loss 3.307376 (+0.08z)| norm 0.2717 (+0.31z)| lr 8.46e-05 | 8442.11 ms | -100.0% bf16 MFU | 62111 tok/s +step 14941/19560 | loss 3.350994 (+1.17z)| norm 0.3177 (+2.70z)| lr 8.45e-05 | 8439.27 ms | -100.0% bf16 MFU | 62112 tok/s +step 14942/19560 | loss 3.264160 (-1.01z)| norm 0.2548 (-0.62z)| lr 8.45e-05 | 8437.82 ms | -100.0% bf16 MFU | 62113 tok/s +step 14943/19560 | loss 3.299250 (-0.12z)| norm 0.2822 (+0.82z)| lr 8.45e-05 | 8440.40 ms | -100.0% bf16 MFU | 62113 tok/s +step 14944/19560 | loss 3.454270 (+3.56z)| norm 0.2911 (+1.27z)| lr 8.44e-05 | 8438.43 ms | -100.0% bf16 MFU | 62114 tok/s +step 14945/19560 | loss 3.314687 (+0.27z)| norm 0.2751 (+0.45z)| lr 8.44e-05 | 8443.32 ms | -100.0% bf16 MFU | 62113 tok/s +step 14946/19560 | loss 3.313775 (+0.25z)| norm 0.2654 (-0.06z)| lr 8.44e-05 | 8440.52 ms | -100.0% bf16 MFU | 62113 tok/s +step 14947/19560 | loss 3.272538 (-0.79z)| norm 0.2553 (-0.59z)| lr 8.43e-05 | 8439.10 ms | -100.0% bf16 MFU | 62114 tok/s +step 14948/19560 | loss 3.324490 (+0.52z)| norm 0.2773 (+0.57z)| lr 8.43e-05 | 8444.17 ms | -100.0% bf16 MFU | 62112 tok/s +step 14949/19560 | loss 3.314428 (+0.26z)| norm 0.2568 (-0.52z)| lr 8.43e-05 | 8441.28 ms | -100.0% bf16 MFU | 62112 tok/s +step 14950/19560 | loss 3.313055 (+0.22z)| norm 0.2692 (+0.13z)| lr 8.42e-05 | 8440.20 ms | -100.0% bf16 MFU | 62113 tok/s +step 14951/19560 | loss 3.269006 (-0.88z)| norm 0.2609 (-0.32z)| lr 8.42e-05 | 8442.70 ms | -100.0% bf16 MFU | 62112 tok/s +step 14952/19560 | loss 3.319734 (+0.39z)| norm 0.2684 (+0.08z)| lr 8.42e-05 | 8441.61 ms | -100.0% bf16 MFU | 62112 tok/s +step 14953/19560 | loss 3.313907 (+0.24z)| norm 0.2496 (-0.92z)| lr 8.41e-05 | 8443.56 ms | -100.0% bf16 MFU | 62111 tok/s +step 14954/19560 | loss 3.300594 (-0.09z)| norm 0.2728 (+0.32z)| lr 8.41e-05 | 8439.01 ms | -100.0% bf16 MFU | 62112 tok/s +step 14955/19560 | loss 3.331460 (+0.69z)| norm 0.2447 (-1.17z)| lr 8.41e-05 | 8435.50 ms | -100.0% bf16 MFU | 62114 tok/s +step 14956/19560 | loss 3.289708 (-0.35z)| norm 0.2585 (-0.44z)| lr 8.40e-05 | 8441.69 ms | -100.0% bf16 MFU | 62113 tok/s +step 14957/19560 | loss 3.244118 (-1.48z)| norm 0.2402 (-1.39z)| lr 8.40e-05 | 8437.72 ms | -100.0% bf16 MFU | 62114 tok/s +step 14958/19560 | loss 3.300385 (-0.07z)| norm 0.2642 (-0.12z)| lr 8.39e-05 | 8439.62 ms | -100.0% bf16 MFU | 62115 tok/s +step 14959/19560 | loss 3.322760 (+0.50z)| norm 0.2584 (-0.42z)| lr 8.39e-05 | 8439.02 ms | -100.0% bf16 MFU | 62115 tok/s +step 14960/19560 | loss 3.310216 (+0.19z)| norm 0.2518 (-0.79z)| lr 8.39e-05 | 8438.02 ms | -100.0% bf16 MFU | 62116 tok/s +step 14961/19560 | loss 3.275250 (-0.68z)| norm 0.2575 (-0.45z)| lr 8.38e-05 | 8442.20 ms | -100.0% bf16 MFU | 62116 tok/s +step 14962/19560 | loss 3.271552 (-0.79z)| norm 0.2573 (-0.45z)| lr 8.38e-05 | 8440.47 ms | -100.0% bf16 MFU | 62116 tok/s +step 14963/19560 | loss 3.304576 (+0.06z)| norm 0.2493 (-0.91z)| lr 8.38e-05 | 8440.36 ms | -100.0% bf16 MFU | 62116 tok/s +step 14964/19560 | loss 3.216802 (-2.14z)| norm 0.2640 (-0.05z)| lr 8.37e-05 | 8441.37 ms | -100.0% bf16 MFU | 62115 tok/s +step 14965/19560 | loss 3.279677 (-0.56z)| norm 0.2530 (-0.69z)| lr 8.37e-05 | 8440.39 ms | -100.0% bf16 MFU | 62116 tok/s +step 14966/19560 | loss 3.303573 (+0.03z)| norm 0.2669 (+0.13z)| lr 8.37e-05 | 8441.85 ms | -100.0% bf16 MFU | 62115 tok/s +step 14967/19560 | loss 3.235761 (-1.68z)| norm 0.2511 (-0.79z)| lr 8.36e-05 | 8443.38 ms | -100.0% bf16 MFU | 62114 tok/s +step 14968/19560 | loss 3.277835 (-0.62z)| norm 0.2561 (-0.50z)| lr 8.36e-05 | 8438.09 ms | -100.0% bf16 MFU | 62115 tok/s +step 14969/19560 | loss 3.285205 (-0.43z)| norm 0.2456 (-1.10z)| lr 8.36e-05 | 8440.62 ms | -100.0% bf16 MFU | 62115 tok/s +step 14970/19560 | loss 3.313078 (+0.26z)| norm 0.2496 (-0.86z)| lr 8.35e-05 | 8438.23 ms | -100.0% bf16 MFU | 62116 tok/s +step 14971/19560 | loss 3.327574 (+0.62z)| norm 0.2537 (-0.61z)| lr 8.35e-05 | 8440.84 ms | -100.0% bf16 MFU | 62116 tok/s +step 14972/19560 | loss 3.311187 (+0.21z)| norm 0.2580 (-0.36z)| lr 8.35e-05 | 8437.30 ms | -100.0% bf16 MFU | 62117 tok/s +step 14973/19560 | loss 3.278172 (-0.62z)| norm 0.2480 (-0.94z)| lr 8.34e-05 | 8440.76 ms | -100.0% bf16 MFU | 62117 tok/s +step 14974/19560 | loss 3.304334 (+0.05z)| norm 0.2510 (-0.76z)| lr 8.34e-05 | 8437.24 ms | -100.0% bf16 MFU | 62118 tok/s +step 14975/19560 | loss 3.322792 (+0.51z)| norm 0.2724 (+0.48z)| lr 8.34e-05 | 8440.95 ms | -100.0% bf16 MFU | 62118 tok/s +step 14976/19560 | loss 3.395035 (+2.29z)| norm 0.2589 (-0.30z)| lr 8.33e-05 | 8439.58 ms | -100.0% bf16 MFU | 62118 tok/s +step 14977/19560 | loss 3.287944 (-0.37z)| norm 0.2502 (-0.80z)| lr 8.33e-05 | 8437.77 ms | -100.0% bf16 MFU | 62119 tok/s +step 14978/19560 | loss 3.334445 (+0.80z)| norm 0.2592 (-0.27z)| lr 8.33e-05 | 8438.27 ms | -100.0% bf16 MFU | 62119 tok/s +step 14979/19560 | loss 3.323253 (+0.51z)| norm 0.2523 (-0.67z)| lr 8.32e-05 | 8439.70 ms | -100.0% bf16 MFU | 62120 tok/s +step 14980/19560 | loss 3.327157 (+0.60z)| norm 0.2690 (+0.29z)| lr 8.32e-05 | 8462.28 ms | -100.0% bf16 MFU | 62111 tok/s +step 14981/19560 | loss 3.278666 (-0.61z)| norm 0.2462 (-1.02z)| lr 8.32e-05 | 8471.28 ms | -100.0% bf16 MFU | 62100 tok/s +step 14982/19560 | loss 3.290553 (-0.31z)| norm 0.2619 (-0.11z)| lr 8.31e-05 | 8465.50 ms | -100.0% bf16 MFU | 62092 tok/s +step 14983/19560 | loss 3.307818 (+0.13z)| norm 0.2819 (+1.04z)| lr 8.31e-05 | 8465.47 ms | -100.0% bf16 MFU | 62084 tok/s +step 14984/19560 | loss 3.294830 (-0.20z)| norm 0.2556 (-0.48z)| lr 8.30e-05 | 8466.29 ms | -100.0% bf16 MFU | 62076 tok/s +step 14985/19560 | loss 3.268473 (-0.85z)| norm 0.2657 (+0.10z)| lr 8.30e-05 | 8464.93 ms | -100.0% bf16 MFU | 62069 tok/s +step 14986/19560 | loss 3.276875 (-0.63z)| norm 0.2785 (+0.84z)| lr 8.30e-05 | 8465.02 ms | -100.0% bf16 MFU | 62062 tok/s +step 14987/19560 | loss 3.333263 (+0.79z)| norm 0.2954 (+1.78z)| lr 8.29e-05 | 8459.44 ms | -100.0% bf16 MFU | 62058 tok/s +step 14988/19560 | loss 3.291903 (-0.26z)| norm 0.2624 (-0.12z)| lr 8.29e-05 | 8463.48 ms | -100.0% bf16 MFU | 62053 tok/s +step 14989/19560 | loss 3.326847 (+0.62z)| norm 0.2555 (-0.52z)| lr 8.29e-05 | 8462.29 ms | -100.0% bf16 MFU | 62048 tok/s +step 14990/19560 | loss 3.283289 (-0.48z)| norm 0.2635 (-0.06z)| lr 8.28e-05 | 8463.75 ms | -100.0% bf16 MFU | 62043 tok/s +step 14991/19560 | loss 3.338644 (+0.90z)| norm 0.2699 (+0.30z)| lr 8.28e-05 | 8458.25 ms | -100.0% bf16 MFU | 62040 tok/s +step 14992/19560 | loss 3.255259 (-1.20z)| norm 0.2531 (-0.68z)| lr 8.28e-05 | 8462.54 ms | -100.0% bf16 MFU | 62035 tok/s +step 14993/19560 | loss 3.326972 (+0.60z)| norm 0.2636 (-0.07z)| lr 8.27e-05 | 8460.93 ms | -100.0% bf16 MFU | 62032 tok/s +step 14994/19560 | loss 3.267575 (-0.90z)| norm 0.2592 (-0.33z)| lr 8.27e-05 | 8446.55 ms | -100.0% bf16 MFU | 62034 tok/s +step 14995/19560 | loss 3.316701 (+0.34z)| norm 0.2395 (-1.46z)| lr 8.27e-05 | 8458.07 ms | -100.0% bf16 MFU | 62032 tok/s +step 14996/19560 | loss 3.336149 (+0.84z)| norm 0.2768 (+0.70z)| lr 8.26e-05 | 8455.77 ms | -100.0% bf16 MFU | 62030 tok/s +step 14997/19560 | loss 3.275158 (-0.70z)| norm 0.2557 (-0.51z)| lr 8.26e-05 | 8453.79 ms | -100.0% bf16 MFU | 62030 tok/s +step 14998/19560 | loss 3.337921 (+0.94z)| norm 0.2639 (-0.04z)| lr 8.26e-05 | 8450.67 ms | -100.0% bf16 MFU | 62030 tok/s +step 14999/19560 | loss 3.367414 (+1.67z)| norm 0.2625 (-0.11z)| lr 8.25e-05 | 8449.06 ms | -100.0% bf16 MFU | 62031 tok/s +step 15000/19560 | loss 3.236287 (-1.67z)| norm 0.2529 (-0.66z)| lr 8.25e-05 | 8461.59 ms | -100.0% bf16 MFU | 62028 tok/s +val loss 3.293582 +evaluating HellaSwag: 0/1256 evaluating HellaSwag: 10/1256 evaluating HellaSwag: 20/1256 evaluating HellaSwag: 30/1256 evaluating HellaSwag: 40/1256 evaluating HellaSwag: 50/1256 evaluating HellaSwag: 60/1256 evaluating HellaSwag: 70/1256 evaluating HellaSwag: 80/1256 evaluating HellaSwag: 90/1256 evaluating HellaSwag: 100/1256 evaluating HellaSwag: 110/1256 evaluating HellaSwag: 120/1256 evaluating HellaSwag: 130/1256 evaluating HellaSwag: 140/1256 evaluating HellaSwag: 150/1256 evaluating HellaSwag: 160/1256 evaluating HellaSwag: 170/1256 evaluating HellaSwag: 180/1256 evaluating HellaSwag: 190/1256 evaluating HellaSwag: 200/1256 evaluating HellaSwag: 210/1256 evaluating HellaSwag: 220/1256 evaluating HellaSwag: 230/1256 evaluating HellaSwag: 240/1256 evaluating HellaSwag: 250/1256 evaluating HellaSwag: 260/1256 evaluating HellaSwag: 270/1256 evaluating HellaSwag: 280/1256 evaluating HellaSwag: 290/1256 evaluating HellaSwag: 300/1256 evaluating HellaSwag: 310/1256 evaluating HellaSwag: 320/1256 evaluating HellaSwag: 330/1256 evaluating HellaSwag: 340/1256 evaluating HellaSwag: 350/1256 evaluating HellaSwag: 360/1256 evaluating HellaSwag: 370/1256 evaluating HellaSwag: 380/1256 evaluating HellaSwag: 390/1256 evaluating HellaSwag: 400/1256 evaluating HellaSwag: 410/1256 evaluating HellaSwag: 420/1256 evaluating HellaSwag: 430/1256 evaluating HellaSwag: 440/1256 evaluating HellaSwag: 450/1256 evaluating HellaSwag: 460/1256 evaluating HellaSwag: 470/1256 evaluating HellaSwag: 480/1256 evaluating HellaSwag: 490/1256 evaluating HellaSwag: 500/1256 evaluating HellaSwag: 510/1256 evaluating HellaSwag: 520/1256 evaluating HellaSwag: 530/1256 evaluating HellaSwag: 540/1256 evaluating HellaSwag: 550/1256 evaluating HellaSwag: 560/1256 evaluating HellaSwag: 570/1256 evaluating HellaSwag: 580/1256 evaluating HellaSwag: 590/1256 evaluating HellaSwag: 600/1256 evaluating HellaSwag: 610/1256 evaluating HellaSwag: 620/1256 evaluating HellaSwag: 630/1256 evaluating HellaSwag: 640/1256 evaluating HellaSwag: 650/1256 evaluating HellaSwag: 660/1256 evaluating HellaSwag: 670/1256 evaluating HellaSwag: 680/1256 evaluating HellaSwag: 690/1256 evaluating HellaSwag: 700/1256 evaluating HellaSwag: 710/1256 evaluating HellaSwag: 720/1256 evaluating HellaSwag: 730/1256 evaluating HellaSwag: 740/1256 evaluating HellaSwag: 750/1256 evaluating HellaSwag: 760/1256 evaluating HellaSwag: 770/1256 evaluating HellaSwag: 780/1256 evaluating HellaSwag: 790/1256 evaluating HellaSwag: 800/1256 evaluating HellaSwag: 810/1256 evaluating HellaSwag: 820/1256 evaluating HellaSwag: 830/1256 evaluating HellaSwag: 840/1256 evaluating HellaSwag: 850/1256 evaluating HellaSwag: 860/1256 evaluating HellaSwag: 870/1256 evaluating HellaSwag: 880/1256 evaluating HellaSwag: 890/1256 evaluating HellaSwag: 900/1256 evaluating HellaSwag: 910/1256 evaluating HellaSwag: 920/1256 evaluating HellaSwag: 930/1256 evaluating HellaSwag: 940/1256 evaluating HellaSwag: 950/1256 evaluating HellaSwag: 960/1256 evaluating HellaSwag: 970/1256 evaluating HellaSwag: 980/1256 evaluating HellaSwag: 990/1256 evaluating HellaSwag: 1000/1256 evaluating HellaSwag: 1010/1256 evaluating HellaSwag: 1020/1256 evaluating HellaSwag: 1030/1256 evaluating HellaSwag: 1040/1256 evaluating HellaSwag: 1050/1256 evaluating HellaSwag: 1060/1256 evaluating HellaSwag: 1070/1256 evaluating HellaSwag: 1080/1256 evaluating HellaSwag: 1090/1256 evaluating HellaSwag: 1100/1256 evaluating HellaSwag: 1110/1256 evaluating HellaSwag: 1120/1256 evaluating HellaSwag: 1130/1256 evaluating HellaSwag: 1140/1256 evaluating HellaSwag: 1150/1256 evaluating HellaSwag: 1160/1256 evaluating HellaSwag: 1170/1256 evaluating HellaSwag: 1180/1256 evaluating HellaSwag: 1190/1256 evaluating HellaSwag: 1200/1256 evaluating HellaSwag: 1210/1256 evaluating HellaSwag: 1220/1256 evaluating HellaSwag: 1230/1256 evaluating HellaSwag: 1240/1256 evaluating HellaSwag: 1250/1256 HellaSwag: 3002/10042 = 0.298944 +Writing checkpoint at step 15000 +Writing model to log124M/model_00015000.bin +Writing state to log124M/state_00015000_00000.bin +step 15001/19560 | loss 3.250905 (-1.28z)| norm 0.2475 (-0.96z)| lr 8.25e-05 | 8450.23 ms | -100.0% bf16 MFU | 62029 tok/s +step 15002/19560 | loss 3.429152 (+3.12z)| norm 0.2775 (+0.77z)| lr 8.24e-05 | 8456.05 ms | -100.0% bf16 MFU | 62027 tok/s +step 15003/19560 | loss 3.341756 (+0.97z)| norm 0.3366 (+3.90z)| lr 8.24e-05 | 8453.90 ms | -100.0% bf16 MFU | 62027 tok/s +step 15004/19560 | loss 3.280603 (-0.53z)| norm 0.2774 (+0.68z)| lr 8.24e-05 | 8567.44 ms | -100.0% bf16 MFU | 61985 tok/s +step 15005/19560 | loss 3.342452 (+0.97z)| norm 0.2641 (-0.05z)| lr 8.23e-05 | 8480.66 ms | -100.0% bf16 MFU | 61977 tok/s +step 15006/19560 | loss 3.341898 (+0.94z)| norm 0.2604 (-0.24z)| lr 8.23e-05 | 8450.38 ms | -100.0% bf16 MFU | 61980 tok/s +step 15007/19560 | loss 3.298272 (-0.12z)| norm 0.2718 (+0.36z)| lr 8.23e-05 | 8460.12 ms | -100.0% bf16 MFU | 61980 tok/s +step 15008/19560 | loss 3.284066 (-0.46z)| norm 0.2529 (-0.81z)| lr 8.22e-05 | 8454.28 ms | -100.0% bf16 MFU | 61982 tok/s +step 15009/19560 | loss 3.247288 (-1.38z)| norm 0.2644 (+0.04z)| lr 8.22e-05 | 8459.53 ms | -100.0% bf16 MFU | 61981 tok/s +step 15010/19560 | loss 3.269892 (-0.79z)| norm 0.2508 (-0.96z)| lr 8.22e-05 | 8459.20 ms | -100.0% bf16 MFU | 61981 tok/s +step 15011/19560 | loss 3.295214 (-0.14z)| norm 0.2648 (+0.08z)| lr 8.21e-05 | 8453.70 ms | -100.0% bf16 MFU | 61983 tok/s +step 15012/19560 | loss 3.454917 (+3.74z)| norm 0.2919 (+2.01z)| lr 8.21e-05 | 8449.65 ms | -100.0% bf16 MFU | 61986 tok/s +step 15013/19560 | loss 3.360734 (+1.41z)| norm 0.2822 (+1.30z)| lr 8.20e-05 | 8456.26 ms | -100.0% bf16 MFU | 61987 tok/s +step 15014/19560 | loss 3.325074 (+0.53z)| norm 0.2730 (+0.63z)| lr 8.20e-05 | 8453.68 ms | -100.0% bf16 MFU | 61989 tok/s +step 15015/19560 | loss 3.290179 (-0.33z)| norm 0.2719 (+0.55z)| lr 8.20e-05 | 8453.10 ms | -100.0% bf16 MFU | 61990 tok/s +step 15016/19560 | loss 3.376929 (+1.76z)| norm 0.2801 (+1.12z)| lr 8.19e-05 | 8455.11 ms | -100.0% bf16 MFU | 61991 tok/s +step 15017/19560 | loss 3.285462 (-0.47z)| norm 0.2564 (-0.58z)| lr 8.19e-05 | 8449.49 ms | -100.0% bf16 MFU | 61994 tok/s +step 15018/19560 | loss 3.305462 (+0.02z)| norm 0.2712 (+0.47z)| lr 8.19e-05 | 8451.86 ms | -100.0% bf16 MFU | 61996 tok/s +step 15019/19560 | loss 3.218569 (-2.07z)| norm 0.2489 (-1.12z)| lr 8.18e-05 | 8455.04 ms | -100.0% bf16 MFU | 61997 tok/s +step 15020/19560 | loss 3.318299 (+0.33z)| norm 0.2692 (+0.33z)| lr 8.18e-05 | 8453.42 ms | -100.0% bf16 MFU | 61998 tok/s +step 15021/19560 | loss 3.265684 (-0.92z)| norm 0.2639 (-0.05z)| lr 8.18e-05 | 8448.21 ms | -100.0% bf16 MFU | 62001 tok/s +step 15022/19560 | loss 3.256521 (-1.13z)| norm 0.2515 (-0.94z)| lr 8.17e-05 | 8451.36 ms | -100.0% bf16 MFU | 62003 tok/s +step 15023/19560 | loss 3.294133 (-0.22z)| norm 0.2523 (-0.87z)| lr 8.17e-05 | 8449.75 ms | -100.0% bf16 MFU | 62005 tok/s +step 15024/19560 | loss 3.263222 (-0.97z)| norm 0.2645 (+0.00z)| lr 8.17e-05 | 8450.61 ms | -100.0% bf16 MFU | 62007 tok/s +step 15025/19560 | loss 3.339116 (+0.86z)| norm 0.2521 (-0.87z)| lr 8.16e-05 | 8450.60 ms | -100.0% bf16 MFU | 62009 tok/s +step 15026/19560 | loss 3.294724 (-0.23z)| norm 0.2485 (-1.12z)| lr 8.16e-05 | 8449.45 ms | -100.0% bf16 MFU | 62011 tok/s +step 15027/19560 | loss 3.276167 (-0.67z)| norm 0.2577 (-0.46z)| lr 8.16e-05 | 8450.46 ms | -100.0% bf16 MFU | 62012 tok/s +step 15028/19560 | loss 3.337752 (+0.83z)| norm 0.2512 (-0.91z)| lr 8.15e-05 | 8449.71 ms | -100.0% bf16 MFU | 62014 tok/s +step 15029/19560 | loss 3.388078 (+2.01z)| norm 0.2639 (-0.00z)| lr 8.15e-05 | 8445.67 ms | -100.0% bf16 MFU | 62017 tok/s +step 15030/19560 | loss 3.308738 (+0.11z)| norm 0.2545 (-0.67z)| lr 8.15e-05 | 8448.81 ms | -100.0% bf16 MFU | 62019 tok/s +step 15031/19560 | loss 3.247670 (-1.35z)| norm 0.2663 (+0.17z)| lr 8.14e-05 | 8447.78 ms | -100.0% bf16 MFU | 62021 tok/s +step 15032/19560 | loss 3.300324 (-0.08z)| norm 0.2485 (-1.09z)| lr 8.14e-05 | 8447.56 ms | -100.0% bf16 MFU | 62023 tok/s +step 15033/19560 | loss 3.304046 (+0.00z)| norm 0.2626 (-0.07z)| lr 8.14e-05 | 8449.30 ms | -100.0% bf16 MFU | 62025 tok/s +step 15034/19560 | loss 3.351139 (+1.14z)| norm 0.2678 (+0.31z)| lr 8.13e-05 | 8447.35 ms | -100.0% bf16 MFU | 62027 tok/s +step 15035/19560 | loss 3.302662 (-0.04z)| norm 0.2558 (-0.56z)| lr 8.13e-05 | 8442.61 ms | -100.0% bf16 MFU | 62030 tok/s +step 15036/19560 | loss 3.306367 (+0.05z)| norm 0.2576 (-0.42z)| lr 8.13e-05 | 8447.10 ms | -100.0% bf16 MFU | 62032 tok/s +step 15037/19560 | loss 3.250318 (-1.30z)| norm 0.2583 (-0.36z)| lr 8.12e-05 | 8448.29 ms | -100.0% bf16 MFU | 62034 tok/s +step 15038/19560 | loss 3.329398 (+0.62z)| norm 0.2629 (-0.03z)| lr 8.12e-05 | 8446.37 ms | -100.0% bf16 MFU | 62036 tok/s +step 15039/19560 | loss 3.246829 (-1.36z)| norm 0.2640 (+0.05z)| lr 8.12e-05 | 8458.68 ms | -100.0% bf16 MFU | 62033 tok/s +step 15040/19560 | loss 3.265384 (-0.91z)| norm 0.2591 (-0.30z)| lr 8.11e-05 | 8450.24 ms | -100.0% bf16 MFU | 62033 tok/s +step 15041/19560 | loss 3.296034 (-0.17z)| norm 0.2711 (+0.56z)| lr 8.11e-05 | 8449.09 ms | -100.0% bf16 MFU | 62034 tok/s +step 15042/19560 | loss 3.253283 (-1.18z)| norm 0.2536 (-0.72z)| lr 8.11e-05 | 8445.87 ms | -100.0% bf16 MFU | 62036 tok/s +step 15043/19560 | loss 3.230662 (-1.69z)| norm 0.2668 (+0.24z)| lr 8.10e-05 | 8448.93 ms | -100.0% bf16 MFU | 62037 tok/s +step 15044/19560 | loss 3.265643 (-0.85z)| norm 0.2662 (+0.19z)| lr 8.10e-05 | 8450.37 ms | -100.0% bf16 MFU | 62038 tok/s +step 15045/19560 | loss 3.264572 (-0.88z)| norm 0.2539 (-0.70z)| lr 8.10e-05 | 8445.68 ms | -100.0% bf16 MFU | 62040 tok/s +step 15046/19560 | loss 3.297588 (-0.10z)| norm 0.2713 (+0.58z)| lr 8.09e-05 | 8450.68 ms | -100.0% bf16 MFU | 62040 tok/s +step 15047/19560 | loss 3.266225 (-0.85z)| norm 0.2730 (+0.70z)| lr 8.09e-05 | 8440.79 ms | -100.0% bf16 MFU | 62043 tok/s +step 15048/19560 | loss 3.315703 (+0.33z)| norm 0.2679 (+0.34z)| lr 8.09e-05 | 8451.74 ms | -100.0% bf16 MFU | 62043 tok/s +step 15049/19560 | loss 3.235664 (-1.55z)| norm 0.2580 (-0.38z)| lr 8.08e-05 | 8448.73 ms | -100.0% bf16 MFU | 62043 tok/s +step 15050/19560 | loss 3.299975 (-0.04z)| norm 0.2654 (+0.17z)| lr 8.08e-05 | 8445.61 ms | -100.0% bf16 MFU | 62045 tok/s +step 15051/19560 | loss 3.237738 (-1.51z)| norm 0.2915 (+2.10z)| lr 8.07e-05 | 8447.44 ms | -100.0% bf16 MFU | 62046 tok/s +step 15052/19560 | loss 3.364276 (+1.49z)| norm 0.2668 (+0.26z)| lr 8.07e-05 | 8454.33 ms | -100.0% bf16 MFU | 62045 tok/s +step 15053/19560 | loss 3.248674 (-1.24z)| norm 0.2907 (+1.99z)| lr 8.07e-05 | 8441.24 ms | -100.0% bf16 MFU | 62048 tok/s +step 15054/19560 | loss 3.285306 (-0.38z)| norm 0.2771 (+0.99z)| lr 8.06e-05 | 8443.57 ms | -100.0% bf16 MFU | 62050 tok/s +step 15055/19560 | loss 3.255648 (-1.07z)| norm 0.2724 (+0.64z)| lr 8.06e-05 | 8444.82 ms | -100.0% bf16 MFU | 62052 tok/s +step 15056/19560 | loss 3.337499 (+0.85z)| norm 0.2860 (+1.59z)| lr 8.06e-05 | 8447.81 ms | -100.0% bf16 MFU | 62052 tok/s +step 15057/19560 | loss 3.304463 (+0.08z)| norm 0.2683 (+0.32z)| lr 8.05e-05 | 8441.48 ms | -100.0% bf16 MFU | 62055 tok/s +step 15058/19560 | loss 3.311561 (+0.23z)| norm 0.2918 (+1.96z)| lr 8.05e-05 | 8450.45 ms | -100.0% bf16 MFU | 62055 tok/s +step 15059/19560 | loss 3.255215 (-1.09z)| norm 0.2785 (+1.01z)| lr 8.05e-05 | 8442.16 ms | -100.0% bf16 MFU | 62057 tok/s +step 15060/19560 | loss 3.262145 (-0.93z)| norm 0.2818 (+1.22z)| lr 8.04e-05 | 8458.60 ms | -100.0% bf16 MFU | 62053 tok/s +step 15061/19560 | loss 3.277203 (-0.57z)| norm 0.2861 (+1.50z)| lr 8.04e-05 | 8438.69 ms | -100.0% bf16 MFU | 62057 tok/s +step 15062/19560 | loss 3.310189 (+0.20z)| norm 0.2695 (+0.33z)| lr 8.04e-05 | 8449.68 ms | -100.0% bf16 MFU | 62057 tok/s +step 15063/19560 | loss 3.254831 (-1.10z)| norm 0.2803 (+1.07z)| lr 8.03e-05 | 8447.32 ms | -100.0% bf16 MFU | 62057 tok/s +step 15064/19560 | loss 3.273498 (-0.65z)| norm 0.2670 (+0.14z)| lr 8.03e-05 | 8442.11 ms | -100.0% bf16 MFU | 62059 tok/s +step 15065/19560 | loss 3.284557 (-0.38z)| norm 0.2521 (-0.90z)| lr 8.03e-05 | 8444.64 ms | -100.0% bf16 MFU | 62061 tok/s +step 15066/19560 | loss 3.338336 (+0.91z)| norm 0.2762 (+0.80z)| lr 8.02e-05 | 8445.77 ms | -100.0% bf16 MFU | 62062 tok/s +step 15067/19560 | loss 3.392373 (+2.16z)| norm 0.2672 (+0.16z)| lr 8.02e-05 | 8443.33 ms | -100.0% bf16 MFU | 62063 tok/s +step 15068/19560 | loss 3.308405 (+0.18z)| norm 0.2580 (-0.48z)| lr 8.02e-05 | 8448.89 ms | -100.0% bf16 MFU | 62063 tok/s +step 15069/19560 | loss 3.330090 (+0.70z)| norm 0.2860 (+1.59z)| lr 8.01e-05 | 8443.06 ms | -100.0% bf16 MFU | 62064 tok/s +step 15070/19560 | loss 3.315119 (+0.34z)| norm 0.2766 (+0.88z)| lr 8.01e-05 | 8446.99 ms | -100.0% bf16 MFU | 62065 tok/s +step 15071/19560 | loss 3.359281 (+1.36z)| norm 0.2747 (+0.75z)| lr 8.01e-05 | 8443.26 ms | -100.0% bf16 MFU | 62066 tok/s +step 15072/19560 | loss 3.278495 (-0.53z)| norm 0.2730 (+0.64z)| lr 8.00e-05 | 8440.72 ms | -100.0% bf16 MFU | 62069 tok/s +step 15073/19560 | loss 3.350245 (+1.23z)| norm 0.2809 (+1.22z)| lr 8.00e-05 | 8440.24 ms | -100.0% bf16 MFU | 62071 tok/s +step 15074/19560 | loss 3.417112 (+2.78z)| norm 1.2541 (+11.14z)| lr 8.00e-05 | 8447.02 ms | -100.0% bf16 MFU | 62071 tok/s +step 15075/19560 | loss 3.268559 (-0.78z)| norm 0.2927 (+0.23z)| lr 7.99e-05 | 8440.10 ms | -100.0% bf16 MFU | 62073 tok/s +step 15076/19560 | loss 3.267612 (-0.79z)| norm 0.3020 (+0.33z)| lr 7.99e-05 | 8441.60 ms | -100.0% bf16 MFU | 62075 tok/s +step 15077/19560 | loss 3.294161 (-0.15z)| norm 0.2865 (+0.15z)| lr 7.99e-05 | 8445.77 ms | -100.0% bf16 MFU | 62075 tok/s +step 15078/19560 | loss 3.308150 (+0.19z)| norm 0.3058 (+0.37z)| lr 7.98e-05 | 8447.34 ms | -100.0% bf16 MFU | 62075 tok/s +step 15079/19560 | loss 3.327075 (+0.63z)| norm 0.2852 (+0.13z)| lr 7.98e-05 | 8440.34 ms | -100.0% bf16 MFU | 62077 tok/s +step 15080/19560 | loss 3.296752 (-0.09z)| norm 0.2776 (+0.05z)| lr 7.98e-05 | 8441.66 ms | -100.0% bf16 MFU | 62078 tok/s +step 15081/19560 | loss 3.236157 (-1.51z)| norm 0.2947 (+0.24z)| lr 7.97e-05 | 8444.13 ms | -100.0% bf16 MFU | 62079 tok/s +step 15082/19560 | loss 3.275761 (-0.57z)| norm 0.2636 (-0.12z)| lr 7.97e-05 | 8440.59 ms | -100.0% bf16 MFU | 62081 tok/s +step 15083/19560 | loss 3.272294 (-0.64z)| norm 0.2762 (+0.03z)| lr 7.97e-05 | 8439.45 ms | -100.0% bf16 MFU | 62083 tok/s +step 15084/19560 | loss 3.293602 (-0.14z)| norm 0.2721 (-0.02z)| lr 7.96e-05 | 8443.22 ms | -100.0% bf16 MFU | 62083 tok/s +step 15085/19560 | loss 3.288088 (-0.28z)| norm 0.2739 (-0.01z)| lr 7.96e-05 | 8439.17 ms | -100.0% bf16 MFU | 62085 tok/s +step 15086/19560 | loss 3.316005 (+0.38z)| norm 0.2745 (+0.00z)| lr 7.96e-05 | 8436.75 ms | -100.0% bf16 MFU | 62088 tok/s +step 15087/19560 | loss 3.318396 (+0.44z)| norm 0.2679 (-0.08z)| lr 7.95e-05 | 8440.25 ms | -100.0% bf16 MFU | 62090 tok/s +step 15088/19560 | loss 3.269471 (-0.72z)| norm 0.2846 (+0.11z)| lr 7.95e-05 | 8439.77 ms | -100.0% bf16 MFU | 62091 tok/s +step 15089/19560 | loss 3.281548 (-0.43z)| norm 0.2550 (-0.22z)| lr 7.95e-05 | 8443.14 ms | -100.0% bf16 MFU | 62092 tok/s +step 15090/19560 | loss 3.301957 (+0.05z)| norm 0.2548 (-0.23z)| lr 7.94e-05 | 8444.70 ms | -100.0% bf16 MFU | 62091 tok/s +step 15091/19560 | loss 3.260720 (-0.92z)| norm 0.2760 (+0.01z)| lr 7.94e-05 | 8442.61 ms | -100.0% bf16 MFU | 62092 tok/s +step 15092/19560 | loss 3.330852 (+0.73z)| norm 0.2655 (-0.11z)| lr 7.94e-05 | 8443.00 ms | -100.0% bf16 MFU | 62092 tok/s +step 15093/19560 | loss 3.263604 (-0.88z)| norm 0.2530 (-0.25z)| lr 7.93e-05 | 8438.54 ms | -100.0% bf16 MFU | 62094 tok/s +step 15094/19560 | loss 3.341729 (+0.99z)| norm 0.2908 (+0.18z)| lr 7.93e-05 | 8442.42 ms | -100.0% bf16 MFU | 62094 tok/s +step 15095/19560 | loss 3.230366 (-1.68z)| norm 0.2714 (-0.04z)| lr 7.93e-05 | 8438.02 ms | -100.0% bf16 MFU | 62096 tok/s +step 15096/19560 | loss 3.246727 (-1.27z)| norm 0.2611 (-0.16z)| lr 7.92e-05 | 8445.10 ms | -100.0% bf16 MFU | 62096 tok/s +step 15097/19560 | loss 3.261714 (-0.91z)| norm 0.2630 (-0.14z)| lr 7.92e-05 | 8444.24 ms | -100.0% bf16 MFU | 62095 tok/s +step 15098/19560 | loss 3.314755 (+0.35z)| norm 0.2601 (-0.18z)| lr 7.92e-05 | 8443.00 ms | -100.0% bf16 MFU | 62095 tok/s +step 15099/19560 | loss 3.248731 (-1.20z)| norm 0.2556 (-0.23z)| lr 7.91e-05 | 8438.49 ms | -100.0% bf16 MFU | 62097 tok/s +step 15100/19560 | loss 3.251066 (-1.13z)| norm 0.2780 (+0.03z)| lr 7.91e-05 | 8442.18 ms | -100.0% bf16 MFU | 62097 tok/s +step 15101/19560 | loss 3.359527 (+1.40z)| norm 0.2655 (-0.12z)| lr 7.91e-05 | 8438.99 ms | -100.0% bf16 MFU | 62099 tok/s +step 15102/19560 | loss 3.240444 (-1.36z)| norm 0.2771 (+0.01z)| lr 7.90e-05 | 8440.99 ms | -100.0% bf16 MFU | 62100 tok/s +step 15103/19560 | loss 3.330851 (+0.73z)| norm 0.2658 (-0.12z)| lr 7.90e-05 | 8441.48 ms | -100.0% bf16 MFU | 62100 tok/s +step 15104/19560 | loss 3.297517 (-0.02z)| norm 0.2599 (-0.18z)| lr 7.90e-05 | 8441.81 ms | -100.0% bf16 MFU | 62100 tok/s +step 15105/19560 | loss 3.320724 (+0.52z)| norm 0.2742 (-0.02z)| lr 7.89e-05 | 8444.00 ms | -100.0% bf16 MFU | 62100 tok/s +step 15106/19560 | loss 3.316450 (+0.42z)| norm 0.2530 (-0.26z)| lr 7.89e-05 | 8442.77 ms | -100.0% bf16 MFU | 62100 tok/s +step 15107/19560 | loss 3.298450 (+0.00z)| norm 0.2521 (-0.27z)| lr 7.88e-05 | 8438.66 ms | -100.0% bf16 MFU | 62101 tok/s +step 15108/19560 | loss 3.338882 (+0.96z)| norm 0.2630 (-0.15z)| lr 7.88e-05 | 8440.70 ms | -100.0% bf16 MFU | 62102 tok/s +step 15109/19560 | loss 3.245649 (-1.24z)| norm 0.2591 (-0.19z)| lr 7.88e-05 | 8438.55 ms | -100.0% bf16 MFU | 62103 tok/s +step 15110/19560 | loss 3.277547 (-0.48z)| norm 0.2687 (-0.09z)| lr 7.87e-05 | 8438.35 ms | -100.0% bf16 MFU | 62105 tok/s +step 15111/19560 | loss 3.238789 (-1.37z)| norm 0.2647 (-0.13z)| lr 7.87e-05 | 8435.94 ms | -100.0% bf16 MFU | 62107 tok/s +step 15112/19560 | loss 3.355298 (+1.33z)| norm 0.2541 (-0.25z)| lr 7.87e-05 | 8439.83 ms | -100.0% bf16 MFU | 62108 tok/s +step 15113/19560 | loss 3.260636 (-0.86z)| norm 0.2627 (-0.15z)| lr 7.86e-05 | 8439.49 ms | -100.0% bf16 MFU | 62108 tok/s +step 15114/19560 | loss 3.337775 (+0.91z)| norm 0.2756 (-0.01z)| lr 7.86e-05 | 8440.46 ms | -100.0% bf16 MFU | 62109 tok/s +step 15115/19560 | loss 3.287653 (-0.24z)| norm 0.2575 (-0.21z)| lr 7.86e-05 | 8438.98 ms | -100.0% bf16 MFU | 62110 tok/s +step 15116/19560 | loss 3.242390 (-1.27z)| norm 0.2682 (-0.09z)| lr 7.85e-05 | 8437.55 ms | -100.0% bf16 MFU | 62111 tok/s +step 15117/19560 | loss 3.307748 (+0.23z)| norm 0.2615 (-0.16z)| lr 7.85e-05 | 8441.37 ms | -100.0% bf16 MFU | 62111 tok/s +step 15118/19560 | loss 3.257840 (-0.91z)| norm 0.2636 (-0.14z)| lr 7.85e-05 | 8440.65 ms | -100.0% bf16 MFU | 62111 tok/s +step 15119/19560 | loss 3.230346 (-1.51z)| norm 0.2557 (-0.23z)| lr 7.84e-05 | 8440.12 ms | -100.0% bf16 MFU | 62112 tok/s +step 15120/19560 | loss 3.269586 (-0.62z)| norm 0.2648 (-0.13z)| lr 7.84e-05 | 8439.82 ms | -100.0% bf16 MFU | 62112 tok/s +step 15121/19560 | loss 3.343465 (+1.07z)| norm 0.2683 (-0.09z)| lr 7.84e-05 | 8439.42 ms | -100.0% bf16 MFU | 62113 tok/s +step 15122/19560 | loss 3.314381 (+0.39z)| norm 0.2537 (-0.25z)| lr 7.83e-05 | 8441.34 ms | -100.0% bf16 MFU | 62112 tok/s +step 15123/19560 | loss 3.265284 (-0.72z)| norm 0.2743 (-0.02z)| lr 7.83e-05 | 8440.04 ms | -100.0% bf16 MFU | 62113 tok/s +step 15124/19560 | loss 3.294944 (-0.03z)| norm 0.2705 (-0.06z)| lr 7.83e-05 | 8440.07 ms | -100.0% bf16 MFU | 62113 tok/s +step 15125/19560 | loss 3.257883 (-0.88z)| norm 0.2642 (-0.14z)| lr 7.82e-05 | 8440.32 ms | -100.0% bf16 MFU | 62113 tok/s +step 15126/19560 | loss 3.313213 (+0.39z)| norm 0.2761 (-0.00z)| lr 7.82e-05 | 8439.41 ms | -100.0% bf16 MFU | 62114 tok/s +step 15127/19560 | loss 3.268390 (-0.62z)| norm 0.2675 (-0.10z)| lr 7.82e-05 | 8439.01 ms | -100.0% bf16 MFU | 62114 tok/s +step 15128/19560 | loss 3.247455 (-1.11z)| norm 0.2484 (-0.32z)| lr 7.81e-05 | 8436.50 ms | -100.0% bf16 MFU | 62116 tok/s +step 15129/19560 | loss 3.311162 (+0.36z)| norm 0.2776 (+0.01z)| lr 7.81e-05 | 8436.38 ms | -100.0% bf16 MFU | 62117 tok/s +step 15130/19560 | loss 3.343070 (+1.15z)| norm 0.2853 (+0.10z)| lr 7.81e-05 | 8441.88 ms | -100.0% bf16 MFU | 62117 tok/s +step 15131/19560 | loss 3.285128 (-0.23z)| norm 0.2791 (+0.03z)| lr 7.80e-05 | 8439.44 ms | -100.0% bf16 MFU | 62117 tok/s +step 15132/19560 | loss 3.263699 (-0.75z)| norm 0.2740 (-0.02z)| lr 7.80e-05 | 8438.40 ms | -100.0% bf16 MFU | 62118 tok/s +step 15133/19560 | loss 3.278404 (-0.38z)| norm 0.2639 (-0.14z)| lr 7.80e-05 | 8441.02 ms | -100.0% bf16 MFU | 62118 tok/s +step 15134/19560 | loss 3.318310 (+0.59z)| norm 0.2705 (-0.06z)| lr 7.79e-05 | 8437.63 ms | -100.0% bf16 MFU | 62119 tok/s +step 15135/19560 | loss 3.371292 (+1.85z)| norm 0.2735 (-0.03z)| lr 7.79e-05 | 8440.04 ms | -100.0% bf16 MFU | 62119 tok/s +step 15136/19560 | loss 3.255374 (-0.93z)| norm 0.2564 (-0.23z)| lr 7.79e-05 | 8438.78 ms | -100.0% bf16 MFU | 62119 tok/s +step 15137/19560 | loss 3.368748 (+1.75z)| norm 0.2684 (-0.09z)| lr 7.78e-05 | 8438.73 ms | -100.0% bf16 MFU | 62120 tok/s +step 15138/19560 | loss 3.329789 (+0.81z)| norm 0.2826 (+0.07z)| lr 7.78e-05 | 8442.56 ms | -100.0% bf16 MFU | 62119 tok/s +step 15139/19560 | loss 3.275308 (-0.48z)| norm 0.2625 (-0.16z)| lr 7.78e-05 | 8440.55 ms | -100.0% bf16 MFU | 62118 tok/s +step 15140/19560 | loss 3.237637 (-1.41z)| norm 0.2639 (-0.14z)| lr 7.77e-05 | 8440.83 ms | -100.0% bf16 MFU | 62118 tok/s +step 15141/19560 | loss 3.294016 (+0.02z)| norm 0.2717 (-0.05z)| lr 7.77e-05 | 8440.01 ms | -100.0% bf16 MFU | 62118 tok/s +step 15142/19560 | loss 3.246873 (-1.16z)| norm 0.2544 (-0.25z)| lr 7.77e-05 | 8439.38 ms | -100.0% bf16 MFU | 62119 tok/s +step 15143/19560 | loss 3.301363 (+0.22z)| norm 0.2750 (-0.01z)| lr 7.76e-05 | 8438.60 ms | -100.0% bf16 MFU | 62119 tok/s +step 15144/19560 | loss 3.304183 (+0.31z)| norm 0.2677 (-0.09z)| lr 7.76e-05 | 8439.78 ms | -100.0% bf16 MFU | 62119 tok/s +step 15145/19560 | loss 3.282800 (-0.24z)| norm 0.2575 (-0.21z)| lr 7.76e-05 | 8439.37 ms | -100.0% bf16 MFU | 62119 tok/s +step 15146/19560 | loss 3.257990 (-0.87z)| norm 0.2660 (-0.11z)| lr 7.75e-05 | 8438.73 ms | -100.0% bf16 MFU | 62120 tok/s +step 15147/19560 | loss 3.290765 (-0.04z)| norm 0.2717 (-0.05z)| lr 7.75e-05 | 8438.14 ms | -100.0% bf16 MFU | 62121 tok/s +step 15148/19560 | loss 3.255170 (-0.95z)| norm 0.2535 (-0.26z)| lr 7.75e-05 | 8439.39 ms | -100.0% bf16 MFU | 62121 tok/s +step 15149/19560 | loss 3.289499 (-0.07z)| norm 0.2586 (-0.20z)| lr 7.74e-05 | 8441.10 ms | -100.0% bf16 MFU | 62120 tok/s +step 15150/19560 | loss 3.300830 (+0.22z)| norm 0.2753 (-0.01z)| lr 7.74e-05 | 8441.24 ms | -100.0% bf16 MFU | 62120 tok/s +step 15151/19560 | loss 3.227605 (-1.66z)| norm 0.2780 (+0.02z)| lr 7.74e-05 | 8441.07 ms | -100.0% bf16 MFU | 62119 tok/s +step 15152/19560 | loss 3.280570 (-0.30z)| norm 0.2612 (-0.17z)| lr 7.73e-05 | 8437.86 ms | -100.0% bf16 MFU | 62120 tok/s +step 15153/19560 | loss 3.317291 (+0.66z)| norm 0.2584 (-0.21z)| lr 7.73e-05 | 8441.42 ms | -100.0% bf16 MFU | 62120 tok/s +step 15154/19560 | loss 3.326137 (+0.88z)| norm 0.2549 (-0.25z)| lr 7.73e-05 | 8439.27 ms | -100.0% bf16 MFU | 62120 tok/s +step 15155/19560 | loss 3.303484 (+0.29z)| norm 0.3063 (+0.34z)| lr 7.72e-05 | 8438.92 ms | -100.0% bf16 MFU | 62120 tok/s +step 15156/19560 | loss 3.255547 (-0.94z)| norm 0.2596 (-0.20z)| lr 7.72e-05 | 8436.71 ms | -100.0% bf16 MFU | 62121 tok/s +step 15157/19560 | loss 3.289188 (-0.05z)| norm 0.2847 (+0.09z)| lr 7.72e-05 | 8438.77 ms | -100.0% bf16 MFU | 62122 tok/s +step 15158/19560 | loss 3.323715 (+0.87z)| norm 0.2931 (+0.18z)| lr 7.71e-05 | 8438.54 ms | -100.0% bf16 MFU | 62122 tok/s +step 15159/19560 | loss 3.300457 (+0.24z)| norm 0.2555 (-0.25z)| lr 7.71e-05 | 8437.84 ms | -100.0% bf16 MFU | 62123 tok/s +step 15160/19560 | loss 3.274354 (-0.45z)| norm 0.2655 (-0.13z)| lr 7.71e-05 | 8437.19 ms | -100.0% bf16 MFU | 62124 tok/s +step 15161/19560 | loss 3.277207 (-0.37z)| norm 0.2778 (+0.00z)| lr 7.70e-05 | 8439.69 ms | -100.0% bf16 MFU | 62124 tok/s +step 15162/19560 | loss 3.247524 (-1.14z)| norm 0.2641 (-0.15z)| lr 7.70e-05 | 8439.84 ms | -100.0% bf16 MFU | 62123 tok/s +step 15163/19560 | loss 3.280053 (-0.27z)| norm 0.2963 (+0.21z)| lr 7.70e-05 | 8439.72 ms | -100.0% bf16 MFU | 62123 tok/s +step 15164/19560 | loss 3.262600 (-0.72z)| norm 0.2810 (+0.03z)| lr 7.69e-05 | 8439.71 ms | -100.0% bf16 MFU | 62123 tok/s +step 15165/19560 | loss 3.245815 (-1.17z)| norm 0.2633 (-0.17z)| lr 7.69e-05 | 8439.87 ms | -100.0% bf16 MFU | 62123 tok/s +step 15166/19560 | loss 3.292810 (+0.09z)| norm 0.2593 (-0.21z)| lr 7.69e-05 | 8439.52 ms | -100.0% bf16 MFU | 62123 tok/s +step 15167/19560 | loss 3.269805 (-0.53z)| norm 0.2702 (-0.09z)| lr 7.68e-05 | 8436.29 ms | -100.0% bf16 MFU | 62124 tok/s +step 15168/19560 | loss 3.291306 (+0.04z)| norm 0.2948 (+0.19z)| lr 7.68e-05 | 8437.57 ms | -100.0% bf16 MFU | 62125 tok/s +step 15169/19560 | loss 3.307679 (+0.48z)| norm 0.2557 (-0.26z)| lr 7.68e-05 | 8437.83 ms | -100.0% bf16 MFU | 62125 tok/s +step 15170/19560 | loss 3.312588 (+0.60z)| norm 0.2764 (-0.02z)| lr 7.67e-05 | 8443.42 ms | -100.0% bf16 MFU | 62124 tok/s +step 15171/19560 | loss 3.303627 (+0.35z)| norm 0.2777 (-0.01z)| lr 7.67e-05 | 8468.57 ms | -100.0% bf16 MFU | 62113 tok/s +step 15172/19560 | loss 3.336542 (+1.23z)| norm 0.3007 (+0.25z)| lr 7.67e-05 | 8466.52 ms | -100.0% bf16 MFU | 62104 tok/s +step 15173/19560 | loss 3.242974 (-1.31z)| norm 0.2639 (-0.17z)| lr 7.66e-05 | 8470.93 ms | -100.0% bf16 MFU | 62093 tok/s +step 15174/19560 | loss 3.303153 (+0.32z)| norm 0.2668 (-0.14z)| lr 7.66e-05 | 8457.70 ms | -100.0% bf16 MFU | 62088 tok/s +step 15175/19560 | loss 3.315135 (+0.63z)| norm 0.2680 (-0.12z)| lr 7.66e-05 | 8463.93 ms | -100.0% bf16 MFU | 62081 tok/s +step 15176/19560 | loss 3.242894 (-1.30z)| norm 0.2698 (-0.10z)| lr 7.65e-05 | 8461.26 ms | -100.0% bf16 MFU | 62075 tok/s +step 15177/19560 | loss 3.341517 (+1.34z)| norm 0.2667 (-0.14z)| lr 7.65e-05 | 8461.95 ms | -100.0% bf16 MFU | 62069 tok/s +step 15178/19560 | loss 3.313196 (+0.57z)| norm 0.2628 (-0.18z)| lr 7.65e-05 | 8463.68 ms | -100.0% bf16 MFU | 62063 tok/s +step 15179/19560 | loss 3.289490 (-0.08z)| norm 0.2439 (-0.39z)| lr 7.64e-05 | 8460.06 ms | -100.0% bf16 MFU | 62058 tok/s +step 15180/19560 | loss 3.311919 (+0.55z)| norm 0.2556 (-0.26z)| lr 7.64e-05 | 8460.45 ms | -100.0% bf16 MFU | 62054 tok/s +step 15181/19560 | loss 3.336494 (+1.20z)| norm 0.2564 (-0.25z)| lr 7.64e-05 | 8461.80 ms | -100.0% bf16 MFU | 62049 tok/s +step 15182/19560 | loss 3.275445 (-0.47z)| norm 0.2485 (-0.33z)| lr 7.63e-05 | 8458.41 ms | -100.0% bf16 MFU | 62046 tok/s +step 15183/19560 | loss 3.233567 (-1.61z)| norm 0.2540 (-0.27z)| lr 7.63e-05 | 8457.55 ms | -100.0% bf16 MFU | 62043 tok/s +step 15184/19560 | loss 3.260606 (-0.86z)| norm 0.2519 (-0.29z)| lr 7.63e-05 | 8448.79 ms | -100.0% bf16 MFU | 62044 tok/s +step 15185/19560 | loss 3.260713 (-0.84z)| norm 0.2842 (+0.08z)| lr 7.62e-05 | 8463.46 ms | -100.0% bf16 MFU | 62039 tok/s +step 15186/19560 | loss 3.346892 (+1.50z)| norm 0.2665 (-0.12z)| lr 7.62e-05 | 8452.97 ms | -100.0% bf16 MFU | 62038 tok/s +step 15187/19560 | loss 3.303771 (+0.32z)| norm 0.2763 (-0.01z)| lr 7.62e-05 | 8461.17 ms | -100.0% bf16 MFU | 62034 tok/s +step 15188/19560 | loss 3.271180 (-0.58z)| norm 0.2713 (-0.07z)| lr 7.61e-05 | 8456.68 ms | -100.0% bf16 MFU | 62033 tok/s +step 15189/19560 | loss 3.295734 (+0.09z)| norm 0.2600 (-0.19z)| lr 7.61e-05 | 8457.17 ms | -100.0% bf16 MFU | 62031 tok/s +step 15190/19560 | loss 3.350213 (+1.56z)| norm 0.2832 (+0.07z)| lr 7.61e-05 | 8464.84 ms | -100.0% bf16 MFU | 62026 tok/s +step 15191/19560 | loss 3.313130 (+0.54z)| norm 0.2537 (-0.27z)| lr 7.60e-05 | 8455.31 ms | -100.0% bf16 MFU | 62025 tok/s +step 15192/19560 | loss 3.315324 (+0.59z)| norm 0.2688 (-0.09z)| lr 7.60e-05 | 8456.93 ms | -100.0% bf16 MFU | 62024 tok/s +step 15193/19560 | loss 3.344493 (+1.36z)| norm 0.2495 (-0.31z)| lr 7.60e-05 | 8454.34 ms | -100.0% bf16 MFU | 62023 tok/s +step 15194/19560 | loss 3.318419 (+0.67z)| norm 0.2622 (-0.17z)| lr 7.59e-05 | 8459.44 ms | -100.0% bf16 MFU | 62021 tok/s +step 15195/19560 | loss 3.263057 (-0.83z)| norm 0.2819 (+0.06z)| lr 7.59e-05 | 8450.28 ms | -100.0% bf16 MFU | 62022 tok/s +step 15196/19560 | loss 3.292841 (+0.00z)| norm 0.2705 (-0.07z)| lr 7.59e-05 | 8456.49 ms | -100.0% bf16 MFU | 62021 tok/s +step 15197/19560 | loss 3.252587 (-1.10z)| norm 0.2502 (-0.30z)| lr 7.58e-05 | 8458.58 ms | -100.0% bf16 MFU | 62019 tok/s +step 15198/19560 | loss 3.290603 (-0.04z)| norm 0.2626 (-0.16z)| lr 7.58e-05 | 8454.26 ms | -100.0% bf16 MFU | 62019 tok/s +step 15199/19560 | loss 3.345680 (+1.51z)| norm 0.2601 (-0.19z)| lr 7.58e-05 | 8451.78 ms | -100.0% bf16 MFU | 62019 tok/s +step 15200/19560 | loss 3.339499 (+1.31z)| norm 0.2501 (-0.30z)| lr 7.57e-05 | 8458.93 ms | -100.0% bf16 MFU | 62017 tok/s +step 15201/19560 | loss 3.268122 (-0.66z)| norm 0.2704 (-0.07z)| lr 7.57e-05 | 8448.79 ms | -100.0% bf16 MFU | 62019 tok/s +step 15202/19560 | loss 3.319443 (+0.84z)| norm 0.2588 (-0.76z)| lr 7.57e-05 | 8448.56 ms | -100.0% bf16 MFU | 62021 tok/s +step 15203/19560 | loss 3.335753 (+1.30z)| norm 0.2729 (+0.36z)| lr 7.56e-05 | 8448.26 ms | -100.0% bf16 MFU | 62023 tok/s +step 15204/19560 | loss 3.311123 (+0.57z)| norm 0.2543 (-1.12z)| lr 7.56e-05 | 8453.95 ms | -100.0% bf16 MFU | 62023 tok/s +step 15205/19560 | loss 3.258389 (-0.96z)| norm 0.2673 (-0.04z)| lr 7.56e-05 | 8453.84 ms | -100.0% bf16 MFU | 62022 tok/s +step 15206/19560 | loss 3.281829 (-0.27z)| norm 0.2591 (-0.71z)| lr 7.55e-05 | 8447.75 ms | -100.0% bf16 MFU | 62024 tok/s +step 15207/19560 | loss 3.305992 (+0.44z)| norm 0.2644 (-0.25z)| lr 7.55e-05 | 8456.66 ms | -100.0% bf16 MFU | 62023 tok/s +step 15208/19560 | loss 3.339798 (+1.41z)| norm 0.2755 (+0.71z)| lr 7.55e-05 | 8447.38 ms | -100.0% bf16 MFU | 62025 tok/s +step 15209/19560 | loss 3.288798 (-0.09z)| norm 0.2630 (-0.35z)| lr 7.54e-05 | 8450.99 ms | -100.0% bf16 MFU | 62026 tok/s +step 15210/19560 | loss 3.352552 (+1.75z)| norm 0.2750 (+0.70z)| lr 7.54e-05 | 8454.63 ms | -100.0% bf16 MFU | 62025 tok/s +step 15211/19560 | loss 3.264786 (-0.80z)| norm 0.2837 (+1.45z)| lr 7.54e-05 | 8449.61 ms | -100.0% bf16 MFU | 62026 tok/s +step 15212/19560 | loss 3.382833 (+2.54z)| norm 0.2890 (+1.88z)| lr 7.53e-05 | 8452.14 ms | -100.0% bf16 MFU | 62027 tok/s +step 15213/19560 | loss 3.234935 (-1.61z)| norm 0.2944 (+2.29z)| lr 7.53e-05 | 8450.52 ms | -100.0% bf16 MFU | 62027 tok/s +step 15214/19560 | loss 3.242297 (-1.38z)| norm 0.2622 (-0.43z)| lr 7.53e-05 | 8445.39 ms | -100.0% bf16 MFU | 62030 tok/s +step 15215/19560 | loss 3.263481 (-0.78z)| norm 0.2584 (-0.75z)| lr 7.52e-05 | 8451.61 ms | -100.0% bf16 MFU | 62030 tok/s +step 15216/19560 | loss 3.349085 (+1.57z)| norm 0.2731 (+0.50z)| lr 7.52e-05 | 8452.33 ms | -100.0% bf16 MFU | 62030 tok/s +step 15217/19560 | loss 3.404641 (+2.97z)| norm 0.2700 (+0.24z)| lr 7.52e-05 | 8447.10 ms | -100.0% bf16 MFU | 62032 tok/s +step 15218/19560 | loss 3.362261 (+1.80z)| norm 0.2615 (-0.50z)| lr 7.51e-05 | 8448.80 ms | -100.0% bf16 MFU | 62033 tok/s +step 15219/19560 | loss 3.292415 (-0.04z)| norm 0.2563 (-0.94z)| lr 7.51e-05 | 8442.32 ms | -100.0% bf16 MFU | 62037 tok/s +step 15220/19560 | loss 3.375264 (+2.10z)| norm 0.2757 (+0.73z)| lr 7.51e-05 | 8457.46 ms | -100.0% bf16 MFU | 62034 tok/s +step 15221/19560 | loss 3.273857 (-0.53z)| norm 0.2648 (-0.22z)| lr 7.50e-05 | 8454.17 ms | -100.0% bf16 MFU | 62033 tok/s +step 15222/19560 | loss 3.380000 (+2.19z)| norm 0.2589 (-0.72z)| lr 7.50e-05 | 8456.14 ms | -100.0% bf16 MFU | 62032 tok/s +step 15223/19560 | loss 3.322832 (+0.71z)| norm 0.2534 (-1.18z)| lr 7.50e-05 | 8452.31 ms | -100.0% bf16 MFU | 62032 tok/s +step 15224/19560 | loss 3.362426 (+1.70z)| norm 0.2723 (+0.45z)| lr 7.49e-05 | 8443.86 ms | -100.0% bf16 MFU | 62035 tok/s +step 15225/19560 | loss 3.310996 (+0.37z)| norm 0.2633 (-0.33z)| lr 7.49e-05 | 8445.39 ms | -100.0% bf16 MFU | 62037 tok/s +step 15226/19560 | loss 3.309629 (+0.34z)| norm 0.2665 (-0.05z)| lr 7.49e-05 | 8449.14 ms | -100.0% bf16 MFU | 62038 tok/s +step 15227/19560 | loss 3.303061 (+0.16z)| norm 0.2758 (+0.74z)| lr 7.48e-05 | 8442.90 ms | -100.0% bf16 MFU | 62041 tok/s +step 15228/19560 | loss 3.307966 (+0.27z)| norm 0.2766 (+0.81z)| lr 7.48e-05 | 8442.43 ms | -100.0% bf16 MFU | 62044 tok/s +step 15229/19560 | loss 3.302964 (+0.16z)| norm 0.2816 (+1.23z)| lr 7.48e-05 | 8445.18 ms | -100.0% bf16 MFU | 62046 tok/s +step 15230/19560 | loss 3.248265 (-1.29z)| norm 0.2636 (-0.32z)| lr 7.47e-05 | 8446.40 ms | -100.0% bf16 MFU | 62047 tok/s +step 15231/19560 | loss 3.328974 (+0.84z)| norm 0.2681 (+0.07z)| lr 7.47e-05 | 8442.01 ms | -100.0% bf16 MFU | 62050 tok/s +step 15232/19560 | loss 3.292087 (-0.13z)| norm 0.2583 (-0.78z)| lr 7.47e-05 | 8445.11 ms | -100.0% bf16 MFU | 62051 tok/s +step 15233/19560 | loss 3.343827 (+1.23z)| norm 0.2875 (+1.73z)| lr 7.46e-05 | 8447.30 ms | -100.0% bf16 MFU | 62052 tok/s +step 15234/19560 | loss 3.359621 (+1.62z)| norm 0.2665 (-0.09z)| lr 7.46e-05 | 8442.18 ms | -100.0% bf16 MFU | 62055 tok/s +step 15235/19560 | loss 3.320153 (+0.58z)| norm 0.2590 (-0.75z)| lr 7.46e-05 | 8445.34 ms | -100.0% bf16 MFU | 62056 tok/s +step 15236/19560 | loss 3.331956 (+0.89z)| norm 0.2558 (-1.01z)| lr 7.45e-05 | 8443.32 ms | -100.0% bf16 MFU | 62058 tok/s +step 15237/19560 | loss 3.347780 (+1.29z)| norm 0.2654 (-0.18z)| lr 7.45e-05 | 8442.22 ms | -100.0% bf16 MFU | 62060 tok/s +step 15238/19560 | loss 3.302870 (+0.11z)| norm 0.2583 (-0.80z)| lr 7.45e-05 | 8439.03 ms | -100.0% bf16 MFU | 62063 tok/s +step 15239/19560 | loss 3.322117 (+0.60z)| norm 0.2560 (-0.98z)| lr 7.44e-05 | 8436.83 ms | -100.0% bf16 MFU | 62067 tok/s +step 15240/19560 | loss 3.316265 (+0.46z)| norm 0.2603 (-0.62z)| lr 7.44e-05 | 8438.40 ms | -100.0% bf16 MFU | 62071 tok/s +step 15241/19560 | loss 3.290090 (-0.25z)| norm 0.2730 (+0.47z)| lr 7.44e-05 | 8439.88 ms | -100.0% bf16 MFU | 62073 tok/s +step 15242/19560 | loss 3.327079 (+0.75z)| norm 0.2664 (-0.09z)| lr 7.43e-05 | 8435.92 ms | -100.0% bf16 MFU | 62077 tok/s +step 15243/19560 | loss 3.292235 (-0.19z)| norm 0.2726 (+0.44z)| lr 7.43e-05 | 8438.48 ms | -100.0% bf16 MFU | 62080 tok/s +step 15244/19560 | loss 3.393181 (+2.45z)| norm 0.2839 (+1.40z)| lr 7.43e-05 | 8440.12 ms | -100.0% bf16 MFU | 62082 tok/s +step 15245/19560 | loss 3.263964 (-0.95z)| norm 0.2532 (-1.24z)| lr 7.42e-05 | 8440.61 ms | -100.0% bf16 MFU | 62083 tok/s +step 15246/19560 | loss 3.294643 (-0.15z)| norm 0.2827 (+1.28z)| lr 7.42e-05 | 8439.74 ms | -100.0% bf16 MFU | 62085 tok/s +step 15247/19560 | loss 3.342557 (+1.10z)| norm 0.2867 (+1.58z)| lr 7.42e-05 | 8442.19 ms | -100.0% bf16 MFU | 62086 tok/s +step 15248/19560 | loss 3.273493 (-0.74z)| norm 0.2595 (-0.72z)| lr 7.41e-05 | 8437.68 ms | -100.0% bf16 MFU | 62089 tok/s +step 15249/19560 | loss 3.289554 (-0.30z)| norm 0.2737 (+0.48z)| lr 7.41e-05 | 8442.08 ms | -100.0% bf16 MFU | 62089 tok/s +step 15250/19560 | loss 3.377068 (+2.00z)| norm 0.2717 (+0.30z)| lr 7.41e-05 | 8441.09 ms | -100.0% bf16 MFU | 62090 tok/s +val loss 3.292042 +evaluating HellaSwag: 0/1256 evaluating HellaSwag: 10/1256 evaluating HellaSwag: 20/1256 evaluating HellaSwag: 30/1256 evaluating HellaSwag: 40/1256 evaluating HellaSwag: 50/1256 evaluating HellaSwag: 60/1256 evaluating HellaSwag: 70/1256 evaluating HellaSwag: 80/1256 evaluating HellaSwag: 90/1256 evaluating HellaSwag: 100/1256 evaluating HellaSwag: 110/1256 evaluating HellaSwag: 120/1256 evaluating HellaSwag: 130/1256 evaluating HellaSwag: 140/1256 evaluating HellaSwag: 150/1256 evaluating HellaSwag: 160/1256 evaluating HellaSwag: 170/1256 evaluating HellaSwag: 180/1256 evaluating HellaSwag: 190/1256 evaluating HellaSwag: 200/1256 evaluating HellaSwag: 210/1256 evaluating HellaSwag: 220/1256 evaluating HellaSwag: 230/1256 evaluating HellaSwag: 240/1256 evaluating HellaSwag: 250/1256 evaluating HellaSwag: 260/1256 evaluating HellaSwag: 270/1256 evaluating HellaSwag: 280/1256 evaluating HellaSwag: 290/1256 evaluating HellaSwag: 300/1256 evaluating HellaSwag: 310/1256 evaluating HellaSwag: 320/1256 evaluating HellaSwag: 330/1256 evaluating HellaSwag: 340/1256 evaluating HellaSwag: 350/1256 evaluating HellaSwag: 360/1256 evaluating HellaSwag: 370/1256 evaluating HellaSwag: 380/1256 evaluating HellaSwag: 390/1256 evaluating HellaSwag: 400/1256 evaluating HellaSwag: 410/1256 evaluating HellaSwag: 420/1256 evaluating HellaSwag: 430/1256 evaluating HellaSwag: 440/1256 evaluating HellaSwag: 450/1256 evaluating HellaSwag: 460/1256 evaluating HellaSwag: 470/1256 evaluating HellaSwag: 480/1256 evaluating HellaSwag: 490/1256 evaluating HellaSwag: 500/1256 evaluating HellaSwag: 510/1256 evaluating HellaSwag: 520/1256 evaluating HellaSwag: 530/1256 evaluating HellaSwag: 540/1256 evaluating HellaSwag: 550/1256 evaluating HellaSwag: 560/1256 evaluating HellaSwag: 570/1256 evaluating HellaSwag: 580/1256 evaluating HellaSwag: 590/1256 evaluating HellaSwag: 600/1256 evaluating HellaSwag: 610/1256 evaluating HellaSwag: 620/1256 evaluating HellaSwag: 630/1256 evaluating HellaSwag: 640/1256 evaluating HellaSwag: 650/1256 evaluating HellaSwag: 660/1256 evaluating HellaSwag: 670/1256 evaluating HellaSwag: 680/1256 evaluating HellaSwag: 690/1256 evaluating HellaSwag: 700/1256 evaluating HellaSwag: 710/1256 evaluating HellaSwag: 720/1256 evaluating HellaSwag: 730/1256 evaluating HellaSwag: 740/1256 evaluating HellaSwag: 750/1256 evaluating HellaSwag: 760/1256 evaluating HellaSwag: 770/1256 evaluating HellaSwag: 780/1256 evaluating HellaSwag: 790/1256 evaluating HellaSwag: 800/1256 evaluating HellaSwag: 810/1256 evaluating HellaSwag: 820/1256 evaluating HellaSwag: 830/1256 evaluating HellaSwag: 840/1256 evaluating HellaSwag: 850/1256 evaluating HellaSwag: 860/1256 evaluating HellaSwag: 870/1256 evaluating HellaSwag: 880/1256 evaluating HellaSwag: 890/1256 evaluating HellaSwag: 900/1256 evaluating HellaSwag: 910/1256 evaluating HellaSwag: 920/1256 evaluating HellaSwag: 930/1256 evaluating HellaSwag: 940/1256 evaluating HellaSwag: 950/1256 evaluating HellaSwag: 960/1256 evaluating HellaSwag: 970/1256 evaluating HellaSwag: 980/1256 evaluating HellaSwag: 990/1256 evaluating HellaSwag: 1000/1256 evaluating HellaSwag: 1010/1256 evaluating HellaSwag: 1020/1256 evaluating HellaSwag: 1030/1256 evaluating HellaSwag: 1040/1256 evaluating HellaSwag: 1050/1256 evaluating HellaSwag: 1060/1256 evaluating HellaSwag: 1070/1256 evaluating HellaSwag: 1080/1256 evaluating HellaSwag: 1090/1256 evaluating HellaSwag: 1100/1256 evaluating HellaSwag: 1110/1256 evaluating HellaSwag: 1120/1256 evaluating HellaSwag: 1130/1256 evaluating HellaSwag: 1140/1256 evaluating HellaSwag: 1150/1256 evaluating HellaSwag: 1160/1256 evaluating HellaSwag: 1170/1256 evaluating HellaSwag: 1180/1256 evaluating HellaSwag: 1190/1256 evaluating HellaSwag: 1200/1256 evaluating HellaSwag: 1210/1256 evaluating HellaSwag: 1220/1256 evaluating HellaSwag: 1230/1256 evaluating HellaSwag: 1240/1256 evaluating HellaSwag: 1250/1256 HellaSwag: 3001/10042 = 0.298845 +step 15251/19560 | loss 3.333869 (+0.84z)| norm 0.2666 (-0.13z)| lr 7.41e-05 | 8442.07 ms | -100.0% bf16 MFU | 62091 tok/s +step 15252/19560 | loss 3.337636 (+0.93z)| norm 0.2554 (-1.07z)| lr 7.40e-05 | 8439.31 ms | -100.0% bf16 MFU | 62093 tok/s +step 15253/19560 | loss 3.328500 (+0.68z)| norm 0.2590 (-0.76z)| lr 7.40e-05 | 8441.31 ms | -100.0% bf16 MFU | 62094 tok/s +step 15254/19560 | loss 3.361660 (+1.53z)| norm 0.2675 (-0.03z)| lr 7.40e-05 | 8441.99 ms | -100.0% bf16 MFU | 62094 tok/s +step 15255/19560 | loss 3.324057 (+0.54z)| norm 0.2524 (-1.30z)| lr 7.39e-05 | 8439.64 ms | -100.0% bf16 MFU | 62096 tok/s +step 15256/19560 | loss 3.298218 (-0.15z)| norm 0.2611 (-0.58z)| lr 7.39e-05 | 8441.04 ms | -100.0% bf16 MFU | 62096 tok/s +step 15257/19560 | loss 3.299462 (-0.12z)| norm 0.2592 (-0.73z)| lr 7.39e-05 | 8441.58 ms | -100.0% bf16 MFU | 62097 tok/s +step 15258/19560 | loss 3.271600 (-0.84z)| norm 0.2733 (+0.48z)| lr 7.38e-05 | 8441.06 ms | -100.0% bf16 MFU | 62098 tok/s +step 15259/19560 | loss 3.301887 (-0.04z)| norm 0.2551 (-1.06z)| lr 7.38e-05 | 8443.24 ms | -100.0% bf16 MFU | 62098 tok/s +step 15260/19560 | loss 3.315043 (+0.30z)| norm 0.2577 (-0.82z)| lr 7.38e-05 | 8441.21 ms | -100.0% bf16 MFU | 62098 tok/s +step 15261/19560 | loss 3.346428 (+1.12z)| norm 0.2621 (-0.44z)| lr 7.37e-05 | 8447.19 ms | -100.0% bf16 MFU | 62097 tok/s +step 15262/19560 | loss 3.333457 (+0.77z)| norm 0.2488 (-1.55z)| lr 7.37e-05 | 8441.79 ms | -100.0% bf16 MFU | 62097 tok/s +step 15263/19560 | loss 3.297301 (-0.18z)| norm 0.2578 (-0.78z)| lr 7.37e-05 | 8441.68 ms | -100.0% bf16 MFU | 62098 tok/s +step 15264/19560 | loss 3.333036 (+0.77z)| norm 0.2466 (-1.71z)| lr 7.36e-05 | 8442.25 ms | -100.0% bf16 MFU | 62098 tok/s +step 15265/19560 | loss 3.295587 (-0.23z)| norm 0.2666 (-0.02z)| lr 7.36e-05 | 8450.54 ms | -100.0% bf16 MFU | 62095 tok/s +step 15266/19560 | loss 3.281817 (-0.59z)| norm 0.2790 (+1.02z)| lr 7.36e-05 | 8439.12 ms | -100.0% bf16 MFU | 62097 tok/s +step 15267/19560 | loss 3.282999 (-0.56z)| norm 0.2509 (-1.33z)| lr 7.35e-05 | 8442.02 ms | -100.0% bf16 MFU | 62097 tok/s +step 15268/19560 | loss 3.298646 (-0.15z)| norm 0.2654 (-0.12z)| lr 7.35e-05 | 8444.53 ms | -100.0% bf16 MFU | 62096 tok/s +step 15269/19560 | loss 3.374814 (+1.91z)| norm 0.2787 (+0.99z)| lr 7.35e-05 | 8441.02 ms | -100.0% bf16 MFU | 62097 tok/s +step 15270/19560 | loss 3.284345 (-0.57z)| norm 0.2492 (-1.46z)| lr 7.34e-05 | 8442.75 ms | -100.0% bf16 MFU | 62097 tok/s +step 15271/19560 | loss 3.324078 (+0.52z)| norm 0.2643 (-0.21z)| lr 7.34e-05 | 8440.09 ms | -100.0% bf16 MFU | 62098 tok/s +step 15272/19560 | loss 3.268801 (-0.99z)| norm 0.2528 (-1.15z)| lr 7.34e-05 | 8443.12 ms | -100.0% bf16 MFU | 62098 tok/s +step 15273/19560 | loss 3.321162 (+0.44z)| norm 0.2661 (-0.04z)| lr 7.33e-05 | 8445.41 ms | -100.0% bf16 MFU | 62097 tok/s +step 15274/19560 | loss 3.320346 (+0.40z)| norm 0.2493 (-1.42z)| lr 7.33e-05 | 8441.78 ms | -100.0% bf16 MFU | 62098 tok/s +step 15275/19560 | loss 3.382363 (+2.06z)| norm 0.2777 (+0.91z)| lr 7.33e-05 | 8443.24 ms | -100.0% bf16 MFU | 62098 tok/s +step 15276/19560 | loss 3.464213 (+3.99z)| norm 0.2913 (+1.98z)| lr 7.32e-05 | 8443.86 ms | -100.0% bf16 MFU | 62097 tok/s +step 15277/19560 | loss 3.289967 (-0.46z)| norm 0.2627 (-0.34z)| lr 7.32e-05 | 8438.43 ms | -100.0% bf16 MFU | 62099 tok/s +step 15278/19560 | loss 3.290065 (-0.46z)| norm 0.2884 (+1.72z)| lr 7.32e-05 | 8441.34 ms | -100.0% bf16 MFU | 62100 tok/s +step 15279/19560 | loss 3.363127 (+1.39z)| norm 0.2720 (+0.41z)| lr 7.31e-05 | 8445.93 ms | -100.0% bf16 MFU | 62098 tok/s +step 15280/19560 | loss 3.314772 (+0.14z)| norm 0.2649 (-0.17z)| lr 7.31e-05 | 8440.25 ms | -100.0% bf16 MFU | 62099 tok/s +step 15281/19560 | loss 3.358513 (+1.25z)| norm 0.2708 (+0.30z)| lr 7.31e-05 | 8439.85 ms | -100.0% bf16 MFU | 62100 tok/s +step 15282/19560 | loss 3.341685 (+0.82z)| norm 0.2526 (-1.17z)| lr 7.30e-05 | 8446.98 ms | -100.0% bf16 MFU | 62099 tok/s +step 15283/19560 | loss 3.339803 (+0.76z)| norm 0.2816 (+1.23z)| lr 7.30e-05 | 8438.39 ms | -100.0% bf16 MFU | 62100 tok/s +step 15284/19560 | loss 3.323306 (+0.33z)| norm 0.2568 (-0.84z)| lr 7.30e-05 | 8441.34 ms | -100.0% bf16 MFU | 62101 tok/s +step 15285/19560 | loss 3.293755 (-0.43z)| norm 0.2776 (+0.91z)| lr 7.29e-05 | 8440.62 ms | -100.0% bf16 MFU | 62102 tok/s +step 15286/19560 | loss 3.331950 (+0.55z)| norm 0.2822 (+1.31z)| lr 7.29e-05 | 8440.45 ms | -100.0% bf16 MFU | 62102 tok/s +step 15287/19560 | loss 3.259238 (-1.31z)| norm 0.2606 (-0.52z)| lr 7.29e-05 | 8440.14 ms | -100.0% bf16 MFU | 62103 tok/s +step 15288/19560 | loss 3.267837 (-1.08z)| norm 0.2647 (-0.18z)| lr 7.28e-05 | 8441.42 ms | -100.0% bf16 MFU | 62103 tok/s +step 15289/19560 | loss 3.225698 (-2.12z)| norm 0.2863 (+1.65z)| lr 7.28e-05 | 8440.44 ms | -100.0% bf16 MFU | 62104 tok/s +step 15290/19560 | loss 3.299460 (-0.28z)| norm 0.2696 (+0.23z)| lr 7.28e-05 | 8436.74 ms | -100.0% bf16 MFU | 62106 tok/s +step 15291/19560 | loss 3.420445 (+2.69z)| norm 0.3056 (+3.22z)| lr 7.27e-05 | 8434.71 ms | -100.0% bf16 MFU | 62109 tok/s +step 15292/19560 | loss 3.247164 (-1.58z)| norm 0.2933 (+2.15z)| lr 7.27e-05 | 8434.67 ms | -100.0% bf16 MFU | 62111 tok/s +step 15293/19560 | loss 3.313450 (+0.04z)| norm 0.2860 (+1.53z)| lr 7.27e-05 | 8431.60 ms | -100.0% bf16 MFU | 62115 tok/s +step 15294/19560 | loss 3.370132 (+1.42z)| norm 0.2776 (+0.83z)| lr 7.26e-05 | 8433.10 ms | -100.0% bf16 MFU | 62117 tok/s +step 15295/19560 | loss 3.318586 (+0.14z)| norm 0.2778 (+0.84z)| lr 7.26e-05 | 8431.71 ms | -100.0% bf16 MFU | 62121 tok/s +step 15296/19560 | loss 3.247818 (-1.59z)| norm 0.2598 (-0.61z)| lr 7.26e-05 | 8429.43 ms | -100.0% bf16 MFU | 62124 tok/s +step 15297/19560 | loss 3.324527 (+0.29z)| norm 0.2597 (-0.62z)| lr 7.25e-05 | 8431.99 ms | -100.0% bf16 MFU | 62127 tok/s +step 15298/19560 | loss 3.249109 (-1.53z)| norm 0.2701 (+0.24z)| lr 7.25e-05 | 8433.26 ms | -100.0% bf16 MFU | 62129 tok/s +step 15299/19560 | loss 3.359651 (+1.14z)| norm 0.2998 (+2.63z)| lr 7.25e-05 | 8432.21 ms | -100.0% bf16 MFU | 62132 tok/s +step 15300/19560 | loss 3.254717 (-1.38z)| norm 0.2614 (-0.47z)| lr 7.24e-05 | 8430.87 ms | -100.0% bf16 MFU | 62134 tok/s +step 15301/19560 | loss 3.380932 (+1.64z)| norm 0.2600 (-0.58z)| lr 7.24e-05 | 8433.77 ms | -100.0% bf16 MFU | 62136 tok/s +step 15302/19560 | loss 3.259059 (-1.28z)| norm 0.3007 (+2.70z)| lr 7.24e-05 | 8432.95 ms | -100.0% bf16 MFU | 62138 tok/s +step 15303/19560 | loss 3.351190 (+0.91z)| norm 0.2515 (-1.25z)| lr 7.23e-05 | 8433.09 ms | -100.0% bf16 MFU | 62139 tok/s +step 15304/19560 | loss 3.320926 (+0.18z)| norm 0.2659 (-0.09z)| lr 7.23e-05 | 8435.61 ms | -100.0% bf16 MFU | 62140 tok/s +step 15305/19560 | loss 3.267432 (-1.09z)| norm 0.2760 (+0.71z)| lr 7.23e-05 | 8435.62 ms | -100.0% bf16 MFU | 62140 tok/s +step 15306/19560 | loss 3.281026 (-0.76z)| norm 0.2846 (+1.37z)| lr 7.23e-05 | 8432.20 ms | -100.0% bf16 MFU | 62142 tok/s +step 15307/19560 | loss 3.268116 (-1.06z)| norm 0.2605 (-0.56z)| lr 7.22e-05 | 8434.79 ms | -100.0% bf16 MFU | 62143 tok/s +step 15308/19560 | loss 3.391932 (+1.86z)| norm 0.2768 (+0.74z)| lr 7.22e-05 | 8431.73 ms | -100.0% bf16 MFU | 62145 tok/s +step 15309/19560 | loss 3.251565 (-1.43z)| norm 0.2517 (-1.28z)| lr 7.22e-05 | 8433.29 ms | -100.0% bf16 MFU | 62146 tok/s +step 15310/19560 | loss 3.288629 (-0.56z)| norm 0.2646 (-0.25z)| lr 7.21e-05 | 8435.24 ms | -100.0% bf16 MFU | 62147 tok/s +step 15311/19560 | loss 3.343116 (+0.70z)| norm 0.2811 (+1.07z)| lr 7.21e-05 | 8432.80 ms | -100.0% bf16 MFU | 62148 tok/s +step 15312/19560 | loss 3.312325 (-0.04z)| norm 0.2819 (+1.12z)| lr 7.21e-05 | 8436.85 ms | -100.0% bf16 MFU | 62148 tok/s +step 15313/19560 | loss 3.584488 (+5.59z)| norm 0.2879 (+1.60z)| lr 7.20e-05 | 8436.60 ms | -100.0% bf16 MFU | 62147 tok/s +step 15314/19560 | loss 3.309611 (-0.14z)| norm 0.2689 (+0.06z)| lr 7.20e-05 | 8435.67 ms | -100.0% bf16 MFU | 62148 tok/s +step 15315/19560 | loss 3.327503 (+0.23z)| norm 0.2774 (+0.74z)| lr 7.20e-05 | 8438.85 ms | -100.0% bf16 MFU | 62147 tok/s +step 15316/19560 | loss 3.244957 (-1.48z)| norm 0.2568 (-0.91z)| lr 7.19e-05 | 8436.38 ms | -100.0% bf16 MFU | 62147 tok/s +step 15317/19560 | loss 3.378456 (+1.28z)| norm 0.2991 (+2.42z)| lr 7.19e-05 | 8435.83 ms | -100.0% bf16 MFU | 62147 tok/s +step 15318/19560 | loss 3.315925 (-0.01z)| norm 0.2769 (+0.68z)| lr 7.19e-05 | 8438.63 ms | -100.0% bf16 MFU | 62146 tok/s +step 15319/19560 | loss 3.330579 (+0.29z)| norm 0.2600 (-0.67z)| lr 7.18e-05 | 8437.41 ms | -100.0% bf16 MFU | 62146 tok/s +step 15320/19560 | loss 3.384132 (+1.38z)| norm 0.2847 (+1.28z)| lr 7.18e-05 | 8438.37 ms | -100.0% bf16 MFU | 62145 tok/s +step 15321/19560 | loss 3.292233 (-0.50z)| norm 0.2699 (+0.10z)| lr 7.18e-05 | 8438.46 ms | -100.0% bf16 MFU | 62144 tok/s +step 15322/19560 | loss 3.337439 (+0.42z)| norm 0.2652 (-0.28z)| lr 7.17e-05 | 8434.80 ms | -100.0% bf16 MFU | 62145 tok/s +step 15323/19560 | loss 3.247272 (-1.42z)| norm 0.2837 (+1.19z)| lr 7.17e-05 | 8437.47 ms | -100.0% bf16 MFU | 62144 tok/s +step 15324/19560 | loss 3.301903 (-0.31z)| norm 0.2756 (+0.54z)| lr 7.17e-05 | 8438.40 ms | -100.0% bf16 MFU | 62144 tok/s +step 15325/19560 | loss 3.420743 (+2.08z)| norm 0.2876 (+1.47z)| lr 7.16e-05 | 8441.00 ms | -100.0% bf16 MFU | 62142 tok/s +step 15326/19560 | loss 3.305317 (-0.26z)| norm 0.2769 (+0.62z)| lr 7.16e-05 | 8435.91 ms | -100.0% bf16 MFU | 62143 tok/s +step 15327/19560 | loss 3.250717 (-1.35z)| norm 0.2607 (-0.67z)| lr 7.16e-05 | 8435.65 ms | -100.0% bf16 MFU | 62143 tok/s +step 15328/19560 | loss 3.300236 (-0.34z)| norm 0.2488 (-1.62z)| lr 7.15e-05 | 8435.28 ms | -100.0% bf16 MFU | 62144 tok/s +step 15329/19560 | loss 3.361430 (+0.88z)| norm 0.2777 (+0.67z)| lr 7.15e-05 | 8434.62 ms | -100.0% bf16 MFU | 62144 tok/s +step 15330/19560 | loss 3.288979 (-0.58z)| norm 0.2595 (-0.77z)| lr 7.15e-05 | 8437.01 ms | -100.0% bf16 MFU | 62144 tok/s +step 15331/19560 | loss 3.298681 (-0.38z)| norm 0.2615 (-0.60z)| lr 7.14e-05 | 8435.46 ms | -100.0% bf16 MFU | 62145 tok/s +step 15332/19560 | loss 3.317147 (-0.01z)| norm 0.2570 (-0.96z)| lr 7.14e-05 | 8434.83 ms | -100.0% bf16 MFU | 62145 tok/s +step 15333/19560 | loss 3.333955 (+0.32z)| norm 0.2631 (-0.48z)| lr 7.14e-05 | 8436.07 ms | -100.0% bf16 MFU | 62145 tok/s +step 15334/19560 | loss 3.391078 (+1.45z)| norm 0.2811 (+0.94z)| lr 7.13e-05 | 8434.47 ms | -100.0% bf16 MFU | 62146 tok/s +step 15335/19560 | loss 3.349886 (+0.62z)| norm 0.2610 (-0.65z)| lr 7.13e-05 | 8437.06 ms | -100.0% bf16 MFU | 62146 tok/s +step 15336/19560 | loss 3.298565 (-0.41z)| norm 0.2628 (-0.50z)| lr 7.13e-05 | 8435.05 ms | -100.0% bf16 MFU | 62146 tok/s +step 15337/19560 | loss 3.276968 (-0.84z)| norm 0.2757 (+0.51z)| lr 7.12e-05 | 8435.40 ms | -100.0% bf16 MFU | 62147 tok/s +step 15338/19560 | loss 3.338471 (+0.40z)| norm 0.2597 (-0.75z)| lr 7.12e-05 | 8434.33 ms | -100.0% bf16 MFU | 62147 tok/s +step 15339/19560 | loss 3.347877 (+0.58z)| norm 0.2643 (-0.37z)| lr 7.12e-05 | 8435.67 ms | -100.0% bf16 MFU | 62148 tok/s +step 15340/19560 | loss 3.308549 (-0.21z)| norm 0.2587 (-0.80z)| lr 7.11e-05 | 8438.05 ms | -100.0% bf16 MFU | 62147 tok/s +step 15341/19560 | loss 3.330104 (+0.22z)| norm 0.2719 (+0.27z)| lr 7.11e-05 | 8437.41 ms | -100.0% bf16 MFU | 62147 tok/s +step 15342/19560 | loss 3.336350 (+0.33z)| norm 0.2534 (-1.23z)| lr 7.11e-05 | 8438.25 ms | -100.0% bf16 MFU | 62146 tok/s +step 15343/19560 | loss 3.254971 (-1.35z)| norm 0.2666 (-0.16z)| lr 7.11e-05 | 8439.86 ms | -100.0% bf16 MFU | 62145 tok/s +step 15344/19560 | loss 3.368409 (+0.99z)| norm 0.2707 (+0.17z)| lr 7.10e-05 | 8438.64 ms | -100.0% bf16 MFU | 62144 tok/s +step 15345/19560 | loss 3.318780 (-0.02z)| norm 0.2655 (-0.24z)| lr 7.10e-05 | 8438.82 ms | -100.0% bf16 MFU | 62143 tok/s +step 15346/19560 | loss 3.359619 (+0.84z)| norm 0.2795 (+0.88z)| lr 7.10e-05 | 8437.33 ms | -100.0% bf16 MFU | 62143 tok/s +step 15347/19560 | loss 3.305303 (-0.30z)| norm 0.2581 (-0.86z)| lr 7.09e-05 | 8436.66 ms | -100.0% bf16 MFU | 62143 tok/s +step 15348/19560 | loss 3.275758 (-0.91z)| norm 0.2671 (-0.12z)| lr 7.09e-05 | 8438.08 ms | -100.0% bf16 MFU | 62142 tok/s +step 15349/19560 | loss 3.365676 (+0.96z)| norm 0.2635 (-0.41z)| lr 7.09e-05 | 8436.92 ms | -100.0% bf16 MFU | 62142 tok/s +step 15350/19560 | loss 3.391086 (+1.49z)| norm 0.2687 (+0.00z)| lr 7.08e-05 | 8437.15 ms | -100.0% bf16 MFU | 62142 tok/s +step 15351/19560 | loss 3.358432 (+0.80z)| norm 0.2792 (+0.84z)| lr 7.08e-05 | 8438.93 ms | -100.0% bf16 MFU | 62142 tok/s +step 15352/19560 | loss 3.278801 (-0.85z)| norm 0.2601 (-0.71z)| lr 7.08e-05 | 8437.47 ms | -100.0% bf16 MFU | 62141 tok/s +step 15353/19560 | loss 3.238597 (-1.66z)| norm 0.2714 (+0.21z)| lr 7.07e-05 | 8440.53 ms | -100.0% bf16 MFU | 62140 tok/s +step 15354/19560 | loss 3.287524 (-0.64z)| norm 0.2867 (+1.43z)| lr 7.07e-05 | 8441.20 ms | -100.0% bf16 MFU | 62139 tok/s +step 15355/19560 | loss 3.301103 (-0.36z)| norm 0.3057 (+2.86z)| lr 7.07e-05 | 8438.42 ms | -100.0% bf16 MFU | 62138 tok/s +step 15356/19560 | loss 3.366396 (+0.97z)| norm 0.2476 (-1.67z)| lr 7.06e-05 | 8438.91 ms | -100.0% bf16 MFU | 62138 tok/s +step 15357/19560 | loss 3.306062 (-0.27z)| norm 0.2699 (+0.08z)| lr 7.06e-05 | 8441.04 ms | -100.0% bf16 MFU | 62136 tok/s +step 15358/19560 | loss 3.233922 (-1.74z)| norm 0.2675 (-0.11z)| lr 7.06e-05 | 8438.68 ms | -100.0% bf16 MFU | 62136 tok/s +step 15359/19560 | loss 3.320285 (+0.03z)| norm 0.2558 (-1.02z)| lr 7.05e-05 | 8436.29 ms | -100.0% bf16 MFU | 62137 tok/s +step 15360/19560 | loss 3.352437 (+0.68z)| norm 0.2707 (+0.14z)| lr 7.05e-05 | 8438.98 ms | -100.0% bf16 MFU | 62136 tok/s +step 15361/19560 | loss 3.308692 (-0.21z)| norm 0.2757 (+0.54z)| lr 7.05e-05 | 8451.93 ms | -100.0% bf16 MFU | 62131 tok/s +step 15362/19560 | loss 3.282715 (-0.73z)| norm 0.2608 (-0.63z)| lr 7.04e-05 | 8460.84 ms | -100.0% bf16 MFU | 62123 tok/s +step 15363/19560 | loss 3.270382 (-0.97z)| norm 0.2612 (-0.60z)| lr 7.04e-05 | 8466.26 ms | -100.0% bf16 MFU | 62113 tok/s +step 15364/19560 | loss 3.245573 (-1.46z)| norm 0.2588 (-0.79z)| lr 7.04e-05 | 8461.63 ms | -100.0% bf16 MFU | 62105 tok/s +step 15365/19560 | loss 3.256675 (-1.21z)| norm 0.2562 (-0.99z)| lr 7.03e-05 | 8465.44 ms | -100.0% bf16 MFU | 62097 tok/s +step 15366/19560 | loss 3.288815 (-0.56z)| norm 0.2804 (+0.90z)| lr 7.03e-05 | 8461.28 ms | -100.0% bf16 MFU | 62090 tok/s +step 15367/19560 | loss 3.340443 (+0.48z)| norm 0.2601 (-0.69z)| lr 7.03e-05 | 8462.78 ms | -100.0% bf16 MFU | 62083 tok/s +step 15368/19560 | loss 3.384246 (+1.34z)| norm 0.2690 (-0.00z)| lr 7.02e-05 | 8465.23 ms | -100.0% bf16 MFU | 62076 tok/s +step 15369/19560 | loss 3.318655 (+0.02z)| norm 0.2644 (-0.36z)| lr 7.02e-05 | 8459.76 ms | -100.0% bf16 MFU | 62071 tok/s +step 15370/19560 | loss 3.267898 (-0.98z)| norm 0.2551 (-1.08z)| lr 7.02e-05 | 8461.56 ms | -100.0% bf16 MFU | 62065 tok/s +step 15371/19560 | loss 3.349274 (+0.63z)| norm 0.2909 (+1.69z)| lr 7.02e-05 | 8460.07 ms | -100.0% bf16 MFU | 62060 tok/s +step 15372/19560 | loss 3.311750 (-0.10z)| norm 0.2551 (-1.06z)| lr 7.01e-05 | 8459.46 ms | -100.0% bf16 MFU | 62056 tok/s +step 15373/19560 | loss 3.276804 (-0.81z)| norm 0.2645 (-0.34z)| lr 7.01e-05 | 8460.66 ms | -100.0% bf16 MFU | 62052 tok/s +step 15374/19560 | loss 3.378987 (+1.23z)| norm 0.2844 (+1.20z)| lr 7.01e-05 | 8456.08 ms | -100.0% bf16 MFU | 62049 tok/s +step 15375/19560 | loss 3.333746 (+0.32z)| norm 0.2563 (-0.96z)| lr 7.00e-05 | 8455.52 ms | -100.0% bf16 MFU | 62047 tok/s +step 15376/19560 | loss 3.301387 (-0.33z)| norm 0.2684 (-0.03z)| lr 7.00e-05 | 8456.65 ms | -100.0% bf16 MFU | 62045 tok/s +step 15377/19560 | loss 3.319867 (+0.04z)| norm 0.2780 (+0.72z)| lr 7.00e-05 | 8450.39 ms | -100.0% bf16 MFU | 62045 tok/s +step 15378/19560 | loss 3.312624 (-0.10z)| norm 0.2677 (-0.08z)| lr 6.99e-05 | 8459.79 ms | -100.0% bf16 MFU | 62041 tok/s +step 15379/19560 | loss 3.251409 (-1.32z)| norm 0.2731 (+0.33z)| lr 6.99e-05 | 8457.47 ms | -100.0% bf16 MFU | 62039 tok/s +step 15380/19560 | loss 3.326587 (+0.20z)| norm 0.2613 (-0.59z)| lr 6.99e-05 | 8460.82 ms | -100.0% bf16 MFU | 62035 tok/s +step 15381/19560 | loss 3.272275 (-0.88z)| norm 0.3040 (+2.65z)| lr 6.98e-05 | 8456.67 ms | -100.0% bf16 MFU | 62033 tok/s +step 15382/19560 | loss 3.291750 (-0.48z)| norm 0.2617 (-0.56z)| lr 6.98e-05 | 8458.41 ms | -100.0% bf16 MFU | 62031 tok/s +step 15383/19560 | loss 3.273122 (-0.85z)| norm 0.2516 (-1.33z)| lr 6.98e-05 | 8457.50 ms | -100.0% bf16 MFU | 62029 tok/s +step 15384/19560 | loss 3.293914 (-0.43z)| norm 0.2749 (+0.43z)| lr 6.97e-05 | 8459.53 ms | -100.0% bf16 MFU | 62026 tok/s +step 15385/19560 | loss 3.264066 (-1.02z)| norm 0.2546 (-1.11z)| lr 6.97e-05 | 8452.62 ms | -100.0% bf16 MFU | 62026 tok/s +step 15386/19560 | loss 3.264303 (-1.01z)| norm 0.2540 (-1.14z)| lr 6.97e-05 | 8454.74 ms | -100.0% bf16 MFU | 62025 tok/s +step 15387/19560 | loss 3.268529 (-0.92z)| norm 0.2489 (-1.51z)| lr 6.96e-05 | 8453.82 ms | -100.0% bf16 MFU | 62025 tok/s +step 15388/19560 | loss 3.298043 (-0.33z)| norm 0.2460 (-1.71z)| lr 6.96e-05 | 8453.23 ms | -100.0% bf16 MFU | 62025 tok/s +step 15389/19560 | loss 3.320961 (+0.13z)| norm 0.2579 (-0.82z)| lr 6.96e-05 | 8447.86 ms | -100.0% bf16 MFU | 62027 tok/s +step 15390/19560 | loss 3.283232 (-0.61z)| norm 0.2477 (-1.58z)| lr 6.95e-05 | 8454.04 ms | -100.0% bf16 MFU | 62026 tok/s +step 15391/19560 | loss 3.260469 (-1.06z)| norm 0.2677 (-0.10z)| lr 6.95e-05 | 8456.88 ms | -100.0% bf16 MFU | 62025 tok/s +step 15392/19560 | loss 3.281527 (-0.63z)| norm 0.2422 (-1.98z)| lr 6.95e-05 | 8453.29 ms | -100.0% bf16 MFU | 62024 tok/s +step 15393/19560 | loss 3.289803 (-0.47z)| norm 0.2493 (-1.44z)| lr 6.94e-05 | 8452.30 ms | -100.0% bf16 MFU | 62025 tok/s +step 15394/19560 | loss 3.275449 (-0.75z)| norm 0.2669 (-0.13z)| lr 6.94e-05 | 8452.67 ms | -100.0% bf16 MFU | 62025 tok/s +step 15395/19560 | loss 3.280063 (-0.66z)| norm 0.2503 (-1.35z)| lr 6.94e-05 | 8456.79 ms | -100.0% bf16 MFU | 62023 tok/s +step 15396/19560 | loss 3.311631 (-0.03z)| norm 0.2597 (-0.66z)| lr 6.94e-05 | 8455.07 ms | -100.0% bf16 MFU | 62023 tok/s +step 15397/19560 | loss 3.305257 (-0.15z)| norm 0.2608 (-0.57z)| lr 6.93e-05 | 8451.30 ms | -100.0% bf16 MFU | 62023 tok/s +step 15398/19560 | loss 3.332556 (+0.39z)| norm 0.2744 (+0.42z)| lr 6.93e-05 | 8451.85 ms | -100.0% bf16 MFU | 62024 tok/s +step 15399/19560 | loss 3.307858 (-0.10z)| norm 0.2744 (+0.42z)| lr 6.93e-05 | 8456.86 ms | -100.0% bf16 MFU | 62022 tok/s +step 15400/19560 | loss 3.292686 (-0.41z)| norm 0.2417 (-1.99z)| lr 6.92e-05 | 8452.00 ms | -100.0% bf16 MFU | 62023 tok/s +step 15401/19560 | loss 3.296813 (-0.32z)| norm 0.2518 (-1.23z)| lr 6.92e-05 | 8450.65 ms | -100.0% bf16 MFU | 62024 tok/s +step 15402/19560 | loss 3.326146 (+0.26z)| norm 0.2615 (-0.53z)| lr 6.92e-05 | 8451.46 ms | -100.0% bf16 MFU | 62024 tok/s +step 15403/19560 | loss 3.337666 (+0.50z)| norm 0.2666 (-0.15z)| lr 6.91e-05 | 8449.98 ms | -100.0% bf16 MFU | 62025 tok/s +step 15404/19560 | loss 3.321419 (+0.20z)| norm 0.2574 (-0.81z)| lr 6.91e-05 | 8449.78 ms | -100.0% bf16 MFU | 62026 tok/s +step 15405/19560 | loss 3.293134 (-0.39z)| norm 0.2765 (+0.60z)| lr 6.91e-05 | 8454.39 ms | -100.0% bf16 MFU | 62026 tok/s +step 15406/19560 | loss 3.341769 (+0.62z)| norm 0.2561 (-0.91z)| lr 6.90e-05 | 8443.74 ms | -100.0% bf16 MFU | 62029 tok/s +step 15407/19560 | loss 3.317518 (+0.12z)| norm 0.2456 (-1.66z)| lr 6.90e-05 | 8442.56 ms | -100.0% bf16 MFU | 62033 tok/s +step 15408/19560 | loss 3.282501 (-0.61z)| norm 0.2683 (+0.02z)| lr 6.90e-05 | 8443.02 ms | -100.0% bf16 MFU | 62036 tok/s +step 15409/19560 | loss 3.207134 (-2.12z)| norm 0.2509 (-1.24z)| lr 6.89e-05 | 8446.47 ms | -100.0% bf16 MFU | 62038 tok/s +step 15410/19560 | loss 3.253469 (-1.15z)| norm 0.2620 (-0.44z)| lr 6.89e-05 | 8441.94 ms | -100.0% bf16 MFU | 62041 tok/s +step 15411/19560 | loss 3.300767 (-0.18z)| norm 0.2518 (-1.17z)| lr 6.89e-05 | 8441.71 ms | -100.0% bf16 MFU | 62044 tok/s +step 15412/19560 | loss 3.326728 (+0.36z)| norm 0.2453 (-1.63z)| lr 6.88e-05 | 8441.92 ms | -100.0% bf16 MFU | 62047 tok/s +step 15413/19560 | loss 3.220774 (-1.79z)| norm 0.2647 (-0.21z)| lr 6.88e-05 | 8441.58 ms | -100.0% bf16 MFU | 62050 tok/s +step 15414/19560 | loss 3.326520 (+0.36z)| norm 0.2524 (-1.09z)| lr 6.88e-05 | 8434.68 ms | -100.0% bf16 MFU | 62056 tok/s +step 15415/19560 | loss 3.277037 (-0.65z)| norm 0.2566 (-0.78z)| lr 6.87e-05 | 8439.50 ms | -100.0% bf16 MFU | 62059 tok/s +step 15416/19560 | loss 3.339268 (+0.61z)| norm 0.2625 (-0.34z)| lr 6.87e-05 | 8441.38 ms | -100.0% bf16 MFU | 62062 tok/s +step 15417/19560 | loss 3.298720 (-0.23z)| norm 0.2669 (-0.01z)| lr 6.87e-05 | 8439.94 ms | -100.0% bf16 MFU | 62065 tok/s +step 15418/19560 | loss 3.333828 (+0.49z)| norm 0.2741 (+0.52z)| lr 6.86e-05 | 8440.36 ms | -100.0% bf16 MFU | 62067 tok/s +step 15419/19560 | loss 3.280406 (-0.60z)| norm 0.2480 (-1.41z)| lr 6.86e-05 | 8438.29 ms | -100.0% bf16 MFU | 62070 tok/s +step 15420/19560 | loss 3.254771 (-1.14z)| norm 0.2815 (+1.14z)| lr 6.86e-05 | 8443.23 ms | -100.0% bf16 MFU | 62072 tok/s +step 15421/19560 | loss 3.286062 (-0.48z)| norm 0.2653 (-0.09z)| lr 6.86e-05 | 8445.46 ms | -100.0% bf16 MFU | 62072 tok/s +step 15422/19560 | loss 3.291279 (-0.36z)| norm 0.2508 (-1.18z)| lr 6.85e-05 | 8438.07 ms | -100.0% bf16 MFU | 62075 tok/s +step 15423/19560 | loss 3.273532 (-0.73z)| norm 0.3021 (+2.67z)| lr 6.85e-05 | 8448.79 ms | -100.0% bf16 MFU | 62074 tok/s +step 15424/19560 | loss 3.282357 (-0.55z)| norm 0.2535 (-0.95z)| lr 6.85e-05 | 8442.45 ms | -100.0% bf16 MFU | 62076 tok/s +step 15425/19560 | loss 3.252447 (-1.17z)| norm 0.2826 (+1.20z)| lr 6.84e-05 | 8446.61 ms | -100.0% bf16 MFU | 62075 tok/s +step 15426/19560 | loss 3.279738 (-0.60z)| norm 0.2612 (-0.39z)| lr 6.84e-05 | 8440.65 ms | -100.0% bf16 MFU | 62077 tok/s +step 15427/19560 | loss 3.250069 (-1.21z)| norm 0.2663 (+0.01z)| lr 6.84e-05 | 8437.55 ms | -100.0% bf16 MFU | 62080 tok/s +step 15428/19560 | loss 3.303932 (-0.08z)| norm 0.2536 (-0.95z)| lr 6.83e-05 | 8441.18 ms | -100.0% bf16 MFU | 62082 tok/s +step 15429/19560 | loss 3.316626 (+0.21z)| norm 0.2635 (-0.20z)| lr 6.83e-05 | 8441.95 ms | -100.0% bf16 MFU | 62083 tok/s +step 15430/19560 | loss 3.250197 (-1.22z)| norm 0.2479 (-1.38z)| lr 6.83e-05 | 8445.98 ms | -100.0% bf16 MFU | 62083 tok/s +step 15431/19560 | loss 3.290571 (-0.34z)| norm 0.2748 (+0.69z)| lr 6.82e-05 | 8446.94 ms | -100.0% bf16 MFU | 62082 tok/s +step 15432/19560 | loss 3.352683 (+0.99z)| norm 0.2666 (+0.05z)| lr 6.82e-05 | 8436.57 ms | -100.0% bf16 MFU | 62085 tok/s +step 15433/19560 | loss 3.310374 (+0.07z)| norm 0.2732 (+0.57z)| lr 6.82e-05 | 8444.86 ms | -100.0% bf16 MFU | 62085 tok/s +step 15434/19560 | loss 3.294240 (-0.28z)| norm 0.2778 (+0.93z)| lr 6.81e-05 | 8438.47 ms | -100.0% bf16 MFU | 62087 tok/s +step 15435/19560 | loss 3.294723 (-0.27z)| norm 0.2775 (+0.90z)| lr 6.81e-05 | 8443.24 ms | -100.0% bf16 MFU | 62088 tok/s +step 15436/19560 | loss 3.327455 (+0.45z)| norm 0.2660 (+0.01z)| lr 6.81e-05 | 8443.36 ms | -100.0% bf16 MFU | 62088 tok/s +step 15437/19560 | loss 3.303521 (-0.08z)| norm 0.2826 (+1.28z)| lr 6.80e-05 | 8447.30 ms | -100.0% bf16 MFU | 62087 tok/s +step 15438/19560 | loss 3.271068 (-0.79z)| norm 0.2448 (-1.63z)| lr 6.80e-05 | 8441.65 ms | -100.0% bf16 MFU | 62088 tok/s +step 15439/19560 | loss 3.334503 (+0.60z)| norm 0.2587 (-0.55z)| lr 6.80e-05 | 8445.57 ms | -100.0% bf16 MFU | 62087 tok/s +step 15440/19560 | loss 3.304406 (-0.06z)| norm 0.2573 (-0.65z)| lr 6.80e-05 | 8440.79 ms | -100.0% bf16 MFU | 62089 tok/s +step 15441/19560 | loss 3.262456 (-1.09z)| norm 0.2647 (-0.06z)| lr 6.79e-05 | 8443.42 ms | -100.0% bf16 MFU | 62089 tok/s +step 15442/19560 | loss 3.317750 (+0.34z)| norm 0.2580 (-0.58z)| lr 6.79e-05 | 8440.38 ms | -100.0% bf16 MFU | 62090 tok/s +step 15443/19560 | loss 3.374817 (+1.80z)| norm 0.2818 (+1.29z)| lr 6.79e-05 | 8441.83 ms | -100.0% bf16 MFU | 62091 tok/s +step 15444/19560 | loss 3.348079 (+1.09z)| norm 0.2616 (-0.30z)| lr 6.78e-05 | 8450.85 ms | -100.0% bf16 MFU | 62089 tok/s +step 15445/19560 | loss 3.350107 (+1.16z)| norm 0.2569 (-0.67z)| lr 6.78e-05 | 8445.51 ms | -100.0% bf16 MFU | 62088 tok/s +step 15446/19560 | loss 3.264038 (-1.06z)| norm 0.2601 (-0.39z)| lr 6.78e-05 | 8440.41 ms | -100.0% bf16 MFU | 62090 tok/s +step 15447/19560 | loss 3.330719 (+0.66z)| norm 0.2759 (+0.87z)| lr 6.77e-05 | 8443.49 ms | -100.0% bf16 MFU | 62090 tok/s +step 15448/19560 | loss 3.271199 (-0.87z)| norm 0.2600 (-0.40z)| lr 6.77e-05 | 8444.69 ms | -100.0% bf16 MFU | 62090 tok/s +step 15449/19560 | loss 3.304133 (-0.00z)| norm 0.2779 (+1.05z)| lr 6.77e-05 | 8440.48 ms | -100.0% bf16 MFU | 62091 tok/s +step 15450/19560 | loss 3.271156 (-0.86z)| norm 0.2562 (-0.71z)| lr 6.76e-05 | 8438.13 ms | -100.0% bf16 MFU | 62093 tok/s +step 15451/19560 | loss 3.412336 (+2.76z)| norm 0.2678 (+0.25z)| lr 6.76e-05 | 8439.46 ms | -100.0% bf16 MFU | 62094 tok/s +step 15452/19560 | loss 3.274257 (-0.78z)| norm 0.2855 (+1.68z)| lr 6.76e-05 | 8446.46 ms | -100.0% bf16 MFU | 62093 tok/s +step 15453/19560 | loss 3.353291 (+1.29z)| norm 0.2730 (+0.68z)| lr 6.75e-05 | 8442.45 ms | -100.0% bf16 MFU | 62094 tok/s +step 15454/19560 | loss 3.325327 (+0.55z)| norm 0.2690 (+0.36z)| lr 6.75e-05 | 8443.04 ms | -100.0% bf16 MFU | 62094 tok/s +step 15455/19560 | loss 3.307992 (+0.08z)| norm 0.2733 (+0.71z)| lr 6.75e-05 | 8438.85 ms | -100.0% bf16 MFU | 62096 tok/s +step 15456/19560 | loss 3.281188 (-0.63z)| norm 0.2732 (+0.68z)| lr 6.74e-05 | 8443.59 ms | -100.0% bf16 MFU | 62095 tok/s +step 15457/19560 | loss 3.284898 (-0.52z)| norm 0.2620 (-0.24z)| lr 6.74e-05 | 8446.36 ms | -100.0% bf16 MFU | 62094 tok/s +step 15458/19560 | loss 3.302118 (-0.06z)| norm 0.2715 (+0.55z)| lr 6.74e-05 | 8441.56 ms | -100.0% bf16 MFU | 62095 tok/s +step 15459/19560 | loss 3.281543 (-0.60z)| norm 0.2666 (+0.14z)| lr 6.73e-05 | 8444.56 ms | -100.0% bf16 MFU | 62095 tok/s +step 15460/19560 | loss 3.351846 (+1.26z)| norm 0.2558 (-0.76z)| lr 6.73e-05 | 8438.26 ms | -100.0% bf16 MFU | 62096 tok/s +step 15461/19560 | loss 3.262480 (-1.10z)| norm 0.2725 (+0.62z)| lr 6.73e-05 | 8441.10 ms | -100.0% bf16 MFU | 62097 tok/s +step 15462/19560 | loss 3.314922 (+0.32z)| norm 0.2616 (-0.27z)| lr 6.73e-05 | 8440.48 ms | -100.0% bf16 MFU | 62098 tok/s +step 15463/19560 | loss 3.289963 (-0.35z)| norm 0.2633 (-0.13z)| lr 6.72e-05 | 8437.18 ms | -100.0% bf16 MFU | 62100 tok/s +step 15464/19560 | loss 3.327601 (+0.67z)| norm 0.2775 (+1.04z)| lr 6.72e-05 | 8439.46 ms | -100.0% bf16 MFU | 62101 tok/s +step 15465/19560 | loss 3.332079 (+0.78z)| norm 0.2584 (-0.54z)| lr 6.72e-05 | 8447.10 ms | -100.0% bf16 MFU | 62100 tok/s +step 15466/19560 | loss 3.317477 (+0.39z)| norm 0.2667 (+0.15z)| lr 6.71e-05 | 8437.99 ms | -100.0% bf16 MFU | 62101 tok/s +step 15467/19560 | loss 3.288471 (-0.39z)| norm 0.2882 (+1.90z)| lr 6.71e-05 | 8444.35 ms | -100.0% bf16 MFU | 62101 tok/s +step 15468/19560 | loss 3.248631 (-1.46z)| norm 0.2653 (+0.01z)| lr 6.71e-05 | 8441.33 ms | -100.0% bf16 MFU | 62101 tok/s +step 15469/19560 | loss 3.272362 (-0.80z)| norm 0.2584 (-0.55z)| lr 6.70e-05 | 8440.37 ms | -100.0% bf16 MFU | 62102 tok/s +step 15470/19560 | loss 3.305942 (+0.12z)| norm 0.2817 (+1.35z)| lr 6.70e-05 | 8447.18 ms | -100.0% bf16 MFU | 62100 tok/s +step 15471/19560 | loss 3.287250 (-0.40z)| norm 0.2647 (-0.05z)| lr 6.70e-05 | 8438.08 ms | -100.0% bf16 MFU | 62102 tok/s +step 15472/19560 | loss 3.324707 (+0.64z)| norm 0.2992 (+2.68z)| lr 6.69e-05 | 8438.20 ms | -100.0% bf16 MFU | 62103 tok/s +step 15473/19560 | loss 3.286804 (-0.40z)| norm 0.2685 (+0.24z)| lr 6.69e-05 | 8439.71 ms | -100.0% bf16 MFU | 62104 tok/s +step 15474/19560 | loss 3.232455 (-1.88z)| norm 0.2494 (-1.26z)| lr 6.69e-05 | 8444.15 ms | -100.0% bf16 MFU | 62104 tok/s +step 15475/19560 | loss 3.297987 (-0.06z)| norm 0.2586 (-0.53z)| lr 6.68e-05 | 8438.33 ms | -100.0% bf16 MFU | 62105 tok/s +step 15476/19560 | loss 3.261676 (-1.06z)| norm 0.2691 (+0.30z)| lr 6.68e-05 | 8439.75 ms | -100.0% bf16 MFU | 62106 tok/s +step 15477/19560 | loss 3.335787 (+1.00z)| norm 0.2716 (+0.50z)| lr 6.68e-05 | 8441.13 ms | -100.0% bf16 MFU | 62106 tok/s +step 15478/19560 | loss 3.216727 (-2.29z)| norm 0.2654 (+0.00z)| lr 6.68e-05 | 8440.87 ms | -100.0% bf16 MFU | 62106 tok/s +step 15479/19560 | loss 3.277572 (-0.58z)| norm 0.2578 (-0.59z)| lr 6.67e-05 | 8442.90 ms | -100.0% bf16 MFU | 62106 tok/s +step 15480/19560 | loss 3.297008 (-0.03z)| norm 0.2693 (+0.32z)| lr 6.67e-05 | 8443.25 ms | -100.0% bf16 MFU | 62105 tok/s +step 15481/19560 | loss 3.286064 (-0.35z)| norm 0.2530 (-0.96z)| lr 6.67e-05 | 8442.76 ms | -100.0% bf16 MFU | 62105 tok/s +step 15482/19560 | loss 3.409157 (+3.03z)| norm 0.2663 (+0.11z)| lr 6.66e-05 | 8442.04 ms | -100.0% bf16 MFU | 62105 tok/s +step 15483/19560 | loss 3.285131 (-0.39z)| norm 0.2482 (-1.36z)| lr 6.66e-05 | 8443.83 ms | -100.0% bf16 MFU | 62104 tok/s +step 15484/19560 | loss 3.295106 (-0.10z)| norm 0.2446 (-1.65z)| lr 6.66e-05 | 8433.95 ms | -100.0% bf16 MFU | 62107 tok/s +step 15485/19560 | loss 3.216805 (-2.22z)| norm 0.2704 (+0.49z)| lr 6.65e-05 | 8441.86 ms | -100.0% bf16 MFU | 62107 tok/s +step 15486/19560 | loss 3.305255 (+0.19z)| norm 0.2535 (-0.90z)| lr 6.65e-05 | 8439.58 ms | -100.0% bf16 MFU | 62108 tok/s +step 15487/19560 | loss 3.292941 (-0.15z)| norm 0.2537 (-0.88z)| lr 6.65e-05 | 8440.08 ms | -100.0% bf16 MFU | 62109 tok/s +step 15488/19560 | loss 3.245199 (-1.45z)| norm 0.2607 (-0.29z)| lr 6.64e-05 | 8440.75 ms | -100.0% bf16 MFU | 62109 tok/s +step 15489/19560 | loss 3.295011 (-0.07z)| norm 0.2443 (-1.62z)| lr 6.64e-05 | 8439.54 ms | -100.0% bf16 MFU | 62110 tok/s +step 15490/19560 | loss 3.329963 (+0.89z)| norm 0.2631 (-0.07z)| lr 6.64e-05 | 8439.88 ms | -100.0% bf16 MFU | 62110 tok/s +step 15491/19560 | loss 3.313633 (+0.43z)| norm 0.2545 (-0.78z)| lr 6.63e-05 | 8439.44 ms | -100.0% bf16 MFU | 62111 tok/s +step 15492/19560 | loss 3.262201 (-1.00z)| norm 0.2610 (-0.25z)| lr 6.63e-05 | 8437.98 ms | -100.0% bf16 MFU | 62112 tok/s +step 15493/19560 | loss 3.341329 (+1.18z)| norm 0.2695 (+0.45z)| lr 6.63e-05 | 8438.67 ms | -100.0% bf16 MFU | 62113 tok/s +step 15494/19560 | loss 3.317637 (+0.52z)| norm 0.2631 (-0.07z)| lr 6.62e-05 | 8439.42 ms | -100.0% bf16 MFU | 62113 tok/s +step 15495/19560 | loss 3.297734 (-0.03z)| norm 0.2522 (-0.97z)| lr 6.62e-05 | 8438.21 ms | -100.0% bf16 MFU | 62114 tok/s +step 15496/19560 | loss 3.310110 (+0.34z)| norm 0.2633 (-0.05z)| lr 6.62e-05 | 8439.04 ms | -100.0% bf16 MFU | 62115 tok/s +step 15497/19560 | loss 3.254200 (-1.24z)| norm 0.2650 (+0.10z)| lr 6.62e-05 | 8440.95 ms | -100.0% bf16 MFU | 62115 tok/s +step 15498/19560 | loss 3.334619 (+1.04z)| norm 0.2483 (-1.28z)| lr 6.61e-05 | 8442.26 ms | -100.0% bf16 MFU | 62114 tok/s +step 15499/19560 | loss 3.278078 (-0.56z)| norm 0.2622 (-0.12z)| lr 6.61e-05 | 8438.60 ms | -100.0% bf16 MFU | 62115 tok/s +step 15500/19560 | loss 3.317129 (+0.56z)| norm 0.2427 (-1.73z)| lr 6.61e-05 | 8443.00 ms | -100.0% bf16 MFU | 62114 tok/s +val loss 3.288217 +evaluating HellaSwag: 0/1256 evaluating HellaSwag: 10/1256 evaluating HellaSwag: 20/1256 evaluating HellaSwag: 30/1256 evaluating HellaSwag: 40/1256 evaluating HellaSwag: 50/1256 evaluating HellaSwag: 60/1256 evaluating HellaSwag: 70/1256 evaluating HellaSwag: 80/1256 evaluating HellaSwag: 90/1256 evaluating HellaSwag: 100/1256 evaluating HellaSwag: 110/1256 evaluating HellaSwag: 120/1256 evaluating HellaSwag: 130/1256 evaluating HellaSwag: 140/1256 evaluating HellaSwag: 150/1256 evaluating HellaSwag: 160/1256 evaluating HellaSwag: 170/1256 evaluating HellaSwag: 180/1256 evaluating HellaSwag: 190/1256 evaluating HellaSwag: 200/1256 evaluating HellaSwag: 210/1256 evaluating HellaSwag: 220/1256 evaluating HellaSwag: 230/1256 evaluating HellaSwag: 240/1256 evaluating HellaSwag: 250/1256 evaluating HellaSwag: 260/1256 evaluating HellaSwag: 270/1256 evaluating HellaSwag: 280/1256 evaluating HellaSwag: 290/1256 evaluating HellaSwag: 300/1256 evaluating HellaSwag: 310/1256 evaluating HellaSwag: 320/1256 evaluating HellaSwag: 330/1256 evaluating HellaSwag: 340/1256 evaluating HellaSwag: 350/1256 evaluating HellaSwag: 360/1256 evaluating HellaSwag: 370/1256 evaluating HellaSwag: 380/1256 evaluating HellaSwag: 390/1256 evaluating HellaSwag: 400/1256 evaluating HellaSwag: 410/1256 evaluating HellaSwag: 420/1256 evaluating HellaSwag: 430/1256 evaluating HellaSwag: 440/1256 evaluating HellaSwag: 450/1256 evaluating HellaSwag: 460/1256 evaluating HellaSwag: 470/1256 evaluating HellaSwag: 480/1256 evaluating HellaSwag: 490/1256 evaluating HellaSwag: 500/1256 evaluating HellaSwag: 510/1256 evaluating HellaSwag: 520/1256 evaluating HellaSwag: 530/1256 evaluating HellaSwag: 540/1256 evaluating HellaSwag: 550/1256 evaluating HellaSwag: 560/1256 evaluating HellaSwag: 570/1256 evaluating HellaSwag: 580/1256 evaluating HellaSwag: 590/1256 evaluating HellaSwag: 600/1256 evaluating HellaSwag: 610/1256 evaluating HellaSwag: 620/1256 evaluating HellaSwag: 630/1256 evaluating HellaSwag: 640/1256 evaluating HellaSwag: 650/1256 evaluating HellaSwag: 660/1256 evaluating HellaSwag: 670/1256 evaluating HellaSwag: 680/1256 evaluating HellaSwag: 690/1256 evaluating HellaSwag: 700/1256 evaluating HellaSwag: 710/1256 evaluating HellaSwag: 720/1256 evaluating HellaSwag: 730/1256 evaluating HellaSwag: 740/1256 evaluating HellaSwag: 750/1256 evaluating HellaSwag: 760/1256 evaluating HellaSwag: 770/1256 evaluating HellaSwag: 780/1256 evaluating HellaSwag: 790/1256 evaluating HellaSwag: 800/1256 evaluating HellaSwag: 810/1256 evaluating HellaSwag: 820/1256 evaluating HellaSwag: 830/1256 evaluating HellaSwag: 840/1256 evaluating HellaSwag: 850/1256 evaluating HellaSwag: 860/1256 evaluating HellaSwag: 870/1256 evaluating HellaSwag: 880/1256 evaluating HellaSwag: 890/1256 evaluating HellaSwag: 900/1256 evaluating HellaSwag: 910/1256 evaluating HellaSwag: 920/1256 evaluating HellaSwag: 930/1256 evaluating HellaSwag: 940/1256 evaluating HellaSwag: 950/1256 evaluating HellaSwag: 960/1256 evaluating HellaSwag: 970/1256 evaluating HellaSwag: 980/1256 evaluating HellaSwag: 990/1256 evaluating HellaSwag: 1000/1256 evaluating HellaSwag: 1010/1256 evaluating HellaSwag: 1020/1256 evaluating HellaSwag: 1030/1256 evaluating HellaSwag: 1040/1256 evaluating HellaSwag: 1050/1256 evaluating HellaSwag: 1060/1256 evaluating HellaSwag: 1070/1256 evaluating HellaSwag: 1080/1256 evaluating HellaSwag: 1090/1256 evaluating HellaSwag: 1100/1256 evaluating HellaSwag: 1110/1256 evaluating HellaSwag: 1120/1256 evaluating HellaSwag: 1130/1256 evaluating HellaSwag: 1140/1256 evaluating HellaSwag: 1150/1256 evaluating HellaSwag: 1160/1256 evaluating HellaSwag: 1170/1256 evaluating HellaSwag: 1180/1256 evaluating HellaSwag: 1190/1256 evaluating HellaSwag: 1200/1256 evaluating HellaSwag: 1210/1256 evaluating HellaSwag: 1220/1256 evaluating HellaSwag: 1230/1256 evaluating HellaSwag: 1240/1256 evaluating HellaSwag: 1250/1256 HellaSwag: 2999/10042 = 0.298646 +step 15501/19560 | loss 3.317425 (+0.56z)| norm 0.2710 (+0.62z)| lr 6.60e-05 | 8440.79 ms | -100.0% bf16 MFU | 62114 tok/s +step 15502/19560 | loss 3.244263 (-1.53z)| norm 0.2427 (-1.71z)| lr 6.60e-05 | 8437.28 ms | -100.0% bf16 MFU | 62115 tok/s +step 15503/19560 | loss 3.251485 (-1.30z)| norm 0.2460 (-1.41z)| lr 6.60e-05 | 8441.65 ms | -100.0% bf16 MFU | 62115 tok/s +step 15504/19560 | loss 3.321975 (+0.74z)| norm 0.2586 (-0.36z)| lr 6.59e-05 | 8444.01 ms | -100.0% bf16 MFU | 62114 tok/s +step 15505/19560 | loss 3.308506 (+0.35z)| norm 0.2572 (-0.47z)| lr 6.59e-05 | 8438.47 ms | -100.0% bf16 MFU | 62115 tok/s +step 15506/19560 | loss 3.289226 (-0.20z)| norm 0.2694 (+0.54z)| lr 6.59e-05 | 8438.41 ms | -100.0% bf16 MFU | 62115 tok/s +step 15507/19560 | loss 3.262105 (-0.99z)| norm 0.2702 (+0.61z)| lr 6.58e-05 | 8439.42 ms | -100.0% bf16 MFU | 62116 tok/s +step 15508/19560 | loss 3.303142 (+0.20z)| norm 0.2523 (-0.87z)| lr 6.58e-05 | 8437.35 ms | -100.0% bf16 MFU | 62117 tok/s +step 15509/19560 | loss 3.245597 (-1.46z)| norm 0.2671 (+0.40z)| lr 6.58e-05 | 8437.80 ms | -100.0% bf16 MFU | 62118 tok/s +step 15510/19560 | loss 3.344811 (+1.39z)| norm 0.2669 (+0.37z)| lr 6.57e-05 | 8438.71 ms | -100.0% bf16 MFU | 62118 tok/s +step 15511/19560 | loss 3.297962 (+0.04z)| norm 0.2579 (-0.41z)| lr 6.57e-05 | 8437.38 ms | -100.0% bf16 MFU | 62119 tok/s +step 15512/19560 | loss 3.294938 (-0.05z)| norm 0.2675 (+0.43z)| lr 6.57e-05 | 8436.89 ms | -100.0% bf16 MFU | 62121 tok/s +step 15513/19560 | loss 3.297838 (+0.03z)| norm 0.2744 (+1.02z)| lr 6.57e-05 | 8440.51 ms | -100.0% bf16 MFU | 62120 tok/s +step 15514/19560 | loss 3.313511 (+0.47z)| norm 0.2571 (-0.49z)| lr 6.56e-05 | 8440.33 ms | -100.0% bf16 MFU | 62120 tok/s +step 15515/19560 | loss 3.258993 (-1.10z)| norm 0.2603 (-0.23z)| lr 6.56e-05 | 8435.76 ms | -100.0% bf16 MFU | 62122 tok/s +step 15516/19560 | loss 3.281904 (-0.44z)| norm 0.2679 (+0.43z)| lr 6.56e-05 | 8436.15 ms | -100.0% bf16 MFU | 62123 tok/s +step 15517/19560 | loss 3.311218 (+0.41z)| norm 0.2564 (-0.58z)| lr 6.55e-05 | 8441.00 ms | -100.0% bf16 MFU | 62122 tok/s +step 15518/19560 | loss 3.386168 (+2.49z)| norm 0.3065 (+3.63z)| lr 6.55e-05 | 8442.07 ms | -100.0% bf16 MFU | 62122 tok/s +step 15519/19560 | loss 3.346530 (+1.35z)| norm 0.2789 (+1.29z)| lr 6.55e-05 | 8440.79 ms | -100.0% bf16 MFU | 62121 tok/s +step 15520/19560 | loss 3.319512 (+0.58z)| norm 0.2823 (+1.56z)| lr 6.54e-05 | 8438.87 ms | -100.0% bf16 MFU | 62121 tok/s +step 15521/19560 | loss 3.320035 (+0.59z)| norm 0.2766 (+1.06z)| lr 6.54e-05 | 8442.20 ms | -100.0% bf16 MFU | 62121 tok/s +step 15522/19560 | loss 3.310065 (+0.31z)| norm 0.2670 (+0.25z)| lr 6.54e-05 | 8438.28 ms | -100.0% bf16 MFU | 62121 tok/s +step 15523/19560 | loss 3.275434 (-0.67z)| norm 0.2645 (+0.03z)| lr 6.53e-05 | 8438.78 ms | -100.0% bf16 MFU | 62121 tok/s +step 15524/19560 | loss 3.324450 (+0.71z)| norm 0.2801 (+1.33z)| lr 6.53e-05 | 8437.01 ms | -100.0% bf16 MFU | 62122 tok/s +step 15525/19560 | loss 3.312330 (+0.36z)| norm 0.2729 (+0.72z)| lr 6.53e-05 | 8435.54 ms | -100.0% bf16 MFU | 62124 tok/s +step 15526/19560 | loss 3.361291 (+1.72z)| norm 0.2800 (+1.30z)| lr 6.53e-05 | 8439.88 ms | -100.0% bf16 MFU | 62124 tok/s +step 15527/19560 | loss 3.245945 (-1.46z)| norm 0.2830 (+1.54z)| lr 6.52e-05 | 8443.84 ms | -100.0% bf16 MFU | 62122 tok/s +step 15528/19560 | loss 3.269052 (-0.82z)| norm 0.2632 (-0.12z)| lr 6.52e-05 | 8437.36 ms | -100.0% bf16 MFU | 62123 tok/s +step 15529/19560 | loss 3.290720 (-0.22z)| norm 0.2775 (+1.06z)| lr 6.52e-05 | 8436.70 ms | -100.0% bf16 MFU | 62124 tok/s +step 15530/19560 | loss 3.255679 (-1.17z)| norm 0.2759 (+0.91z)| lr 6.51e-05 | 8441.45 ms | -100.0% bf16 MFU | 62123 tok/s +step 15531/19560 | loss 3.267952 (-0.82z)| norm 0.2576 (-0.61z)| lr 6.51e-05 | 8438.42 ms | -100.0% bf16 MFU | 62124 tok/s +step 15532/19560 | loss 3.358628 (+1.65z)| norm 0.2715 (+0.55z)| lr 6.51e-05 | 8438.55 ms | -100.0% bf16 MFU | 62124 tok/s +step 15533/19560 | loss 3.271498 (-0.72z)| norm 0.2887 (+1.95z)| lr 6.50e-05 | 8439.94 ms | -100.0% bf16 MFU | 62124 tok/s +step 15534/19560 | loss 3.289335 (-0.22z)| norm 0.2727 (+0.62z)| lr 6.50e-05 | 8440.17 ms | -100.0% bf16 MFU | 62123 tok/s +step 15535/19560 | loss 3.292595 (-0.13z)| norm 0.2682 (+0.23z)| lr 6.50e-05 | 8437.39 ms | -100.0% bf16 MFU | 62124 tok/s +step 15536/19560 | loss 3.261344 (-0.97z)| norm 0.2544 (-0.91z)| lr 6.49e-05 | 8440.42 ms | -100.0% bf16 MFU | 62124 tok/s +step 15537/19560 | loss 3.295374 (-0.07z)| norm 0.2787 (+1.10z)| lr 6.49e-05 | 8438.60 ms | -100.0% bf16 MFU | 62124 tok/s +step 15538/19560 | loss 3.311219 (+0.36z)| norm 0.2706 (+0.41z)| lr 6.49e-05 | 8439.66 ms | -100.0% bf16 MFU | 62124 tok/s +step 15539/19560 | loss 3.255416 (-1.18z)| norm 0.2727 (+0.58z)| lr 6.48e-05 | 8439.92 ms | -100.0% bf16 MFU | 62124 tok/s +step 15540/19560 | loss 3.291179 (-0.18z)| norm 0.2616 (-0.37z)| lr 6.48e-05 | 8438.28 ms | -100.0% bf16 MFU | 62124 tok/s +step 15541/19560 | loss 3.338706 (+1.14z)| norm 0.2702 (+0.36z)| lr 6.48e-05 | 8437.82 ms | -100.0% bf16 MFU | 62125 tok/s +step 15542/19560 | loss 3.306213 (+0.22z)| norm 0.2694 (+0.28z)| lr 6.48e-05 | 8441.21 ms | -100.0% bf16 MFU | 62124 tok/s +step 15543/19560 | loss 3.303166 (+0.13z)| norm 0.2709 (+0.40z)| lr 6.47e-05 | 8439.43 ms | -100.0% bf16 MFU | 62124 tok/s +step 15544/19560 | loss 3.284418 (-0.39z)| norm 0.2658 (-0.04z)| lr 6.47e-05 | 8438.83 ms | -100.0% bf16 MFU | 62124 tok/s +step 15545/19560 | loss 3.328174 (+0.85z)| norm 0.2572 (-0.76z)| lr 6.47e-05 | 8439.39 ms | -100.0% bf16 MFU | 62124 tok/s +step 15546/19560 | loss 3.310377 (+0.35z)| norm 0.2618 (-0.37z)| lr 6.46e-05 | 8438.27 ms | -100.0% bf16 MFU | 62125 tok/s +step 15547/19560 | loss 3.221867 (-2.13z)| norm 0.2638 (-0.20z)| lr 6.46e-05 | 8436.97 ms | -100.0% bf16 MFU | 62125 tok/s +step 15548/19560 | loss 3.342723 (+1.25z)| norm 0.2474 (-1.60z)| lr 6.46e-05 | 8438.87 ms | -100.0% bf16 MFU | 62126 tok/s +step 15549/19560 | loss 3.330582 (+0.89z)| norm 0.2817 (+1.34z)| lr 6.45e-05 | 8437.76 ms | -100.0% bf16 MFU | 62126 tok/s +step 15550/19560 | loss 3.303793 (+0.14z)| norm 0.2878 (+1.82z)| lr 6.45e-05 | 8438.05 ms | -100.0% bf16 MFU | 62127 tok/s +step 15551/19560 | loss 3.298134 (-0.03z)| norm 0.2470 (-1.65z)| lr 6.45e-05 | 8439.32 ms | -100.0% bf16 MFU | 62126 tok/s +step 15552/19560 | loss 3.321987 (+0.64z)| norm 0.2691 (+0.27z)| lr 6.44e-05 | 8440.40 ms | -100.0% bf16 MFU | 62126 tok/s +step 15553/19560 | loss 3.356067 (+1.57z)| norm 0.2681 (+0.19z)| lr 6.44e-05 | 8438.67 ms | -100.0% bf16 MFU | 62126 tok/s +step 15554/19560 | loss 3.354117 (+1.48z)| norm 0.2592 (-0.60z)| lr 6.44e-05 | 8439.90 ms | -100.0% bf16 MFU | 62126 tok/s +step 15555/19560 | loss 3.340928 (+1.10z)| norm 0.2662 (+0.03z)| lr 6.44e-05 | 8436.03 ms | -100.0% bf16 MFU | 62127 tok/s +step 15556/19560 | loss 3.355717 (+1.49z)| norm 0.2540 (-1.05z)| lr 6.43e-05 | 8438.48 ms | -100.0% bf16 MFU | 62127 tok/s +step 15557/19560 | loss 3.241464 (-1.64z)| norm 0.2587 (-0.64z)| lr 6.43e-05 | 8437.77 ms | -100.0% bf16 MFU | 62128 tok/s +step 15558/19560 | loss 3.263989 (-1.03z)| norm 0.2555 (-0.92z)| lr 6.43e-05 | 8439.47 ms | -100.0% bf16 MFU | 62127 tok/s +step 15559/19560 | loss 3.313787 (+0.34z)| norm 0.2759 (+0.88z)| lr 6.42e-05 | 8435.63 ms | -100.0% bf16 MFU | 62129 tok/s +step 15560/19560 | loss 3.344491 (+1.18z)| norm 0.2582 (-0.68z)| lr 6.42e-05 | 8438.18 ms | -100.0% bf16 MFU | 62129 tok/s +step 15561/19560 | loss 3.312281 (+0.30z)| norm 0.2396 (-2.26z)| lr 6.42e-05 | 8436.12 ms | -100.0% bf16 MFU | 62130 tok/s +step 15562/19560 | loss 3.286003 (-0.42z)| norm 0.2588 (-0.58z)| lr 6.41e-05 | 8438.51 ms | -100.0% bf16 MFU | 62130 tok/s +step 15563/19560 | loss 3.326922 (+0.69z)| norm 0.2752 (+0.85z)| lr 6.41e-05 | 8437.07 ms | -100.0% bf16 MFU | 62130 tok/s +step 15564/19560 | loss 3.379812 (+2.10z)| norm 0.2686 (+0.28z)| lr 6.41e-05 | 8437.31 ms | -100.0% bf16 MFU | 62131 tok/s +step 15565/19560 | loss 3.314862 (+0.34z)| norm 0.2655 (+0.02z)| lr 6.40e-05 | 8437.05 ms | -100.0% bf16 MFU | 62131 tok/s +step 15566/19560 | loss 3.450783 (+3.76z)| norm 0.2685 (+0.27z)| lr 6.40e-05 | 8437.51 ms | -100.0% bf16 MFU | 62132 tok/s +step 15567/19560 | loss 3.341104 (+0.96z)| norm 0.2518 (-1.21z)| lr 6.40e-05 | 8438.41 ms | -100.0% bf16 MFU | 62132 tok/s +step 15568/19560 | loss 3.321337 (+0.45z)| norm 0.2554 (-0.90z)| lr 6.39e-05 | 8438.94 ms | -100.0% bf16 MFU | 62131 tok/s +step 15569/19560 | loss 3.309042 (+0.13z)| norm 0.2745 (+0.80z)| lr 6.39e-05 | 8440.03 ms | -100.0% bf16 MFU | 62131 tok/s +step 15570/19560 | loss 3.349434 (+1.15z)| norm 0.2501 (-1.35z)| lr 6.39e-05 | 8436.01 ms | -100.0% bf16 MFU | 62132 tok/s +step 15571/19560 | loss 3.310869 (+0.18z)| norm 0.2686 (+0.29z)| lr 6.39e-05 | 8436.76 ms | -100.0% bf16 MFU | 62132 tok/s +step 15572/19560 | loss 3.340404 (+0.94z)| norm 0.2575 (-0.69z)| lr 6.38e-05 | 8439.27 ms | -100.0% bf16 MFU | 62132 tok/s +step 15573/19560 | loss 3.257618 (-1.18z)| norm 0.2528 (-1.10z)| lr 6.38e-05 | 8438.34 ms | -100.0% bf16 MFU | 62132 tok/s +step 15574/19560 | loss 3.308541 (+0.13z)| norm 0.2504 (-1.30z)| lr 6.38e-05 | 8439.54 ms | -100.0% bf16 MFU | 62131 tok/s +step 15575/19560 | loss 3.335083 (+0.82z)| norm 0.2484 (-1.45z)| lr 6.37e-05 | 8437.81 ms | -100.0% bf16 MFU | 62132 tok/s +step 15576/19560 | loss 3.337375 (+0.86z)| norm 0.2576 (-0.64z)| lr 6.37e-05 | 8437.23 ms | -100.0% bf16 MFU | 62132 tok/s +step 15577/19560 | loss 3.391884 (+2.22z)| norm 0.2713 (+0.56z)| lr 6.37e-05 | 8438.13 ms | -100.0% bf16 MFU | 62132 tok/s +step 15578/19560 | loss 3.271117 (-0.85z)| norm 0.2607 (-0.37z)| lr 6.36e-05 | 8438.49 ms | -100.0% bf16 MFU | 62132 tok/s +step 15579/19560 | loss 3.360260 (+1.45z)| norm 0.2613 (-0.32z)| lr 6.36e-05 | 8438.35 ms | -100.0% bf16 MFU | 62132 tok/s +step 15580/19560 | loss 3.283118 (-0.55z)| norm 0.2622 (-0.23z)| lr 6.36e-05 | 8440.62 ms | -100.0% bf16 MFU | 62131 tok/s +step 15581/19560 | loss 3.344352 (+1.04z)| norm 0.2525 (-1.07z)| lr 6.35e-05 | 8441.64 ms | -100.0% bf16 MFU | 62130 tok/s +step 15582/19560 | loss 3.317982 (+0.36z)| norm 0.2522 (-1.08z)| lr 6.35e-05 | 8439.51 ms | -100.0% bf16 MFU | 62130 tok/s +step 15583/19560 | loss 3.321278 (+0.44z)| norm 0.2630 (-0.12z)| lr 6.35e-05 | 8440.22 ms | -100.0% bf16 MFU | 62129 tok/s +step 15584/19560 | loss 3.337345 (+0.85z)| norm 0.2404 (-2.07z)| lr 6.35e-05 | 8435.17 ms | -100.0% bf16 MFU | 62130 tok/s +step 15585/19560 | loss 3.327334 (+0.58z)| norm 0.2585 (-0.49z)| lr 6.34e-05 | 8437.73 ms | -100.0% bf16 MFU | 62131 tok/s +step 15586/19560 | loss 3.349719 (+1.15z)| norm 0.2572 (-0.59z)| lr 6.34e-05 | 8435.04 ms | -100.0% bf16 MFU | 62132 tok/s +step 15587/19560 | loss 3.350468 (+1.15z)| norm 0.2691 (+0.44z)| lr 6.34e-05 | 8437.96 ms | -100.0% bf16 MFU | 62132 tok/s +step 15588/19560 | loss 3.309159 (+0.09z)| norm 0.2432 (-1.78z)| lr 6.33e-05 | 8436.34 ms | -100.0% bf16 MFU | 62133 tok/s +step 15589/19560 | loss 3.255493 (-1.30z)| norm 0.2453 (-1.57z)| lr 6.33e-05 | 8437.76 ms | -100.0% bf16 MFU | 62133 tok/s +step 15590/19560 | loss 3.319028 (+0.35z)| norm 0.2602 (-0.30z)| lr 6.33e-05 | 8436.45 ms | -100.0% bf16 MFU | 62133 tok/s +step 15591/19560 | loss 3.332837 (+0.69z)| norm 0.2566 (-0.60z)| lr 6.32e-05 | 8437.49 ms | -100.0% bf16 MFU | 62134 tok/s +step 15592/19560 | loss 3.333643 (+0.71z)| norm 0.2595 (-0.35z)| lr 6.32e-05 | 8439.51 ms | -100.0% bf16 MFU | 62133 tok/s +step 15593/19560 | loss 3.233551 (-1.83z)| norm 0.2556 (-0.67z)| lr 6.32e-05 | 8435.41 ms | -100.0% bf16 MFU | 62134 tok/s +step 15594/19560 | loss 3.312878 (+0.20z)| norm 0.2518 (-0.99z)| lr 6.31e-05 | 8436.98 ms | -100.0% bf16 MFU | 62135 tok/s +step 15595/19560 | loss 3.298026 (-0.19z)| norm 0.2549 (-0.71z)| lr 6.31e-05 | 8439.35 ms | -100.0% bf16 MFU | 62134 tok/s +step 15596/19560 | loss 3.318425 (+0.32z)| norm 0.2506 (-1.07z)| lr 6.31e-05 | 8439.08 ms | -100.0% bf16 MFU | 62134 tok/s +step 15597/19560 | loss 3.231328 (-1.89z)| norm 0.2589 (-0.36z)| lr 6.31e-05 | 8438.26 ms | -100.0% bf16 MFU | 62134 tok/s +step 15598/19560 | loss 3.518486 (+4.86z)| norm 0.2951 (+2.72z)| lr 6.30e-05 | 8439.06 ms | -100.0% bf16 MFU | 62133 tok/s +step 15599/19560 | loss 3.326180 (+0.43z)| norm 0.2523 (-0.90z)| lr 6.30e-05 | 8437.64 ms | -100.0% bf16 MFU | 62133 tok/s +step 15600/19560 | loss 3.350693 (+0.98z)| norm 0.2621 (-0.05z)| lr 6.30e-05 | 8436.97 ms | -100.0% bf16 MFU | 62134 tok/s +step 15601/19560 | loss 3.381753 (+1.66z)| norm 0.2497 (-1.12z)| lr 6.29e-05 | 8440.10 ms | -100.0% bf16 MFU | 62133 tok/s +step 15602/19560 | loss 3.299858 (-0.21z)| norm 0.2631 (+0.04z)| lr 6.29e-05 | 8437.05 ms | -100.0% bf16 MFU | 62133 tok/s +step 15603/19560 | loss 3.312162 (+0.07z)| norm 0.2543 (-0.73z)| lr 6.29e-05 | 8438.61 ms | -100.0% bf16 MFU | 62133 tok/s +step 15604/19560 | loss 3.307530 (-0.04z)| norm 0.2620 (-0.05z)| lr 6.28e-05 | 8438.80 ms | -100.0% bf16 MFU | 62133 tok/s +step 15605/19560 | loss 3.315454 (+0.14z)| norm 0.2607 (-0.15z)| lr 6.28e-05 | 8438.96 ms | -100.0% bf16 MFU | 62133 tok/s +step 15606/19560 | loss 3.369470 (+1.38z)| norm 0.2619 (-0.05z)| lr 6.28e-05 | 8437.79 ms | -100.0% bf16 MFU | 62133 tok/s +step 15607/19560 | loss 3.334116 (+0.54z)| norm 0.2683 (+0.51z)| lr 6.28e-05 | 8437.68 ms | -100.0% bf16 MFU | 62133 tok/s +step 15608/19560 | loss 3.313309 (+0.05z)| norm 0.2565 (-0.52z)| lr 6.27e-05 | 8440.04 ms | -100.0% bf16 MFU | 62132 tok/s +step 15609/19560 | loss 3.319992 (+0.20z)| norm 0.2710 (+0.74z)| lr 6.27e-05 | 8435.18 ms | -100.0% bf16 MFU | 62133 tok/s +step 15610/19560 | loss 3.281836 (-0.68z)| norm 0.2597 (-0.25z)| lr 6.27e-05 | 8439.58 ms | -100.0% bf16 MFU | 62133 tok/s +step 15611/19560 | loss 3.301052 (-0.22z)| norm 0.2592 (-0.31z)| lr 6.26e-05 | 8438.33 ms | -100.0% bf16 MFU | 62133 tok/s +step 15612/19560 | loss 3.251511 (-1.39z)| norm 0.2547 (-0.71z)| lr 6.26e-05 | 8440.29 ms | -100.0% bf16 MFU | 62132 tok/s +step 15613/19560 | loss 3.281730 (-0.70z)| norm 0.2652 (+0.23z)| lr 6.26e-05 | 8438.14 ms | -100.0% bf16 MFU | 62132 tok/s +step 15614/19560 | loss 3.338659 (+0.67z)| norm 0.2566 (-0.54z)| lr 6.25e-05 | 8437.60 ms | -100.0% bf16 MFU | 62132 tok/s +step 15615/19560 | loss 3.341279 (+0.72z)| norm 0.2514 (-1.01z)| lr 6.25e-05 | 8438.03 ms | -100.0% bf16 MFU | 62132 tok/s +step 15616/19560 | loss 3.394056 (+1.96z)| norm 0.2649 (+0.20z)| lr 6.25e-05 | 8439.25 ms | -100.0% bf16 MFU | 62132 tok/s +step 15617/19560 | loss 3.257294 (-1.31z)| norm 0.2729 (+0.90z)| lr 6.24e-05 | 8438.94 ms | -100.0% bf16 MFU | 62132 tok/s +step 15618/19560 | loss 3.297000 (-0.35z)| norm 0.2621 (-0.07z)| lr 6.24e-05 | 8438.02 ms | -100.0% bf16 MFU | 62132 tok/s +step 15619/19560 | loss 3.347266 (+0.83z)| norm 0.2836 (+1.83z)| lr 6.24e-05 | 8439.36 ms | -100.0% bf16 MFU | 62132 tok/s +step 15620/19560 | loss 3.274632 (-0.89z)| norm 0.2858 (+1.98z)| lr 6.24e-05 | 8437.74 ms | -100.0% bf16 MFU | 62132 tok/s +step 15621/19560 | loss 3.266095 (-1.08z)| norm 0.2532 (-0.88z)| lr 6.23e-05 | 8439.96 ms | -100.0% bf16 MFU | 62131 tok/s +step 15622/19560 | loss 3.351471 (+0.94z)| norm 0.2746 (+0.99z)| lr 6.23e-05 | 8435.79 ms | -100.0% bf16 MFU | 62132 tok/s +step 15623/19560 | loss 3.272217 (-0.93z)| norm 0.2668 (+0.30z)| lr 6.23e-05 | 8437.76 ms | -100.0% bf16 MFU | 62132 tok/s +step 15624/19560 | loss 3.284249 (-0.64z)| norm 0.2611 (-0.20z)| lr 6.22e-05 | 8438.81 ms | -100.0% bf16 MFU | 62132 tok/s +step 15625/19560 | loss 3.306008 (-0.14z)| norm 0.2730 (+0.84z)| lr 6.22e-05 | 8439.51 ms | -100.0% bf16 MFU | 62132 tok/s +step 15626/19560 | loss 3.310963 (-0.02z)| norm 0.2876 (+2.07z)| lr 6.22e-05 | 8437.48 ms | -100.0% bf16 MFU | 62132 tok/s +step 15627/19560 | loss 3.355210 (+1.02z)| norm 0.2871 (+1.98z)| lr 6.21e-05 | 8437.92 ms | -100.0% bf16 MFU | 62132 tok/s +step 15628/19560 | loss 3.329744 (+0.41z)| norm 0.2740 (+0.85z)| lr 6.21e-05 | 8438.38 ms | -100.0% bf16 MFU | 62132 tok/s +step 15629/19560 | loss 3.267279 (-1.06z)| norm 0.2681 (+0.34z)| lr 6.21e-05 | 8436.11 ms | -100.0% bf16 MFU | 62133 tok/s +step 15630/19560 | loss 3.210048 (-2.37z)| norm 0.2797 (+1.32z)| lr 6.20e-05 | 8437.68 ms | -100.0% bf16 MFU | 62133 tok/s +step 15631/19560 | loss 3.284731 (-0.64z)| norm 0.2527 (-1.03z)| lr 6.20e-05 | 8438.57 ms | -100.0% bf16 MFU | 62133 tok/s +step 15632/19560 | loss 3.272981 (-0.90z)| norm 0.2505 (-1.21z)| lr 6.20e-05 | 8437.81 ms | -100.0% bf16 MFU | 62133 tok/s +step 15633/19560 | loss 3.298864 (-0.30z)| norm 0.2704 (+0.51z)| lr 6.20e-05 | 8441.64 ms | -100.0% bf16 MFU | 62132 tok/s +step 15634/19560 | loss 3.262074 (-1.15z)| norm 0.2568 (-0.67z)| lr 6.19e-05 | 8439.76 ms | -100.0% bf16 MFU | 62131 tok/s +step 15635/19560 | loss 3.317052 (+0.12z)| norm 0.2686 (+0.36z)| lr 6.19e-05 | 8440.87 ms | -100.0% bf16 MFU | 62130 tok/s +step 15636/19560 | loss 3.302034 (-0.23z)| norm 0.2645 (-0.00z)| lr 6.19e-05 | 8436.63 ms | -100.0% bf16 MFU | 62131 tok/s +step 15637/19560 | loss 3.302438 (-0.23z)| norm 0.2580 (-0.57z)| lr 6.18e-05 | 8436.76 ms | -100.0% bf16 MFU | 62132 tok/s +step 15638/19560 | loss 3.243819 (-1.58z)| norm 0.2686 (+0.36z)| lr 6.18e-05 | 8439.29 ms | -100.0% bf16 MFU | 62131 tok/s +step 15639/19560 | loss 3.333828 (+0.52z)| norm 0.2672 (+0.23z)| lr 6.18e-05 | 8437.60 ms | -100.0% bf16 MFU | 62132 tok/s +step 15640/19560 | loss 3.326532 (+0.34z)| norm 0.2617 (-0.25z)| lr 6.17e-05 | 8438.71 ms | -100.0% bf16 MFU | 62131 tok/s +step 15641/19560 | loss 3.256310 (-1.29z)| norm 0.2690 (+0.40z)| lr 6.17e-05 | 8437.21 ms | -100.0% bf16 MFU | 62132 tok/s +step 15642/19560 | loss 3.321575 (+0.23z)| norm 0.2786 (+1.22z)| lr 6.17e-05 | 8438.82 ms | -100.0% bf16 MFU | 62132 tok/s +step 15643/19560 | loss 3.301242 (-0.25z)| norm 0.2533 (-0.98z)| lr 6.17e-05 | 8436.40 ms | -100.0% bf16 MFU | 62132 tok/s +step 15644/19560 | loss 3.269277 (-1.00z)| norm 0.2685 (+0.34z)| lr 6.16e-05 | 8436.33 ms | -100.0% bf16 MFU | 62133 tok/s +step 15645/19560 | loss 3.290402 (-0.50z)| norm 0.2657 (+0.09z)| lr 6.16e-05 | 8437.98 ms | -100.0% bf16 MFU | 62133 tok/s +step 15646/19560 | loss 3.336484 (+0.59z)| norm 0.2739 (+0.87z)| lr 6.16e-05 | 8437.95 ms | -100.0% bf16 MFU | 62133 tok/s +step 15647/19560 | loss 3.390436 (+1.84z)| norm 0.2740 (+0.89z)| lr 6.15e-05 | 8438.75 ms | -100.0% bf16 MFU | 62133 tok/s +step 15648/19560 | loss 3.313053 (+0.03z)| norm 0.2659 (+0.16z)| lr 6.15e-05 | 8439.91 ms | -100.0% bf16 MFU | 62132 tok/s +step 15649/19560 | loss 3.441797 (+2.91z)| norm 0.2574 (-0.63z)| lr 6.15e-05 | 8439.97 ms | -100.0% bf16 MFU | 62132 tok/s +step 15650/19560 | loss 3.330180 (+0.39z)| norm 0.2685 (+0.42z)| lr 6.14e-05 | 8437.43 ms | -100.0% bf16 MFU | 62132 tok/s +step 15651/19560 | loss 3.284407 (-0.64z)| norm 0.2648 (+0.07z)| lr 6.14e-05 | 8437.68 ms | -100.0% bf16 MFU | 62132 tok/s +step 15652/19560 | loss 3.306779 (-0.13z)| norm 0.2652 (+0.11z)| lr 6.14e-05 | 8439.67 ms | -100.0% bf16 MFU | 62132 tok/s +step 15653/19560 | loss 3.356106 (+0.97z)| norm 0.2502 (-1.28z)| lr 6.14e-05 | 8434.99 ms | -100.0% bf16 MFU | 62133 tok/s +step 15654/19560 | loss 3.297927 (-0.33z)| norm 0.2520 (-1.09z)| lr 6.13e-05 | 8439.54 ms | -100.0% bf16 MFU | 62132 tok/s +step 15655/19560 | loss 3.346633 (+0.76z)| norm 0.2593 (-0.39z)| lr 6.13e-05 | 8438.15 ms | -100.0% bf16 MFU | 62132 tok/s +step 15656/19560 | loss 3.245729 (-1.52z)| norm 0.2563 (-0.67z)| lr 6.13e-05 | 8440.03 ms | -100.0% bf16 MFU | 62132 tok/s +step 15657/19560 | loss 3.503403 (+3.99z)| norm 0.2816 (+1.73z)| lr 6.12e-05 | 8438.37 ms | -100.0% bf16 MFU | 62132 tok/s +step 15658/19560 | loss 3.333110 (+0.38z)| norm 0.2593 (-0.38z)| lr 6.12e-05 | 8436.88 ms | -100.0% bf16 MFU | 62132 tok/s +step 15659/19560 | loss 3.435801 (+2.48z)| norm 0.2881 (+2.30z)| lr 6.12e-05 | 8438.34 ms | -100.0% bf16 MFU | 62132 tok/s +step 15660/19560 | loss 3.349509 (+0.69z)| norm 0.2558 (-0.70z)| lr 6.11e-05 | 8437.48 ms | -100.0% bf16 MFU | 62133 tok/s +step 15661/19560 | loss 3.291683 (-0.53z)| norm 0.2484 (-1.39z)| lr 6.11e-05 | 8437.87 ms | -100.0% bf16 MFU | 62133 tok/s +step 15662/19560 | loss 3.282819 (-0.71z)| norm 0.2482 (-1.38z)| lr 6.11e-05 | 8437.02 ms | -100.0% bf16 MFU | 62133 tok/s +step 15663/19560 | loss 3.287592 (-0.61z)| norm 0.2871 (+2.23z)| lr 6.10e-05 | 8436.77 ms | -100.0% bf16 MFU | 62134 tok/s +step 15664/19560 | loss 3.304376 (-0.27z)| norm 0.2581 (-0.46z)| lr 6.10e-05 | 8437.92 ms | -100.0% bf16 MFU | 62134 tok/s +step 15665/19560 | loss 3.299778 (-0.36z)| norm 0.2522 (-0.99z)| lr 6.10e-05 | 8437.55 ms | -100.0% bf16 MFU | 62134 tok/s +step 15666/19560 | loss 3.298210 (-0.39z)| norm 0.2734 (+0.99z)| lr 6.10e-05 | 8436.50 ms | -100.0% bf16 MFU | 62134 tok/s +step 15667/19560 | loss 3.239724 (-1.61z)| norm 0.2544 (-0.77z)| lr 6.09e-05 | 8439.11 ms | -100.0% bf16 MFU | 62134 tok/s +step 15668/19560 | loss 3.251190 (-1.36z)| norm 0.2535 (-0.84z)| lr 6.09e-05 | 8435.94 ms | -100.0% bf16 MFU | 62135 tok/s +step 15669/19560 | loss 3.361398 (+0.93z)| norm 0.2536 (-0.83z)| lr 6.09e-05 | 8443.01 ms | -100.0% bf16 MFU | 62133 tok/s +step 15670/19560 | loss 3.236063 (-1.64z)| norm 0.2664 (+0.36z)| lr 6.08e-05 | 8438.19 ms | -100.0% bf16 MFU | 62133 tok/s +step 15671/19560 | loss 3.317624 (+0.03z)| norm 0.2495 (-1.19z)| lr 6.08e-05 | 8438.18 ms | -100.0% bf16 MFU | 62133 tok/s +step 15672/19560 | loss 3.365109 (+0.99z)| norm 0.2505 (-1.08z)| lr 6.08e-05 | 8435.25 ms | -100.0% bf16 MFU | 62134 tok/s +step 15673/19560 | loss 3.394334 (+1.56z)| norm 0.2581 (-0.37z)| lr 6.07e-05 | 8439.57 ms | -100.0% bf16 MFU | 62133 tok/s +step 15674/19560 | loss 3.307258 (-0.21z)| norm 0.2516 (-0.97z)| lr 6.07e-05 | 8439.25 ms | -100.0% bf16 MFU | 62133 tok/s +step 15675/19560 | loss 3.271908 (-0.94z)| norm 0.2573 (-0.44z)| lr 6.07e-05 | 8438.60 ms | -100.0% bf16 MFU | 62133 tok/s +step 15676/19560 | loss 3.308638 (-0.18z)| norm 0.2512 (-1.00z)| lr 6.07e-05 | 8438.78 ms | -100.0% bf16 MFU | 62133 tok/s +step 15677/19560 | loss 3.248747 (-1.39z)| norm 0.2699 (+0.74z)| lr 6.06e-05 | 8438.77 ms | -100.0% bf16 MFU | 62132 tok/s +step 15678/19560 | loss 3.329522 (+0.25z)| norm 0.2603 (-0.14z)| lr 6.06e-05 | 8439.11 ms | -100.0% bf16 MFU | 62132 tok/s +step 15679/19560 | loss 3.294657 (-0.46z)| norm 0.2561 (-0.55z)| lr 6.06e-05 | 8439.83 ms | -100.0% bf16 MFU | 62132 tok/s +step 15680/19560 | loss 3.290882 (-0.53z)| norm 0.2816 (+1.86z)| lr 6.05e-05 | 8439.45 ms | -100.0% bf16 MFU | 62131 tok/s +step 15681/19560 | loss 3.291942 (-0.50z)| norm 0.2497 (-1.15z)| lr 6.05e-05 | 8435.39 ms | -100.0% bf16 MFU | 62132 tok/s +step 15682/19560 | loss 3.333023 (+0.34z)| norm 0.2722 (+0.97z)| lr 6.05e-05 | 8437.98 ms | -100.0% bf16 MFU | 62132 tok/s +step 15683/19560 | loss 3.357521 (+0.84z)| norm 0.2493 (-1.17z)| lr 6.04e-05 | 8437.25 ms | -100.0% bf16 MFU | 62133 tok/s +step 15684/19560 | loss 3.247125 (-1.39z)| norm 0.2778 (+1.48z)| lr 6.04e-05 | 8437.40 ms | -100.0% bf16 MFU | 62133 tok/s +step 15685/19560 | loss 3.309012 (-0.14z)| norm 0.2668 (+0.45z)| lr 6.04e-05 | 8436.55 ms | -100.0% bf16 MFU | 62134 tok/s +step 15686/19560 | loss 3.292826 (-0.48z)| norm 0.2616 (-0.05z)| lr 6.04e-05 | 8437.26 ms | -100.0% bf16 MFU | 62134 tok/s +step 15687/19560 | loss 3.348612 (+0.66z)| norm 0.2519 (-0.94z)| lr 6.03e-05 | 8437.46 ms | -100.0% bf16 MFU | 62134 tok/s +step 15688/19560 | loss 3.304401 (-0.24z)| norm 0.2680 (+0.57z)| lr 6.03e-05 | 8437.00 ms | -100.0% bf16 MFU | 62134 tok/s +step 15689/19560 | loss 3.342185 (+0.53z)| norm 0.2696 (+0.71z)| lr 6.03e-05 | 8436.04 ms | -100.0% bf16 MFU | 62135 tok/s +step 15690/19560 | loss 3.346901 (+0.62z)| norm 0.2551 (-0.67z)| lr 6.02e-05 | 8436.98 ms | -100.0% bf16 MFU | 62135 tok/s +step 15691/19560 | loss 3.275540 (-0.84z)| norm 0.2568 (-0.50z)| lr 6.02e-05 | 8435.13 ms | -100.0% bf16 MFU | 62136 tok/s +step 15692/19560 | loss 3.330143 (+0.29z)| norm 0.2761 (+1.33z)| lr 6.02e-05 | 8437.75 ms | -100.0% bf16 MFU | 62136 tok/s +step 15693/19560 | loss 3.293833 (-0.45z)| norm 0.2549 (-0.68z)| lr 6.01e-05 | 8436.16 ms | -100.0% bf16 MFU | 62137 tok/s +step 15694/19560 | loss 3.311893 (-0.06z)| norm 0.2667 (+0.45z)| lr 6.01e-05 | 8438.81 ms | -100.0% bf16 MFU | 62137 tok/s +step 15695/19560 | loss 3.275214 (-0.83z)| norm 0.2705 (+0.79z)| lr 6.01e-05 | 8439.01 ms | -100.0% bf16 MFU | 62136 tok/s +step 15696/19560 | loss 3.346458 (+0.68z)| norm 0.2639 (+0.16z)| lr 6.01e-05 | 8437.82 ms | -100.0% bf16 MFU | 62136 tok/s +step 15697/19560 | loss 3.240519 (-1.54z)| norm 0.2598 (-0.22z)| lr 6.00e-05 | 8436.09 ms | -100.0% bf16 MFU | 62137 tok/s +step 15698/19560 | loss 3.319873 (+0.13z)| norm 0.2710 (+0.84z)| lr 6.00e-05 | 8438.34 ms | -100.0% bf16 MFU | 62136 tok/s +step 15699/19560 | loss 3.264805 (-1.02z)| norm 0.2642 (+0.19z)| lr 6.00e-05 | 8436.04 ms | -100.0% bf16 MFU | 62137 tok/s +step 15700/19560 | loss 3.341019 (+0.58z)| norm 0.2713 (+0.86z)| lr 5.99e-05 | 8438.01 ms | -100.0% bf16 MFU | 62137 tok/s +step 15701/19560 | loss 3.355921 (+0.88z)| norm 0.2935 (+2.86z)| lr 5.99e-05 | 8437.84 ms | -100.0% bf16 MFU | 62137 tok/s +step 15702/19560 | loss 3.305343 (-0.18z)| norm 0.2613 (-0.14z)| lr 5.99e-05 | 8438.32 ms | -100.0% bf16 MFU | 62137 tok/s +step 15703/19560 | loss 3.283309 (-0.64z)| norm 0.2612 (-0.16z)| lr 5.98e-05 | 8437.01 ms | -100.0% bf16 MFU | 62137 tok/s +step 15704/19560 | loss 3.317800 (+0.09z)| norm 0.2660 (+0.29z)| lr 5.98e-05 | 8437.08 ms | -100.0% bf16 MFU | 62137 tok/s +step 15705/19560 | loss 3.308495 (-0.09z)| norm 0.2700 (+0.67z)| lr 5.98e-05 | 8436.53 ms | -100.0% bf16 MFU | 62137 tok/s +step 15706/19560 | loss 3.239902 (-1.53z)| norm 0.2517 (-1.04z)| lr 5.98e-05 | 8437.69 ms | -100.0% bf16 MFU | 62137 tok/s +step 15707/19560 | loss 3.320101 (+0.16z)| norm 0.2425 (-1.87z)| lr 5.97e-05 | 8440.46 ms | -100.0% bf16 MFU | 62136 tok/s +step 15708/19560 | loss 3.290736 (-0.46z)| norm 0.2645 (+0.17z)| lr 5.97e-05 | 8437.35 ms | -100.0% bf16 MFU | 62136 tok/s +step 15709/19560 | loss 3.226189 (-1.79z)| norm 0.2662 (+0.32z)| lr 5.97e-05 | 8437.71 ms | -100.0% bf16 MFU | 62136 tok/s +step 15710/19560 | loss 3.265466 (-0.95z)| norm 0.2562 (-0.61z)| lr 5.96e-05 | 8436.80 ms | -100.0% bf16 MFU | 62137 tok/s +step 15711/19560 | loss 3.263465 (-0.98z)| norm 0.2582 (-0.42z)| lr 5.96e-05 | 8437.87 ms | -100.0% bf16 MFU | 62137 tok/s +step 15712/19560 | loss 3.264704 (-0.94z)| norm 0.2662 (+0.30z)| lr 5.96e-05 | 8437.90 ms | -100.0% bf16 MFU | 62137 tok/s +step 15713/19560 | loss 3.289945 (-0.41z)| norm 0.2492 (-1.29z)| lr 5.95e-05 | 8437.56 ms | -100.0% bf16 MFU | 62137 tok/s +step 15714/19560 | loss 3.297251 (-0.25z)| norm 0.2445 (-1.70z)| lr 5.95e-05 | 8436.73 ms | -100.0% bf16 MFU | 62137 tok/s +step 15715/19560 | loss 3.292464 (-0.34z)| norm 0.2658 (+0.28z)| lr 5.95e-05 | 8438.25 ms | -100.0% bf16 MFU | 62137 tok/s +step 15716/19560 | loss 3.272476 (-0.75z)| norm 0.2599 (-0.28z)| lr 5.95e-05 | 8440.43 ms | -100.0% bf16 MFU | 62136 tok/s +step 15717/19560 | loss 3.303925 (-0.11z)| norm 0.2662 (+0.29z)| lr 5.94e-05 | 8437.10 ms | -100.0% bf16 MFU | 62136 tok/s +step 15718/19560 | loss 3.411193 (+2.08z)| norm 0.2647 (+0.15z)| lr 5.94e-05 | 8440.21 ms | -100.0% bf16 MFU | 62135 tok/s +step 15719/19560 | loss 3.351542 (+0.85z)| norm 0.2455 (-1.65z)| lr 5.94e-05 | 8436.86 ms | -100.0% bf16 MFU | 62135 tok/s +step 15720/19560 | loss 3.317747 (+0.16z)| norm 0.2622 (-0.08z)| lr 5.93e-05 | 8439.84 ms | -100.0% bf16 MFU | 62135 tok/s +step 15721/19560 | loss 3.302646 (-0.16z)| norm 0.2594 (-0.35z)| lr 5.93e-05 | 8436.67 ms | -100.0% bf16 MFU | 62135 tok/s +step 15722/19560 | loss 3.222503 (-1.78z)| norm 0.3078 (+3.93z)| lr 5.93e-05 | 8437.91 ms | -100.0% bf16 MFU | 62135 tok/s +step 15723/19560 | loss 3.304075 (-0.11z)| norm 0.2694 (+0.51z)| lr 5.92e-05 | 8437.20 ms | -100.0% bf16 MFU | 62135 tok/s +step 15724/19560 | loss 3.336201 (+0.54z)| norm 0.2522 (-1.02z)| lr 5.92e-05 | 8439.22 ms | -100.0% bf16 MFU | 62135 tok/s +step 15725/19560 | loss 3.321253 (+0.22z)| norm 0.2561 (-0.67z)| lr 5.92e-05 | 8439.62 ms | -100.0% bf16 MFU | 62134 tok/s +step 15726/19560 | loss 3.297631 (-0.25z)| norm 0.2519 (-1.04z)| lr 5.92e-05 | 8436.78 ms | -100.0% bf16 MFU | 62135 tok/s +step 15727/19560 | loss 3.292723 (-0.35z)| norm 0.2510 (-1.12z)| lr 5.91e-05 | 8437.83 ms | -100.0% bf16 MFU | 62135 tok/s +step 15728/19560 | loss 3.270398 (-0.84z)| norm 0.2574 (-0.53z)| lr 5.91e-05 | 8435.26 ms | -100.0% bf16 MFU | 62136 tok/s +step 15729/19560 | loss 3.333705 (+0.59z)| norm 0.2661 (+0.25z)| lr 5.91e-05 | 8435.48 ms | -100.0% bf16 MFU | 62137 tok/s +step 15730/19560 | loss 3.251164 (-1.25z)| norm 0.2547 (-0.79z)| lr 5.90e-05 | 8437.01 ms | -100.0% bf16 MFU | 62137 tok/s +step 15731/19560 | loss 3.275893 (-0.69z)| norm 0.2636 (+0.02z)| lr 5.90e-05 | 8437.75 ms | -100.0% bf16 MFU | 62137 tok/s +step 15732/19560 | loss 3.315647 (+0.20z)| norm 0.2706 (+0.66z)| lr 5.90e-05 | 8436.16 ms | -100.0% bf16 MFU | 62137 tok/s +step 15733/19560 | loss 3.288185 (-0.41z)| norm 0.2618 (-0.15z)| lr 5.90e-05 | 8439.90 ms | -100.0% bf16 MFU | 62136 tok/s +step 15734/19560 | loss 3.332037 (+0.58z)| norm 0.2708 (+0.67z)| lr 5.89e-05 | 8436.58 ms | -100.0% bf16 MFU | 62137 tok/s +step 15735/19560 | loss 3.290505 (-0.35z)| norm 0.2558 (-0.70z)| lr 5.89e-05 | 8438.84 ms | -100.0% bf16 MFU | 62136 tok/s +step 15736/19560 | loss 3.276840 (-0.65z)| norm 0.2443 (-1.72z)| lr 5.89e-05 | 8437.42 ms | -100.0% bf16 MFU | 62136 tok/s +step 15737/19560 | loss 3.346710 (+0.91z)| norm 0.2508 (-1.11z)| lr 5.88e-05 | 8437.56 ms | -100.0% bf16 MFU | 62137 tok/s +step 15738/19560 | loss 3.310540 (+0.10z)| norm 0.2436 (-1.73z)| lr 5.88e-05 | 8439.68 ms | -100.0% bf16 MFU | 62136 tok/s +step 15739/19560 | loss 3.331529 (+0.56z)| norm 0.2488 (-1.25z)| lr 5.88e-05 | 8438.54 ms | -100.0% bf16 MFU | 62136 tok/s +step 15740/19560 | loss 3.330316 (+0.52z)| norm 0.2573 (-0.51z)| lr 5.87e-05 | 8437.41 ms | -100.0% bf16 MFU | 62136 tok/s +step 15741/19560 | loss 3.358054 (+1.13z)| norm 0.2447 (-1.59z)| lr 5.87e-05 | 8436.57 ms | -100.0% bf16 MFU | 62136 tok/s +step 15742/19560 | loss 3.309597 (+0.05z)| norm 0.2470 (-1.37z)| lr 5.87e-05 | 8443.99 ms | -100.0% bf16 MFU | 62134 tok/s +step 15743/19560 | loss 3.314617 (+0.17z)| norm 0.2395 (-2.00z)| lr 5.87e-05 | 8462.79 ms | -100.0% bf16 MFU | 62125 tok/s +step 15744/19560 | loss 3.343801 (+0.84z)| norm 0.2574 (-0.45z)| lr 5.86e-05 | 8463.44 ms | -100.0% bf16 MFU | 62116 tok/s +step 15745/19560 | loss 3.283375 (-0.54z)| norm 0.2631 (+0.05z)| lr 5.86e-05 | 8462.53 ms | -100.0% bf16 MFU | 62108 tok/s +step 15746/19560 | loss 3.251386 (-1.26z)| norm 0.2587 (-0.33z)| lr 5.86e-05 | 8465.42 ms | -100.0% bf16 MFU | 62099 tok/s +step 15747/19560 | loss 3.312302 (+0.13z)| norm 0.2540 (-0.72z)| lr 5.85e-05 | 8465.19 ms | -100.0% bf16 MFU | 62091 tok/s +step 15748/19560 | loss 3.289264 (-0.39z)| norm 0.2478 (-1.25z)| lr 5.85e-05 | 8465.61 ms | -100.0% bf16 MFU | 62083 tok/s +step 15749/19560 | loss 3.314114 (+0.17z)| norm 0.2472 (-1.29z)| lr 5.85e-05 | 8461.50 ms | -100.0% bf16 MFU | 62077 tok/s +step 15750/19560 | loss 3.342926 (+0.83z)| norm 0.2502 (-1.02z)| lr 5.84e-05 | 8460.05 ms | -100.0% bf16 MFU | 62072 tok/s +val loss 3.284510 +evaluating HellaSwag: 0/1256 evaluating HellaSwag: 10/1256 evaluating HellaSwag: 20/1256 evaluating HellaSwag: 30/1256 evaluating HellaSwag: 40/1256 evaluating HellaSwag: 50/1256 evaluating HellaSwag: 60/1256 evaluating HellaSwag: 70/1256 evaluating HellaSwag: 80/1256 evaluating HellaSwag: 90/1256 evaluating HellaSwag: 100/1256 evaluating HellaSwag: 110/1256 evaluating HellaSwag: 120/1256 evaluating HellaSwag: 130/1256 evaluating HellaSwag: 140/1256 evaluating HellaSwag: 150/1256 evaluating HellaSwag: 160/1256 evaluating HellaSwag: 170/1256 evaluating HellaSwag: 180/1256 evaluating HellaSwag: 190/1256 evaluating HellaSwag: 200/1256 evaluating HellaSwag: 210/1256 evaluating HellaSwag: 220/1256 evaluating HellaSwag: 230/1256 evaluating HellaSwag: 240/1256 evaluating HellaSwag: 250/1256 evaluating HellaSwag: 260/1256 evaluating HellaSwag: 270/1256 evaluating HellaSwag: 280/1256 evaluating HellaSwag: 290/1256 evaluating HellaSwag: 300/1256 evaluating HellaSwag: 310/1256 evaluating HellaSwag: 320/1256 evaluating HellaSwag: 330/1256 evaluating HellaSwag: 340/1256 evaluating HellaSwag: 350/1256 evaluating HellaSwag: 360/1256 evaluating HellaSwag: 370/1256 evaluating HellaSwag: 380/1256 evaluating HellaSwag: 390/1256 evaluating HellaSwag: 400/1256 evaluating HellaSwag: 410/1256 evaluating HellaSwag: 420/1256 evaluating HellaSwag: 430/1256 evaluating HellaSwag: 440/1256 evaluating HellaSwag: 450/1256 evaluating HellaSwag: 460/1256 evaluating HellaSwag: 470/1256 evaluating HellaSwag: 480/1256 evaluating HellaSwag: 490/1256 evaluating HellaSwag: 500/1256 evaluating HellaSwag: 510/1256 evaluating HellaSwag: 520/1256 evaluating HellaSwag: 530/1256 evaluating HellaSwag: 540/1256 evaluating HellaSwag: 550/1256 evaluating HellaSwag: 560/1256 evaluating HellaSwag: 570/1256 evaluating HellaSwag: 580/1256 evaluating HellaSwag: 590/1256 evaluating HellaSwag: 600/1256 evaluating HellaSwag: 610/1256 evaluating HellaSwag: 620/1256 evaluating HellaSwag: 630/1256 evaluating HellaSwag: 640/1256 evaluating HellaSwag: 650/1256 evaluating HellaSwag: 660/1256 evaluating HellaSwag: 670/1256 evaluating HellaSwag: 680/1256 evaluating HellaSwag: 690/1256 evaluating HellaSwag: 700/1256 evaluating HellaSwag: 710/1256 evaluating HellaSwag: 720/1256 evaluating HellaSwag: 730/1256 evaluating HellaSwag: 740/1256 evaluating HellaSwag: 750/1256 evaluating HellaSwag: 760/1256 evaluating HellaSwag: 770/1256 evaluating HellaSwag: 780/1256 evaluating HellaSwag: 790/1256 evaluating HellaSwag: 800/1256 evaluating HellaSwag: 810/1256 evaluating HellaSwag: 820/1256 evaluating HellaSwag: 830/1256 evaluating HellaSwag: 840/1256 evaluating HellaSwag: 850/1256 evaluating HellaSwag: 860/1256 evaluating HellaSwag: 870/1256 evaluating HellaSwag: 880/1256 evaluating HellaSwag: 890/1256 evaluating HellaSwag: 900/1256 evaluating HellaSwag: 910/1256 evaluating HellaSwag: 920/1256 evaluating HellaSwag: 930/1256 evaluating HellaSwag: 940/1256 evaluating HellaSwag: 950/1256 evaluating HellaSwag: 960/1256 evaluating HellaSwag: 970/1256 evaluating HellaSwag: 980/1256 evaluating HellaSwag: 990/1256 evaluating HellaSwag: 1000/1256 evaluating HellaSwag: 1010/1256 evaluating HellaSwag: 1020/1256 evaluating HellaSwag: 1030/1256 evaluating HellaSwag: 1040/1256 evaluating HellaSwag: 1050/1256 evaluating HellaSwag: 1060/1256 evaluating HellaSwag: 1070/1256 evaluating HellaSwag: 1080/1256 evaluating HellaSwag: 1090/1256 evaluating HellaSwag: 1100/1256 evaluating HellaSwag: 1110/1256 evaluating HellaSwag: 1120/1256 evaluating HellaSwag: 1130/1256 evaluating HellaSwag: 1140/1256 evaluating HellaSwag: 1150/1256 evaluating HellaSwag: 1160/1256 evaluating HellaSwag: 1170/1256 evaluating HellaSwag: 1180/1256 evaluating HellaSwag: 1190/1256 evaluating HellaSwag: 1200/1256 evaluating HellaSwag: 1210/1256 evaluating HellaSwag: 1220/1256 evaluating HellaSwag: 1230/1256 evaluating HellaSwag: 1240/1256 evaluating HellaSwag: 1250/1256 HellaSwag: 2987/10042 = 0.297451 +step 15751/19560 | loss 3.289322 (-0.41z)| norm 0.2391 (-1.94z)| lr 5.84e-05 | 8458.43 ms | -100.0% bf16 MFU | 62067 tok/s +step 15752/19560 | loss 3.303201 (-0.09z)| norm 0.2530 (-0.73z)| lr 5.84e-05 | 8461.76 ms | -100.0% bf16 MFU | 62062 tok/s +step 15753/19560 | loss 3.344394 (+0.85z)| norm 0.2501 (-0.97z)| lr 5.84e-05 | 8458.75 ms | -100.0% bf16 MFU | 62058 tok/s +step 15754/19560 | loss 3.361325 (+1.22z)| norm 0.2447 (-1.42z)| lr 5.83e-05 | 8461.94 ms | -100.0% bf16 MFU | 62053 tok/s +step 15755/19560 | loss 3.291352 (-0.37z)| norm 0.2508 (-0.89z)| lr 5.83e-05 | 8459.35 ms | -100.0% bf16 MFU | 62049 tok/s +step 15756/19560 | loss 3.279407 (-0.63z)| norm 0.2477 (-1.15z)| lr 5.83e-05 | 8459.48 ms | -100.0% bf16 MFU | 62045 tok/s +step 15757/19560 | loss 3.288127 (-0.44z)| norm 0.2647 (+0.38z)| lr 5.82e-05 | 8458.19 ms | -100.0% bf16 MFU | 62042 tok/s +step 15758/19560 | loss 3.333385 (+0.59z)| norm 0.2518 (-0.76z)| lr 5.82e-05 | 8457.17 ms | -100.0% bf16 MFU | 62040 tok/s +step 15759/19560 | loss 3.307271 (-0.02z)| norm 0.2581 (-0.19z)| lr 5.82e-05 | 8461.37 ms | -100.0% bf16 MFU | 62036 tok/s +step 15760/19560 | loss 3.304874 (-0.08z)| norm 0.2526 (-0.70z)| lr 5.81e-05 | 8456.96 ms | -100.0% bf16 MFU | 62034 tok/s +step 15761/19560 | loss 3.282230 (-0.61z)| norm 0.2519 (-0.74z)| lr 5.81e-05 | 8452.42 ms | -100.0% bf16 MFU | 62034 tok/s +step 15762/19560 | loss 3.292598 (-0.38z)| norm 0.2591 (-0.09z)| lr 5.81e-05 | 8459.72 ms | -100.0% bf16 MFU | 62031 tok/s +step 15763/19560 | loss 3.329604 (+0.49z)| norm 0.2563 (-0.34z)| lr 5.81e-05 | 8457.56 ms | -100.0% bf16 MFU | 62029 tok/s +step 15764/19560 | loss 3.284986 (-0.55z)| norm 0.2700 (+0.90z)| lr 5.80e-05 | 8457.67 ms | -100.0% bf16 MFU | 62027 tok/s +step 15765/19560 | loss 3.314800 (+0.15z)| norm 0.2547 (-0.49z)| lr 5.80e-05 | 8455.45 ms | -100.0% bf16 MFU | 62026 tok/s +step 15766/19560 | loss 3.335129 (+0.61z)| norm 0.2602 (+0.02z)| lr 5.80e-05 | 8455.34 ms | -100.0% bf16 MFU | 62025 tok/s +step 15767/19560 | loss 3.298424 (-0.25z)| norm 0.2682 (+0.74z)| lr 5.79e-05 | 8455.31 ms | -100.0% bf16 MFU | 62024 tok/s +step 15768/19560 | loss 3.309708 (+0.02z)| norm 0.2585 (-0.13z)| lr 5.79e-05 | 8453.10 ms | -100.0% bf16 MFU | 62024 tok/s +step 15769/19560 | loss 3.476747 (+3.74z)| norm 0.2869 (+2.39z)| lr 5.79e-05 | 8456.10 ms | -100.0% bf16 MFU | 62023 tok/s +step 15770/19560 | loss 3.320174 (+0.21z)| norm 0.2548 (-0.46z)| lr 5.79e-05 | 8463.99 ms | -100.0% bf16 MFU | 62019 tok/s +step 15771/19560 | loss 3.228818 (-1.81z)| norm 0.2555 (-0.40z)| lr 5.78e-05 | 8449.45 ms | -100.0% bf16 MFU | 62020 tok/s +step 15772/19560 | loss 3.303494 (-0.15z)| norm 0.2652 (+0.47z)| lr 5.78e-05 | 8451.90 ms | -100.0% bf16 MFU | 62021 tok/s +step 15773/19560 | loss 3.387472 (+1.69z)| norm 0.2776 (+1.57z)| lr 5.78e-05 | 8450.06 ms | -100.0% bf16 MFU | 62022 tok/s +step 15774/19560 | loss 3.318695 (+0.17z)| norm 0.2545 (-0.48z)| lr 5.77e-05 | 8456.89 ms | -100.0% bf16 MFU | 62021 tok/s +step 15775/19560 | loss 3.341562 (+0.69z)| norm 0.2693 (+0.85z)| lr 5.77e-05 | 8448.70 ms | -100.0% bf16 MFU | 62023 tok/s +step 15776/19560 | loss 3.326584 (+0.36z)| norm 0.2799 (+1.78z)| lr 5.77e-05 | 8461.50 ms | -100.0% bf16 MFU | 62019 tok/s +step 15777/19560 | loss 3.374039 (+1.46z)| norm 0.2707 (+0.94z)| lr 5.76e-05 | 8454.75 ms | -100.0% bf16 MFU | 62019 tok/s +step 15778/19560 | loss 3.326322 (+0.37z)| norm 0.2846 (+2.14z)| lr 5.76e-05 | 8449.83 ms | -100.0% bf16 MFU | 62020 tok/s +step 15779/19560 | loss 3.451891 (+3.10z)| norm 0.2761 (+1.38z)| lr 5.76e-05 | 8454.56 ms | -100.0% bf16 MFU | 62020 tok/s +step 15780/19560 | loss 3.293717 (-0.39z)| norm 0.2620 (+0.15z)| lr 5.76e-05 | 8448.95 ms | -100.0% bf16 MFU | 62022 tok/s +step 15781/19560 | loss 3.334865 (+0.52z)| norm 0.2729 (+1.08z)| lr 5.75e-05 | 8458.33 ms | -100.0% bf16 MFU | 62020 tok/s +step 15782/19560 | loss 3.297633 (-0.30z)| norm 0.2599 (-0.05z)| lr 5.75e-05 | 8456.48 ms | -100.0% bf16 MFU | 62019 tok/s +step 15783/19560 | loss 3.298824 (-0.27z)| norm 0.2588 (-0.14z)| lr 5.75e-05 | 8450.65 ms | -100.0% bf16 MFU | 62020 tok/s +step 15784/19560 | loss 3.289706 (-0.48z)| norm 0.2473 (-1.13z)| lr 5.74e-05 | 8450.95 ms | -100.0% bf16 MFU | 62021 tok/s +step 15785/19560 | loss 3.256109 (-1.27z)| norm 0.2706 (+0.90z)| lr 5.74e-05 | 8455.32 ms | -100.0% bf16 MFU | 62020 tok/s +step 15786/19560 | loss 3.280670 (-0.67z)| norm 0.2685 (+0.71z)| lr 5.74e-05 | 8450.97 ms | -100.0% bf16 MFU | 62021 tok/s +step 15787/19560 | loss 3.241297 (-1.63z)| norm 0.2642 (+0.36z)| lr 5.74e-05 | 8443.51 ms | -100.0% bf16 MFU | 62025 tok/s +step 15788/19560 | loss 3.275630 (-0.77z)| norm 0.2731 (+1.14z)| lr 5.73e-05 | 8447.87 ms | -100.0% bf16 MFU | 62027 tok/s +step 15789/19560 | loss 3.357273 (+1.23z)| norm 0.2631 (+0.24z)| lr 5.73e-05 | 8443.05 ms | -100.0% bf16 MFU | 62030 tok/s +step 15790/19560 | loss 3.363093 (+1.35z)| norm 0.2534 (-0.63z)| lr 5.73e-05 | 8444.24 ms | -100.0% bf16 MFU | 62033 tok/s +step 15791/19560 | loss 3.359642 (+1.25z)| norm 0.2534 (-0.62z)| lr 5.72e-05 | 8454.60 ms | -100.0% bf16 MFU | 62032 tok/s +step 15792/19560 | loss 3.243511 (-1.55z)| norm 0.2566 (-0.33z)| lr 5.72e-05 | 8456.27 ms | -100.0% bf16 MFU | 62030 tok/s +step 15793/19560 | loss 3.302087 (-0.14z)| norm 0.2508 (-0.86z)| lr 5.72e-05 | 8446.54 ms | -100.0% bf16 MFU | 62032 tok/s +step 15794/19560 | loss 3.282865 (-0.60z)| norm 0.2421 (-1.62z)| lr 5.71e-05 | 8448.50 ms | -100.0% bf16 MFU | 62034 tok/s +step 15795/19560 | loss 3.327450 (+0.46z)| norm 0.2936 (+2.93z)| lr 5.71e-05 | 8446.85 ms | -100.0% bf16 MFU | 62035 tok/s +step 15796/19560 | loss 3.260764 (-1.16z)| norm 0.2681 (+0.68z)| lr 5.71e-05 | 8446.84 ms | -100.0% bf16 MFU | 62037 tok/s +step 15797/19560 | loss 3.306365 (-0.04z)| norm 0.2892 (+2.45z)| lr 5.71e-05 | 8451.63 ms | -100.0% bf16 MFU | 62037 tok/s +step 15798/19560 | loss 3.265014 (-1.07z)| norm 0.2560 (-0.39z)| lr 5.70e-05 | 8447.11 ms | -100.0% bf16 MFU | 62038 tok/s +step 15799/19560 | loss 3.263110 (-1.10z)| norm 0.2614 (+0.07z)| lr 5.70e-05 | 8449.16 ms | -100.0% bf16 MFU | 62039 tok/s +step 15800/19560 | loss 3.305372 (-0.05z)| norm 0.2751 (+1.23z)| lr 5.70e-05 | 8447.86 ms | -100.0% bf16 MFU | 62040 tok/s +step 15801/19560 | loss 3.285359 (-0.54z)| norm 0.2665 (+0.48z)| lr 5.69e-05 | 8450.15 ms | -100.0% bf16 MFU | 62040 tok/s +step 15802/19560 | loss 3.241122 (-1.62z)| norm 0.2740 (+1.11z)| lr 5.69e-05 | 8447.42 ms | -100.0% bf16 MFU | 62042 tok/s +step 15803/19560 | loss 3.318656 (+0.30z)| norm 0.2711 (+0.85z)| lr 5.69e-05 | 8450.61 ms | -100.0% bf16 MFU | 62042 tok/s +step 15804/19560 | loss 3.301208 (-0.13z)| norm 0.2521 (-0.78z)| lr 5.69e-05 | 8452.67 ms | -100.0% bf16 MFU | 62041 tok/s +step 15805/19560 | loss 3.307935 (+0.03z)| norm 0.2556 (-0.47z)| lr 5.68e-05 | 8448.63 ms | -100.0% bf16 MFU | 62042 tok/s +step 15806/19560 | loss 3.301249 (-0.14z)| norm 0.2523 (-0.75z)| lr 5.68e-05 | 8445.21 ms | -100.0% bf16 MFU | 62044 tok/s +step 15807/19560 | loss 3.248863 (-1.44z)| norm 0.2621 (+0.09z)| lr 5.68e-05 | 8454.41 ms | -100.0% bf16 MFU | 62042 tok/s +step 15808/19560 | loss 3.285749 (-0.51z)| norm 0.2413 (-1.67z)| lr 5.67e-05 | 8448.30 ms | -100.0% bf16 MFU | 62043 tok/s +step 15809/19560 | loss 3.278211 (-0.70z)| norm 0.2686 (+0.66z)| lr 5.67e-05 | 8444.49 ms | -100.0% bf16 MFU | 62045 tok/s +step 15810/19560 | loss 3.302836 (-0.08z)| norm 0.2546 (-0.53z)| lr 5.67e-05 | 8445.40 ms | -100.0% bf16 MFU | 62047 tok/s +step 15811/19560 | loss 3.325108 (+0.49z)| norm 0.2672 (+0.54z)| lr 5.67e-05 | 8448.18 ms | -100.0% bf16 MFU | 62047 tok/s +step 15812/19560 | loss 3.331826 (+0.65z)| norm 0.2680 (+0.62z)| lr 5.66e-05 | 8449.18 ms | -100.0% bf16 MFU | 62048 tok/s +step 15813/19560 | loss 3.365685 (+1.48z)| norm 0.2791 (+1.56z)| lr 5.66e-05 | 8447.53 ms | -100.0% bf16 MFU | 62049 tok/s +step 15814/19560 | loss 3.253306 (-1.33z)| norm 0.2682 (+0.62z)| lr 5.66e-05 | 8448.57 ms | -100.0% bf16 MFU | 62049 tok/s +step 15815/19560 | loss 3.316858 (+0.27z)| norm 0.2847 (+1.99z)| lr 5.65e-05 | 8447.41 ms | -100.0% bf16 MFU | 62050 tok/s +step 15816/19560 | loss 3.326077 (+0.49z)| norm 0.2534 (-0.66z)| lr 5.65e-05 | 8446.22 ms | -100.0% bf16 MFU | 62051 tok/s +step 15817/19560 | loss 3.342541 (+0.91z)| norm 0.2535 (-0.63z)| lr 5.65e-05 | 8442.20 ms | -100.0% bf16 MFU | 62054 tok/s +step 15818/19560 | loss 3.312114 (+0.15z)| norm 0.2820 (+1.74z)| lr 5.64e-05 | 8442.60 ms | -100.0% bf16 MFU | 62056 tok/s +step 15819/19560 | loss 3.295829 (-0.26z)| norm 0.2461 (-1.25z)| lr 5.64e-05 | 8438.20 ms | -100.0% bf16 MFU | 62060 tok/s +step 15820/19560 | loss 3.324131 (+0.45z)| norm 0.2622 (+0.10z)| lr 5.64e-05 | 8437.18 ms | -100.0% bf16 MFU | 62064 tok/s +step 15821/19560 | loss 3.279771 (-0.66z)| norm 0.2593 (-0.15z)| lr 5.64e-05 | 8437.39 ms | -100.0% bf16 MFU | 62067 tok/s +step 15822/19560 | loss 3.378970 (+1.80z)| norm 0.2732 (+1.02z)| lr 5.63e-05 | 8434.83 ms | -100.0% bf16 MFU | 62072 tok/s +step 15823/19560 | loss 3.288748 (-0.45z)| norm 0.2567 (-0.36z)| lr 5.63e-05 | 8431.53 ms | -100.0% bf16 MFU | 62077 tok/s +step 15824/19560 | loss 3.319632 (+0.33z)| norm 0.2709 (+0.83z)| lr 5.63e-05 | 8442.66 ms | -100.0% bf16 MFU | 62079 tok/s +step 15825/19560 | loss 3.332600 (+0.64z)| norm 0.2519 (-0.76z)| lr 5.62e-05 | 8435.43 ms | -100.0% bf16 MFU | 62082 tok/s +step 15826/19560 | loss 3.309359 (+0.06z)| norm 0.2557 (-0.43z)| lr 5.62e-05 | 8433.29 ms | -100.0% bf16 MFU | 62087 tok/s +step 15827/19560 | loss 3.404954 (+2.40z)| norm 0.2660 (+0.43z)| lr 5.62e-05 | 8438.43 ms | -100.0% bf16 MFU | 62089 tok/s +step 15828/19560 | loss 3.306528 (-0.04z)| norm 0.2577 (-0.26z)| lr 5.62e-05 | 8441.20 ms | -100.0% bf16 MFU | 62090 tok/s +step 15829/19560 | loss 3.269547 (-0.94z)| norm 0.2726 (+1.03z)| lr 5.61e-05 | 8434.99 ms | -100.0% bf16 MFU | 62093 tok/s +step 15830/19560 | loss 3.287520 (-0.49z)| norm 0.2630 (+0.21z)| lr 5.61e-05 | 8430.40 ms | -100.0% bf16 MFU | 62098 tok/s +step 15831/19560 | loss 3.315528 (+0.20z)| norm 0.2559 (-0.40z)| lr 5.61e-05 | 8433.94 ms | -100.0% bf16 MFU | 62101 tok/s +step 15832/19560 | loss 3.297738 (-0.24z)| norm 0.2639 (+0.29z)| lr 5.60e-05 | 8434.95 ms | -100.0% bf16 MFU | 62104 tok/s +step 15833/19560 | loss 3.289613 (-0.44z)| norm 0.3057 (+3.66z)| lr 5.60e-05 | 8430.34 ms | -100.0% bf16 MFU | 62108 tok/s +step 15834/19560 | loss 3.322628 (+0.37z)| norm 0.2640 (+0.25z)| lr 5.60e-05 | 8430.57 ms | -100.0% bf16 MFU | 62113 tok/s +step 15835/19560 | loss 3.267938 (-0.99z)| norm 0.2648 (+0.31z)| lr 5.60e-05 | 8441.18 ms | -100.0% bf16 MFU | 62112 tok/s +step 15836/19560 | loss 3.298184 (-0.23z)| norm 0.2740 (+1.06z)| lr 5.59e-05 | 8432.91 ms | -100.0% bf16 MFU | 62115 tok/s +step 15837/19560 | loss 3.296524 (-0.29z)| norm 0.2642 (+0.25z)| lr 5.59e-05 | 8437.53 ms | -100.0% bf16 MFU | 62116 tok/s +step 15838/19560 | loss 3.306974 (-0.03z)| norm 0.2581 (-0.26z)| lr 5.59e-05 | 8436.33 ms | -100.0% bf16 MFU | 62118 tok/s +step 15839/19560 | loss 3.334440 (+0.66z)| norm 0.2726 (+0.92z)| lr 5.58e-05 | 8435.62 ms | -100.0% bf16 MFU | 62120 tok/s +step 15840/19560 | loss 3.325626 (+0.42z)| norm 0.2624 (+0.09z)| lr 5.58e-05 | 8439.10 ms | -100.0% bf16 MFU | 62120 tok/s +step 15841/19560 | loss 3.331796 (+0.57z)| norm 0.2845 (+1.86z)| lr 5.58e-05 | 8442.13 ms | -100.0% bf16 MFU | 62119 tok/s +step 15842/19560 | loss 3.322829 (+0.33z)| norm 0.2633 (+0.13z)| lr 5.57e-05 | 8436.00 ms | -100.0% bf16 MFU | 62121 tok/s +step 15843/19560 | loss 3.396449 (+2.17z)| norm 0.2726 (+0.89z)| lr 5.57e-05 | 8434.94 ms | -100.0% bf16 MFU | 62122 tok/s +step 15844/19560 | loss 3.256495 (-1.36z)| norm 0.2780 (+1.31z)| lr 5.57e-05 | 8439.13 ms | -100.0% bf16 MFU | 62123 tok/s +step 15845/19560 | loss 3.306160 (-0.11z)| norm 0.2775 (+1.25z)| lr 5.57e-05 | 8437.14 ms | -100.0% bf16 MFU | 62124 tok/s +step 15846/19560 | loss 3.308135 (-0.04z)| norm 0.2839 (+1.74z)| lr 5.56e-05 | 8438.75 ms | -100.0% bf16 MFU | 62124 tok/s +step 15847/19560 | loss 3.330667 (+0.55z)| norm 0.2887 (+2.06z)| lr 5.56e-05 | 8440.18 ms | -100.0% bf16 MFU | 62123 tok/s +step 15848/19560 | loss 3.322715 (+0.34z)| norm 0.2550 (-0.59z)| lr 5.56e-05 | 8439.45 ms | -100.0% bf16 MFU | 62123 tok/s +step 15849/19560 | loss 3.306150 (-0.09z)| norm 0.2888 (+2.02z)| lr 5.55e-05 | 8442.90 ms | -100.0% bf16 MFU | 62122 tok/s +step 15850/19560 | loss 3.338265 (+0.73z)| norm 0.2854 (+1.84z)| lr 5.55e-05 | 8438.48 ms | -100.0% bf16 MFU | 62123 tok/s +step 15851/19560 | loss 3.381943 (+1.85z)| norm 0.2724 (+0.79z)| lr 5.55e-05 | 8440.16 ms | -100.0% bf16 MFU | 62122 tok/s +step 15852/19560 | loss 3.326426 (+0.40z)| norm 0.2542 (-0.66z)| lr 5.55e-05 | 8446.26 ms | -100.0% bf16 MFU | 62120 tok/s +step 15853/19560 | loss 3.354259 (+1.12z)| norm 0.2765 (+1.11z)| lr 5.54e-05 | 8441.58 ms | -100.0% bf16 MFU | 62119 tok/s +step 15854/19560 | loss 3.263545 (-1.23z)| norm 0.2522 (-0.83z)| lr 5.54e-05 | 8440.87 ms | -100.0% bf16 MFU | 62119 tok/s +step 15855/19560 | loss 3.275029 (-0.93z)| norm 0.2552 (-0.60z)| lr 5.54e-05 | 8440.48 ms | -100.0% bf16 MFU | 62119 tok/s +step 15856/19560 | loss 3.297063 (-0.37z)| norm 0.2609 (-0.15z)| lr 5.53e-05 | 8439.71 ms | -100.0% bf16 MFU | 62119 tok/s +step 15857/19560 | loss 3.366973 (+1.43z)| norm 0.2472 (-1.23z)| lr 5.53e-05 | 8443.83 ms | -100.0% bf16 MFU | 62118 tok/s +step 15858/19560 | loss 3.300254 (-0.30z)| norm 0.2577 (-0.39z)| lr 5.53e-05 | 8436.92 ms | -100.0% bf16 MFU | 62119 tok/s +step 15859/19560 | loss 3.280254 (-0.82z)| norm 0.2589 (-0.30z)| lr 5.53e-05 | 8444.35 ms | -100.0% bf16 MFU | 62117 tok/s +step 15860/19560 | loss 3.218205 (-2.36z)| norm 0.2613 (-0.09z)| lr 5.52e-05 | 8437.96 ms | -100.0% bf16 MFU | 62118 tok/s +step 15861/19560 | loss 3.267848 (-1.09z)| norm 0.2778 (+1.21z)| lr 5.52e-05 | 8437.18 ms | -100.0% bf16 MFU | 62119 tok/s +step 15862/19560 | loss 3.298967 (-0.30z)| norm 0.2582 (-0.34z)| lr 5.52e-05 | 8439.03 ms | -100.0% bf16 MFU | 62120 tok/s +step 15863/19560 | loss 3.291190 (-0.49z)| norm 0.2437 (-1.48z)| lr 5.51e-05 | 8440.93 ms | -100.0% bf16 MFU | 62119 tok/s +step 15864/19560 | loss 3.283319 (-0.70z)| norm 0.2577 (-0.38z)| lr 5.51e-05 | 8439.73 ms | -100.0% bf16 MFU | 62119 tok/s +step 15865/19560 | loss 3.269927 (-1.02z)| norm 0.2669 (+0.34z)| lr 5.51e-05 | 8441.67 ms | -100.0% bf16 MFU | 62119 tok/s +step 15866/19560 | loss 3.365290 (+1.38z)| norm 0.2612 (-0.13z)| lr 5.51e-05 | 8437.42 ms | -100.0% bf16 MFU | 62120 tok/s +step 15867/19560 | loss 3.245653 (-1.60z)| norm 0.2578 (-0.41z)| lr 5.50e-05 | 8441.68 ms | -100.0% bf16 MFU | 62119 tok/s +step 15868/19560 | loss 3.254965 (-1.35z)| norm 0.2664 (+0.28z)| lr 5.50e-05 | 8440.90 ms | -100.0% bf16 MFU | 62119 tok/s +step 15869/19560 | loss 3.359612 (+1.25z)| norm 0.2712 (+0.65z)| lr 5.50e-05 | 8439.74 ms | -100.0% bf16 MFU | 62119 tok/s +step 15870/19560 | loss 3.293597 (-0.39z)| norm 0.2605 (-0.22z)| lr 5.49e-05 | 8442.33 ms | -100.0% bf16 MFU | 62118 tok/s +step 15871/19560 | loss 3.326874 (+0.44z)| norm 0.2646 (+0.10z)| lr 5.49e-05 | 8440.73 ms | -100.0% bf16 MFU | 62118 tok/s +step 15872/19560 | loss 3.272261 (-0.90z)| norm 0.2562 (-0.60z)| lr 5.49e-05 | 8437.85 ms | -100.0% bf16 MFU | 62119 tok/s +step 15873/19560 | loss 3.289962 (-0.47z)| norm 0.2730 (+0.79z)| lr 5.49e-05 | 8439.56 ms | -100.0% bf16 MFU | 62119 tok/s +step 15874/19560 | loss 3.285018 (-0.60z)| norm 0.2569 (-0.55z)| lr 5.48e-05 | 8441.55 ms | -100.0% bf16 MFU | 62118 tok/s +step 15875/19560 | loss 3.273728 (-0.87z)| norm 0.2530 (-0.87z)| lr 5.48e-05 | 8441.90 ms | -100.0% bf16 MFU | 62118 tok/s +step 15876/19560 | loss 3.348889 (+0.98z)| norm 0.2690 (+0.44z)| lr 5.48e-05 | 8440.57 ms | -100.0% bf16 MFU | 62118 tok/s +step 15877/19560 | loss 3.298725 (-0.26z)| norm 0.2594 (-0.36z)| lr 5.47e-05 | 8441.15 ms | -100.0% bf16 MFU | 62117 tok/s +step 15878/19560 | loss 3.350161 (+1.02z)| norm 0.2506 (-1.10z)| lr 5.47e-05 | 8442.61 ms | -100.0% bf16 MFU | 62116 tok/s +step 15879/19560 | loss 3.321483 (+0.30z)| norm 0.2526 (-0.96z)| lr 5.47e-05 | 8441.50 ms | -100.0% bf16 MFU | 62116 tok/s +step 15880/19560 | loss 3.258247 (-1.25z)| norm 0.2603 (-0.31z)| lr 5.46e-05 | 8441.73 ms | -100.0% bf16 MFU | 62116 tok/s +step 15881/19560 | loss 3.244964 (-1.55z)| norm 0.2528 (-0.95z)| lr 5.46e-05 | 8436.57 ms | -100.0% bf16 MFU | 62117 tok/s +step 15882/19560 | loss 3.352355 (+1.08z)| norm 0.2478 (-1.39z)| lr 5.46e-05 | 8440.53 ms | -100.0% bf16 MFU | 62117 tok/s +step 15883/19560 | loss 3.268456 (-0.97z)| norm 0.2668 (+0.24z)| lr 5.46e-05 | 8440.36 ms | -100.0% bf16 MFU | 62117 tok/s +step 15884/19560 | loss 3.282307 (-0.63z)| norm 0.2594 (-0.41z)| lr 5.45e-05 | 8443.79 ms | -100.0% bf16 MFU | 62116 tok/s +step 15885/19560 | loss 3.325922 (+0.43z)| norm 0.2514 (-1.09z)| lr 5.45e-05 | 8437.35 ms | -100.0% bf16 MFU | 62117 tok/s +step 15886/19560 | loss 3.292446 (-0.38z)| norm 0.2597 (-0.38z)| lr 5.45e-05 | 8444.11 ms | -100.0% bf16 MFU | 62115 tok/s +step 15887/19560 | loss 3.331613 (+0.57z)| norm 0.2490 (-1.30z)| lr 5.44e-05 | 8440.20 ms | -100.0% bf16 MFU | 62116 tok/s +step 15888/19560 | loss 3.289641 (-0.45z)| norm 0.2548 (-0.80z)| lr 5.44e-05 | 8436.94 ms | -100.0% bf16 MFU | 62117 tok/s +step 15889/19560 | loss 3.326053 (+0.43z)| norm 0.2384 (-2.18z)| lr 5.44e-05 | 8439.24 ms | -100.0% bf16 MFU | 62117 tok/s +step 15890/19560 | loss 3.307085 (-0.04z)| norm 0.2541 (-0.84z)| lr 5.44e-05 | 8433.24 ms | -100.0% bf16 MFU | 62120 tok/s +step 15891/19560 | loss 3.322186 (+0.34z)| norm 0.2627 (-0.11z)| lr 5.43e-05 | 8441.11 ms | -100.0% bf16 MFU | 62119 tok/s +step 15892/19560 | loss 3.360934 (+1.26z)| norm 0.2465 (-1.46z)| lr 5.43e-05 | 8439.23 ms | -100.0% bf16 MFU | 62120 tok/s +step 15893/19560 | loss 3.252347 (-1.36z)| norm 0.2654 (+0.13z)| lr 5.43e-05 | 8440.20 ms | -100.0% bf16 MFU | 62120 tok/s +step 15894/19560 | loss 3.258990 (-1.18z)| norm 0.2546 (-0.78z)| lr 5.42e-05 | 8436.93 ms | -100.0% bf16 MFU | 62121 tok/s +step 15895/19560 | loss 3.362607 (+1.29z)| norm 0.2498 (-1.17z)| lr 5.42e-05 | 8440.54 ms | -100.0% bf16 MFU | 62120 tok/s +step 15896/19560 | loss 3.258927 (-1.17z)| norm 0.2599 (-0.32z)| lr 5.42e-05 | 8442.65 ms | -100.0% bf16 MFU | 62119 tok/s +step 15897/19560 | loss 3.306349 (-0.01z)| norm 0.2631 (-0.04z)| lr 5.42e-05 | 8439.89 ms | -100.0% bf16 MFU | 62120 tok/s +step 15898/19560 | loss 3.350276 (+1.10z)| norm 0.2493 (-1.21z)| lr 5.41e-05 | 8441.20 ms | -100.0% bf16 MFU | 62119 tok/s +step 15899/19560 | loss 3.280539 (-0.69z)| norm 0.2721 (+0.72z)| lr 5.41e-05 | 8439.23 ms | -100.0% bf16 MFU | 62119 tok/s +step 15900/19560 | loss 3.257881 (-1.26z)| norm 0.2686 (+0.42z)| lr 5.41e-05 | 8440.09 ms | -100.0% bf16 MFU | 62119 tok/s +step 15901/19560 | loss 3.318635 (+0.32z)| norm 0.2656 (+0.18z)| lr 5.40e-05 | 8442.79 ms | -100.0% bf16 MFU | 62118 tok/s +step 15902/19560 | loss 3.265445 (-1.05z)| norm 0.2703 (+0.57z)| lr 5.40e-05 | 8438.19 ms | -100.0% bf16 MFU | 62119 tok/s +step 15903/19560 | loss 3.280677 (-0.65z)| norm 0.2765 (+1.09z)| lr 5.40e-05 | 8439.91 ms | -100.0% bf16 MFU | 62119 tok/s +step 15904/19560 | loss 3.306343 (+0.02z)| norm 0.2572 (-0.55z)| lr 5.40e-05 | 8437.71 ms | -100.0% bf16 MFU | 62120 tok/s +step 15905/19560 | loss 3.309871 (+0.13z)| norm 0.2828 (+1.63z)| lr 5.39e-05 | 8436.69 ms | -100.0% bf16 MFU | 62121 tok/s +step 15906/19560 | loss 3.356399 (+1.34z)| norm 0.2634 (-0.01z)| lr 5.39e-05 | 8439.10 ms | -100.0% bf16 MFU | 62121 tok/s +step 15907/19560 | loss 3.257365 (-1.28z)| norm 0.2598 (-0.31z)| lr 5.39e-05 | 8437.94 ms | -100.0% bf16 MFU | 62122 tok/s +step 15908/19560 | loss 3.261508 (-1.15z)| norm 0.2792 (+1.35z)| lr 5.38e-05 | 8440.34 ms | -100.0% bf16 MFU | 62122 tok/s +step 15909/19560 | loss 3.298803 (-0.12z)| norm 0.2526 (-0.92z)| lr 5.38e-05 | 8438.45 ms | -100.0% bf16 MFU | 62122 tok/s +step 15910/19560 | loss 3.298028 (-0.14z)| norm 0.2503 (-1.11z)| lr 5.38e-05 | 8438.76 ms | -100.0% bf16 MFU | 62123 tok/s +step 15911/19560 | loss 3.282608 (-0.56z)| norm 0.2762 (+1.09z)| lr 5.38e-05 | 8439.34 ms | -100.0% bf16 MFU | 62123 tok/s +step 15912/19560 | loss 3.284055 (-0.52z)| norm 0.2535 (-0.85z)| lr 5.37e-05 | 8434.98 ms | -100.0% bf16 MFU | 62124 tok/s +step 15913/19560 | loss 3.234955 (-1.85z)| norm 0.2526 (-0.91z)| lr 5.37e-05 | 8438.30 ms | -100.0% bf16 MFU | 62125 tok/s +step 15914/19560 | loss 3.446495 (+3.68z)| norm 0.2482 (-1.27z)| lr 5.37e-05 | 8439.08 ms | -100.0% bf16 MFU | 62125 tok/s +step 15915/19560 | loss 3.280043 (-0.63z)| norm 0.3483 (+6.06z)| lr 5.36e-05 | 8437.10 ms | -100.0% bf16 MFU | 62126 tok/s +step 15916/19560 | loss 3.269818 (-0.90z)| norm 0.2671 (+0.24z)| lr 5.36e-05 | 8439.78 ms | -100.0% bf16 MFU | 62125 tok/s +step 15917/19560 | loss 3.295838 (-0.21z)| norm 0.2656 (+0.13z)| lr 5.36e-05 | 8436.08 ms | -100.0% bf16 MFU | 62126 tok/s +step 15918/19560 | loss 3.270329 (-0.87z)| norm 0.2703 (+0.46z)| lr 5.36e-05 | 8434.08 ms | -100.0% bf16 MFU | 62128 tok/s +step 15919/19560 | loss 3.244287 (-1.53z)| norm 0.2546 (-0.67z)| lr 5.35e-05 | 8433.92 ms | -100.0% bf16 MFU | 62130 tok/s +step 15920/19560 | loss 3.237313 (-1.71z)| norm 0.2623 (-0.12z)| lr 5.35e-05 | 8430.50 ms | -100.0% bf16 MFU | 62133 tok/s +step 15921/19560 | loss 3.275221 (-0.70z)| norm 0.2631 (-0.07z)| lr 5.35e-05 | 8430.09 ms | -100.0% bf16 MFU | 62136 tok/s +step 15922/19560 | loss 3.512075 (+4.93z)| norm 0.2807 (+1.18z)| lr 5.34e-05 | 8428.85 ms | -100.0% bf16 MFU | 62139 tok/s +step 15923/19560 | loss 3.310787 (+0.17z)| norm 0.2506 (-0.99z)| lr 5.34e-05 | 8425.26 ms | -100.0% bf16 MFU | 62144 tok/s +step 15924/19560 | loss 3.279639 (-0.57z)| norm 0.2563 (-0.56z)| lr 5.34e-05 | 8427.83 ms | -100.0% bf16 MFU | 62147 tok/s +step 15925/19560 | loss 3.385967 (+1.91z)| norm 0.2543 (-0.70z)| lr 5.34e-05 | 8429.28 ms | -100.0% bf16 MFU | 62150 tok/s +step 15926/19560 | loss 3.323399 (+0.44z)| norm 0.2458 (-1.31z)| lr 5.33e-05 | 8428.67 ms | -100.0% bf16 MFU | 62152 tok/s +step 15927/19560 | loss 3.298459 (-0.16z)| norm 0.2599 (-0.27z)| lr 5.33e-05 | 8425.89 ms | -100.0% bf16 MFU | 62156 tok/s +step 15928/19560 | loss 3.305397 (+0.01z)| norm 0.2545 (-0.66z)| lr 5.33e-05 | 8429.29 ms | -100.0% bf16 MFU | 62158 tok/s +step 15929/19560 | loss 3.255929 (-1.15z)| norm 0.2459 (-1.28z)| lr 5.32e-05 | 8431.61 ms | -100.0% bf16 MFU | 62159 tok/s +step 15930/19560 | loss 3.282932 (-0.53z)| norm 0.2502 (-0.94z)| lr 5.32e-05 | 8432.28 ms | -100.0% bf16 MFU | 62160 tok/s +step 15931/19560 | loss 3.297353 (-0.18z)| norm 0.2606 (-0.17z)| lr 5.32e-05 | 8431.93 ms | -100.0% bf16 MFU | 62161 tok/s +step 15932/19560 | loss 3.310944 (+0.14z)| norm 0.2469 (-1.17z)| lr 5.32e-05 | 8429.92 ms | -100.0% bf16 MFU | 62163 tok/s +step 15933/19560 | loss 3.266846 (-0.90z)| norm 0.2449 (-1.31z)| lr 5.31e-05 | 8446.46 ms | -100.0% bf16 MFU | 62158 tok/s +step 15934/19560 | loss 3.270247 (-0.81z)| norm 0.2458 (-1.23z)| lr 5.31e-05 | 8454.38 ms | -100.0% bf16 MFU | 62151 tok/s +step 15935/19560 | loss 3.309802 (+0.11z)| norm 0.2516 (-0.81z)| lr 5.31e-05 | 8455.48 ms | -100.0% bf16 MFU | 62144 tok/s +step 15936/19560 | loss 3.284694 (-0.48z)| norm 0.2418 (-1.52z)| lr 5.31e-05 | 8461.20 ms | -100.0% bf16 MFU | 62135 tok/s +step 15937/19560 | loss 3.300324 (-0.12z)| norm 0.2481 (-1.05z)| lr 5.30e-05 | 8456.24 ms | -100.0% bf16 MFU | 62128 tok/s +step 15938/19560 | loss 3.262914 (-0.99z)| norm 0.2527 (-0.71z)| lr 5.30e-05 | 8462.57 ms | -100.0% bf16 MFU | 62119 tok/s +step 15939/19560 | loss 3.319387 (+0.34z)| norm 0.2641 (+0.12z)| lr 5.30e-05 | 8459.73 ms | -100.0% bf16 MFU | 62112 tok/s +step 15940/19560 | loss 3.257546 (-1.10z)| norm 0.2574 (-0.36z)| lr 5.29e-05 | 8458.31 ms | -100.0% bf16 MFU | 62106 tok/s +step 15941/19560 | loss 3.294904 (-0.21z)| norm 0.2644 (+0.15z)| lr 5.29e-05 | 8451.82 ms | -100.0% bf16 MFU | 62102 tok/s +step 15942/19560 | loss 3.340041 (+0.85z)| norm 0.2707 (+0.61z)| lr 5.29e-05 | 8454.21 ms | -100.0% bf16 MFU | 62098 tok/s +step 15943/19560 | loss 3.283916 (-0.48z)| norm 0.2821 (+1.44z)| lr 5.29e-05 | 8456.67 ms | -100.0% bf16 MFU | 62093 tok/s +step 15944/19560 | loss 3.234129 (-1.63z)| norm 0.2572 (-0.38z)| lr 5.28e-05 | 8462.86 ms | -100.0% bf16 MFU | 62085 tok/s +step 15945/19560 | loss 3.348372 (+1.06z)| norm 0.2589 (-0.26z)| lr 5.28e-05 | 8451.75 ms | -100.0% bf16 MFU | 62083 tok/s +step 15946/19560 | loss 3.357943 (+1.27z)| norm 0.2665 (+0.31z)| lr 5.28e-05 | 8458.18 ms | -100.0% bf16 MFU | 62078 tok/s +step 15947/19560 | loss 3.283980 (-0.46z)| norm 0.2692 (+0.50z)| lr 5.27e-05 | 8455.07 ms | -100.0% bf16 MFU | 62075 tok/s +step 15948/19560 | loss 3.318129 (+0.34z)| norm 0.2456 (-1.23z)| lr 5.27e-05 | 8464.99 ms | -100.0% bf16 MFU | 62068 tok/s +step 15949/19560 | loss 3.341655 (+0.87z)| norm 0.2828 (+1.48z)| lr 5.27e-05 | 8457.36 ms | -100.0% bf16 MFU | 62064 tok/s +step 15950/19560 | loss 3.293782 (-0.23z)| norm 0.2814 (+1.36z)| lr 5.27e-05 | 8457.96 ms | -100.0% bf16 MFU | 62060 tok/s +step 15951/19560 | loss 3.273589 (-0.70z)| norm 0.2576 (-0.36z)| lr 5.26e-05 | 8456.12 ms | -100.0% bf16 MFU | 62057 tok/s +step 15952/19560 | loss 3.312009 (+0.20z)| norm 0.2687 (+0.44z)| lr 5.26e-05 | 8455.42 ms | -100.0% bf16 MFU | 62055 tok/s +step 15953/19560 | loss 3.230320 (-1.69z)| norm 0.2903 (+1.97z)| lr 5.26e-05 | 8452.63 ms | -100.0% bf16 MFU | 62053 tok/s +step 15954/19560 | loss 3.353076 (+1.17z)| norm 0.2767 (+0.98z)| lr 5.25e-05 | 8452.40 ms | -100.0% bf16 MFU | 62052 tok/s +step 15955/19560 | loss 3.326959 (+0.59z)| norm 0.2567 (-0.45z)| lr 5.25e-05 | 8454.32 ms | -100.0% bf16 MFU | 62050 tok/s +step 15956/19560 | loss 3.370125 (+1.58z)| norm 0.2736 (+0.75z)| lr 5.25e-05 | 8453.88 ms | -100.0% bf16 MFU | 62048 tok/s +step 15957/19560 | loss 3.296157 (-0.16z)| norm 0.2791 (+1.14z)| lr 5.25e-05 | 8452.19 ms | -100.0% bf16 MFU | 62047 tok/s +step 15958/19560 | loss 3.252592 (-1.18z)| norm 0.2832 (+1.41z)| lr 5.24e-05 | 8451.19 ms | -100.0% bf16 MFU | 62047 tok/s +step 15959/19560 | loss 3.321543 (+0.44z)| norm 0.2554 (-0.55z)| lr 5.24e-05 | 8453.07 ms | -100.0% bf16 MFU | 62046 tok/s +step 15960/19560 | loss 3.311622 (+0.21z)| norm 0.2556 (-0.53z)| lr 5.24e-05 | 8453.71 ms | -100.0% bf16 MFU | 62044 tok/s +step 15961/19560 | loss 3.297795 (-0.12z)| norm 0.2648 (+0.14z)| lr 5.23e-05 | 8449.67 ms | -100.0% bf16 MFU | 62045 tok/s +step 15962/19560 | loss 3.316395 (+0.32z)| norm 0.2709 (+0.58z)| lr 5.23e-05 | 8454.80 ms | -100.0% bf16 MFU | 62043 tok/s +step 15963/19560 | loss 3.333766 (+0.71z)| norm 0.2688 (+0.42z)| lr 5.23e-05 | 8453.85 ms | -100.0% bf16 MFU | 62042 tok/s +step 15964/19560 | loss 3.261024 (-0.98z)| norm 0.2575 (-0.39z)| lr 5.23e-05 | 8452.15 ms | -100.0% bf16 MFU | 62041 tok/s +step 15965/19560 | loss 3.264682 (-0.89z)| norm 0.2565 (-0.46z)| lr 5.22e-05 | 8450.95 ms | -100.0% bf16 MFU | 62041 tok/s +step 15966/19560 | loss 3.328644 (+0.60z)| norm 0.2453 (-1.26z)| lr 5.22e-05 | 8454.34 ms | -100.0% bf16 MFU | 62040 tok/s +step 15967/19560 | loss 3.270185 (-0.75z)| norm 0.2713 (+0.63z)| lr 5.22e-05 | 8457.81 ms | -100.0% bf16 MFU | 62037 tok/s +step 15968/19560 | loss 3.240951 (-1.41z)| norm 0.2425 (-1.44z)| lr 5.21e-05 | 8451.39 ms | -100.0% bf16 MFU | 62037 tok/s +step 15969/19560 | loss 3.317243 (+0.36z)| norm 0.2556 (-0.49z)| lr 5.21e-05 | 8453.31 ms | -100.0% bf16 MFU | 62036 tok/s +step 15970/19560 | loss 3.298899 (-0.06z)| norm 0.2670 (+0.34z)| lr 5.21e-05 | 8451.78 ms | -100.0% bf16 MFU | 62036 tok/s +step 15971/19560 | loss 3.269233 (-0.74z)| norm 0.2475 (-1.06z)| lr 5.21e-05 | 8443.18 ms | -100.0% bf16 MFU | 62039 tok/s +step 15972/19560 | loss 3.264325 (-0.86z)| norm 0.2604 (-0.12z)| lr 5.20e-05 | 8453.27 ms | -100.0% bf16 MFU | 62038 tok/s +step 15973/19560 | loss 3.333435 (+0.77z)| norm 0.2474 (-1.05z)| lr 5.20e-05 | 8457.71 ms | -100.0% bf16 MFU | 62036 tok/s +step 15974/19560 | loss 3.226524 (-1.72z)| norm 0.3070 (+3.18z)| lr 5.20e-05 | 8454.20 ms | -100.0% bf16 MFU | 62035 tok/s +step 15975/19560 | loss 3.315612 (+0.36z)| norm 0.2629 (+0.08z)| lr 5.19e-05 | 8450.14 ms | -100.0% bf16 MFU | 62035 tok/s +step 15976/19560 | loss 3.315358 (+0.36z)| norm 0.2689 (+0.51z)| lr 5.19e-05 | 8450.46 ms | -100.0% bf16 MFU | 62036 tok/s +step 15977/19560 | loss 3.326620 (+0.62z)| norm 0.2636 (+0.14z)| lr 5.19e-05 | 8450.24 ms | -100.0% bf16 MFU | 62036 tok/s +step 15978/19560 | loss 3.326023 (+0.61z)| norm 0.2674 (+0.43z)| lr 5.19e-05 | 8450.10 ms | -100.0% bf16 MFU | 62036 tok/s +step 15979/19560 | loss 3.287637 (-0.28z)| norm 0.2694 (+0.58z)| lr 5.18e-05 | 8445.19 ms | -100.0% bf16 MFU | 62039 tok/s +step 15980/19560 | loss 3.337959 (+0.91z)| norm 0.2611 (-0.03z)| lr 5.18e-05 | 8445.70 ms | -100.0% bf16 MFU | 62041 tok/s +step 15981/19560 | loss 3.299852 (+0.02z)| norm 0.2660 (+0.34z)| lr 5.18e-05 | 8449.37 ms | -100.0% bf16 MFU | 62041 tok/s +step 15982/19560 | loss 3.295211 (-0.10z)| norm 0.3178 (+3.88z)| lr 5.18e-05 | 8448.04 ms | -100.0% bf16 MFU | 62042 tok/s +step 15983/19560 | loss 3.274411 (-0.59z)| norm 0.2846 (+1.55z)| lr 5.17e-05 | 8447.24 ms | -100.0% bf16 MFU | 62043 tok/s +step 15984/19560 | loss 3.333078 (+0.80z)| norm 0.2690 (+0.47z)| lr 5.17e-05 | 8445.92 ms | -100.0% bf16 MFU | 62045 tok/s +step 15985/19560 | loss 3.315423 (+0.39z)| norm 0.2655 (+0.21z)| lr 5.17e-05 | 8447.20 ms | -100.0% bf16 MFU | 62046 tok/s +step 15986/19560 | loss 3.289947 (-0.22z)| norm 0.2708 (+0.58z)| lr 5.16e-05 | 8448.55 ms | -100.0% bf16 MFU | 62047 tok/s +step 15987/19560 | loss 3.274088 (-0.60z)| norm 0.2793 (+1.15z)| lr 5.16e-05 | 8447.18 ms | -100.0% bf16 MFU | 62048 tok/s +step 15988/19560 | loss 3.293724 (-0.14z)| norm 0.2525 (-0.69z)| lr 5.16e-05 | 8446.00 ms | -100.0% bf16 MFU | 62049 tok/s +step 15989/19560 | loss 3.377129 (+1.85z)| norm 0.2694 (+0.47z)| lr 5.16e-05 | 8447.18 ms | -100.0% bf16 MFU | 62050 tok/s +step 15990/19560 | loss 3.277023 (-0.56z)| norm 0.2705 (+0.54z)| lr 5.15e-05 | 8447.42 ms | -100.0% bf16 MFU | 62051 tok/s +step 15991/19560 | loss 3.351405 (+1.21z)| norm 0.2623 (-0.03z)| lr 5.15e-05 | 8449.70 ms | -100.0% bf16 MFU | 62050 tok/s +step 15992/19560 | loss 3.348449 (+1.13z)| norm 0.2729 (+0.69z)| lr 5.15e-05 | 8443.79 ms | -100.0% bf16 MFU | 62052 tok/s +step 15993/19560 | loss 3.341562 (+0.95z)| norm 0.2846 (+1.48z)| lr 5.14e-05 | 8445.58 ms | -100.0% bf16 MFU | 62054 tok/s +step 15994/19560 | loss 3.280313 (-0.50z)| norm 0.2455 (-1.19z)| lr 5.14e-05 | 8442.09 ms | -100.0% bf16 MFU | 62056 tok/s +step 15995/19560 | loss 3.314406 (+0.31z)| norm 0.4967 (+9.18z)| lr 5.14e-05 | 8450.20 ms | -100.0% bf16 MFU | 62056 tok/s +step 15996/19560 | loss 3.343067 (+0.98z)| norm 0.2548 (-0.39z)| lr 5.14e-05 | 8440.51 ms | -100.0% bf16 MFU | 62059 tok/s +step 15997/19560 | loss 3.323142 (+0.51z)| norm 0.2727 (+0.32z)| lr 5.13e-05 | 8444.85 ms | -100.0% bf16 MFU | 62060 tok/s +step 15998/19560 | loss 3.281425 (-0.50z)| norm 0.2623 (-0.10z)| lr 5.13e-05 | 8442.48 ms | -100.0% bf16 MFU | 62062 tok/s +step 15999/19560 | loss 3.310113 (+0.20z)| norm 0.2500 (-0.58z)| lr 5.13e-05 | 8444.69 ms | -100.0% bf16 MFU | 62063 tok/s +step 16000/19560 | loss 3.312900 (+0.26z)| norm 0.2616 (-0.12z)| lr 5.12e-05 | 8446.51 ms | -100.0% bf16 MFU | 62064 tok/s +val loss 3.282171 +evaluating HellaSwag: 0/1256 evaluating HellaSwag: 10/1256 evaluating HellaSwag: 20/1256 evaluating HellaSwag: 30/1256 evaluating HellaSwag: 40/1256 evaluating HellaSwag: 50/1256 evaluating HellaSwag: 60/1256 evaluating HellaSwag: 70/1256 evaluating HellaSwag: 80/1256 evaluating HellaSwag: 90/1256 evaluating HellaSwag: 100/1256 evaluating HellaSwag: 110/1256 evaluating HellaSwag: 120/1256 evaluating HellaSwag: 130/1256 evaluating HellaSwag: 140/1256 evaluating HellaSwag: 150/1256 evaluating HellaSwag: 160/1256 evaluating HellaSwag: 170/1256 evaluating HellaSwag: 180/1256 evaluating HellaSwag: 190/1256 evaluating HellaSwag: 200/1256 evaluating HellaSwag: 210/1256 evaluating HellaSwag: 220/1256 evaluating HellaSwag: 230/1256 evaluating HellaSwag: 240/1256 evaluating HellaSwag: 250/1256 evaluating HellaSwag: 260/1256 evaluating HellaSwag: 270/1256 evaluating HellaSwag: 280/1256 evaluating HellaSwag: 290/1256 evaluating HellaSwag: 300/1256 evaluating HellaSwag: 310/1256 evaluating HellaSwag: 320/1256 evaluating HellaSwag: 330/1256 evaluating HellaSwag: 340/1256 evaluating HellaSwag: 350/1256 evaluating HellaSwag: 360/1256 evaluating HellaSwag: 370/1256 evaluating HellaSwag: 380/1256 evaluating HellaSwag: 390/1256 evaluating HellaSwag: 400/1256 evaluating HellaSwag: 410/1256 evaluating HellaSwag: 420/1256 evaluating HellaSwag: 430/1256 evaluating HellaSwag: 440/1256 evaluating HellaSwag: 450/1256 evaluating HellaSwag: 460/1256 evaluating HellaSwag: 470/1256 evaluating HellaSwag: 480/1256 evaluating HellaSwag: 490/1256 evaluating HellaSwag: 500/1256 evaluating HellaSwag: 510/1256 evaluating HellaSwag: 520/1256 evaluating HellaSwag: 530/1256 evaluating HellaSwag: 540/1256 evaluating HellaSwag: 550/1256 evaluating HellaSwag: 560/1256 evaluating HellaSwag: 570/1256 evaluating HellaSwag: 580/1256 evaluating HellaSwag: 590/1256 evaluating HellaSwag: 600/1256 evaluating HellaSwag: 610/1256 evaluating HellaSwag: 620/1256 evaluating HellaSwag: 630/1256 evaluating HellaSwag: 640/1256 evaluating HellaSwag: 650/1256 evaluating HellaSwag: 660/1256 evaluating HellaSwag: 670/1256 evaluating HellaSwag: 680/1256 evaluating HellaSwag: 690/1256 evaluating HellaSwag: 700/1256 evaluating HellaSwag: 710/1256 evaluating HellaSwag: 720/1256 evaluating HellaSwag: 730/1256 evaluating HellaSwag: 740/1256 evaluating HellaSwag: 750/1256 evaluating HellaSwag: 760/1256 evaluating HellaSwag: 770/1256 evaluating HellaSwag: 780/1256 evaluating HellaSwag: 790/1256 evaluating HellaSwag: 800/1256 evaluating HellaSwag: 810/1256 evaluating HellaSwag: 820/1256 evaluating HellaSwag: 830/1256 evaluating HellaSwag: 840/1256 evaluating HellaSwag: 850/1256 evaluating HellaSwag: 860/1256 evaluating HellaSwag: 870/1256 evaluating HellaSwag: 880/1256 evaluating HellaSwag: 890/1256 evaluating HellaSwag: 900/1256 evaluating HellaSwag: 910/1256 evaluating HellaSwag: 920/1256 evaluating HellaSwag: 930/1256 evaluating HellaSwag: 940/1256 evaluating HellaSwag: 950/1256 evaluating HellaSwag: 960/1256 evaluating HellaSwag: 970/1256 evaluating HellaSwag: 980/1256 evaluating HellaSwag: 990/1256 evaluating HellaSwag: 1000/1256 evaluating HellaSwag: 1010/1256 evaluating HellaSwag: 1020/1256 evaluating HellaSwag: 1030/1256 evaluating HellaSwag: 1040/1256 evaluating HellaSwag: 1050/1256 evaluating HellaSwag: 1060/1256 evaluating HellaSwag: 1070/1256 evaluating HellaSwag: 1080/1256 evaluating HellaSwag: 1090/1256 evaluating HellaSwag: 1100/1256 evaluating HellaSwag: 1110/1256 evaluating HellaSwag: 1120/1256 evaluating HellaSwag: 1130/1256 evaluating HellaSwag: 1140/1256 evaluating HellaSwag: 1150/1256 evaluating HellaSwag: 1160/1256 evaluating HellaSwag: 1170/1256 evaluating HellaSwag: 1180/1256 evaluating HellaSwag: 1190/1256 evaluating HellaSwag: 1200/1256 evaluating HellaSwag: 1210/1256 evaluating HellaSwag: 1220/1256 evaluating HellaSwag: 1230/1256 evaluating HellaSwag: 1240/1256 evaluating HellaSwag: 1250/1256 HellaSwag: 2996/10042 = 0.298347 +step 16001/19560 | loss 3.328535 (+0.63z)| norm 0.2578 (-0.27z)| lr 5.12e-05 | 8445.95 ms | -100.0% bf16 MFU | 62064 tok/s +step 16002/19560 | loss 3.325320 (+0.55z)| norm 0.2544 (-0.40z)| lr 5.12e-05 | 8440.51 ms | -100.0% bf16 MFU | 62067 tok/s +step 16003/19560 | loss 3.310502 (+0.18z)| norm 0.3008 (+1.41z)| lr 5.12e-05 | 8443.69 ms | -100.0% bf16 MFU | 62068 tok/s +step 16004/19560 | loss 3.338227 (+0.86z)| norm 0.2868 (+0.85z)| lr 5.11e-05 | 8443.67 ms | -100.0% bf16 MFU | 62069 tok/s +step 16005/19560 | loss 3.356292 (+1.28z)| norm 0.2580 (-0.27z)| lr 5.11e-05 | 8442.93 ms | -100.0% bf16 MFU | 62071 tok/s +step 16006/19560 | loss 3.296058 (-0.17z)| norm 0.2839 (+0.73z)| lr 5.11e-05 | 8450.23 ms | -100.0% bf16 MFU | 62069 tok/s +step 16007/19560 | loss 3.305910 (+0.07z)| norm 0.2634 (-0.08z)| lr 5.11e-05 | 8440.95 ms | -100.0% bf16 MFU | 62072 tok/s +step 16008/19560 | loss 3.331198 (+0.68z)| norm 0.2651 (-0.01z)| lr 5.10e-05 | 8448.12 ms | -100.0% bf16 MFU | 62071 tok/s +step 16009/19560 | loss 3.303769 (-0.00z)| norm 0.2722 (+0.26z)| lr 5.10e-05 | 8439.31 ms | -100.0% bf16 MFU | 62074 tok/s +step 16010/19560 | loss 3.328172 (+0.61z)| norm 0.2662 (+0.02z)| lr 5.10e-05 | 8442.59 ms | -100.0% bf16 MFU | 62075 tok/s +step 16011/19560 | loss 3.286145 (-0.44z)| norm 0.2767 (+0.43z)| lr 5.09e-05 | 8438.04 ms | -100.0% bf16 MFU | 62078 tok/s +step 16012/19560 | loss 3.275397 (-0.71z)| norm 0.2532 (-0.49z)| lr 5.09e-05 | 8443.54 ms | -100.0% bf16 MFU | 62079 tok/s +step 16013/19560 | loss 3.283815 (-0.49z)| norm 0.2755 (+0.37z)| lr 5.09e-05 | 8445.75 ms | -100.0% bf16 MFU | 62079 tok/s +step 16014/19560 | loss 3.235506 (-1.66z)| norm 0.2534 (-0.49z)| lr 5.09e-05 | 8446.25 ms | -100.0% bf16 MFU | 62078 tok/s +step 16015/19560 | loss 3.287404 (-0.38z)| norm 0.2754 (+0.37z)| lr 5.08e-05 | 8445.08 ms | -100.0% bf16 MFU | 62079 tok/s +step 16016/19560 | loss 3.262589 (-0.98z)| norm 0.2490 (-0.67z)| lr 5.08e-05 | 8440.08 ms | -100.0% bf16 MFU | 62081 tok/s +step 16017/19560 | loss 3.260735 (-1.01z)| norm 0.2534 (-0.50z)| lr 5.08e-05 | 8440.24 ms | -100.0% bf16 MFU | 62082 tok/s +step 16018/19560 | loss 3.282160 (-0.48z)| norm 0.2694 (+0.12z)| lr 5.07e-05 | 8442.60 ms | -100.0% bf16 MFU | 62083 tok/s +step 16019/19560 | loss 3.293761 (-0.19z)| norm 0.2617 (-0.18z)| lr 5.07e-05 | 8440.72 ms | -100.0% bf16 MFU | 62085 tok/s +step 16020/19560 | loss 3.248393 (-1.28z)| norm 0.2536 (-0.50z)| lr 5.07e-05 | 8442.80 ms | -100.0% bf16 MFU | 62086 tok/s +step 16021/19560 | loss 3.269831 (-0.76z)| norm 0.2503 (-0.62z)| lr 5.07e-05 | 8445.01 ms | -100.0% bf16 MFU | 62085 tok/s +step 16022/19560 | loss 3.263184 (-0.93z)| norm 0.2596 (-0.26z)| lr 5.06e-05 | 8447.81 ms | -100.0% bf16 MFU | 62084 tok/s +step 16023/19560 | loss 3.325230 (+0.61z)| norm 0.2589 (-0.29z)| lr 5.06e-05 | 8443.84 ms | -100.0% bf16 MFU | 62085 tok/s +step 16024/19560 | loss 3.307717 (+0.17z)| norm 0.2483 (-0.70z)| lr 5.06e-05 | 8441.49 ms | -100.0% bf16 MFU | 62086 tok/s +step 16025/19560 | loss 3.309906 (+0.22z)| norm 0.2541 (-0.47z)| lr 5.06e-05 | 8443.90 ms | -100.0% bf16 MFU | 62086 tok/s +step 16026/19560 | loss 3.276122 (-0.61z)| norm 0.2723 (+0.24z)| lr 5.05e-05 | 8441.20 ms | -100.0% bf16 MFU | 62087 tok/s +step 16027/19560 | loss 3.295954 (-0.11z)| norm 0.2499 (-0.64z)| lr 5.05e-05 | 8446.32 ms | -100.0% bf16 MFU | 62087 tok/s +step 16028/19560 | loss 3.315244 (+0.36z)| norm 0.2595 (-0.26z)| lr 5.05e-05 | 8442.84 ms | -100.0% bf16 MFU | 62087 tok/s +step 16029/19560 | loss 3.318787 (+0.45z)| norm 0.2559 (-0.40z)| lr 5.04e-05 | 8445.51 ms | -100.0% bf16 MFU | 62087 tok/s +step 16030/19560 | loss 3.352764 (+1.28z)| norm 0.2895 (+0.92z)| lr 5.04e-05 | 8443.14 ms | -100.0% bf16 MFU | 62087 tok/s +step 16031/19560 | loss 3.329576 (+0.69z)| norm 0.2561 (-0.39z)| lr 5.04e-05 | 8445.50 ms | -100.0% bf16 MFU | 62087 tok/s +step 16032/19560 | loss 3.231452 (-1.73z)| norm 0.2658 (-0.01z)| lr 5.04e-05 | 8440.35 ms | -100.0% bf16 MFU | 62088 tok/s +step 16033/19560 | loss 3.287494 (-0.34z)| norm 0.2637 (-0.09z)| lr 5.03e-05 | 8443.53 ms | -100.0% bf16 MFU | 62089 tok/s +step 16034/19560 | loss 3.256918 (-1.08z)| norm 0.2743 (+0.33z)| lr 5.03e-05 | 8441.84 ms | -100.0% bf16 MFU | 62089 tok/s +step 16035/19560 | loss 3.314673 (+0.34z)| norm 0.2627 (-0.13z)| lr 5.03e-05 | 8444.19 ms | -100.0% bf16 MFU | 62089 tok/s +step 16036/19560 | loss 3.328138 (+0.67z)| norm 0.2473 (-0.72z)| lr 5.02e-05 | 8448.56 ms | -100.0% bf16 MFU | 62088 tok/s +step 16037/19560 | loss 3.347250 (+1.13z)| norm 0.2575 (-0.32z)| lr 5.02e-05 | 8443.11 ms | -100.0% bf16 MFU | 62088 tok/s +step 16038/19560 | loss 3.197316 (-2.51z)| norm 0.2630 (-0.11z)| lr 5.02e-05 | 8441.49 ms | -100.0% bf16 MFU | 62089 tok/s +step 16039/19560 | loss 3.322018 (+0.50z)| norm 0.2468 (-0.74z)| lr 5.02e-05 | 8440.32 ms | -100.0% bf16 MFU | 62091 tok/s +step 16040/19560 | loss 3.253196 (-1.16z)| norm 0.2529 (-0.50z)| lr 5.01e-05 | 8439.97 ms | -100.0% bf16 MFU | 62092 tok/s +step 16041/19560 | loss 3.298047 (-0.09z)| norm 0.2489 (-0.66z)| lr 5.01e-05 | 8439.25 ms | -100.0% bf16 MFU | 62094 tok/s +step 16042/19560 | loss 3.256074 (-1.12z)| norm 0.2438 (-0.85z)| lr 5.01e-05 | 8441.86 ms | -100.0% bf16 MFU | 62094 tok/s +step 16043/19560 | loss 3.293173 (-0.18z)| norm 0.2622 (-0.11z)| lr 5.01e-05 | 8442.22 ms | -100.0% bf16 MFU | 62095 tok/s +step 16044/19560 | loss 3.317626 (+0.43z)| norm 0.2624 (-0.10z)| lr 5.00e-05 | 8440.66 ms | -100.0% bf16 MFU | 62096 tok/s +step 16045/19560 | loss 3.289673 (-0.28z)| norm 0.2770 (+0.49z)| lr 5.00e-05 | 8443.97 ms | -100.0% bf16 MFU | 62095 tok/s +step 16046/19560 | loss 3.296339 (-0.11z)| norm 0.2480 (-0.68z)| lr 5.00e-05 | 8439.74 ms | -100.0% bf16 MFU | 62097 tok/s +step 16047/19560 | loss 3.302434 (+0.03z)| norm 0.2668 (+0.08z)| lr 4.99e-05 | 8440.43 ms | -100.0% bf16 MFU | 62098 tok/s +step 16048/19560 | loss 3.284789 (-0.44z)| norm 0.2699 (+0.20z)| lr 4.99e-05 | 8439.47 ms | -100.0% bf16 MFU | 62099 tok/s +step 16049/19560 | loss 3.287292 (-0.38z)| norm 0.2689 (+0.16z)| lr 4.99e-05 | 8440.55 ms | -100.0% bf16 MFU | 62100 tok/s +step 16050/19560 | loss 3.263914 (-1.07z)| norm 0.2482 (-0.68z)| lr 4.99e-05 | 8440.32 ms | -100.0% bf16 MFU | 62101 tok/s +step 16051/19560 | loss 3.304453 (+0.14z)| norm 0.2561 (-0.36z)| lr 4.98e-05 | 8444.06 ms | -100.0% bf16 MFU | 62100 tok/s +step 16052/19560 | loss 3.305668 (+0.17z)| norm 0.2652 (+0.01z)| lr 4.98e-05 | 8438.00 ms | -100.0% bf16 MFU | 62102 tok/s +step 16053/19560 | loss 3.293051 (-0.19z)| norm 0.2544 (-0.42z)| lr 4.98e-05 | 8440.70 ms | -100.0% bf16 MFU | 62102 tok/s +step 16054/19560 | loss 3.313997 (+0.46z)| norm 0.2804 (+0.62z)| lr 4.97e-05 | 8439.76 ms | -100.0% bf16 MFU | 62103 tok/s +step 16055/19560 | loss 3.398490 (+2.92z)| norm 0.2685 (+0.14z)| lr 4.97e-05 | 8441.38 ms | -100.0% bf16 MFU | 62104 tok/s +step 16056/19560 | loss 3.265299 (-1.01z)| norm 0.2837 (+0.74z)| lr 4.97e-05 | 8436.54 ms | -100.0% bf16 MFU | 62106 tok/s +step 16057/19560 | loss 3.274533 (-0.75z)| norm 0.2785 (+0.52z)| lr 4.97e-05 | 8438.48 ms | -100.0% bf16 MFU | 62107 tok/s +step 16058/19560 | loss 3.229068 (-2.05z)| norm 0.2647 (-0.04z)| lr 4.96e-05 | 8441.64 ms | -100.0% bf16 MFU | 62107 tok/s +step 16059/19560 | loss 3.282630 (-0.48z)| norm 0.2797 (+0.56z)| lr 4.96e-05 | 8439.39 ms | -100.0% bf16 MFU | 62108 tok/s +step 16060/19560 | loss 3.279542 (-0.57z)| norm 0.2780 (+0.49z)| lr 4.96e-05 | 8440.18 ms | -100.0% bf16 MFU | 62108 tok/s +step 16061/19560 | loss 3.305121 (+0.17z)| norm 0.2594 (-0.28z)| lr 4.96e-05 | 8439.97 ms | -100.0% bf16 MFU | 62109 tok/s +step 16062/19560 | loss 3.219149 (-2.29z)| norm 0.2991 (+1.33z)| lr 4.95e-05 | 8437.67 ms | -100.0% bf16 MFU | 62110 tok/s +step 16063/19560 | loss 3.266706 (-0.91z)| norm 0.2603 (-0.27z)| lr 4.95e-05 | 8441.70 ms | -100.0% bf16 MFU | 62110 tok/s +step 16064/19560 | loss 3.331880 (+0.94z)| norm 0.2657 (-0.05z)| lr 4.95e-05 | 8443.75 ms | -100.0% bf16 MFU | 62109 tok/s +step 16065/19560 | loss 3.298735 (-0.00z)| norm 0.2549 (-0.50z)| lr 4.94e-05 | 8436.66 ms | -100.0% bf16 MFU | 62111 tok/s +step 16066/19560 | loss 3.299553 (+0.01z)| norm 0.2549 (-0.50z)| lr 4.94e-05 | 8440.93 ms | -100.0% bf16 MFU | 62111 tok/s +step 16067/19560 | loss 3.321614 (+0.64z)| norm 0.2655 (-0.06z)| lr 4.94e-05 | 8440.01 ms | -100.0% bf16 MFU | 62111 tok/s +step 16068/19560 | loss 3.301771 (+0.07z)| norm 0.2612 (-0.24z)| lr 4.94e-05 | 8441.18 ms | -100.0% bf16 MFU | 62111 tok/s +step 16069/19560 | loss 3.357154 (+1.63z)| norm 0.2473 (-0.81z)| lr 4.93e-05 | 8442.51 ms | -100.0% bf16 MFU | 62111 tok/s +step 16070/19560 | loss 3.335692 (+1.02z)| norm 0.2530 (-0.57z)| lr 4.93e-05 | 8441.45 ms | -100.0% bf16 MFU | 62111 tok/s +step 16071/19560 | loss 3.300274 (+0.01z)| norm 0.2545 (-0.50z)| lr 4.93e-05 | 8438.03 ms | -100.0% bf16 MFU | 62112 tok/s +step 16072/19560 | loss 3.274077 (-0.76z)| norm 0.2546 (-0.49z)| lr 4.93e-05 | 8439.23 ms | -100.0% bf16 MFU | 62113 tok/s +step 16073/19560 | loss 3.316850 (+0.49z)| norm 0.2783 (+0.48z)| lr 4.92e-05 | 8437.41 ms | -100.0% bf16 MFU | 62114 tok/s +step 16074/19560 | loss 3.314983 (+0.45z)| norm 0.2781 (+0.46z)| lr 4.92e-05 | 8437.96 ms | -100.0% bf16 MFU | 62115 tok/s +step 16075/19560 | loss 3.269076 (-0.90z)| norm 0.2593 (-0.30z)| lr 4.92e-05 | 8440.45 ms | -100.0% bf16 MFU | 62115 tok/s +step 16076/19560 | loss 3.305939 (+0.19z)| norm 0.2616 (-0.22z)| lr 4.91e-05 | 8440.12 ms | -100.0% bf16 MFU | 62115 tok/s +step 16077/19560 | loss 3.306506 (+0.21z)| norm 0.2685 (+0.07z)| lr 4.91e-05 | 8436.09 ms | -100.0% bf16 MFU | 62117 tok/s +step 16078/19560 | loss 3.399428 (+2.84z)| norm 0.2699 (+0.14z)| lr 4.91e-05 | 8437.84 ms | -100.0% bf16 MFU | 62118 tok/s +step 16079/19560 | loss 3.304594 (+0.12z)| norm 0.2560 (-0.44z)| lr 4.91e-05 | 8442.34 ms | -100.0% bf16 MFU | 62117 tok/s +step 16080/19560 | loss 3.326314 (+0.74z)| norm 0.2595 (-0.29z)| lr 4.90e-05 | 8436.58 ms | -100.0% bf16 MFU | 62118 tok/s +step 16081/19560 | loss 3.298610 (-0.07z)| norm 0.2725 (+0.25z)| lr 4.90e-05 | 8436.52 ms | -100.0% bf16 MFU | 62120 tok/s +step 16082/19560 | loss 3.328199 (+0.80z)| norm 0.2471 (-0.79z)| lr 4.90e-05 | 8438.14 ms | -100.0% bf16 MFU | 62120 tok/s +step 16083/19560 | loss 3.390003 (+2.53z)| norm 0.2582 (-0.33z)| lr 4.90e-05 | 8440.77 ms | -100.0% bf16 MFU | 62120 tok/s +step 16084/19560 | loss 3.288700 (-0.35z)| norm 0.2564 (-0.40z)| lr 4.89e-05 | 8437.79 ms | -100.0% bf16 MFU | 62121 tok/s +step 16085/19560 | loss 3.320358 (+0.56z)| norm 0.2611 (-0.20z)| lr 4.89e-05 | 8439.71 ms | -100.0% bf16 MFU | 62121 tok/s +step 16086/19560 | loss 3.268327 (-0.95z)| norm 0.2555 (-0.42z)| lr 4.89e-05 | 8439.03 ms | -100.0% bf16 MFU | 62121 tok/s +step 16087/19560 | loss 3.294522 (-0.18z)| norm 0.2507 (-0.62z)| lr 4.88e-05 | 8442.31 ms | -100.0% bf16 MFU | 62120 tok/s +step 16088/19560 | loss 3.300633 (-0.00z)| norm 0.2505 (-0.63z)| lr 4.88e-05 | 8438.00 ms | -100.0% bf16 MFU | 62121 tok/s +step 16089/19560 | loss 3.295128 (-0.16z)| norm 0.2449 (-0.85z)| lr 4.88e-05 | 8439.43 ms | -100.0% bf16 MFU | 62121 tok/s +step 16090/19560 | loss 3.349901 (+1.42z)| norm 0.2461 (-0.79z)| lr 4.88e-05 | 8438.39 ms | -100.0% bf16 MFU | 62122 tok/s +step 16091/19560 | loss 3.311354 (+0.31z)| norm 0.2818 (+0.67z)| lr 4.87e-05 | 8439.79 ms | -100.0% bf16 MFU | 62122 tok/s +step 16092/19560 | loss 3.299755 (-0.04z)| norm 0.2726 (+0.29z)| lr 4.87e-05 | 8438.01 ms | -100.0% bf16 MFU | 62122 tok/s +step 16093/19560 | loss 3.344634 (+1.25z)| norm 0.2526 (-0.53z)| lr 4.87e-05 | 8435.53 ms | -100.0% bf16 MFU | 62124 tok/s +step 16094/19560 | loss 3.291041 (-0.30z)| norm 0.2485 (-0.70z)| lr 4.87e-05 | 8437.12 ms | -100.0% bf16 MFU | 62124 tok/s +step 16095/19560 | loss 3.289732 (-0.35z)| norm 0.2718 (+0.26z)| lr 4.86e-05 | 8438.48 ms | -100.0% bf16 MFU | 62125 tok/s +step 16096/19560 | loss 3.307031 (+0.15z)| norm 0.2582 (-0.31z)| lr 4.86e-05 | 8436.51 ms | -100.0% bf16 MFU | 62126 tok/s +step 16097/19560 | loss 3.295288 (-0.20z)| norm 0.2598 (-0.24z)| lr 4.86e-05 | 8440.91 ms | -100.0% bf16 MFU | 62125 tok/s +step 16098/19560 | loss 3.294194 (-0.23z)| norm 0.2749 (+0.38z)| lr 4.85e-05 | 8440.19 ms | -100.0% bf16 MFU | 62125 tok/s +step 16099/19560 | loss 3.325074 (+0.68z)| norm 0.2489 (-0.69z)| lr 4.85e-05 | 8441.01 ms | -100.0% bf16 MFU | 62124 tok/s +step 16100/19560 | loss 3.293433 (-0.27z)| norm 0.2621 (-0.15z)| lr 4.85e-05 | 8440.85 ms | -100.0% bf16 MFU | 62124 tok/s +step 16101/19560 | loss 3.356871 (+1.61z)| norm 0.2624 (-0.14z)| lr 4.85e-05 | 8439.15 ms | -100.0% bf16 MFU | 62124 tok/s +step 16102/19560 | loss 3.316001 (+0.38z)| norm 0.2551 (-0.43z)| lr 4.84e-05 | 8438.78 ms | -100.0% bf16 MFU | 62124 tok/s +step 16103/19560 | loss 3.309278 (+0.18z)| norm 0.2758 (+0.43z)| lr 4.84e-05 | 8434.44 ms | -100.0% bf16 MFU | 62126 tok/s +step 16104/19560 | loss 3.314901 (+0.35z)| norm 0.2710 (+0.23z)| lr 4.84e-05 | 8440.33 ms | -100.0% bf16 MFU | 62125 tok/s +step 16105/19560 | loss 3.269450 (-1.01z)| norm 0.2522 (-0.55z)| lr 4.84e-05 | 8435.66 ms | -100.0% bf16 MFU | 62127 tok/s +step 16106/19560 | loss 3.263134 (-1.18z)| norm 0.2682 (+0.11z)| lr 4.83e-05 | 8439.33 ms | -100.0% bf16 MFU | 62127 tok/s +step 16107/19560 | loss 3.293345 (-0.28z)| norm 0.2500 (-0.64z)| lr 4.83e-05 | 8438.39 ms | -100.0% bf16 MFU | 62127 tok/s +step 16108/19560 | loss 3.285121 (-0.51z)| norm 0.2564 (-0.37z)| lr 4.83e-05 | 8437.55 ms | -100.0% bf16 MFU | 62127 tok/s +step 16109/19560 | loss 3.273790 (-0.85z)| norm 0.2620 (-0.14z)| lr 4.82e-05 | 8439.24 ms | -100.0% bf16 MFU | 62127 tok/s +step 16110/19560 | loss 3.266624 (-1.05z)| norm 0.2527 (-0.51z)| lr 4.82e-05 | 8438.93 ms | -100.0% bf16 MFU | 62127 tok/s +step 16111/19560 | loss 3.296734 (-0.15z)| norm 0.2586 (-0.25z)| lr 4.82e-05 | 8435.72 ms | -100.0% bf16 MFU | 62128 tok/s +step 16112/19560 | loss 3.361782 (+1.78z)| norm 0.2747 (+0.43z)| lr 4.82e-05 | 8437.03 ms | -100.0% bf16 MFU | 62129 tok/s +step 16113/19560 | loss 3.278314 (-0.70z)| norm 0.2540 (-0.45z)| lr 4.81e-05 | 8438.70 ms | -100.0% bf16 MFU | 62129 tok/s +step 16114/19560 | loss 3.301239 (-0.02z)| norm 0.2629 (-0.06z)| lr 4.81e-05 | 8436.82 ms | -100.0% bf16 MFU | 62130 tok/s +step 16115/19560 | loss 3.268466 (-0.99z)| norm 0.2682 (+0.16z)| lr 4.81e-05 | 8436.68 ms | -100.0% bf16 MFU | 62130 tok/s +step 16116/19560 | loss 3.282573 (-0.57z)| norm 0.2565 (-0.34z)| lr 4.81e-05 | 8436.66 ms | -100.0% bf16 MFU | 62131 tok/s +step 16117/19560 | loss 3.288823 (-0.37z)| norm 0.2523 (-0.51z)| lr 4.80e-05 | 8438.74 ms | -100.0% bf16 MFU | 62131 tok/s +step 16118/19560 | loss 3.288639 (-0.38z)| norm 0.2687 (+0.19z)| lr 4.80e-05 | 8435.27 ms | -100.0% bf16 MFU | 62132 tok/s +step 16119/19560 | loss 3.256072 (-1.35z)| norm 0.2505 (-0.58z)| lr 4.80e-05 | 8437.87 ms | -100.0% bf16 MFU | 62132 tok/s +step 16120/19560 | loss 3.328934 (+0.88z)| norm 0.2496 (-0.61z)| lr 4.79e-05 | 8439.37 ms | -100.0% bf16 MFU | 62132 tok/s +step 16121/19560 | loss 3.287601 (-0.37z)| norm 0.2665 (+0.11z)| lr 4.79e-05 | 8438.14 ms | -100.0% bf16 MFU | 62132 tok/s +step 16122/19560 | loss 3.286450 (-0.41z)| norm 0.2490 (-0.63z)| lr 4.79e-05 | 8434.11 ms | -100.0% bf16 MFU | 62133 tok/s +step 16123/19560 | loss 3.321870 (+0.68z)| norm 0.2391 (-1.99z)| lr 4.79e-05 | 8438.03 ms | -100.0% bf16 MFU | 62134 tok/s +step 16124/19560 | loss 3.279784 (-0.61z)| norm 0.2708 (+0.77z)| lr 4.78e-05 | 8459.35 ms | -100.0% bf16 MFU | 62126 tok/s +step 16125/19560 | loss 3.252902 (-1.41z)| norm 0.2563 (-0.49z)| lr 4.78e-05 | 8462.09 ms | -100.0% bf16 MFU | 62117 tok/s +step 16126/19560 | loss 3.336861 (+1.15z)| norm 0.2857 (+2.04z)| lr 4.78e-05 | 8466.02 ms | -100.0% bf16 MFU | 62108 tok/s +step 16127/19560 | loss 3.319922 (+0.63z)| norm 0.2581 (-0.34z)| lr 4.78e-05 | 8457.73 ms | -100.0% bf16 MFU | 62102 tok/s +step 16128/19560 | loss 3.346816 (+1.43z)| norm 0.2621 (-0.00z)| lr 4.77e-05 | 8457.50 ms | -100.0% bf16 MFU | 62096 tok/s +step 16129/19560 | loss 3.296304 (-0.09z)| norm 0.2686 (+0.56z)| lr 4.77e-05 | 8462.79 ms | -100.0% bf16 MFU | 62089 tok/s +step 16130/19560 | loss 3.418976 (+3.45z)| norm 0.2553 (-0.60z)| lr 4.77e-05 | 8461.73 ms | -100.0% bf16 MFU | 62083 tok/s +step 16131/19560 | loss 3.344693 (+1.28z)| norm 0.2469 (-1.33z)| lr 4.76e-05 | 8458.79 ms | -100.0% bf16 MFU | 62078 tok/s +step 16132/19560 | loss 3.250364 (-1.42z)| norm 0.2722 (+0.97z)| lr 4.76e-05 | 8461.42 ms | -100.0% bf16 MFU | 62072 tok/s +step 16133/19560 | loss 3.304006 (+0.14z)| norm 0.2673 (+0.51z)| lr 4.76e-05 | 8455.97 ms | -100.0% bf16 MFU | 62068 tok/s +step 16134/19560 | loss 3.315430 (+0.46z)| norm 0.2535 (-0.74z)| lr 4.76e-05 | 8455.77 ms | -100.0% bf16 MFU | 62065 tok/s +step 16135/19560 | loss 3.272934 (-0.76z)| norm 0.2548 (-0.61z)| lr 4.75e-05 | 8458.42 ms | -100.0% bf16 MFU | 62061 tok/s +step 16136/19560 | loss 3.324397 (+0.73z)| norm 0.2629 (+0.14z)| lr 4.75e-05 | 8454.07 ms | -100.0% bf16 MFU | 62059 tok/s +step 16137/19560 | loss 3.293335 (-0.17z)| norm 0.2503 (-1.02z)| lr 4.75e-05 | 8458.72 ms | -100.0% bf16 MFU | 62055 tok/s +step 16138/19560 | loss 3.268486 (-0.87z)| norm 0.2445 (-1.53z)| lr 4.75e-05 | 8456.29 ms | -100.0% bf16 MFU | 62052 tok/s +step 16139/19560 | loss 3.309870 (+0.32z)| norm 0.2568 (-0.38z)| lr 4.74e-05 | 8458.71 ms | -100.0% bf16 MFU | 62049 tok/s +step 16140/19560 | loss 3.323919 (+0.72z)| norm 0.2620 (+0.09z)| lr 4.74e-05 | 8458.52 ms | -100.0% bf16 MFU | 62045 tok/s +step 16141/19560 | loss 3.282456 (-0.48z)| norm 0.2544 (-0.60z)| lr 4.74e-05 | 8465.26 ms | -100.0% bf16 MFU | 62040 tok/s +step 16142/19560 | loss 3.304870 (+0.15z)| norm 0.2437 (-1.57z)| lr 4.74e-05 | 8452.11 ms | -100.0% bf16 MFU | 62039 tok/s +step 16143/19560 | loss 3.348024 (+1.40z)| norm 0.2646 (+0.36z)| lr 4.73e-05 | 8455.27 ms | -100.0% bf16 MFU | 62038 tok/s +step 16144/19560 | loss 3.340229 (+1.15z)| norm 0.2497 (-1.02z)| lr 4.73e-05 | 8460.97 ms | -100.0% bf16 MFU | 62034 tok/s +step 16145/19560 | loss 3.293857 (-0.21z)| norm 0.2514 (-0.86z)| lr 4.73e-05 | 8454.12 ms | -100.0% bf16 MFU | 62033 tok/s +step 16146/19560 | loss 3.427943 (+3.51z)| norm 0.2669 (+0.58z)| lr 4.72e-05 | 8459.06 ms | -100.0% bf16 MFU | 62031 tok/s +step 16147/19560 | loss 3.332509 (+0.84z)| norm 0.2501 (-0.96z)| lr 4.72e-05 | 8458.36 ms | -100.0% bf16 MFU | 62028 tok/s +step 16148/19560 | loss 3.368507 (+1.81z)| norm 0.2460 (-1.34z)| lr 4.72e-05 | 8454.68 ms | -100.0% bf16 MFU | 62027 tok/s +step 16149/19560 | loss 3.343681 (+1.10z)| norm 0.2522 (-0.77z)| lr 4.72e-05 | 8453.62 ms | -100.0% bf16 MFU | 62027 tok/s +step 16150/19560 | loss 3.356584 (+1.43z)| norm 0.2484 (-1.10z)| lr 4.71e-05 | 8454.94 ms | -100.0% bf16 MFU | 62026 tok/s +step 16151/19560 | loss 3.264046 (-1.11z)| norm 0.2516 (-0.80z)| lr 4.71e-05 | 8446.52 ms | -100.0% bf16 MFU | 62028 tok/s +step 16152/19560 | loss 3.313325 (+0.25z)| norm 0.2806 (+1.82z)| lr 4.71e-05 | 8447.89 ms | -100.0% bf16 MFU | 62030 tok/s +step 16153/19560 | loss 3.254285 (-1.35z)| norm 0.2560 (-0.42z)| lr 4.71e-05 | 8450.43 ms | -100.0% bf16 MFU | 62031 tok/s +step 16154/19560 | loss 3.301689 (-0.06z)| norm 0.2472 (-1.21z)| lr 4.70e-05 | 8456.85 ms | -100.0% bf16 MFU | 62029 tok/s +step 16155/19560 | loss 3.341287 (+1.01z)| norm 0.2628 (+0.21z)| lr 4.70e-05 | 8451.62 ms | -100.0% bf16 MFU | 62029 tok/s +step 16156/19560 | loss 3.250208 (-1.45z)| norm 0.2490 (-1.04z)| lr 4.70e-05 | 8451.08 ms | -100.0% bf16 MFU | 62030 tok/s +step 16157/19560 | loss 3.292878 (-0.29z)| norm 0.2542 (-0.57z)| lr 4.69e-05 | 8451.71 ms | -100.0% bf16 MFU | 62030 tok/s +step 16158/19560 | loss 3.330504 (+0.73z)| norm 0.2632 (+0.28z)| lr 4.69e-05 | 8448.97 ms | -100.0% bf16 MFU | 62031 tok/s +step 16159/19560 | loss 3.358165 (+1.47z)| norm 0.2475 (-1.18z)| lr 4.69e-05 | 8450.71 ms | -100.0% bf16 MFU | 62031 tok/s +step 16160/19560 | loss 3.350757 (+1.26z)| norm 0.2610 (+0.08z)| lr 4.69e-05 | 8454.94 ms | -100.0% bf16 MFU | 62030 tok/s +step 16161/19560 | loss 3.267063 (-1.02z)| norm 0.2476 (-1.15z)| lr 4.68e-05 | 8455.95 ms | -100.0% bf16 MFU | 62029 tok/s +step 16162/19560 | loss 3.293571 (-0.31z)| norm 0.2505 (-0.87z)| lr 4.68e-05 | 8447.62 ms | -100.0% bf16 MFU | 62031 tok/s +step 16163/19560 | loss 3.288745 (-0.43z)| norm 0.2542 (-0.51z)| lr 4.68e-05 | 8452.19 ms | -100.0% bf16 MFU | 62031 tok/s +step 16164/19560 | loss 3.309130 (+0.13z)| norm 0.2554 (-0.41z)| lr 4.68e-05 | 8446.23 ms | -100.0% bf16 MFU | 62033 tok/s +step 16165/19560 | loss 3.368068 (+1.73z)| norm 0.2559 (-0.36z)| lr 4.67e-05 | 8447.95 ms | -100.0% bf16 MFU | 62034 tok/s +step 16166/19560 | loss 3.413051 (+2.90z)| norm 0.2559 (-0.36z)| lr 4.67e-05 | 8446.75 ms | -100.0% bf16 MFU | 62036 tok/s +step 16167/19560 | loss 3.318539 (+0.33z)| norm 0.2577 (-0.20z)| lr 4.67e-05 | 8448.34 ms | -100.0% bf16 MFU | 62037 tok/s +step 16168/19560 | loss 3.291640 (-0.41z)| norm 0.2622 (+0.21z)| lr 4.67e-05 | 8453.78 ms | -100.0% bf16 MFU | 62036 tok/s +step 16169/19560 | loss 3.267439 (-1.06z)| norm 0.2489 (-1.03z)| lr 4.66e-05 | 8450.77 ms | -100.0% bf16 MFU | 62036 tok/s +step 16170/19560 | loss 3.364553 (+1.56z)| norm 0.2528 (-0.68z)| lr 4.66e-05 | 8448.37 ms | -100.0% bf16 MFU | 62037 tok/s +step 16171/19560 | loss 3.306510 (-0.02z)| norm 0.2575 (-0.23z)| lr 4.66e-05 | 8456.05 ms | -100.0% bf16 MFU | 62036 tok/s +step 16172/19560 | loss 3.339928 (+0.88z)| norm 0.2643 (+0.41z)| lr 4.65e-05 | 8452.70 ms | -100.0% bf16 MFU | 62035 tok/s +step 16173/19560 | loss 3.235489 (-1.92z)| norm 0.2721 (+1.16z)| lr 4.65e-05 | 8454.37 ms | -100.0% bf16 MFU | 62034 tok/s +step 16174/19560 | loss 3.315656 (+0.23z)| norm 0.2512 (-0.83z)| lr 4.65e-05 | 8447.07 ms | -100.0% bf16 MFU | 62036 tok/s +step 16175/19560 | loss 3.342184 (+0.93z)| norm 0.2702 (+0.97z)| lr 4.65e-05 | 8450.05 ms | -100.0% bf16 MFU | 62036 tok/s +step 16176/19560 | loss 3.287965 (-0.52z)| norm 0.2443 (-1.47z)| lr 4.64e-05 | 8452.79 ms | -100.0% bf16 MFU | 62036 tok/s +step 16177/19560 | loss 3.287478 (-0.54z)| norm 0.2632 (+0.33z)| lr 4.64e-05 | 8447.60 ms | -100.0% bf16 MFU | 62037 tok/s +step 16178/19560 | loss 3.270455 (-0.99z)| norm 0.2569 (-0.27z)| lr 4.64e-05 | 8440.59 ms | -100.0% bf16 MFU | 62041 tok/s +step 16179/19560 | loss 3.300435 (-0.19z)| norm 0.2452 (-1.37z)| lr 4.64e-05 | 8443.91 ms | -100.0% bf16 MFU | 62043 tok/s +step 16180/19560 | loss 3.317646 (+0.27z)| norm 0.2579 (-0.16z)| lr 4.63e-05 | 8447.42 ms | -100.0% bf16 MFU | 62045 tok/s +step 16181/19560 | loss 3.291864 (-0.42z)| norm 0.2543 (-0.50z)| lr 4.63e-05 | 8451.27 ms | -100.0% bf16 MFU | 62044 tok/s +step 16182/19560 | loss 3.417038 (+2.82z)| norm 0.2558 (-0.35z)| lr 4.63e-05 | 8447.81 ms | -100.0% bf16 MFU | 62045 tok/s +step 16183/19560 | loss 3.295198 (-0.33z)| norm 0.2580 (-0.13z)| lr 4.63e-05 | 8445.17 ms | -100.0% bf16 MFU | 62047 tok/s +step 16184/19560 | loss 3.256744 (-1.34z)| norm 0.2467 (-1.21z)| lr 4.62e-05 | 8448.97 ms | -100.0% bf16 MFU | 62047 tok/s +step 16185/19560 | loss 3.296193 (-0.30z)| norm 0.2567 (-0.22z)| lr 4.62e-05 | 8444.52 ms | -100.0% bf16 MFU | 62049 tok/s +step 16186/19560 | loss 3.268211 (-1.07z)| norm 0.2595 (+0.06z)| lr 4.62e-05 | 8447.57 ms | -100.0% bf16 MFU | 62050 tok/s +step 16187/19560 | loss 3.289181 (-0.51z)| norm 0.2452 (-1.35z)| lr 4.61e-05 | 8448.56 ms | -100.0% bf16 MFU | 62050 tok/s +step 16188/19560 | loss 3.322341 (+0.38z)| norm 0.2388 (-1.96z)| lr 4.61e-05 | 8446.17 ms | -100.0% bf16 MFU | 62051 tok/s +step 16189/19560 | loss 3.378685 (+1.85z)| norm 0.2609 (+0.26z)| lr 4.61e-05 | 8443.48 ms | -100.0% bf16 MFU | 62054 tok/s +step 16190/19560 | loss 3.296868 (-0.34z)| norm 0.2659 (+0.85z)| lr 4.61e-05 | 8444.40 ms | -100.0% bf16 MFU | 62055 tok/s +step 16191/19560 | loss 3.331654 (+0.59z)| norm 0.2559 (-0.22z)| lr 4.60e-05 | 8444.95 ms | -100.0% bf16 MFU | 62057 tok/s +step 16192/19560 | loss 3.248981 (-1.63z)| norm 0.2584 (+0.05z)| lr 4.60e-05 | 8443.05 ms | -100.0% bf16 MFU | 62059 tok/s +step 16193/19560 | loss 3.252981 (-1.50z)| norm 0.2452 (-1.35z)| lr 4.60e-05 | 8444.36 ms | -100.0% bf16 MFU | 62060 tok/s +step 16194/19560 | loss 3.326564 (+0.46z)| norm 0.2427 (-1.61z)| lr 4.60e-05 | 8440.19 ms | -100.0% bf16 MFU | 62063 tok/s +step 16195/19560 | loss 3.341453 (+0.85z)| norm 0.2497 (-0.85z)| lr 4.59e-05 | 8443.79 ms | -100.0% bf16 MFU | 62064 tok/s +step 16196/19560 | loss 3.365625 (+1.47z)| norm 0.2532 (-0.47z)| lr 4.59e-05 | 8444.91 ms | -100.0% bf16 MFU | 62065 tok/s +step 16197/19560 | loss 3.274063 (-0.93z)| norm 0.2420 (-1.64z)| lr 4.59e-05 | 8448.22 ms | -100.0% bf16 MFU | 62065 tok/s +step 16198/19560 | loss 3.307641 (-0.04z)| norm 0.2603 (+0.29z)| lr 4.59e-05 | 8443.43 ms | -100.0% bf16 MFU | 62066 tok/s +step 16199/19560 | loss 3.265261 (-1.15z)| norm 0.2486 (-0.94z)| lr 4.58e-05 | 8443.44 ms | -100.0% bf16 MFU | 62068 tok/s +step 16200/19560 | loss 3.283822 (-0.66z)| norm 0.2489 (-0.91z)| lr 4.58e-05 | 8444.39 ms | -100.0% bf16 MFU | 62069 tok/s +step 16201/19560 | loss 3.355554 (+1.22z)| norm 0.2570 (-0.04z)| lr 4.58e-05 | 8442.42 ms | -100.0% bf16 MFU | 62070 tok/s +step 16202/19560 | loss 3.234570 (-1.92z)| norm 0.2433 (-1.49z)| lr 4.57e-05 | 8442.21 ms | -100.0% bf16 MFU | 62072 tok/s +step 16203/19560 | loss 3.290993 (-0.46z)| norm 0.2534 (-0.40z)| lr 4.57e-05 | 8439.85 ms | -100.0% bf16 MFU | 62075 tok/s +step 16204/19560 | loss 3.271085 (-0.97z)| norm 0.2652 (+0.88z)| lr 4.57e-05 | 8442.21 ms | -100.0% bf16 MFU | 62076 tok/s +step 16205/19560 | loss 3.356512 (+1.23z)| norm 0.2619 (+0.53z)| lr 4.57e-05 | 8442.16 ms | -100.0% bf16 MFU | 62077 tok/s +step 16206/19560 | loss 3.271518 (-0.95z)| norm 0.2518 (-0.55z)| lr 4.56e-05 | 8443.82 ms | -100.0% bf16 MFU | 62078 tok/s +step 16207/19560 | loss 3.304815 (-0.08z)| norm 0.2445 (-1.33z)| lr 4.56e-05 | 8442.52 ms | -100.0% bf16 MFU | 62079 tok/s +step 16208/19560 | loss 3.260659 (-1.22z)| norm 0.2567 (-0.00z)| lr 4.56e-05 | 8443.52 ms | -100.0% bf16 MFU | 62080 tok/s +step 16209/19560 | loss 3.387920 (+2.05z)| norm 0.2568 (+0.02z)| lr 4.56e-05 | 8435.74 ms | -100.0% bf16 MFU | 62083 tok/s +step 16210/19560 | loss 3.301557 (-0.16z)| norm 0.2501 (-0.73z)| lr 4.55e-05 | 8444.30 ms | -100.0% bf16 MFU | 62084 tok/s +step 16211/19560 | loss 3.336956 (+0.77z)| norm 0.2500 (-0.72z)| lr 4.55e-05 | 8439.77 ms | -100.0% bf16 MFU | 62086 tok/s +step 16212/19560 | loss 3.272214 (-0.91z)| norm 0.2398 (-1.80z)| lr 4.55e-05 | 8441.38 ms | -100.0% bf16 MFU | 62087 tok/s +step 16213/19560 | loss 3.313305 (+0.16z)| norm 0.2523 (-0.45z)| lr 4.55e-05 | 8440.97 ms | -100.0% bf16 MFU | 62088 tok/s +step 16214/19560 | loss 3.235989 (-1.83z)| norm 0.2701 (+1.46z)| lr 4.54e-05 | 8439.00 ms | -100.0% bf16 MFU | 62090 tok/s +step 16215/19560 | loss 3.347892 (+1.04z)| norm 0.2471 (-1.01z)| lr 4.54e-05 | 8438.51 ms | -100.0% bf16 MFU | 62092 tok/s +step 16216/19560 | loss 3.332587 (+0.64z)| norm 0.2637 (+0.77z)| lr 4.54e-05 | 8443.61 ms | -100.0% bf16 MFU | 62092 tok/s +step 16217/19560 | loss 3.363160 (+1.40z)| norm 0.2618 (+0.55z)| lr 4.54e-05 | 8441.39 ms | -100.0% bf16 MFU | 62093 tok/s +step 16218/19560 | loss 3.254497 (-1.35z)| norm 0.2692 (+1.32z)| lr 4.53e-05 | 8442.75 ms | -100.0% bf16 MFU | 62093 tok/s +step 16219/19560 | loss 3.350843 (+1.09z)| norm 0.2740 (+1.87z)| lr 4.53e-05 | 8442.65 ms | -100.0% bf16 MFU | 62093 tok/s +step 16220/19560 | loss 3.264297 (-1.09z)| norm 0.2607 (+0.44z)| lr 4.53e-05 | 8440.18 ms | -100.0% bf16 MFU | 62095 tok/s +step 16221/19560 | loss 3.278640 (-0.71z)| norm 0.2517 (-0.55z)| lr 4.52e-05 | 8442.57 ms | -100.0% bf16 MFU | 62095 tok/s +step 16222/19560 | loss 3.301893 (-0.13z)| norm 0.2643 (+0.82z)| lr 4.52e-05 | 8445.89 ms | -100.0% bf16 MFU | 62094 tok/s +step 16223/19560 | loss 3.324090 (+0.42z)| norm 0.2713 (+1.59z)| lr 4.52e-05 | 8439.78 ms | -100.0% bf16 MFU | 62095 tok/s +step 16224/19560 | loss 3.375588 (+1.69z)| norm 0.2468 (-1.09z)| lr 4.52e-05 | 8435.98 ms | -100.0% bf16 MFU | 62098 tok/s +step 16225/19560 | loss 3.410232 (+2.47z)| norm 0.2647 (+0.86z)| lr 4.51e-05 | 8437.43 ms | -100.0% bf16 MFU | 62100 tok/s +step 16226/19560 | loss 3.384491 (+1.81z)| norm 0.2654 (+0.96z)| lr 4.51e-05 | 8438.43 ms | -100.0% bf16 MFU | 62102 tok/s +step 16227/19560 | loss 3.259972 (-1.17z)| norm 0.2650 (+0.91z)| lr 4.51e-05 | 8439.28 ms | -100.0% bf16 MFU | 62103 tok/s +step 16228/19560 | loss 3.278545 (-0.72z)| norm 0.2509 (-0.65z)| lr 4.51e-05 | 8437.93 ms | -100.0% bf16 MFU | 62104 tok/s +step 16229/19560 | loss 3.353199 (+1.06z)| norm 0.2552 (-0.16z)| lr 4.50e-05 | 8440.98 ms | -100.0% bf16 MFU | 62105 tok/s +step 16230/19560 | loss 3.275972 (-0.78z)| norm 0.2544 (-0.25z)| lr 4.50e-05 | 8437.47 ms | -100.0% bf16 MFU | 62106 tok/s +step 16231/19560 | loss 3.272577 (-0.85z)| norm 0.2542 (-0.27z)| lr 4.50e-05 | 8442.71 ms | -100.0% bf16 MFU | 62106 tok/s +step 16232/19560 | loss 3.341452 (+0.79z)| norm 0.2384 (-2.01z)| lr 4.50e-05 | 8441.67 ms | -100.0% bf16 MFU | 62106 tok/s +step 16233/19560 | loss 3.292201 (-0.39z)| norm 0.2573 (+0.11z)| lr 4.49e-05 | 8440.41 ms | -100.0% bf16 MFU | 62107 tok/s +step 16234/19560 | loss 3.352026 (+1.02z)| norm 0.2524 (-0.42z)| lr 4.49e-05 | 8444.31 ms | -100.0% bf16 MFU | 62106 tok/s +step 16235/19560 | loss 3.284264 (-0.59z)| norm 0.2535 (-0.31z)| lr 4.49e-05 | 8438.04 ms | -100.0% bf16 MFU | 62107 tok/s +step 16236/19560 | loss 3.350749 (+0.98z)| norm 0.2555 (-0.08z)| lr 4.48e-05 | 8438.16 ms | -100.0% bf16 MFU | 62108 tok/s +step 16237/19560 | loss 3.280429 (-0.70z)| norm 0.2448 (-1.27z)| lr 4.48e-05 | 8443.61 ms | -100.0% bf16 MFU | 62108 tok/s +step 16238/19560 | loss 3.266048 (-1.04z)| norm 0.2409 (-1.68z)| lr 4.48e-05 | 8439.16 ms | -100.0% bf16 MFU | 62109 tok/s +step 16239/19560 | loss 3.382802 (+1.70z)| norm 0.2523 (-0.41z)| lr 4.48e-05 | 8441.08 ms | -100.0% bf16 MFU | 62109 tok/s +step 16240/19560 | loss 3.295817 (-0.33z)| norm 0.2626 (+0.77z)| lr 4.47e-05 | 8442.49 ms | -100.0% bf16 MFU | 62108 tok/s +step 16241/19560 | loss 3.337235 (+0.64z)| norm 0.2415 (-1.59z)| lr 4.47e-05 | 8437.88 ms | -100.0% bf16 MFU | 62110 tok/s +step 16242/19560 | loss 3.337026 (+0.62z)| norm 0.2417 (-1.55z)| lr 4.47e-05 | 8436.53 ms | -100.0% bf16 MFU | 62111 tok/s +step 16243/19560 | loss 3.335500 (+0.58z)| norm 0.2588 (+0.37z)| lr 4.47e-05 | 8437.87 ms | -100.0% bf16 MFU | 62113 tok/s +step 16244/19560 | loss 3.285983 (-0.60z)| norm 0.2549 (-0.07z)| lr 4.46e-05 | 8441.47 ms | -100.0% bf16 MFU | 62112 tok/s +step 16245/19560 | loss 3.382815 (+1.66z)| norm 0.2500 (-0.61z)| lr 4.46e-05 | 8440.32 ms | -100.0% bf16 MFU | 62113 tok/s +step 16246/19560 | loss 3.295047 (-0.40z)| norm 0.2679 (+1.40z)| lr 4.46e-05 | 8441.22 ms | -100.0% bf16 MFU | 62113 tok/s +step 16247/19560 | loss 3.237592 (-1.73z)| norm 0.2745 (+2.08z)| lr 4.46e-05 | 8439.65 ms | -100.0% bf16 MFU | 62113 tok/s +step 16248/19560 | loss 3.251182 (-1.39z)| norm 0.2559 (+0.02z)| lr 4.45e-05 | 8444.31 ms | -100.0% bf16 MFU | 62112 tok/s +step 16249/19560 | loss 3.352139 (+0.94z)| norm 0.2681 (+1.37z)| lr 4.45e-05 | 8440.66 ms | -100.0% bf16 MFU | 62112 tok/s +step 16250/19560 | loss 3.332302 (+0.47z)| norm 0.2491 (-0.73z)| lr 4.45e-05 | 8434.52 ms | -100.0% bf16 MFU | 62114 tok/s +val loss 3.278258 +evaluating HellaSwag: 0/1256 evaluating HellaSwag: 10/1256 evaluating HellaSwag: 20/1256 evaluating HellaSwag: 30/1256 evaluating HellaSwag: 40/1256 evaluating HellaSwag: 50/1256 evaluating HellaSwag: 60/1256 evaluating HellaSwag: 70/1256 evaluating HellaSwag: 80/1256 evaluating HellaSwag: 90/1256 evaluating HellaSwag: 100/1256 evaluating HellaSwag: 110/1256 evaluating HellaSwag: 120/1256 evaluating HellaSwag: 130/1256 evaluating HellaSwag: 140/1256 evaluating HellaSwag: 150/1256 evaluating HellaSwag: 160/1256 evaluating HellaSwag: 170/1256 evaluating HellaSwag: 180/1256 evaluating HellaSwag: 190/1256 evaluating HellaSwag: 200/1256 evaluating HellaSwag: 210/1256 evaluating HellaSwag: 220/1256 evaluating HellaSwag: 230/1256 evaluating HellaSwag: 240/1256 evaluating HellaSwag: 250/1256 evaluating HellaSwag: 260/1256 evaluating HellaSwag: 270/1256 evaluating HellaSwag: 280/1256 evaluating HellaSwag: 290/1256 evaluating HellaSwag: 300/1256 evaluating HellaSwag: 310/1256 evaluating HellaSwag: 320/1256 evaluating HellaSwag: 330/1256 evaluating HellaSwag: 340/1256 evaluating HellaSwag: 350/1256 evaluating HellaSwag: 360/1256 evaluating HellaSwag: 370/1256 evaluating HellaSwag: 380/1256 evaluating HellaSwag: 390/1256 evaluating HellaSwag: 400/1256 evaluating HellaSwag: 410/1256 evaluating HellaSwag: 420/1256 evaluating HellaSwag: 430/1256 evaluating HellaSwag: 440/1256 evaluating HellaSwag: 450/1256 evaluating HellaSwag: 460/1256 evaluating HellaSwag: 470/1256 evaluating HellaSwag: 480/1256 evaluating HellaSwag: 490/1256 evaluating HellaSwag: 500/1256 evaluating HellaSwag: 510/1256 evaluating HellaSwag: 520/1256 evaluating HellaSwag: 530/1256 evaluating HellaSwag: 540/1256 evaluating HellaSwag: 550/1256 evaluating HellaSwag: 560/1256 evaluating HellaSwag: 570/1256 evaluating HellaSwag: 580/1256 evaluating HellaSwag: 590/1256 evaluating HellaSwag: 600/1256 evaluating HellaSwag: 610/1256 evaluating HellaSwag: 620/1256 evaluating HellaSwag: 630/1256 evaluating HellaSwag: 640/1256 evaluating HellaSwag: 650/1256 evaluating HellaSwag: 660/1256 evaluating HellaSwag: 670/1256 evaluating HellaSwag: 680/1256 evaluating HellaSwag: 690/1256 evaluating HellaSwag: 700/1256 evaluating HellaSwag: 710/1256 evaluating HellaSwag: 720/1256 evaluating HellaSwag: 730/1256 evaluating HellaSwag: 740/1256 evaluating HellaSwag: 750/1256 evaluating HellaSwag: 760/1256 evaluating HellaSwag: 770/1256 evaluating HellaSwag: 780/1256 evaluating HellaSwag: 790/1256 evaluating HellaSwag: 800/1256 evaluating HellaSwag: 810/1256 evaluating HellaSwag: 820/1256 evaluating HellaSwag: 830/1256 evaluating HellaSwag: 840/1256 evaluating HellaSwag: 850/1256 evaluating HellaSwag: 860/1256 evaluating HellaSwag: 870/1256 evaluating HellaSwag: 880/1256 evaluating HellaSwag: 890/1256 evaluating HellaSwag: 900/1256 evaluating HellaSwag: 910/1256 evaluating HellaSwag: 920/1256 evaluating HellaSwag: 930/1256 evaluating HellaSwag: 940/1256 evaluating HellaSwag: 950/1256 evaluating HellaSwag: 960/1256 evaluating HellaSwag: 970/1256 evaluating HellaSwag: 980/1256 evaluating HellaSwag: 990/1256 evaluating HellaSwag: 1000/1256 evaluating HellaSwag: 1010/1256 evaluating HellaSwag: 1020/1256 evaluating HellaSwag: 1030/1256 evaluating HellaSwag: 1040/1256 evaluating HellaSwag: 1050/1256 evaluating HellaSwag: 1060/1256 evaluating HellaSwag: 1070/1256 evaluating HellaSwag: 1080/1256 evaluating HellaSwag: 1090/1256 evaluating HellaSwag: 1100/1256 evaluating HellaSwag: 1110/1256 evaluating HellaSwag: 1120/1256 evaluating HellaSwag: 1130/1256 evaluating HellaSwag: 1140/1256 evaluating HellaSwag: 1150/1256 evaluating HellaSwag: 1160/1256 evaluating HellaSwag: 1170/1256 evaluating HellaSwag: 1180/1256 evaluating HellaSwag: 1190/1256 evaluating HellaSwag: 1200/1256 evaluating HellaSwag: 1210/1256 evaluating HellaSwag: 1220/1256 evaluating HellaSwag: 1230/1256 evaluating HellaSwag: 1240/1256 evaluating HellaSwag: 1250/1256 HellaSwag: 3007/10042 = 0.299442 +step 16251/19560 | loss 3.310106 (-0.04z)| norm 0.2740 (+1.99z)| lr 4.45e-05 | 8439.44 ms | -100.0% bf16 MFU | 62115 tok/s +step 16252/19560 | loss 3.282190 (-0.69z)| norm 0.2556 (-0.03z)| lr 4.44e-05 | 8441.03 ms | -100.0% bf16 MFU | 62115 tok/s +step 16253/19560 | loss 3.327457 (+0.35z)| norm 0.2564 (+0.06z)| lr 4.44e-05 | 8438.96 ms | -100.0% bf16 MFU | 62115 tok/s +step 16254/19560 | loss 3.266720 (-1.05z)| norm 0.2588 (+0.37z)| lr 4.44e-05 | 8438.70 ms | -100.0% bf16 MFU | 62116 tok/s +step 16255/19560 | loss 3.274431 (-0.86z)| norm 0.2663 (+1.23z)| lr 4.44e-05 | 8439.64 ms | -100.0% bf16 MFU | 62116 tok/s +step 16256/19560 | loss 3.408844 (+2.20z)| norm 0.2515 (-0.48z)| lr 4.43e-05 | 8434.28 ms | -100.0% bf16 MFU | 62118 tok/s +step 16257/19560 | loss 3.303263 (-0.20z)| norm 0.2483 (-0.84z)| lr 4.43e-05 | 8430.73 ms | -100.0% bf16 MFU | 62122 tok/s +step 16258/19560 | loss 3.320774 (+0.22z)| norm 0.2648 (+1.07z)| lr 4.43e-05 | 8429.33 ms | -100.0% bf16 MFU | 62126 tok/s +step 16259/19560 | loss 3.244273 (-1.54z)| norm 0.2587 (+0.35z)| lr 4.42e-05 | 8430.12 ms | -100.0% bf16 MFU | 62129 tok/s +step 16260/19560 | loss 3.298046 (-0.30z)| norm 0.2546 (-0.10z)| lr 4.42e-05 | 8431.65 ms | -100.0% bf16 MFU | 62132 tok/s +step 16261/19560 | loss 3.305528 (-0.13z)| norm 0.2533 (-0.25z)| lr 4.42e-05 | 8429.94 ms | -100.0% bf16 MFU | 62135 tok/s +step 16262/19560 | loss 3.328400 (+0.40z)| norm 0.2603 (+0.58z)| lr 4.42e-05 | 8428.27 ms | -100.0% bf16 MFU | 62138 tok/s +step 16263/19560 | loss 3.291691 (-0.46z)| norm 0.2510 (-0.53z)| lr 4.41e-05 | 8432.74 ms | -100.0% bf16 MFU | 62140 tok/s +step 16264/19560 | loss 3.346624 (+0.82z)| norm 0.2738 (+2.15z)| lr 4.41e-05 | 8429.94 ms | -100.0% bf16 MFU | 62143 tok/s +step 16265/19560 | loss 3.260071 (-1.19z)| norm 0.2761 (+2.34z)| lr 4.41e-05 | 8431.68 ms | -100.0% bf16 MFU | 62145 tok/s +step 16266/19560 | loss 3.317124 (+0.13z)| norm 0.2609 (+0.59z)| lr 4.41e-05 | 8431.04 ms | -100.0% bf16 MFU | 62147 tok/s +step 16267/19560 | loss 3.288267 (-0.54z)| norm 0.2563 (+0.06z)| lr 4.40e-05 | 8434.96 ms | -100.0% bf16 MFU | 62147 tok/s +step 16268/19560 | loss 3.307872 (-0.08z)| norm 0.3039 (+4.95z)| lr 4.40e-05 | 8434.23 ms | -100.0% bf16 MFU | 62148 tok/s +step 16269/19560 | loss 3.271164 (-0.93z)| norm 0.2521 (-0.42z)| lr 4.40e-05 | 8433.77 ms | -100.0% bf16 MFU | 62149 tok/s +step 16270/19560 | loss 3.331720 (+0.47z)| norm 0.2585 (+0.23z)| lr 4.40e-05 | 8433.74 ms | -100.0% bf16 MFU | 62150 tok/s +step 16271/19560 | loss 3.351223 (+0.92z)| norm 0.2612 (+0.52z)| lr 4.39e-05 | 8430.15 ms | -100.0% bf16 MFU | 62152 tok/s +step 16272/19560 | loss 3.327580 (+0.38z)| norm 0.2716 (+1.58z)| lr 4.39e-05 | 8432.65 ms | -100.0% bf16 MFU | 62153 tok/s +step 16273/19560 | loss 3.322066 (+0.24z)| norm 0.2613 (+0.50z)| lr 4.39e-05 | 8433.34 ms | -100.0% bf16 MFU | 62154 tok/s +step 16274/19560 | loss 3.366947 (+1.33z)| norm 0.2541 (-0.24z)| lr 4.39e-05 | 8431.82 ms | -100.0% bf16 MFU | 62155 tok/s +step 16275/19560 | loss 3.337775 (+0.63z)| norm 0.2631 (+0.69z)| lr 4.38e-05 | 8434.19 ms | -100.0% bf16 MFU | 62155 tok/s +step 16276/19560 | loss 3.337203 (+0.63z)| norm 0.2564 (-0.02z)| lr 4.38e-05 | 8435.90 ms | -100.0% bf16 MFU | 62155 tok/s +step 16277/19560 | loss 3.295281 (-0.37z)| norm 0.2474 (-0.95z)| lr 4.38e-05 | 8434.46 ms | -100.0% bf16 MFU | 62155 tok/s +step 16278/19560 | loss 3.247250 (-1.49z)| norm 0.2572 (+0.06z)| lr 4.38e-05 | 8436.88 ms | -100.0% bf16 MFU | 62155 tok/s +step 16279/19560 | loss 3.286152 (-0.57z)| norm 0.2501 (-0.68z)| lr 4.37e-05 | 8437.02 ms | -100.0% bf16 MFU | 62154 tok/s +step 16280/19560 | loss 3.282745 (-0.64z)| norm 0.2353 (-2.20z)| lr 4.37e-05 | 8433.71 ms | -100.0% bf16 MFU | 62155 tok/s +step 16281/19560 | loss 3.277933 (-0.77z)| norm 0.2458 (-1.08z)| lr 4.37e-05 | 8439.12 ms | -100.0% bf16 MFU | 62153 tok/s +step 16282/19560 | loss 3.343398 (+0.80z)| norm 0.2539 (-0.24z)| lr 4.36e-05 | 8435.24 ms | -100.0% bf16 MFU | 62153 tok/s +step 16283/19560 | loss 3.283804 (-0.62z)| norm 0.2418 (-1.49z)| lr 4.36e-05 | 8437.19 ms | -100.0% bf16 MFU | 62153 tok/s +step 16284/19560 | loss 3.305659 (-0.11z)| norm 0.2613 (+0.54z)| lr 4.36e-05 | 8433.04 ms | -100.0% bf16 MFU | 62153 tok/s +step 16285/19560 | loss 3.266450 (-1.05z)| norm 0.2446 (-1.19z)| lr 4.36e-05 | 8435.59 ms | -100.0% bf16 MFU | 62153 tok/s +step 16286/19560 | loss 3.346433 (+0.88z)| norm 0.2425 (-1.39z)| lr 4.35e-05 | 8434.48 ms | -100.0% bf16 MFU | 62154 tok/s +step 16287/19560 | loss 3.327825 (+0.44z)| norm 0.2476 (-0.86z)| lr 4.35e-05 | 8437.35 ms | -100.0% bf16 MFU | 62153 tok/s +step 16288/19560 | loss 3.324723 (+0.37z)| norm 0.2500 (-0.60z)| lr 4.35e-05 | 8436.39 ms | -100.0% bf16 MFU | 62153 tok/s +step 16289/19560 | loss 3.279404 (-0.74z)| norm 0.2513 (-0.47z)| lr 4.35e-05 | 8436.82 ms | -100.0% bf16 MFU | 62152 tok/s +step 16290/19560 | loss 3.308132 (-0.04z)| norm 0.2451 (-1.10z)| lr 4.34e-05 | 8433.24 ms | -100.0% bf16 MFU | 62153 tok/s +step 16291/19560 | loss 3.262714 (-1.13z)| norm 0.2420 (-1.41z)| lr 4.34e-05 | 8438.85 ms | -100.0% bf16 MFU | 62152 tok/s +step 16292/19560 | loss 3.331915 (+0.54z)| norm 0.2461 (-0.97z)| lr 4.34e-05 | 8439.67 ms | -100.0% bf16 MFU | 62150 tok/s +step 16293/19560 | loss 3.296601 (-0.31z)| norm 0.2449 (-1.08z)| lr 4.34e-05 | 8438.07 ms | -100.0% bf16 MFU | 62149 tok/s +step 16294/19560 | loss 3.304535 (-0.10z)| norm 0.2437 (-1.19z)| lr 4.33e-05 | 8437.77 ms | -100.0% bf16 MFU | 62149 tok/s +step 16295/19560 | loss 3.311823 (+0.09z)| norm 0.2602 (+0.48z)| lr 4.33e-05 | 8438.68 ms | -100.0% bf16 MFU | 62148 tok/s +step 16296/19560 | loss 3.331241 (+0.57z)| norm 0.2538 (-0.17z)| lr 4.33e-05 | 8436.29 ms | -100.0% bf16 MFU | 62148 tok/s +step 16297/19560 | loss 3.308476 (-0.01z)| norm 0.2636 (+0.82z)| lr 4.33e-05 | 8438.00 ms | -100.0% bf16 MFU | 62147 tok/s +step 16298/19560 | loss 3.274931 (-0.84z)| norm 0.2646 (+0.91z)| lr 4.32e-05 | 8434.90 ms | -100.0% bf16 MFU | 62148 tok/s +step 16299/19560 | loss 3.341989 (+0.84z)| norm 0.2522 (-0.34z)| lr 4.32e-05 | 8434.44 ms | -100.0% bf16 MFU | 62148 tok/s +step 16300/19560 | loss 3.275718 (-0.81z)| norm 0.2438 (-1.17z)| lr 4.32e-05 | 8435.94 ms | -100.0% bf16 MFU | 62148 tok/s +step 16301/19560 | loss 3.237391 (-1.78z)| norm 0.2436 (-1.17z)| lr 4.32e-05 | 8437.20 ms | -100.0% bf16 MFU | 62148 tok/s +step 16302/19560 | loss 3.282796 (-0.63z)| norm 0.2461 (-0.91z)| lr 4.31e-05 | 8437.90 ms | -100.0% bf16 MFU | 62147 tok/s +step 16303/19560 | loss 3.254353 (-1.32z)| norm 0.2682 (+1.32z)| lr 4.31e-05 | 8437.95 ms | -100.0% bf16 MFU | 62147 tok/s +step 16304/19560 | loss 3.260495 (-1.16z)| norm 0.2553 (+0.00z)| lr 4.31e-05 | 8436.63 ms | -100.0% bf16 MFU | 62146 tok/s +step 16305/19560 | loss 3.349609 (+1.05z)| norm 0.2563 (+0.12z)| lr 4.31e-05 | 8437.95 ms | -100.0% bf16 MFU | 62146 tok/s +step 16306/19560 | loss 3.285450 (-0.55z)| norm 0.2523 (-0.29z)| lr 4.30e-05 | 8438.12 ms | -100.0% bf16 MFU | 62145 tok/s +step 16307/19560 | loss 3.310594 (+0.08z)| norm 0.2488 (-0.65z)| lr 4.30e-05 | 8435.82 ms | -100.0% bf16 MFU | 62145 tok/s +step 16308/19560 | loss 3.458295 (+3.54z)| norm 0.2532 (-0.19z)| lr 4.30e-05 | 8436.87 ms | -100.0% bf16 MFU | 62145 tok/s +step 16309/19560 | loss 3.401601 (+2.14z)| norm 0.2660 (+1.10z)| lr 4.29e-05 | 8436.00 ms | -100.0% bf16 MFU | 62145 tok/s +step 16310/19560 | loss 3.248685 (-1.41z)| norm 0.2543 (-0.09z)| lr 4.29e-05 | 8434.95 ms | -100.0% bf16 MFU | 62146 tok/s +step 16311/19560 | loss 3.265209 (-1.01z)| norm 0.2516 (-0.36z)| lr 4.29e-05 | 8438.72 ms | -100.0% bf16 MFU | 62145 tok/s +step 16312/19560 | loss 3.309044 (+0.02z)| norm 0.2468 (-0.85z)| lr 4.29e-05 | 8437.41 ms | -100.0% bf16 MFU | 62145 tok/s +step 16313/19560 | loss 3.304112 (-0.10z)| norm 0.2645 (+0.93z)| lr 4.28e-05 | 8435.37 ms | -100.0% bf16 MFU | 62145 tok/s +step 16314/19560 | loss 3.201406 (-2.47z)| norm 0.2583 (+0.31z)| lr 4.28e-05 | 8437.09 ms | -100.0% bf16 MFU | 62145 tok/s +step 16315/19560 | loss 3.251470 (-1.29z)| norm 0.2536 (-0.17z)| lr 4.28e-05 | 8462.58 ms | -100.0% bf16 MFU | 62135 tok/s +step 16316/19560 | loss 3.278165 (-0.67z)| norm 0.2602 (+0.49z)| lr 4.28e-05 | 8458.27 ms | -100.0% bf16 MFU | 62128 tok/s +step 16317/19560 | loss 3.305060 (-0.04z)| norm 0.2556 (+0.02z)| lr 4.27e-05 | 8461.25 ms | -100.0% bf16 MFU | 62120 tok/s +step 16318/19560 | loss 3.284455 (-0.51z)| norm 0.2549 (-0.04z)| lr 4.27e-05 | 8463.50 ms | -100.0% bf16 MFU | 62111 tok/s +step 16319/19560 | loss 3.366333 (+1.37z)| norm 0.2679 (+1.28z)| lr 4.27e-05 | 8460.69 ms | -100.0% bf16 MFU | 62104 tok/s +step 16320/19560 | loss 3.388830 (+1.86z)| norm 0.2524 (-0.30z)| lr 4.27e-05 | 8462.61 ms | -100.0% bf16 MFU | 62096 tok/s +step 16321/19560 | loss 3.298095 (-0.24z)| norm 0.2566 (+0.12z)| lr 4.26e-05 | 8456.23 ms | -100.0% bf16 MFU | 62092 tok/s +step 16322/19560 | loss 3.257436 (-1.16z)| norm 0.2554 (-0.02z)| lr 4.26e-05 | 8454.65 ms | -100.0% bf16 MFU | 62088 tok/s +step 16323/19560 | loss 3.433296 (+2.79z)| norm 0.2523 (-0.33z)| lr 4.26e-05 | 8462.93 ms | -100.0% bf16 MFU | 62081 tok/s +step 16324/19560 | loss 3.390505 (+1.82z)| norm 0.2500 (-0.57z)| lr 4.26e-05 | 8464.20 ms | -100.0% bf16 MFU | 62074 tok/s +step 16325/19560 | loss 3.407817 (+2.14z)| norm 0.2859 (+3.03z)| lr 4.25e-05 | 8457.11 ms | -100.0% bf16 MFU | 62070 tok/s +step 16326/19560 | loss 3.360573 (+1.10z)| norm 0.2493 (-0.66z)| lr 4.25e-05 | 8460.95 ms | -100.0% bf16 MFU | 62065 tok/s +step 16327/19560 | loss 3.293607 (-0.37z)| norm 0.2533 (-0.26z)| lr 4.25e-05 | 8455.69 ms | -100.0% bf16 MFU | 62062 tok/s +step 16328/19560 | loss 3.328755 (+0.39z)| norm 0.2669 (+1.09z)| lr 4.25e-05 | 8458.08 ms | -100.0% bf16 MFU | 62058 tok/s +step 16329/19560 | loss 3.351035 (+0.88z)| norm 0.2427 (-1.31z)| lr 4.24e-05 | 8455.60 ms | -100.0% bf16 MFU | 62055 tok/s +step 16330/19560 | loss 3.288985 (-0.49z)| norm 0.2578 (+0.19z)| lr 4.24e-05 | 8461.77 ms | -100.0% bf16 MFU | 62050 tok/s +step 16331/19560 | loss 3.273242 (-0.83z)| norm 0.2590 (+0.30z)| lr 4.24e-05 | 8453.81 ms | -100.0% bf16 MFU | 62049 tok/s +step 16332/19560 | loss 3.362542 (+1.12z)| norm 0.2605 (+0.45z)| lr 4.24e-05 | 8455.42 ms | -100.0% bf16 MFU | 62047 tok/s +step 16333/19560 | loss 3.315660 (+0.09z)| norm 0.2452 (-1.07z)| lr 4.23e-05 | 8449.60 ms | -100.0% bf16 MFU | 62047 tok/s +step 16334/19560 | loss 3.342688 (+0.68z)| norm 0.2615 (+0.56z)| lr 4.23e-05 | 8452.62 ms | -100.0% bf16 MFU | 62046 tok/s +step 16335/19560 | loss 3.299837 (-0.27z)| norm 0.2645 (+0.85z)| lr 4.23e-05 | 8448.95 ms | -100.0% bf16 MFU | 62046 tok/s +step 16336/19560 | loss 3.330875 (+0.41z)| norm 0.2534 (-0.27z)| lr 4.23e-05 | 8454.39 ms | -100.0% bf16 MFU | 62045 tok/s +step 16337/19560 | loss 3.281933 (-0.66z)| norm 0.2606 (+0.45z)| lr 4.22e-05 | 8455.41 ms | -100.0% bf16 MFU | 62043 tok/s +step 16338/19560 | loss 3.321199 (+0.21z)| norm 0.2405 (-1.54z)| lr 4.22e-05 | 8456.08 ms | -100.0% bf16 MFU | 62041 tok/s +step 16339/19560 | loss 3.341451 (+0.66z)| norm 0.2488 (-0.72z)| lr 4.22e-05 | 8454.87 ms | -100.0% bf16 MFU | 62039 tok/s +step 16340/19560 | loss 3.322953 (+0.24z)| norm 0.2442 (-1.19z)| lr 4.22e-05 | 8450.81 ms | -100.0% bf16 MFU | 62039 tok/s +step 16341/19560 | loss 3.306332 (-0.13z)| norm 0.2456 (-1.03z)| lr 4.21e-05 | 8455.86 ms | -100.0% bf16 MFU | 62037 tok/s +step 16342/19560 | loss 3.299760 (-0.29z)| norm 0.2550 (-0.09z)| lr 4.21e-05 | 8452.46 ms | -100.0% bf16 MFU | 62037 tok/s +step 16343/19560 | loss 3.398700 (+1.92z)| norm 0.2546 (-0.13z)| lr 4.21e-05 | 8452.44 ms | -100.0% bf16 MFU | 62036 tok/s +step 16344/19560 | loss 3.339391 (+0.59z)| norm 0.2670 (+1.11z)| lr 4.21e-05 | 8451.54 ms | -100.0% bf16 MFU | 62036 tok/s +step 16345/19560 | loss 3.339408 (+0.60z)| norm 0.2455 (-1.04z)| lr 4.20e-05 | 8455.30 ms | -100.0% bf16 MFU | 62035 tok/s +step 16346/19560 | loss 3.311972 (-0.03z)| norm 0.2575 (+0.18z)| lr 4.20e-05 | 8449.75 ms | -100.0% bf16 MFU | 62035 tok/s +step 16347/19560 | loss 3.290514 (-0.51z)| norm 0.2613 (+0.58z)| lr 4.20e-05 | 8447.27 ms | -100.0% bf16 MFU | 62037 tok/s +step 16348/19560 | loss 3.394243 (+1.81z)| norm 0.2531 (-0.25z)| lr 4.20e-05 | 8448.96 ms | -100.0% bf16 MFU | 62038 tok/s +step 16349/19560 | loss 3.339625 (+0.57z)| norm 0.2779 (+2.22z)| lr 4.19e-05 | 8448.64 ms | -100.0% bf16 MFU | 62039 tok/s +step 16350/19560 | loss 3.254157 (-1.34z)| norm 0.2582 (+0.25z)| lr 4.19e-05 | 8454.53 ms | -100.0% bf16 MFU | 62037 tok/s +step 16351/19560 | loss 3.381049 (+1.48z)| norm 0.2667 (+1.12z)| lr 4.19e-05 | 8450.37 ms | -100.0% bf16 MFU | 62038 tok/s +step 16352/19560 | loss 3.235960 (-1.71z)| norm 0.2760 (+2.00z)| lr 4.18e-05 | 8447.62 ms | -100.0% bf16 MFU | 62039 tok/s +step 16353/19560 | loss 3.357567 (+1.00z)| norm 0.2586 (+0.27z)| lr 4.18e-05 | 8450.65 ms | -100.0% bf16 MFU | 62039 tok/s +step 16354/19560 | loss 3.310923 (-0.03z)| norm 0.2560 (+0.01z)| lr 4.18e-05 | 8449.31 ms | -100.0% bf16 MFU | 62040 tok/s +step 16355/19560 | loss 3.300812 (-0.27z)| norm 0.2545 (-0.12z)| lr 4.18e-05 | 8451.61 ms | -100.0% bf16 MFU | 62039 tok/s +step 16356/19560 | loss 3.350775 (+0.85z)| norm 0.2425 (-1.31z)| lr 4.17e-05 | 8447.41 ms | -100.0% bf16 MFU | 62041 tok/s +step 16357/19560 | loss 3.316241 (+0.08z)| norm 0.2552 (-0.05z)| lr 4.17e-05 | 8456.11 ms | -100.0% bf16 MFU | 62039 tok/s +step 16358/19560 | loss 3.302779 (-0.24z)| norm 0.2520 (-0.36z)| lr 4.17e-05 | 8452.57 ms | -100.0% bf16 MFU | 62038 tok/s +step 16359/19560 | loss 3.364406 (+1.15z)| norm 0.2569 (+0.13z)| lr 4.17e-05 | 8446.74 ms | -100.0% bf16 MFU | 62040 tok/s +step 16360/19560 | loss 3.322291 (+0.20z)| norm 0.2457 (-1.01z)| lr 4.16e-05 | 8453.94 ms | -100.0% bf16 MFU | 62039 tok/s +step 16361/19560 | loss 3.318409 (+0.10z)| norm 0.2487 (-0.70z)| lr 4.16e-05 | 8448.12 ms | -100.0% bf16 MFU | 62040 tok/s +step 16362/19560 | loss 3.288386 (-0.57z)| norm 0.2479 (-0.78z)| lr 4.16e-05 | 8450.11 ms | -100.0% bf16 MFU | 62040 tok/s +step 16363/19560 | loss 3.328754 (+0.34z)| norm 0.2553 (-0.04z)| lr 4.16e-05 | 8446.76 ms | -100.0% bf16 MFU | 62041 tok/s +step 16364/19560 | loss 3.336780 (+0.53z)| norm 0.2491 (-0.65z)| lr 4.15e-05 | 8448.38 ms | -100.0% bf16 MFU | 62042 tok/s +step 16365/19560 | loss 3.294075 (-0.45z)| norm 0.2469 (-0.87z)| lr 4.15e-05 | 8446.91 ms | -100.0% bf16 MFU | 62044 tok/s +step 16366/19560 | loss 3.281379 (-0.75z)| norm 0.2508 (-0.49z)| lr 4.15e-05 | 8446.98 ms | -100.0% bf16 MFU | 62045 tok/s +step 16367/19560 | loss 3.336237 (+0.53z)| norm 0.2530 (-0.27z)| lr 4.15e-05 | 8445.96 ms | -100.0% bf16 MFU | 62046 tok/s +step 16368/19560 | loss 3.342623 (+0.67z)| norm 0.2379 (-1.76z)| lr 4.14e-05 | 8444.64 ms | -100.0% bf16 MFU | 62048 tok/s +step 16369/19560 | loss 3.381628 (+1.55z)| norm 0.2670 (+1.14z)| lr 4.14e-05 | 8445.39 ms | -100.0% bf16 MFU | 62050 tok/s +step 16370/19560 | loss 3.349483 (+0.81z)| norm 0.2523 (-0.35z)| lr 4.14e-05 | 8450.86 ms | -100.0% bf16 MFU | 62049 tok/s +step 16371/19560 | loss 3.351138 (+0.84z)| norm 0.2432 (-1.26z)| lr 4.14e-05 | 8447.88 ms | -100.0% bf16 MFU | 62050 tok/s +step 16372/19560 | loss 3.350134 (+0.81z)| norm 0.2440 (-1.16z)| lr 4.13e-05 | 8445.00 ms | -100.0% bf16 MFU | 62052 tok/s +step 16373/19560 | loss 3.380637 (+1.51z)| norm 0.2570 (+0.14z)| lr 4.13e-05 | 8444.75 ms | -100.0% bf16 MFU | 62053 tok/s +step 16374/19560 | loss 3.415285 (+2.24z)| norm 0.3056 (+4.58z)| lr 4.13e-05 | 8447.55 ms | -100.0% bf16 MFU | 62054 tok/s +step 16375/19560 | loss 3.315333 (-0.03z)| norm 0.2469 (-0.82z)| lr 4.13e-05 | 8441.67 ms | -100.0% bf16 MFU | 62056 tok/s +step 16376/19560 | loss 3.312613 (-0.10z)| norm 0.2559 (+0.02z)| lr 4.12e-05 | 8443.79 ms | -100.0% bf16 MFU | 62058 tok/s +step 16377/19560 | loss 3.332780 (+0.37z)| norm 0.2527 (-0.27z)| lr 4.12e-05 | 8442.23 ms | -100.0% bf16 MFU | 62060 tok/s +step 16378/19560 | loss 3.451597 (+2.98z)| norm 0.2647 (+0.84z)| lr 4.12e-05 | 8439.19 ms | -100.0% bf16 MFU | 62064 tok/s +step 16379/19560 | loss 3.278436 (-0.87z)| norm 0.3452 (+6.73z)| lr 4.12e-05 | 8448.33 ms | -100.0% bf16 MFU | 62063 tok/s +step 16380/19560 | loss 3.322181 (+0.10z)| norm 0.2814 (+1.86z)| lr 4.11e-05 | 8441.03 ms | -100.0% bf16 MFU | 62066 tok/s +step 16381/19560 | loss 3.415820 (+2.13z)| norm 0.2593 (+0.21z)| lr 4.11e-05 | 8441.19 ms | -100.0% bf16 MFU | 62068 tok/s +step 16382/19560 | loss 3.309204 (-0.21z)| norm 0.2624 (+0.44z)| lr 4.11e-05 | 8440.89 ms | -100.0% bf16 MFU | 62070 tok/s +step 16383/19560 | loss 3.300311 (-0.41z)| norm 0.2579 (+0.11z)| lr 4.11e-05 | 8439.84 ms | -100.0% bf16 MFU | 62073 tok/s +step 16384/19560 | loss 3.288997 (-0.65z)| norm 0.2418 (-1.09z)| lr 4.10e-05 | 8451.67 ms | -100.0% bf16 MFU | 62071 tok/s +step 16385/19560 | loss 3.299900 (-0.41z)| norm 0.2608 (+0.33z)| lr 4.10e-05 | 8445.44 ms | -100.0% bf16 MFU | 62071 tok/s +step 16386/19560 | loss 3.279730 (-0.85z)| norm 0.2349 (-1.58z)| lr 4.10e-05 | 8443.34 ms | -100.0% bf16 MFU | 62072 tok/s +step 16387/19560 | loss 3.294872 (-0.52z)| norm 0.2477 (-0.62z)| lr 4.10e-05 | 8445.61 ms | -100.0% bf16 MFU | 62073 tok/s +step 16388/19560 | loss 3.302595 (-0.35z)| norm 0.2468 (-0.68z)| lr 4.09e-05 | 8442.02 ms | -100.0% bf16 MFU | 62074 tok/s +step 16389/19560 | loss 3.292753 (-0.57z)| norm 0.2407 (-1.12z)| lr 4.09e-05 | 8444.50 ms | -100.0% bf16 MFU | 62075 tok/s +step 16390/19560 | loss 3.564636 (+4.94z)| norm 0.3331 (+5.03z)| lr 4.09e-05 | 8446.06 ms | -100.0% bf16 MFU | 62075 tok/s +step 16391/19560 | loss 3.376173 (+1.12z)| norm 0.2457 (-0.71z)| lr 4.09e-05 | 8439.74 ms | -100.0% bf16 MFU | 62077 tok/s +step 16392/19560 | loss 3.326830 (+0.13z)| norm 0.2636 (+0.47z)| lr 4.08e-05 | 8446.65 ms | -100.0% bf16 MFU | 62077 tok/s +step 16393/19560 | loss 3.397252 (+1.52z)| norm 0.2935 (+2.39z)| lr 4.08e-05 | 8447.74 ms | -100.0% bf16 MFU | 62076 tok/s +step 16394/19560 | loss 3.279712 (-0.83z)| norm 0.2475 (-0.58z)| lr 4.08e-05 | 8440.57 ms | -100.0% bf16 MFU | 62078 tok/s +step 16395/19560 | loss 3.356788 (+0.70z)| norm 0.2646 (+0.52z)| lr 4.08e-05 | 8445.08 ms | -100.0% bf16 MFU | 62078 tok/s +step 16396/19560 | loss 3.326483 (+0.09z)| norm 0.2535 (-0.18z)| lr 4.07e-05 | 8447.88 ms | -100.0% bf16 MFU | 62078 tok/s +step 16397/19560 | loss 3.310071 (-0.24z)| norm 0.2423 (-0.92z)| lr 4.07e-05 | 8445.15 ms | -100.0% bf16 MFU | 62078 tok/s +step 16398/19560 | loss 3.307169 (-0.30z)| norm 0.2467 (-0.62z)| lr 4.07e-05 | 8440.54 ms | -100.0% bf16 MFU | 62080 tok/s +step 16399/19560 | loss 3.262176 (-1.18z)| norm 0.2388 (-1.13z)| lr 4.07e-05 | 8438.40 ms | -100.0% bf16 MFU | 62082 tok/s +step 16400/19560 | loss 3.277863 (-0.86z)| norm 0.2576 (+0.13z)| lr 4.06e-05 | 8444.16 ms | -100.0% bf16 MFU | 62083 tok/s +step 16401/19560 | loss 3.313108 (-0.15z)| norm 0.2462 (-0.62z)| lr 4.06e-05 | 8444.43 ms | -100.0% bf16 MFU | 62083 tok/s +step 16402/19560 | loss 3.391821 (+1.41z)| norm 0.2662 (+0.70z)| lr 4.06e-05 | 8441.00 ms | -100.0% bf16 MFU | 62084 tok/s +step 16403/19560 | loss 3.266484 (-1.07z)| norm 0.2557 (+0.01z)| lr 4.06e-05 | 8448.13 ms | -100.0% bf16 MFU | 62083 tok/s +step 16404/19560 | loss 3.294787 (-0.50z)| norm 0.2823 (+1.75z)| lr 4.05e-05 | 8441.85 ms | -100.0% bf16 MFU | 62084 tok/s +step 16405/19560 | loss 3.347340 (+0.53z)| norm 0.2579 (+0.14z)| lr 4.05e-05 | 8440.97 ms | -100.0% bf16 MFU | 62086 tok/s +step 16406/19560 | loss 3.263009 (-1.14z)| norm 0.2546 (-0.08z)| lr 4.05e-05 | 8441.83 ms | -100.0% bf16 MFU | 62087 tok/s +step 16407/19560 | loss 3.344021 (+0.45z)| norm 0.2552 (-0.05z)| lr 4.05e-05 | 8443.52 ms | -100.0% bf16 MFU | 62087 tok/s +step 16408/19560 | loss 3.375056 (+1.05z)| norm 0.2845 (+1.85z)| lr 4.04e-05 | 8445.84 ms | -100.0% bf16 MFU | 62086 tok/s +step 16409/19560 | loss 3.336951 (+0.29z)| norm 0.2527 (-0.24z)| lr 4.04e-05 | 8442.69 ms | -100.0% bf16 MFU | 62087 tok/s +step 16410/19560 | loss 3.406244 (+1.64z)| norm 0.2551 (-0.08z)| lr 4.04e-05 | 8441.76 ms | -100.0% bf16 MFU | 62088 tok/s +step 16411/19560 | loss 3.303132 (-0.39z)| norm 0.2543 (-0.14z)| lr 4.04e-05 | 8445.46 ms | -100.0% bf16 MFU | 62088 tok/s +step 16412/19560 | loss 3.397276 (+1.44z)| norm 0.2546 (-0.12z)| lr 4.03e-05 | 8440.15 ms | -100.0% bf16 MFU | 62089 tok/s +step 16413/19560 | loss 3.318515 (-0.11z)| norm 0.2434 (-0.86z)| lr 4.03e-05 | 8441.02 ms | -100.0% bf16 MFU | 62090 tok/s +step 16414/19560 | loss 3.319089 (-0.09z)| norm 0.2533 (-0.21z)| lr 4.03e-05 | 8443.01 ms | -100.0% bf16 MFU | 62091 tok/s +step 16415/19560 | loss 3.292909 (-0.60z)| norm 0.2498 (-0.44z)| lr 4.03e-05 | 8437.99 ms | -100.0% bf16 MFU | 62093 tok/s +step 16416/19560 | loss 3.386149 (+1.21z)| norm 0.2535 (-0.20z)| lr 4.02e-05 | 8441.99 ms | -100.0% bf16 MFU | 62093 tok/s +step 16417/19560 | loss 3.286264 (-0.74z)| norm 0.2423 (-0.93z)| lr 4.02e-05 | 8442.23 ms | -100.0% bf16 MFU | 62094 tok/s +step 16418/19560 | loss 3.312005 (-0.24z)| norm 0.2393 (-1.13z)| lr 4.02e-05 | 8439.10 ms | -100.0% bf16 MFU | 62095 tok/s +step 16419/19560 | loss 3.351293 (+0.52z)| norm 0.2468 (-0.63z)| lr 4.02e-05 | 8443.98 ms | -100.0% bf16 MFU | 62095 tok/s +step 16420/19560 | loss 3.259240 (-1.27z)| norm 0.2397 (-1.09z)| lr 4.01e-05 | 8445.06 ms | -100.0% bf16 MFU | 62095 tok/s +step 16421/19560 | loss 3.251830 (-1.40z)| norm 0.2542 (-0.15z)| lr 4.01e-05 | 8440.95 ms | -100.0% bf16 MFU | 62095 tok/s +step 16422/19560 | loss 3.393275 (+1.32z)| norm 0.2521 (-0.29z)| lr 4.01e-05 | 8440.72 ms | -100.0% bf16 MFU | 62096 tok/s +step 16423/19560 | loss 3.309915 (-0.28z)| norm 0.2473 (-0.60z)| lr 4.01e-05 | 8440.92 ms | -100.0% bf16 MFU | 62097 tok/s +step 16424/19560 | loss 3.282253 (-0.81z)| norm 0.2479 (-0.56z)| lr 4.00e-05 | 8441.35 ms | -100.0% bf16 MFU | 62098 tok/s +step 16425/19560 | loss 3.309419 (-0.28z)| norm 0.2486 (-0.50z)| lr 4.00e-05 | 8433.85 ms | -100.0% bf16 MFU | 62101 tok/s +step 16426/19560 | loss 3.357323 (+0.62z)| norm 0.2430 (-0.86z)| lr 4.00e-05 | 8439.83 ms | -100.0% bf16 MFU | 62102 tok/s +step 16427/19560 | loss 3.377613 (+1.01z)| norm 0.2532 (-0.19z)| lr 4.00e-05 | 8436.81 ms | -100.0% bf16 MFU | 62104 tok/s +step 16428/19560 | loss 3.285867 (-0.76z)| norm 0.2425 (-0.89z)| lr 3.99e-05 | 8443.25 ms | -100.0% bf16 MFU | 62104 tok/s +step 16429/19560 | loss 3.419334 (+1.78z)| norm 0.2652 (+0.59z)| lr 3.99e-05 | 8437.81 ms | -100.0% bf16 MFU | 62105 tok/s +step 16430/19560 | loss 3.341294 (+0.27z)| norm 0.2709 (+0.95z)| lr 3.99e-05 | 8440.66 ms | -100.0% bf16 MFU | 62106 tok/s +step 16431/19560 | loss 3.324911 (-0.05z)| norm 0.2759 (+1.27z)| lr 3.99e-05 | 8440.06 ms | -100.0% bf16 MFU | 62106 tok/s +step 16432/19560 | loss 3.364497 (+0.70z)| norm 0.2628 (+0.41z)| lr 3.98e-05 | 8436.07 ms | -100.0% bf16 MFU | 62109 tok/s +step 16433/19560 | loss 3.266417 (-1.19z)| norm 0.2546 (-0.13z)| lr 3.98e-05 | 8438.05 ms | -100.0% bf16 MFU | 62110 tok/s +step 16434/19560 | loss 3.359607 (+0.60z)| norm 0.2708 (+0.92z)| lr 3.98e-05 | 8436.57 ms | -100.0% bf16 MFU | 62112 tok/s +step 16435/19560 | loss 3.345674 (+0.33z)| norm 0.2719 (+0.98z)| lr 3.98e-05 | 8440.50 ms | -100.0% bf16 MFU | 62112 tok/s +step 16436/19560 | loss 3.315015 (-0.25z)| norm 0.2473 (-0.62z)| lr 3.97e-05 | 8440.95 ms | -100.0% bf16 MFU | 62112 tok/s +step 16437/19560 | loss 3.383983 (+1.13z)| norm 0.2901 (+2.11z)| lr 3.97e-05 | 8442.27 ms | -100.0% bf16 MFU | 62111 tok/s +step 16438/19560 | loss 3.331896 (+0.08z)| norm 0.2561 (-0.06z)| lr 3.97e-05 | 8435.93 ms | -100.0% bf16 MFU | 62113 tok/s +step 16439/19560 | loss 3.277132 (-1.03z)| norm 0.2876 (+1.91z)| lr 3.97e-05 | 8437.76 ms | -100.0% bf16 MFU | 62114 tok/s +step 16440/19560 | loss 3.358664 (+0.61z)| norm 0.2690 (+0.72z)| lr 3.96e-05 | 8439.82 ms | -100.0% bf16 MFU | 62115 tok/s +step 16441/19560 | loss 3.331801 (+0.06z)| norm 0.2594 (+0.13z)| lr 3.96e-05 | 8439.86 ms | -100.0% bf16 MFU | 62115 tok/s +step 16442/19560 | loss 3.328949 (-0.02z)| norm 0.2665 (+0.57z)| lr 3.96e-05 | 8438.68 ms | -100.0% bf16 MFU | 62116 tok/s +step 16443/19560 | loss 3.280303 (-1.04z)| norm 0.2657 (+0.51z)| lr 3.96e-05 | 8438.18 ms | -100.0% bf16 MFU | 62117 tok/s +step 16444/19560 | loss 3.303112 (-0.57z)| norm 0.2499 (-0.48z)| lr 3.95e-05 | 8443.59 ms | -100.0% bf16 MFU | 62115 tok/s +step 16445/19560 | loss 3.303760 (-0.55z)| norm 0.2653 (+0.48z)| lr 3.95e-05 | 8436.78 ms | -100.0% bf16 MFU | 62117 tok/s +step 16446/19560 | loss 3.380030 (+1.03z)| norm 0.2854 (+1.71z)| lr 3.95e-05 | 8437.53 ms | -100.0% bf16 MFU | 62118 tok/s +step 16447/19560 | loss 3.280962 (-1.03z)| norm 0.2446 (-0.81z)| lr 3.95e-05 | 8436.57 ms | -100.0% bf16 MFU | 62119 tok/s +step 16448/19560 | loss 3.317915 (-0.25z)| norm 0.2428 (-0.91z)| lr 3.94e-05 | 8443.20 ms | -100.0% bf16 MFU | 62118 tok/s +step 16449/19560 | loss 3.418156 (+1.82z)| norm 0.2867 (+1.76z)| lr 3.94e-05 | 8435.83 ms | -100.0% bf16 MFU | 62120 tok/s +step 16450/19560 | loss 3.262839 (-1.41z)| norm 0.2402 (-1.07z)| lr 3.94e-05 | 8438.88 ms | -100.0% bf16 MFU | 62120 tok/s +step 16451/19560 | loss 3.310321 (-0.41z)| norm 0.2538 (-0.24z)| lr 3.94e-05 | 8438.24 ms | -100.0% bf16 MFU | 62121 tok/s +step 16452/19560 | loss 3.349285 (+0.42z)| norm 0.2420 (-0.95z)| lr 3.93e-05 | 8437.96 ms | -100.0% bf16 MFU | 62121 tok/s +step 16453/19560 | loss 3.261778 (-1.42z)| norm 0.2445 (-0.79z)| lr 3.93e-05 | 8435.81 ms | -100.0% bf16 MFU | 62123 tok/s +step 16454/19560 | loss 3.308998 (-0.40z)| norm 0.2461 (-0.68z)| lr 3.93e-05 | 8437.77 ms | -100.0% bf16 MFU | 62123 tok/s +step 16455/19560 | loss 3.343516 (+0.33z)| norm 0.2670 (+0.58z)| lr 3.93e-05 | 8440.04 ms | -100.0% bf16 MFU | 62123 tok/s +step 16456/19560 | loss 3.318257 (-0.21z)| norm 0.2579 (+0.03z)| lr 3.92e-05 | 8437.43 ms | -100.0% bf16 MFU | 62124 tok/s +step 16457/19560 | loss 3.364474 (+0.77z)| norm 0.2639 (+0.39z)| lr 3.92e-05 | 8440.54 ms | -100.0% bf16 MFU | 62124 tok/s +step 16458/19560 | loss 3.398829 (+1.48z)| norm 0.2674 (+0.60z)| lr 3.92e-05 | 8440.52 ms | -100.0% bf16 MFU | 62123 tok/s +step 16459/19560 | loss 3.296155 (-0.71z)| norm 0.2548 (-0.17z)| lr 3.92e-05 | 8439.52 ms | -100.0% bf16 MFU | 62123 tok/s +step 16460/19560 | loss 3.271605 (-1.21z)| norm 0.2529 (-0.28z)| lr 3.91e-05 | 8436.84 ms | -100.0% bf16 MFU | 62124 tok/s +step 16461/19560 | loss 3.318660 (-0.21z)| norm 0.2539 (-0.23z)| lr 3.91e-05 | 8442.54 ms | -100.0% bf16 MFU | 62123 tok/s +step 16462/19560 | loss 3.349208 (+0.44z)| norm 0.2654 (+0.48z)| lr 3.91e-05 | 8434.43 ms | -100.0% bf16 MFU | 62125 tok/s +step 16463/19560 | loss 3.388398 (+1.25z)| norm 0.2467 (-0.66z)| lr 3.91e-05 | 8440.20 ms | -100.0% bf16 MFU | 62124 tok/s +step 16464/19560 | loss 3.303963 (-0.53z)| norm 0.2378 (-1.19z)| lr 3.90e-05 | 8436.35 ms | -100.0% bf16 MFU | 62126 tok/s +step 16465/19560 | loss 3.445125 (+2.38z)| norm 0.2672 (+0.60z)| lr 3.90e-05 | 8436.32 ms | -100.0% bf16 MFU | 62127 tok/s +step 16466/19560 | loss 3.298675 (-0.65z)| norm 0.2546 (-0.17z)| lr 3.90e-05 | 8437.38 ms | -100.0% bf16 MFU | 62127 tok/s +step 16467/19560 | loss 3.275190 (-1.12z)| norm 0.2616 (+0.25z)| lr 3.90e-05 | 8438.55 ms | -100.0% bf16 MFU | 62127 tok/s +step 16468/19560 | loss 3.311853 (-0.37z)| norm 0.2460 (-0.71z)| lr 3.89e-05 | 8437.47 ms | -100.0% bf16 MFU | 62128 tok/s +step 16469/19560 | loss 3.316304 (-0.28z)| norm 0.2504 (-0.44z)| lr 3.89e-05 | 8438.99 ms | -100.0% bf16 MFU | 62128 tok/s +step 16470/19560 | loss 3.335944 (+0.12z)| norm 0.2487 (-0.55z)| lr 3.89e-05 | 8438.82 ms | -100.0% bf16 MFU | 62128 tok/s +step 16471/19560 | loss 3.325697 (-0.08z)| norm 0.2419 (-0.95z)| lr 3.89e-05 | 8438.18 ms | -100.0% bf16 MFU | 62128 tok/s +step 16472/19560 | loss 3.289586 (-0.82z)| norm 0.2494 (-0.49z)| lr 3.88e-05 | 8439.60 ms | -100.0% bf16 MFU | 62128 tok/s +step 16473/19560 | loss 3.335312 (+0.13z)| norm 0.2443 (-0.79z)| lr 3.88e-05 | 8438.92 ms | -100.0% bf16 MFU | 62128 tok/s +step 16474/19560 | loss 3.280568 (-1.00z)| norm 0.2574 (+0.00z)| lr 3.88e-05 | 8438.82 ms | -100.0% bf16 MFU | 62128 tok/s +step 16475/19560 | loss 3.361306 (+0.66z)| norm 0.2466 (-0.65z)| lr 3.88e-05 | 8440.49 ms | -100.0% bf16 MFU | 62127 tok/s +step 16476/19560 | loss 3.282994 (-0.94z)| norm 0.2378 (-1.17z)| lr 3.87e-05 | 8435.76 ms | -100.0% bf16 MFU | 62128 tok/s +step 16477/19560 | loss 3.306090 (-0.46z)| norm 0.2586 (+0.10z)| lr 3.87e-05 | 8435.27 ms | -100.0% bf16 MFU | 62130 tok/s +step 16478/19560 | loss 3.388093 (+1.23z)| norm 0.2533 (-0.22z)| lr 3.87e-05 | 8439.76 ms | -100.0% bf16 MFU | 62129 tok/s +step 16479/19560 | loss 3.253251 (-1.56z)| norm 0.2430 (-0.83z)| lr 3.87e-05 | 8435.80 ms | -100.0% bf16 MFU | 62130 tok/s +step 16480/19560 | loss 3.287850 (-0.86z)| norm 0.2424 (-0.86z)| lr 3.86e-05 | 8438.54 ms | -100.0% bf16 MFU | 62130 tok/s +step 16481/19560 | loss 3.359305 (+0.64z)| norm 0.2555 (-0.06z)| lr 3.86e-05 | 8437.48 ms | -100.0% bf16 MFU | 62131 tok/s +step 16482/19560 | loss 3.358669 (+0.62z)| norm 0.2524 (-0.24z)| lr 3.86e-05 | 8438.07 ms | -100.0% bf16 MFU | 62131 tok/s +step 16483/19560 | loss 3.407673 (+1.62z)| norm 0.2754 (+1.14z)| lr 3.86e-05 | 8438.07 ms | -100.0% bf16 MFU | 62131 tok/s +step 16484/19560 | loss 3.354407 (+0.51z)| norm 0.2499 (-0.41z)| lr 3.86e-05 | 8437.98 ms | -100.0% bf16 MFU | 62131 tok/s +step 16485/19560 | loss 3.317710 (-0.25z)| norm 0.2880 (+1.87z)| lr 3.85e-05 | 8435.61 ms | -100.0% bf16 MFU | 62132 tok/s +step 16486/19560 | loss 3.320287 (-0.20z)| norm 0.2416 (-0.91z)| lr 3.85e-05 | 8438.57 ms | -100.0% bf16 MFU | 62132 tok/s +step 16487/19560 | loss 3.332254 (+0.05z)| norm 0.2728 (+0.95z)| lr 3.85e-05 | 8437.72 ms | -100.0% bf16 MFU | 62132 tok/s +step 16488/19560 | loss 3.304036 (-0.53z)| norm 0.2539 (-0.18z)| lr 3.85e-05 | 8437.98 ms | -100.0% bf16 MFU | 62132 tok/s +step 16489/19560 | loss 3.271676 (-1.19z)| norm 0.2495 (-0.45z)| lr 3.84e-05 | 8435.97 ms | -100.0% bf16 MFU | 62133 tok/s +step 16490/19560 | loss 3.268797 (-1.24z)| norm 0.2667 (+0.57z)| lr 3.84e-05 | 8436.54 ms | -100.0% bf16 MFU | 62134 tok/s +step 16491/19560 | loss 3.308502 (-0.42z)| norm 0.2579 (+0.04z)| lr 3.84e-05 | 8438.88 ms | -100.0% bf16 MFU | 62133 tok/s +step 16492/19560 | loss 3.265165 (-1.30z)| norm 0.2471 (-0.60z)| lr 3.84e-05 | 8437.72 ms | -100.0% bf16 MFU | 62134 tok/s +step 16493/19560 | loss 3.321533 (-0.15z)| norm 0.2601 (+0.17z)| lr 3.83e-05 | 8438.18 ms | -100.0% bf16 MFU | 62134 tok/s +step 16494/19560 | loss 3.352180 (+0.47z)| norm 0.2787 (+1.26z)| lr 3.83e-05 | 8437.38 ms | -100.0% bf16 MFU | 62134 tok/s +step 16495/19560 | loss 3.276769 (-1.06z)| norm 0.2596 (+0.12z)| lr 3.83e-05 | 8434.89 ms | -100.0% bf16 MFU | 62135 tok/s +step 16496/19560 | loss 3.313204 (-0.31z)| norm 0.2441 (-0.80z)| lr 3.83e-05 | 8436.05 ms | -100.0% bf16 MFU | 62136 tok/s +step 16497/19560 | loss 3.337235 (+0.19z)| norm 0.2496 (-0.47z)| lr 3.82e-05 | 8436.71 ms | -100.0% bf16 MFU | 62136 tok/s +step 16498/19560 | loss 3.362144 (+0.70z)| norm 0.2579 (+0.03z)| lr 3.82e-05 | 8437.90 ms | -100.0% bf16 MFU | 62136 tok/s +step 16499/19560 | loss 3.309368 (-0.38z)| norm 0.2673 (+0.57z)| lr 3.82e-05 | 8437.65 ms | -100.0% bf16 MFU | 62136 tok/s +step 16500/19560 | loss 3.275680 (-1.06z)| norm 0.2584 (+0.04z)| lr 3.82e-05 | 8436.51 ms | -100.0% bf16 MFU | 62136 tok/s +val loss 3.276960 +evaluating HellaSwag: 0/1256 evaluating HellaSwag: 10/1256 evaluating HellaSwag: 20/1256 evaluating HellaSwag: 30/1256 evaluating HellaSwag: 40/1256 evaluating HellaSwag: 50/1256 evaluating HellaSwag: 60/1256 evaluating HellaSwag: 70/1256 evaluating HellaSwag: 80/1256 evaluating HellaSwag: 90/1256 evaluating HellaSwag: 100/1256 evaluating HellaSwag: 110/1256 evaluating HellaSwag: 120/1256 evaluating HellaSwag: 130/1256 evaluating HellaSwag: 140/1256 evaluating HellaSwag: 150/1256 evaluating HellaSwag: 160/1256 evaluating HellaSwag: 170/1256 evaluating HellaSwag: 180/1256 evaluating HellaSwag: 190/1256 evaluating HellaSwag: 200/1256 evaluating HellaSwag: 210/1256 evaluating HellaSwag: 220/1256 evaluating HellaSwag: 230/1256 evaluating HellaSwag: 240/1256 evaluating HellaSwag: 250/1256 evaluating HellaSwag: 260/1256 evaluating HellaSwag: 270/1256 evaluating HellaSwag: 280/1256 evaluating HellaSwag: 290/1256 evaluating HellaSwag: 300/1256 evaluating HellaSwag: 310/1256 evaluating HellaSwag: 320/1256 evaluating HellaSwag: 330/1256 evaluating HellaSwag: 340/1256 evaluating HellaSwag: 350/1256 evaluating HellaSwag: 360/1256 evaluating HellaSwag: 370/1256 evaluating HellaSwag: 380/1256 evaluating HellaSwag: 390/1256 evaluating HellaSwag: 400/1256 evaluating HellaSwag: 410/1256 evaluating HellaSwag: 420/1256 evaluating HellaSwag: 430/1256 evaluating HellaSwag: 440/1256 evaluating HellaSwag: 450/1256 evaluating HellaSwag: 460/1256 evaluating HellaSwag: 470/1256 evaluating HellaSwag: 480/1256 evaluating HellaSwag: 490/1256 evaluating HellaSwag: 500/1256 evaluating HellaSwag: 510/1256 evaluating HellaSwag: 520/1256 evaluating HellaSwag: 530/1256 evaluating HellaSwag: 540/1256 evaluating HellaSwag: 550/1256 evaluating HellaSwag: 560/1256 evaluating HellaSwag: 570/1256 evaluating HellaSwag: 580/1256 evaluating HellaSwag: 590/1256 evaluating HellaSwag: 600/1256 evaluating HellaSwag: 610/1256 evaluating HellaSwag: 620/1256 evaluating HellaSwag: 630/1256 evaluating HellaSwag: 640/1256 evaluating HellaSwag: 650/1256 evaluating HellaSwag: 660/1256 evaluating HellaSwag: 670/1256 evaluating HellaSwag: 680/1256 evaluating HellaSwag: 690/1256 evaluating HellaSwag: 700/1256 evaluating HellaSwag: 710/1256 evaluating HellaSwag: 720/1256 evaluating HellaSwag: 730/1256 evaluating HellaSwag: 740/1256 evaluating HellaSwag: 750/1256 evaluating HellaSwag: 760/1256 evaluating HellaSwag: 770/1256 evaluating HellaSwag: 780/1256 evaluating HellaSwag: 790/1256 evaluating HellaSwag: 800/1256 evaluating HellaSwag: 810/1256 evaluating HellaSwag: 820/1256 evaluating HellaSwag: 830/1256 evaluating HellaSwag: 840/1256 evaluating HellaSwag: 850/1256 evaluating HellaSwag: 860/1256 evaluating HellaSwag: 870/1256 evaluating HellaSwag: 880/1256 evaluating HellaSwag: 890/1256 evaluating HellaSwag: 900/1256 evaluating HellaSwag: 910/1256 evaluating HellaSwag: 920/1256 evaluating HellaSwag: 930/1256 evaluating HellaSwag: 940/1256 evaluating HellaSwag: 950/1256 evaluating HellaSwag: 960/1256 evaluating HellaSwag: 970/1256 evaluating HellaSwag: 980/1256 evaluating HellaSwag: 990/1256 evaluating HellaSwag: 1000/1256 evaluating HellaSwag: 1010/1256 evaluating HellaSwag: 1020/1256 evaluating HellaSwag: 1030/1256 evaluating HellaSwag: 1040/1256 evaluating HellaSwag: 1050/1256 evaluating HellaSwag: 1060/1256 evaluating HellaSwag: 1070/1256 evaluating HellaSwag: 1080/1256 evaluating HellaSwag: 1090/1256 evaluating HellaSwag: 1100/1256 evaluating HellaSwag: 1110/1256 evaluating HellaSwag: 1120/1256 evaluating HellaSwag: 1130/1256 evaluating HellaSwag: 1140/1256 evaluating HellaSwag: 1150/1256 evaluating HellaSwag: 1160/1256 evaluating HellaSwag: 1170/1256 evaluating HellaSwag: 1180/1256 evaluating HellaSwag: 1190/1256 evaluating HellaSwag: 1200/1256 evaluating HellaSwag: 1210/1256 evaluating HellaSwag: 1220/1256 evaluating HellaSwag: 1230/1256 evaluating HellaSwag: 1240/1256 evaluating HellaSwag: 1250/1256 HellaSwag: 3000/10042 = 0.298745 +step 16501/19560 | loss 3.326149 (-0.02z)| norm 0.2596 (+0.11z)| lr 3.81e-05 | 8435.95 ms | -100.0% bf16 MFU | 62137 tok/s +step 16502/19560 | loss 3.372464 (+0.95z)| norm 0.2509 (-0.40z)| lr 3.81e-05 | 8437.87 ms | -100.0% bf16 MFU | 62137 tok/s +step 16503/19560 | loss 3.314226 (-0.26z)| norm 0.2658 (+0.51z)| lr 3.81e-05 | 8435.75 ms | -100.0% bf16 MFU | 62138 tok/s +step 16504/19560 | loss 3.309063 (-0.36z)| norm 0.2572 (-0.02z)| lr 3.81e-05 | 8438.04 ms | -100.0% bf16 MFU | 62138 tok/s +step 16505/19560 | loss 3.291755 (-0.71z)| norm 0.2404 (-1.05z)| lr 3.80e-05 | 8447.78 ms | -100.0% bf16 MFU | 62134 tok/s +step 16506/19560 | loss 3.294373 (-0.65z)| norm 0.2455 (-0.72z)| lr 3.80e-05 | 8467.22 ms | -100.0% bf16 MFU | 62123 tok/s +step 16507/19560 | loss 3.333658 (+0.18z)| norm 0.2668 (+0.71z)| lr 3.80e-05 | 8462.37 ms | -100.0% bf16 MFU | 62115 tok/s +step 16508/19560 | loss 3.354274 (+0.61z)| norm 0.2518 (-0.33z)| lr 3.80e-05 | 8468.78 ms | -100.0% bf16 MFU | 62104 tok/s +step 16509/19560 | loss 3.302117 (-0.49z)| norm 0.2577 (+0.09z)| lr 3.79e-05 | 8463.74 ms | -100.0% bf16 MFU | 62096 tok/s +step 16510/19560 | loss 3.302986 (-0.47z)| norm 0.2463 (-0.71z)| lr 3.79e-05 | 8459.35 ms | -100.0% bf16 MFU | 62090 tok/s +step 16511/19560 | loss 3.324216 (-0.02z)| norm 0.2462 (-0.71z)| lr 3.79e-05 | 8465.63 ms | -100.0% bf16 MFU | 62082 tok/s +step 16512/19560 | loss 3.346754 (+0.46z)| norm 0.2644 (+0.57z)| lr 3.79e-05 | 8464.35 ms | -100.0% bf16 MFU | 62075 tok/s +step 16513/19560 | loss 3.291555 (-0.73z)| norm 0.2459 (-0.74z)| lr 3.78e-05 | 8465.60 ms | -100.0% bf16 MFU | 62068 tok/s +step 16514/19560 | loss 3.309639 (-0.35z)| norm 0.2501 (-0.45z)| lr 3.78e-05 | 8457.67 ms | -100.0% bf16 MFU | 62064 tok/s +step 16515/19560 | loss 3.271304 (-1.17z)| norm 0.2419 (-1.03z)| lr 3.78e-05 | 8457.35 ms | -100.0% bf16 MFU | 62061 tok/s +step 16516/19560 | loss 3.296429 (-0.62z)| norm 0.2535 (-0.21z)| lr 3.78e-05 | 8459.00 ms | -100.0% bf16 MFU | 62057 tok/s +step 16517/19560 | loss 3.278394 (-1.01z)| norm 0.2606 (+0.29z)| lr 3.77e-05 | 8456.11 ms | -100.0% bf16 MFU | 62054 tok/s +step 16518/19560 | loss 3.302448 (-0.50z)| norm 0.2429 (-1.06z)| lr 3.77e-05 | 8457.60 ms | -100.0% bf16 MFU | 62051 tok/s +step 16519/19560 | loss 3.305380 (-0.42z)| norm 0.2532 (-0.23z)| lr 3.77e-05 | 8458.19 ms | -100.0% bf16 MFU | 62047 tok/s +step 16520/19560 | loss 3.299075 (-0.57z)| norm 0.2427 (-1.06z)| lr 3.77e-05 | 8457.04 ms | -100.0% bf16 MFU | 62045 tok/s +step 16521/19560 | loss 3.259567 (-1.51z)| norm 0.2518 (-0.31z)| lr 3.76e-05 | 8456.73 ms | -100.0% bf16 MFU | 62042 tok/s +step 16522/19560 | loss 3.381320 (+1.44z)| norm 0.2418 (-1.15z)| lr 3.76e-05 | 8446.44 ms | -100.0% bf16 MFU | 62044 tok/s +step 16523/19560 | loss 3.345481 (+0.57z)| norm 0.2512 (-0.35z)| lr 3.76e-05 | 8456.87 ms | -100.0% bf16 MFU | 62041 tok/s +step 16524/19560 | loss 3.313831 (-0.20z)| norm 0.2412 (-1.18z)| lr 3.76e-05 | 8452.47 ms | -100.0% bf16 MFU | 62041 tok/s +step 16525/19560 | loss 3.276501 (-1.10z)| norm 0.2473 (-0.67z)| lr 3.76e-05 | 8456.50 ms | -100.0% bf16 MFU | 62039 tok/s +step 16526/19560 | loss 3.293840 (-0.67z)| norm 0.2483 (-0.59z)| lr 3.75e-05 | 8453.16 ms | -100.0% bf16 MFU | 62038 tok/s +step 16527/19560 | loss 3.337708 (+0.38z)| norm 0.2569 (+0.13z)| lr 3.75e-05 | 8456.44 ms | -100.0% bf16 MFU | 62036 tok/s +step 16528/19560 | loss 3.298901 (-0.57z)| norm 0.2541 (-0.11z)| lr 3.75e-05 | 8456.50 ms | -100.0% bf16 MFU | 62034 tok/s +step 16529/19560 | loss 3.412444 (+2.15z)| norm 0.2563 (+0.07z)| lr 3.75e-05 | 8457.90 ms | -100.0% bf16 MFU | 62032 tok/s +step 16530/19560 | loss 3.243919 (-1.87z)| norm 0.2514 (-0.34z)| lr 3.74e-05 | 8457.08 ms | -100.0% bf16 MFU | 62030 tok/s +step 16531/19560 | loss 3.322864 (+0.01z)| norm 0.2556 (+0.02z)| lr 3.74e-05 | 8452.88 ms | -100.0% bf16 MFU | 62030 tok/s +step 16532/19560 | loss 3.365685 (+1.03z)| norm 0.2626 (+0.65z)| lr 3.74e-05 | 8449.99 ms | -100.0% bf16 MFU | 62030 tok/s +step 16533/19560 | loss 3.284120 (-0.92z)| norm 0.2539 (-0.11z)| lr 3.74e-05 | 8454.03 ms | -100.0% bf16 MFU | 62030 tok/s +step 16534/19560 | loss 3.277994 (-1.08z)| norm 0.2679 (+1.10z)| lr 3.73e-05 | 8447.21 ms | -100.0% bf16 MFU | 62031 tok/s +step 16535/19560 | loss 3.256318 (-1.57z)| norm 0.2534 (-0.16z)| lr 3.73e-05 | 8448.47 ms | -100.0% bf16 MFU | 62033 tok/s +step 16536/19560 | loss 3.267960 (-1.27z)| norm 0.2414 (-1.20z)| lr 3.73e-05 | 8457.81 ms | -100.0% bf16 MFU | 62031 tok/s +step 16537/19560 | loss 3.320913 (+0.00z)| norm 0.2513 (-0.32z)| lr 3.73e-05 | 8456.20 ms | -100.0% bf16 MFU | 62029 tok/s +step 16538/19560 | loss 3.282092 (-0.92z)| norm 0.2577 (+0.25z)| lr 3.72e-05 | 8456.67 ms | -100.0% bf16 MFU | 62027 tok/s +step 16539/19560 | loss 3.287013 (-0.80z)| norm 0.2534 (-0.13z)| lr 3.72e-05 | 8453.21 ms | -100.0% bf16 MFU | 62027 tok/s +step 16540/19560 | loss 3.319009 (-0.00z)| norm 0.2443 (-0.93z)| lr 3.72e-05 | 8453.64 ms | -100.0% bf16 MFU | 62027 tok/s +step 16541/19560 | loss 3.302670 (-0.40z)| norm 0.2525 (-0.21z)| lr 3.72e-05 | 8452.44 ms | -100.0% bf16 MFU | 62027 tok/s +step 16542/19560 | loss 3.233663 (-2.05z)| norm 0.2579 (+0.26z)| lr 3.71e-05 | 8451.69 ms | -100.0% bf16 MFU | 62027 tok/s +step 16543/19560 | loss 3.341324 (+0.54z)| norm 0.2564 (+0.13z)| lr 3.71e-05 | 8441.89 ms | -100.0% bf16 MFU | 62031 tok/s +step 16544/19560 | loss 3.305282 (-0.31z)| norm 0.2490 (-0.52z)| lr 3.71e-05 | 8452.53 ms | -100.0% bf16 MFU | 62031 tok/s +step 16545/19560 | loss 3.299666 (-0.45z)| norm 0.2557 (+0.06z)| lr 3.71e-05 | 8451.90 ms | -100.0% bf16 MFU | 62031 tok/s +step 16546/19560 | loss 3.296208 (-0.54z)| norm 0.2424 (-1.13z)| lr 3.70e-05 | 8451.18 ms | -100.0% bf16 MFU | 62031 tok/s +step 16547/19560 | loss 3.327732 (+0.24z)| norm 0.2558 (+0.06z)| lr 3.70e-05 | 8450.31 ms | -100.0% bf16 MFU | 62032 tok/s +step 16548/19560 | loss 3.260369 (-1.41z)| norm 0.2542 (-0.09z)| lr 3.70e-05 | 8447.47 ms | -100.0% bf16 MFU | 62034 tok/s +step 16549/19560 | loss 3.271823 (-1.14z)| norm 0.2557 (+0.04z)| lr 3.70e-05 | 8449.84 ms | -100.0% bf16 MFU | 62034 tok/s +step 16550/19560 | loss 3.257592 (-1.47z)| norm 0.2400 (-1.36z)| lr 3.69e-05 | 8449.38 ms | -100.0% bf16 MFU | 62035 tok/s +step 16551/19560 | loss 3.266032 (-1.25z)| norm 0.2501 (-0.45z)| lr 3.69e-05 | 8456.59 ms | -100.0% bf16 MFU | 62033 tok/s +step 16552/19560 | loss 3.301337 (-0.38z)| norm 0.2568 (+0.14z)| lr 3.69e-05 | 8447.79 ms | -100.0% bf16 MFU | 62035 tok/s +step 16553/19560 | loss 3.249350 (-1.64z)| norm 0.2471 (-0.73z)| lr 3.69e-05 | 8445.91 ms | -100.0% bf16 MFU | 62037 tok/s +step 16554/19560 | loss 3.282346 (-0.82z)| norm 0.2480 (-0.66z)| lr 3.69e-05 | 8450.79 ms | -100.0% bf16 MFU | 62037 tok/s +step 16555/19560 | loss 3.253112 (-1.51z)| norm 0.2603 (+0.45z)| lr 3.68e-05 | 8445.34 ms | -100.0% bf16 MFU | 62039 tok/s +step 16556/19560 | loss 3.309758 (-0.13z)| norm 0.2626 (+0.64z)| lr 3.68e-05 | 8455.50 ms | -100.0% bf16 MFU | 62037 tok/s +step 16557/19560 | loss 3.249472 (-1.60z)| norm 0.2419 (-1.21z)| lr 3.68e-05 | 8443.76 ms | -100.0% bf16 MFU | 62040 tok/s +step 16558/19560 | loss 3.339271 (+0.64z)| norm 0.2573 (+0.19z)| lr 3.68e-05 | 8438.96 ms | -100.0% bf16 MFU | 62044 tok/s +step 16559/19560 | loss 3.297872 (-0.39z)| norm 0.2660 (+1.00z)| lr 3.67e-05 | 8445.78 ms | -100.0% bf16 MFU | 62046 tok/s +step 16560/19560 | loss 3.328274 (+0.38z)| norm 0.2957 (+3.52z)| lr 3.67e-05 | 8443.81 ms | -100.0% bf16 MFU | 62048 tok/s +step 16561/19560 | loss 3.288721 (-0.62z)| norm 0.2462 (-0.79z)| lr 3.67e-05 | 8444.70 ms | -100.0% bf16 MFU | 62050 tok/s +step 16562/19560 | loss 3.345672 (+0.82z)| norm 0.2574 (+0.20z)| lr 3.67e-05 | 8442.42 ms | -100.0% bf16 MFU | 62053 tok/s +step 16563/19560 | loss 3.286690 (-0.66z)| norm 0.2547 (-0.03z)| lr 3.66e-05 | 8448.43 ms | -100.0% bf16 MFU | 62053 tok/s +step 16564/19560 | loss 3.237318 (-1.86z)| norm 0.2449 (-0.90z)| lr 3.66e-05 | 8450.57 ms | -100.0% bf16 MFU | 62052 tok/s +step 16565/19560 | loss 3.287562 (-0.60z)| norm 0.2609 (+0.56z)| lr 3.66e-05 | 8444.40 ms | -100.0% bf16 MFU | 62054 tok/s +step 16566/19560 | loss 3.335879 (+0.61z)| norm 0.2447 (-0.92z)| lr 3.66e-05 | 8446.43 ms | -100.0% bf16 MFU | 62055 tok/s +step 16567/19560 | loss 3.297513 (-0.36z)| norm 0.2489 (-0.52z)| lr 3.65e-05 | 8449.94 ms | -100.0% bf16 MFU | 62055 tok/s +step 16568/19560 | loss 3.336144 (+0.63z)| norm 0.2484 (-0.56z)| lr 3.65e-05 | 8451.27 ms | -100.0% bf16 MFU | 62054 tok/s +step 16569/19560 | loss 3.311875 (+0.01z)| norm 0.2475 (-0.63z)| lr 3.65e-05 | 8442.50 ms | -100.0% bf16 MFU | 62056 tok/s +step 16570/19560 | loss 3.320555 (+0.24z)| norm 0.2559 (+0.18z)| lr 3.65e-05 | 8446.85 ms | -100.0% bf16 MFU | 62057 tok/s +step 16571/19560 | loss 3.302699 (-0.22z)| norm 0.2392 (-1.40z)| lr 3.64e-05 | 8445.62 ms | -100.0% bf16 MFU | 62058 tok/s +step 16572/19560 | loss 3.333780 (+0.56z)| norm 0.2794 (+2.37z)| lr 3.64e-05 | 8449.00 ms | -100.0% bf16 MFU | 62058 tok/s +step 16573/19560 | loss 3.251330 (-1.51z)| norm 0.2509 (-0.29z)| lr 3.64e-05 | 8440.90 ms | -100.0% bf16 MFU | 62060 tok/s +step 16574/19560 | loss 3.340934 (+0.76z)| norm 0.2486 (-0.50z)| lr 3.64e-05 | 8445.11 ms | -100.0% bf16 MFU | 62061 tok/s +step 16575/19560 | loss 3.348502 (+0.94z)| norm 0.2590 (+0.51z)| lr 3.64e-05 | 8446.34 ms | -100.0% bf16 MFU | 62062 tok/s +step 16576/19560 | loss 3.316362 (+0.13z)| norm 0.2524 (-0.15z)| lr 3.63e-05 | 8443.47 ms | -100.0% bf16 MFU | 62064 tok/s +step 16577/19560 | loss 3.288870 (-0.56z)| norm 0.2458 (-0.79z)| lr 3.63e-05 | 8442.61 ms | -100.0% bf16 MFU | 62065 tok/s +step 16578/19560 | loss 3.368654 (+1.49z)| norm 0.2606 (+0.70z)| lr 3.63e-05 | 8443.07 ms | -100.0% bf16 MFU | 62067 tok/s +step 16579/19560 | loss 3.346517 (+0.91z)| norm 0.2662 (+1.26z)| lr 3.63e-05 | 8444.68 ms | -100.0% bf16 MFU | 62068 tok/s +step 16580/19560 | loss 3.332599 (+0.55z)| norm 0.2501 (-0.39z)| lr 3.62e-05 | 8440.10 ms | -100.0% bf16 MFU | 62070 tok/s +step 16581/19560 | loss 3.319413 (+0.20z)| norm 0.2470 (-0.71z)| lr 3.62e-05 | 8442.08 ms | -100.0% bf16 MFU | 62072 tok/s +step 16582/19560 | loss 3.253515 (-1.50z)| norm 0.2453 (-0.88z)| lr 3.62e-05 | 8443.37 ms | -100.0% bf16 MFU | 62073 tok/s +step 16583/19560 | loss 3.345909 (+0.89z)| norm 0.2595 (+0.59z)| lr 3.62e-05 | 8446.34 ms | -100.0% bf16 MFU | 62073 tok/s +step 16584/19560 | loss 3.311552 (+0.00z)| norm 0.2555 (+0.17z)| lr 3.61e-05 | 8443.71 ms | -100.0% bf16 MFU | 62074 tok/s +step 16585/19560 | loss 3.339660 (+0.74z)| norm 0.2455 (-0.84z)| lr 3.61e-05 | 8446.12 ms | -100.0% bf16 MFU | 62074 tok/s +step 16586/19560 | loss 3.400059 (+2.32z)| norm 0.2438 (-1.00z)| lr 3.61e-05 | 8441.12 ms | -100.0% bf16 MFU | 62076 tok/s +step 16587/19560 | loss 3.364192 (+1.36z)| norm 0.2624 (+0.92z)| lr 3.61e-05 | 8440.71 ms | -100.0% bf16 MFU | 62078 tok/s +step 16588/19560 | loss 3.322260 (+0.26z)| norm 0.2765 (+2.30z)| lr 3.60e-05 | 8438.43 ms | -100.0% bf16 MFU | 62081 tok/s +step 16589/19560 | loss 3.280996 (-0.80z)| norm 0.2422 (-1.15z)| lr 3.60e-05 | 8440.88 ms | -100.0% bf16 MFU | 62082 tok/s +step 16590/19560 | loss 3.332688 (+0.55z)| norm 0.2690 (+1.53z)| lr 3.60e-05 | 8446.53 ms | -100.0% bf16 MFU | 62082 tok/s +step 16591/19560 | loss 3.316695 (+0.15z)| norm 0.2569 (+0.32z)| lr 3.60e-05 | 8445.04 ms | -100.0% bf16 MFU | 62082 tok/s +step 16592/19560 | loss 3.267969 (-1.13z)| norm 0.2627 (+0.88z)| lr 3.59e-05 | 8440.53 ms | -100.0% bf16 MFU | 62083 tok/s +step 16593/19560 | loss 3.302167 (-0.21z)| norm 0.2517 (-0.22z)| lr 3.59e-05 | 8444.49 ms | -100.0% bf16 MFU | 62084 tok/s +step 16594/19560 | loss 3.418759 (+2.89z)| norm 0.2445 (-0.93z)| lr 3.59e-05 | 8437.83 ms | -100.0% bf16 MFU | 62086 tok/s +step 16595/19560 | loss 3.292559 (-0.49z)| norm 0.2631 (+0.94z)| lr 3.59e-05 | 8440.23 ms | -100.0% bf16 MFU | 62088 tok/s +step 16596/19560 | loss 3.306895 (-0.10z)| norm 0.2569 (+0.31z)| lr 3.59e-05 | 8438.97 ms | -100.0% bf16 MFU | 62090 tok/s +step 16597/19560 | loss 3.285861 (-0.66z)| norm 0.2533 (-0.06z)| lr 3.58e-05 | 8440.58 ms | -100.0% bf16 MFU | 62091 tok/s +step 16598/19560 | loss 3.341811 (+0.84z)| norm 0.2430 (-1.10z)| lr 3.58e-05 | 8442.00 ms | -100.0% bf16 MFU | 62092 tok/s +step 16599/19560 | loss 3.348356 (+1.00z)| norm 0.2535 (-0.04z)| lr 3.58e-05 | 8440.99 ms | -100.0% bf16 MFU | 62093 tok/s +step 16600/19560 | loss 3.308674 (-0.06z)| norm 0.2775 (+2.32z)| lr 3.58e-05 | 8445.09 ms | -100.0% bf16 MFU | 62092 tok/s +step 16601/19560 | loss 3.299853 (-0.29z)| norm 0.2598 (+0.55z)| lr 3.57e-05 | 8441.65 ms | -100.0% bf16 MFU | 62093 tok/s +step 16602/19560 | loss 3.294482 (-0.44z)| norm 0.2532 (-0.10z)| lr 3.57e-05 | 8442.96 ms | -100.0% bf16 MFU | 62093 tok/s +step 16603/19560 | loss 3.277626 (-0.87z)| norm 0.2452 (-0.91z)| lr 3.57e-05 | 8443.24 ms | -100.0% bf16 MFU | 62093 tok/s +step 16604/19560 | loss 3.317566 (+0.19z)| norm 0.2703 (+1.58z)| lr 3.57e-05 | 8440.15 ms | -100.0% bf16 MFU | 62094 tok/s +step 16605/19560 | loss 3.304937 (-0.15z)| norm 0.2605 (+0.59z)| lr 3.56e-05 | 8444.54 ms | -100.0% bf16 MFU | 62094 tok/s +step 16606/19560 | loss 3.265751 (-1.19z)| norm 0.2687 (+1.39z)| lr 3.56e-05 | 8440.42 ms | -100.0% bf16 MFU | 62095 tok/s +step 16607/19560 | loss 3.270967 (-1.06z)| norm 0.2444 (-1.02z)| lr 3.56e-05 | 8439.25 ms | -100.0% bf16 MFU | 62097 tok/s +step 16608/19560 | loss 3.311065 (+0.04z)| norm 0.2691 (+1.41z)| lr 3.56e-05 | 8442.96 ms | -100.0% bf16 MFU | 62097 tok/s +step 16609/19560 | loss 3.317087 (+0.21z)| norm 0.2825 (+2.64z)| lr 3.55e-05 | 8439.96 ms | -100.0% bf16 MFU | 62098 tok/s +step 16610/19560 | loss 3.350853 (+1.15z)| norm 0.2705 (+1.46z)| lr 3.55e-05 | 8439.33 ms | -100.0% bf16 MFU | 62099 tok/s +step 16611/19560 | loss 3.292809 (-0.45z)| norm 0.2412 (-1.32z)| lr 3.55e-05 | 8438.99 ms | -100.0% bf16 MFU | 62101 tok/s +step 16612/19560 | loss 3.255714 (-1.48z)| norm 0.3002 (+4.03z)| lr 3.55e-05 | 8443.05 ms | -100.0% bf16 MFU | 62100 tok/s +step 16613/19560 | loss 3.318531 (+0.31z)| norm 0.2645 (+0.87z)| lr 3.55e-05 | 8440.21 ms | -100.0% bf16 MFU | 62101 tok/s +step 16614/19560 | loss 3.305902 (-0.05z)| norm 0.2444 (-1.00z)| lr 3.54e-05 | 8437.70 ms | -100.0% bf16 MFU | 62103 tok/s +step 16615/19560 | loss 3.290764 (-0.47z)| norm 0.2648 (+0.91z)| lr 3.54e-05 | 8438.97 ms | -100.0% bf16 MFU | 62104 tok/s +step 16616/19560 | loss 3.299638 (-0.22z)| norm 0.2587 (+0.34z)| lr 3.54e-05 | 8440.07 ms | -100.0% bf16 MFU | 62105 tok/s +step 16617/19560 | loss 3.307393 (-0.00z)| norm 0.2573 (+0.20z)| lr 3.54e-05 | 8437.67 ms | -100.0% bf16 MFU | 62107 tok/s +step 16618/19560 | loss 3.347605 (+1.13z)| norm 0.2672 (+1.12z)| lr 3.53e-05 | 8438.99 ms | -100.0% bf16 MFU | 62108 tok/s +step 16619/19560 | loss 3.264839 (-1.22z)| norm 0.2471 (-0.75z)| lr 3.53e-05 | 8439.16 ms | -100.0% bf16 MFU | 62108 tok/s +step 16620/19560 | loss 3.253473 (-1.54z)| norm 0.2500 (-0.48z)| lr 3.53e-05 | 8435.67 ms | -100.0% bf16 MFU | 62111 tok/s +step 16621/19560 | loss 3.346268 (+1.08z)| norm 0.2634 (+0.77z)| lr 3.53e-05 | 8438.21 ms | -100.0% bf16 MFU | 62112 tok/s +step 16622/19560 | loss 3.270275 (-1.04z)| norm 0.2625 (+0.71z)| lr 3.52e-05 | 8434.75 ms | -100.0% bf16 MFU | 62114 tok/s +step 16623/19560 | loss 3.341990 (+0.97z)| norm 0.2796 (+2.27z)| lr 3.52e-05 | 8436.40 ms | -100.0% bf16 MFU | 62116 tok/s +step 16624/19560 | loss 3.301744 (-0.17z)| norm 0.2587 (+0.31z)| lr 3.52e-05 | 8437.83 ms | -100.0% bf16 MFU | 62117 tok/s +step 16625/19560 | loss 3.355196 (+1.33z)| norm 0.2690 (+1.26z)| lr 3.52e-05 | 8441.44 ms | -100.0% bf16 MFU | 62116 tok/s +step 16626/19560 | loss 3.273804 (-0.94z)| norm 0.2696 (+1.30z)| lr 3.51e-05 | 8439.80 ms | -100.0% bf16 MFU | 62116 tok/s +step 16627/19560 | loss 3.359584 (+1.46z)| norm 0.2657 (+0.94z)| lr 3.51e-05 | 8435.40 ms | -100.0% bf16 MFU | 62118 tok/s +step 16628/19560 | loss 3.365928 (+1.61z)| norm 0.2658 (+0.94z)| lr 3.51e-05 | 8438.59 ms | -100.0% bf16 MFU | 62119 tok/s +step 16629/19560 | loss 3.279808 (-0.78z)| norm 0.2676 (+1.10z)| lr 3.51e-05 | 8437.79 ms | -100.0% bf16 MFU | 62120 tok/s +step 16630/19560 | loss 3.294957 (-0.35z)| norm 0.2683 (+1.15z)| lr 3.51e-05 | 8439.40 ms | -100.0% bf16 MFU | 62120 tok/s +step 16631/19560 | loss 3.312738 (+0.16z)| norm 0.2533 (-0.22z)| lr 3.50e-05 | 8438.08 ms | -100.0% bf16 MFU | 62121 tok/s +step 16632/19560 | loss 3.271440 (-1.00z)| norm 0.2448 (-0.99z)| lr 3.50e-05 | 8436.62 ms | -100.0% bf16 MFU | 62122 tok/s +step 16633/19560 | loss 3.260739 (-1.28z)| norm 0.2745 (+1.70z)| lr 3.50e-05 | 8442.02 ms | -100.0% bf16 MFU | 62121 tok/s +step 16634/19560 | loss 3.312525 (+0.16z)| norm 0.2769 (+1.88z)| lr 3.50e-05 | 8436.62 ms | -100.0% bf16 MFU | 62122 tok/s +step 16635/19560 | loss 3.246255 (-1.66z)| norm 0.2453 (-0.96z)| lr 3.49e-05 | 8439.90 ms | -100.0% bf16 MFU | 62122 tok/s +step 16636/19560 | loss 3.309880 (+0.11z)| norm 0.2501 (-0.53z)| lr 3.49e-05 | 8438.12 ms | -100.0% bf16 MFU | 62123 tok/s +step 16637/19560 | loss 3.297043 (-0.24z)| norm 0.2569 (+0.09z)| lr 3.49e-05 | 8437.22 ms | -100.0% bf16 MFU | 62123 tok/s +step 16638/19560 | loss 3.261501 (-1.22z)| norm 0.2481 (-0.71z)| lr 3.49e-05 | 8436.41 ms | -100.0% bf16 MFU | 62125 tok/s +step 16639/19560 | loss 3.280173 (-0.69z)| norm 0.2584 (+0.21z)| lr 3.48e-05 | 8438.64 ms | -100.0% bf16 MFU | 62125 tok/s +step 16640/19560 | loss 3.312956 (+0.23z)| norm 0.2575 (+0.14z)| lr 3.48e-05 | 8440.25 ms | -100.0% bf16 MFU | 62124 tok/s +step 16641/19560 | loss 3.288731 (-0.45z)| norm 0.2452 (-0.97z)| lr 3.48e-05 | 8441.31 ms | -100.0% bf16 MFU | 62124 tok/s +step 16642/19560 | loss 3.325734 (+0.58z)| norm 0.2559 (-0.01z)| lr 3.48e-05 | 8440.42 ms | -100.0% bf16 MFU | 62123 tok/s +step 16643/19560 | loss 3.300588 (-0.13z)| norm 0.2543 (-0.16z)| lr 3.47e-05 | 8441.34 ms | -100.0% bf16 MFU | 62123 tok/s +step 16644/19560 | loss 3.299659 (-0.15z)| norm 0.2711 (+1.35z)| lr 3.47e-05 | 8440.67 ms | -100.0% bf16 MFU | 62122 tok/s +step 16645/19560 | loss 3.270129 (-0.97z)| norm 0.2566 (+0.03z)| lr 3.47e-05 | 8438.42 ms | -100.0% bf16 MFU | 62123 tok/s +step 16646/19560 | loss 3.291338 (-0.38z)| norm 0.2483 (-0.73z)| lr 3.47e-05 | 8437.50 ms | -100.0% bf16 MFU | 62123 tok/s +step 16647/19560 | loss 3.304749 (-0.01z)| norm 0.2484 (-0.72z)| lr 3.47e-05 | 8439.39 ms | -100.0% bf16 MFU | 62123 tok/s +step 16648/19560 | loss 3.295022 (-0.28z)| norm 0.2543 (-0.19z)| lr 3.46e-05 | 8436.16 ms | -100.0% bf16 MFU | 62125 tok/s +step 16649/19560 | loss 3.283614 (-0.60z)| norm 0.2528 (-0.33z)| lr 3.46e-05 | 8439.15 ms | -100.0% bf16 MFU | 62125 tok/s +step 16650/19560 | loss 3.300514 (-0.11z)| norm 0.2576 (+0.11z)| lr 3.46e-05 | 8436.88 ms | -100.0% bf16 MFU | 62126 tok/s +step 16651/19560 | loss 3.288235 (-0.45z)| norm 0.2417 (-1.34z)| lr 3.46e-05 | 8437.01 ms | -100.0% bf16 MFU | 62126 tok/s +step 16652/19560 | loss 3.276050 (-0.79z)| norm 0.2426 (-1.26z)| lr 3.45e-05 | 8437.87 ms | -100.0% bf16 MFU | 62127 tok/s +step 16653/19560 | loss 3.321181 (+0.49z)| norm 0.2430 (-1.22z)| lr 3.45e-05 | 8437.94 ms | -100.0% bf16 MFU | 62127 tok/s +step 16654/19560 | loss 3.288212 (-0.45z)| norm 0.2612 (+0.44z)| lr 3.45e-05 | 8439.25 ms | -100.0% bf16 MFU | 62127 tok/s +step 16655/19560 | loss 3.265331 (-1.09z)| norm 0.2392 (-1.55z)| lr 3.45e-05 | 8436.42 ms | -100.0% bf16 MFU | 62128 tok/s +step 16656/19560 | loss 3.310450 (+0.19z)| norm 0.2766 (+1.80z)| lr 3.44e-05 | 8434.17 ms | -100.0% bf16 MFU | 62130 tok/s +step 16657/19560 | loss 3.276748 (-0.76z)| norm 0.2506 (-0.52z)| lr 3.44e-05 | 8437.66 ms | -100.0% bf16 MFU | 62130 tok/s +step 16658/19560 | loss 3.333498 (+0.90z)| norm 0.2536 (-0.25z)| lr 3.44e-05 | 8436.84 ms | -100.0% bf16 MFU | 62131 tok/s +step 16659/19560 | loss 3.301350 (-0.05z)| norm 0.2437 (-1.13z)| lr 3.44e-05 | 8439.64 ms | -100.0% bf16 MFU | 62130 tok/s +step 16660/19560 | loss 3.214890 (-2.57z)| norm 0.2568 (+0.04z)| lr 3.44e-05 | 8438.72 ms | -100.0% bf16 MFU | 62130 tok/s +step 16661/19560 | loss 3.350768 (+1.42z)| norm 0.2569 (+0.05z)| lr 3.43e-05 | 8437.19 ms | -100.0% bf16 MFU | 62131 tok/s +step 16662/19560 | loss 3.291055 (-0.34z)| norm 0.2423 (-1.23z)| lr 3.43e-05 | 8436.55 ms | -100.0% bf16 MFU | 62131 tok/s +step 16663/19560 | loss 3.290308 (-0.37z)| norm 0.2401 (-1.41z)| lr 3.43e-05 | 8438.01 ms | -100.0% bf16 MFU | 62132 tok/s +step 16664/19560 | loss 3.298005 (-0.15z)| norm 0.2468 (-0.82z)| lr 3.43e-05 | 8440.31 ms | -100.0% bf16 MFU | 62131 tok/s +step 16665/19560 | loss 3.326776 (+0.70z)| norm 0.2413 (-1.30z)| lr 3.42e-05 | 8436.89 ms | -100.0% bf16 MFU | 62131 tok/s +step 16666/19560 | loss 3.372180 (+2.00z)| norm 0.2480 (-0.70z)| lr 3.42e-05 | 8435.35 ms | -100.0% bf16 MFU | 62133 tok/s +step 16667/19560 | loss 3.276814 (-0.78z)| norm 0.2351 (-1.80z)| lr 3.42e-05 | 8440.23 ms | -100.0% bf16 MFU | 62132 tok/s +step 16668/19560 | loss 3.319821 (+0.47z)| norm 0.2479 (-0.69z)| lr 3.42e-05 | 8436.53 ms | -100.0% bf16 MFU | 62132 tok/s +step 16669/19560 | loss 3.314914 (+0.32z)| norm 0.2514 (-0.38z)| lr 3.41e-05 | 8436.60 ms | -100.0% bf16 MFU | 62133 tok/s +step 16670/19560 | loss 3.257734 (-1.36z)| norm 0.2450 (-0.93z)| lr 3.41e-05 | 8435.03 ms | -100.0% bf16 MFU | 62134 tok/s +step 16671/19560 | loss 3.333700 (+0.88z)| norm 0.2677 (+1.03z)| lr 3.41e-05 | 8436.34 ms | -100.0% bf16 MFU | 62135 tok/s +step 16672/19560 | loss 3.282773 (-0.62z)| norm 0.2467 (-0.78z)| lr 3.41e-05 | 8436.48 ms | -100.0% bf16 MFU | 62135 tok/s +step 16673/19560 | loss 3.345440 (+1.21z)| norm 0.2529 (-0.25z)| lr 3.40e-05 | 8436.29 ms | -100.0% bf16 MFU | 62136 tok/s +step 16674/19560 | loss 3.301657 (-0.07z)| norm 0.2476 (-0.71z)| lr 3.40e-05 | 8439.03 ms | -100.0% bf16 MFU | 62135 tok/s +step 16675/19560 | loss 3.323112 (+0.56z)| norm 0.2493 (-0.56z)| lr 3.40e-05 | 8437.64 ms | -100.0% bf16 MFU | 62135 tok/s +step 16676/19560 | loss 3.353060 (+1.41z)| norm 0.2458 (-0.86z)| lr 3.40e-05 | 8434.68 ms | -100.0% bf16 MFU | 62137 tok/s +step 16677/19560 | loss 3.355365 (+1.45z)| norm 0.2597 (+0.35z)| lr 3.40e-05 | 8438.98 ms | -100.0% bf16 MFU | 62136 tok/s +step 16678/19560 | loss 3.318125 (+0.36z)| norm 0.2542 (-0.14z)| lr 3.39e-05 | 8440.02 ms | -100.0% bf16 MFU | 62135 tok/s +step 16679/19560 | loss 3.309530 (+0.10z)| norm 0.2546 (-0.11z)| lr 3.39e-05 | 8438.93 ms | -100.0% bf16 MFU | 62135 tok/s +step 16680/19560 | loss 3.327064 (+0.61z)| norm 0.2497 (-0.53z)| lr 3.39e-05 | 8437.10 ms | -100.0% bf16 MFU | 62135 tok/s +step 16681/19560 | loss 3.339281 (+0.95z)| norm 0.2414 (-1.24z)| lr 3.39e-05 | 8437.34 ms | -100.0% bf16 MFU | 62135 tok/s +step 16682/19560 | loss 3.301857 (-0.16z)| norm 0.2469 (-0.77z)| lr 3.38e-05 | 8438.51 ms | -100.0% bf16 MFU | 62135 tok/s +step 16683/19560 | loss 3.323481 (+0.47z)| norm 0.2377 (-1.53z)| lr 3.38e-05 | 8436.96 ms | -100.0% bf16 MFU | 62135 tok/s +step 16684/19560 | loss 3.303532 (-0.13z)| norm 0.2476 (-0.67z)| lr 3.38e-05 | 8434.65 ms | -100.0% bf16 MFU | 62137 tok/s +step 16685/19560 | loss 3.272101 (-1.09z)| norm 0.2385 (-1.44z)| lr 3.38e-05 | 8436.03 ms | -100.0% bf16 MFU | 62137 tok/s +step 16686/19560 | loss 3.287483 (-0.61z)| norm 0.2491 (-0.53z)| lr 3.37e-05 | 8437.49 ms | -100.0% bf16 MFU | 62137 tok/s +step 16687/19560 | loss 3.336540 (+0.87z)| norm 0.2486 (-0.57z)| lr 3.37e-05 | 8436.83 ms | -100.0% bf16 MFU | 62138 tok/s +step 16688/19560 | loss 3.339828 (+0.96z)| norm 0.2527 (-0.19z)| lr 3.37e-05 | 8437.51 ms | -100.0% bf16 MFU | 62138 tok/s +step 16689/19560 | loss 3.293354 (-0.44z)| norm 0.2411 (-1.23z)| lr 3.37e-05 | 8437.34 ms | -100.0% bf16 MFU | 62138 tok/s +step 16690/19560 | loss 3.335114 (+0.82z)| norm 0.2329 (-1.92z)| lr 3.37e-05 | 8438.10 ms | -100.0% bf16 MFU | 62137 tok/s +step 16691/19560 | loss 3.330217 (+0.66z)| norm 0.2507 (-0.34z)| lr 3.36e-05 | 8439.20 ms | -100.0% bf16 MFU | 62137 tok/s +step 16692/19560 | loss 3.276951 (-0.97z)| norm 0.2560 (+0.11z)| lr 3.36e-05 | 8436.20 ms | -100.0% bf16 MFU | 62137 tok/s +step 16693/19560 | loss 3.259325 (-1.50z)| norm 0.2418 (-1.13z)| lr 3.36e-05 | 8434.96 ms | -100.0% bf16 MFU | 62138 tok/s +step 16694/19560 | loss 3.306967 (-0.04z)| norm 0.2545 (-0.01z)| lr 3.36e-05 | 8435.51 ms | -100.0% bf16 MFU | 62139 tok/s +step 16695/19560 | loss 3.341981 (+1.02z)| norm 0.2485 (-0.54z)| lr 3.35e-05 | 8437.02 ms | -100.0% bf16 MFU | 62139 tok/s +step 16696/19560 | loss 3.254785 (-1.61z)| norm 0.2400 (-1.28z)| lr 3.35e-05 | 8460.10 ms | -100.0% bf16 MFU | 62131 tok/s +step 16697/19560 | loss 3.339476 (+0.95z)| norm 0.2575 (+0.25z)| lr 3.35e-05 | 8468.17 ms | -100.0% bf16 MFU | 62120 tok/s +step 16698/19560 | loss 3.302173 (-0.17z)| norm 0.2395 (-1.31z)| lr 3.35e-05 | 8459.25 ms | -100.0% bf16 MFU | 62113 tok/s +step 16699/19560 | loss 3.317426 (+0.28z)| norm 0.2496 (-0.44z)| lr 3.35e-05 | 8463.86 ms | -100.0% bf16 MFU | 62104 tok/s +step 16700/19560 | loss 3.253532 (-1.61z)| norm 0.2492 (-0.47z)| lr 3.34e-05 | 8460.23 ms | -100.0% bf16 MFU | 62098 tok/s +step 16701/19560 | loss 3.281922 (-0.78z)| norm 0.2457 (-0.77z)| lr 3.34e-05 | 8459.48 ms | -100.0% bf16 MFU | 62092 tok/s +step 16702/19560 | loss 3.337022 (+0.89z)| norm 0.2563 (+0.17z)| lr 3.34e-05 | 8463.59 ms | -100.0% bf16 MFU | 62084 tok/s +step 16703/19560 | loss 3.318738 (+0.34z)| norm 0.2488 (-0.49z)| lr 3.34e-05 | 8465.41 ms | -100.0% bf16 MFU | 62077 tok/s +step 16704/19560 | loss 3.227428 (-2.36z)| norm 0.2486 (-0.50z)| lr 3.33e-05 | 8460.47 ms | -100.0% bf16 MFU | 62071 tok/s +step 16705/19560 | loss 3.233064 (-2.14z)| norm 0.2525 (-0.16z)| lr 3.33e-05 | 8456.38 ms | -100.0% bf16 MFU | 62068 tok/s +step 16706/19560 | loss 3.289583 (-0.48z)| norm 0.2630 (+0.78z)| lr 3.33e-05 | 8458.05 ms | -100.0% bf16 MFU | 62064 tok/s +step 16707/19560 | loss 3.325274 (+0.59z)| norm 0.2557 (+0.13z)| lr 3.33e-05 | 8462.74 ms | -100.0% bf16 MFU | 62058 tok/s +step 16708/19560 | loss 3.312123 (+0.20z)| norm 0.2510 (-0.29z)| lr 3.32e-05 | 8454.00 ms | -100.0% bf16 MFU | 62056 tok/s +step 16709/19560 | loss 3.305070 (-0.01z)| norm 0.2602 (+0.52z)| lr 3.32e-05 | 8463.04 ms | -100.0% bf16 MFU | 62051 tok/s +step 16710/19560 | loss 3.348724 (+1.28z)| norm 0.2745 (+1.77z)| lr 3.32e-05 | 8456.16 ms | -100.0% bf16 MFU | 62048 tok/s +step 16711/19560 | loss 3.277517 (-0.84z)| norm 0.2556 (+0.09z)| lr 3.32e-05 | 8459.88 ms | -100.0% bf16 MFU | 62045 tok/s +step 16712/19560 | loss 3.273345 (-0.95z)| norm 0.2489 (-0.50z)| lr 3.32e-05 | 8464.35 ms | -100.0% bf16 MFU | 62039 tok/s +step 16713/19560 | loss 3.283288 (-0.64z)| norm 0.2483 (-0.56z)| lr 3.31e-05 | 8456.06 ms | -100.0% bf16 MFU | 62037 tok/s +step 16714/19560 | loss 3.290968 (-0.40z)| norm 0.2558 (+0.10z)| lr 3.31e-05 | 8456.82 ms | -100.0% bf16 MFU | 62035 tok/s +step 16715/19560 | loss 3.325215 (+0.68z)| norm 0.2629 (+0.74z)| lr 3.31e-05 | 8454.95 ms | -100.0% bf16 MFU | 62034 tok/s +step 16716/19560 | loss 3.276609 (-0.83z)| norm 0.2452 (-0.83z)| lr 3.31e-05 | 8453.01 ms | -100.0% bf16 MFU | 62034 tok/s +step 16717/19560 | loss 3.243068 (-1.85z)| norm 0.2483 (-0.56z)| lr 3.30e-05 | 8451.14 ms | -100.0% bf16 MFU | 62034 tok/s +step 16718/19560 | loss 3.343455 (+1.25z)| norm 0.2505 (-0.35z)| lr 3.30e-05 | 8462.40 ms | -100.0% bf16 MFU | 62030 tok/s +step 16719/19560 | loss 3.377113 (+2.23z)| norm 0.2528 (-0.13z)| lr 3.30e-05 | 8449.46 ms | -100.0% bf16 MFU | 62031 tok/s +step 16720/19560 | loss 3.304475 (+0.02z)| norm 0.2498 (-0.40z)| lr 3.30e-05 | 8457.33 ms | -100.0% bf16 MFU | 62029 tok/s +step 16721/19560 | loss 3.258692 (-1.35z)| norm 0.2602 (+0.55z)| lr 3.29e-05 | 8448.20 ms | -100.0% bf16 MFU | 62030 tok/s +step 16722/19560 | loss 3.311030 (+0.27z)| norm 0.2502 (-0.38z)| lr 3.29e-05 | 8450.69 ms | -100.0% bf16 MFU | 62031 tok/s +step 16723/19560 | loss 3.311011 (+0.26z)| norm 0.2534 (-0.08z)| lr 3.29e-05 | 8451.83 ms | -100.0% bf16 MFU | 62031 tok/s +step 16724/19560 | loss 3.318839 (+0.51z)| norm 0.2426 (-1.05z)| lr 3.29e-05 | 8456.15 ms | -100.0% bf16 MFU | 62029 tok/s +step 16725/19560 | loss 3.334236 (+0.98z)| norm 0.2519 (-0.20z)| lr 3.29e-05 | 8450.70 ms | -100.0% bf16 MFU | 62030 tok/s +step 16726/19560 | loss 3.355696 (+1.65z)| norm 0.2727 (+1.68z)| lr 3.28e-05 | 8449.49 ms | -100.0% bf16 MFU | 62031 tok/s +step 16727/19560 | loss 3.255267 (-1.48z)| norm 0.2533 (-0.10z)| lr 3.28e-05 | 8447.62 ms | -100.0% bf16 MFU | 62033 tok/s +step 16728/19560 | loss 3.210181 (-2.79z)| norm 0.2493 (-0.44z)| lr 3.28e-05 | 8449.92 ms | -100.0% bf16 MFU | 62033 tok/s +step 16729/19560 | loss 3.353332 (+1.54z)| norm 0.2541 (+0.01z)| lr 3.28e-05 | 8453.96 ms | -100.0% bf16 MFU | 62033 tok/s +step 16730/19560 | loss 3.331791 (+0.88z)| norm 0.2492 (-0.44z)| lr 3.27e-05 | 8453.99 ms | -100.0% bf16 MFU | 62032 tok/s +step 16731/19560 | loss 3.316373 (+0.41z)| norm 0.2616 (+0.69z)| lr 3.27e-05 | 8455.73 ms | -100.0% bf16 MFU | 62030 tok/s +step 16732/19560 | loss 3.315272 (+0.38z)| norm 0.2603 (+0.58z)| lr 3.27e-05 | 8452.55 ms | -100.0% bf16 MFU | 62030 tok/s +step 16733/19560 | loss 3.287746 (-0.45z)| norm 0.2547 (+0.07z)| lr 3.27e-05 | 8448.75 ms | -100.0% bf16 MFU | 62031 tok/s +step 16734/19560 | loss 3.316595 (+0.41z)| norm 0.2404 (-1.25z)| lr 3.27e-05 | 8453.89 ms | -100.0% bf16 MFU | 62031 tok/s +step 16735/19560 | loss 3.264052 (-1.17z)| norm 0.2403 (-1.26z)| lr 3.26e-05 | 8452.93 ms | -100.0% bf16 MFU | 62030 tok/s +step 16736/19560 | loss 3.305010 (+0.06z)| norm 0.2696 (+1.47z)| lr 3.26e-05 | 8451.31 ms | -100.0% bf16 MFU | 62031 tok/s +step 16737/19560 | loss 3.300628 (-0.07z)| norm 0.2437 (-0.93z)| lr 3.26e-05 | 8450.45 ms | -100.0% bf16 MFU | 62031 tok/s +step 16738/19560 | loss 3.280563 (-0.66z)| norm 0.2407 (-1.21z)| lr 3.26e-05 | 8453.75 ms | -100.0% bf16 MFU | 62031 tok/s +step 16739/19560 | loss 3.269061 (-1.00z)| norm 0.2484 (-0.47z)| lr 3.25e-05 | 8449.50 ms | -100.0% bf16 MFU | 62032 tok/s +step 16740/19560 | loss 3.347585 (+1.36z)| norm 0.2433 (-1.00z)| lr 3.25e-05 | 8447.90 ms | -100.0% bf16 MFU | 62033 tok/s +step 16741/19560 | loss 3.266303 (-1.09z)| norm 0.2475 (-0.55z)| lr 3.25e-05 | 8450.42 ms | -100.0% bf16 MFU | 62034 tok/s +step 16742/19560 | loss 3.259818 (-1.27z)| norm 0.2588 (+0.63z)| lr 3.25e-05 | 8450.53 ms | -100.0% bf16 MFU | 62034 tok/s +step 16743/19560 | loss 3.266910 (-1.04z)| norm 0.2609 (+0.86z)| lr 3.24e-05 | 8446.68 ms | -100.0% bf16 MFU | 62036 tok/s +step 16744/19560 | loss 3.341100 (+1.16z)| norm 0.2679 (+1.58z)| lr 3.24e-05 | 8445.80 ms | -100.0% bf16 MFU | 62038 tok/s +step 16745/19560 | loss 3.221103 (-2.34z)| norm 0.2475 (-0.56z)| lr 3.24e-05 | 8442.99 ms | -100.0% bf16 MFU | 62041 tok/s +step 16746/19560 | loss 3.298792 (-0.07z)| norm 0.2607 (+0.84z)| lr 3.24e-05 | 8449.18 ms | -100.0% bf16 MFU | 62041 tok/s +step 16747/19560 | loss 3.303471 (+0.06z)| norm 0.2531 (+0.04z)| lr 3.24e-05 | 8450.91 ms | -100.0% bf16 MFU | 62041 tok/s +step 16748/19560 | loss 3.259571 (-1.23z)| norm 0.2530 (+0.02z)| lr 3.23e-05 | 8449.37 ms | -100.0% bf16 MFU | 62042 tok/s +step 16749/19560 | loss 3.291585 (-0.28z)| norm 0.2461 (-0.70z)| lr 3.23e-05 | 8441.44 ms | -100.0% bf16 MFU | 62045 tok/s +step 16750/19560 | loss 3.263540 (-1.11z)| norm 0.2543 (+0.18z)| lr 3.23e-05 | 8443.97 ms | -100.0% bf16 MFU | 62047 tok/s +val loss 3.273823 +evaluating HellaSwag: 0/1256 evaluating HellaSwag: 10/1256 evaluating HellaSwag: 20/1256 evaluating HellaSwag: 30/1256 evaluating HellaSwag: 40/1256 evaluating HellaSwag: 50/1256 evaluating HellaSwag: 60/1256 evaluating HellaSwag: 70/1256 evaluating HellaSwag: 80/1256 evaluating HellaSwag: 90/1256 evaluating HellaSwag: 100/1256 evaluating HellaSwag: 110/1256 evaluating HellaSwag: 120/1256 evaluating HellaSwag: 130/1256 evaluating HellaSwag: 140/1256 evaluating HellaSwag: 150/1256 evaluating HellaSwag: 160/1256 evaluating HellaSwag: 170/1256 evaluating HellaSwag: 180/1256 evaluating HellaSwag: 190/1256 evaluating HellaSwag: 200/1256 evaluating HellaSwag: 210/1256 evaluating HellaSwag: 220/1256 evaluating HellaSwag: 230/1256 evaluating HellaSwag: 240/1256 evaluating HellaSwag: 250/1256 evaluating HellaSwag: 260/1256 evaluating HellaSwag: 270/1256 evaluating HellaSwag: 280/1256 evaluating HellaSwag: 290/1256 evaluating HellaSwag: 300/1256 evaluating HellaSwag: 310/1256 evaluating HellaSwag: 320/1256 evaluating HellaSwag: 330/1256 evaluating HellaSwag: 340/1256 evaluating HellaSwag: 350/1256 evaluating HellaSwag: 360/1256 evaluating HellaSwag: 370/1256 evaluating HellaSwag: 380/1256 evaluating HellaSwag: 390/1256 evaluating HellaSwag: 400/1256 evaluating HellaSwag: 410/1256 evaluating HellaSwag: 420/1256 evaluating HellaSwag: 430/1256 evaluating HellaSwag: 440/1256 evaluating HellaSwag: 450/1256 evaluating HellaSwag: 460/1256 evaluating HellaSwag: 470/1256 evaluating HellaSwag: 480/1256 evaluating HellaSwag: 490/1256 evaluating HellaSwag: 500/1256 evaluating HellaSwag: 510/1256 evaluating HellaSwag: 520/1256 evaluating HellaSwag: 530/1256 evaluating HellaSwag: 540/1256 evaluating HellaSwag: 550/1256 evaluating HellaSwag: 560/1256 evaluating HellaSwag: 570/1256 evaluating HellaSwag: 580/1256 evaluating HellaSwag: 590/1256 evaluating HellaSwag: 600/1256 evaluating HellaSwag: 610/1256 evaluating HellaSwag: 620/1256 evaluating HellaSwag: 630/1256 evaluating HellaSwag: 640/1256 evaluating HellaSwag: 650/1256 evaluating HellaSwag: 660/1256 evaluating HellaSwag: 670/1256 evaluating HellaSwag: 680/1256 evaluating HellaSwag: 690/1256 evaluating HellaSwag: 700/1256 evaluating HellaSwag: 710/1256 evaluating HellaSwag: 720/1256 evaluating HellaSwag: 730/1256 evaluating HellaSwag: 740/1256 evaluating HellaSwag: 750/1256 evaluating HellaSwag: 760/1256 evaluating HellaSwag: 770/1256 evaluating HellaSwag: 780/1256 evaluating HellaSwag: 790/1256 evaluating HellaSwag: 800/1256 evaluating HellaSwag: 810/1256 evaluating HellaSwag: 820/1256 evaluating HellaSwag: 830/1256 evaluating HellaSwag: 840/1256 evaluating HellaSwag: 850/1256 evaluating HellaSwag: 860/1256 evaluating HellaSwag: 870/1256 evaluating HellaSwag: 880/1256 evaluating HellaSwag: 890/1256 evaluating HellaSwag: 900/1256 evaluating HellaSwag: 910/1256 evaluating HellaSwag: 920/1256 evaluating HellaSwag: 930/1256 evaluating HellaSwag: 940/1256 evaluating HellaSwag: 950/1256 evaluating HellaSwag: 960/1256 evaluating HellaSwag: 970/1256 evaluating HellaSwag: 980/1256 evaluating HellaSwag: 990/1256 evaluating HellaSwag: 1000/1256 evaluating HellaSwag: 1010/1256 evaluating HellaSwag: 1020/1256 evaluating HellaSwag: 1030/1256 evaluating HellaSwag: 1040/1256 evaluating HellaSwag: 1050/1256 evaluating HellaSwag: 1060/1256 evaluating HellaSwag: 1070/1256 evaluating HellaSwag: 1080/1256 evaluating HellaSwag: 1090/1256 evaluating HellaSwag: 1100/1256 evaluating HellaSwag: 1110/1256 evaluating HellaSwag: 1120/1256 evaluating HellaSwag: 1130/1256 evaluating HellaSwag: 1140/1256 evaluating HellaSwag: 1150/1256 evaluating HellaSwag: 1160/1256 evaluating HellaSwag: 1170/1256 evaluating HellaSwag: 1180/1256 evaluating HellaSwag: 1190/1256 evaluating HellaSwag: 1200/1256 evaluating HellaSwag: 1210/1256 evaluating HellaSwag: 1220/1256 evaluating HellaSwag: 1230/1256 evaluating HellaSwag: 1240/1256 evaluating HellaSwag: 1250/1256 HellaSwag: 3003/10042 = 0.299044 +step 16751/19560 | loss 3.291089 (-0.28z)| norm 0.2641 (+1.27z)| lr 3.23e-05 | 8443.87 ms | -100.0% bf16 MFU | 62050 tok/s +step 16752/19560 | loss 3.253176 (-1.39z)| norm 0.2492 (-0.35z)| lr 3.22e-05 | 8447.85 ms | -100.0% bf16 MFU | 62050 tok/s +step 16753/19560 | loss 3.292027 (-0.23z)| norm 0.2394 (-1.42z)| lr 3.22e-05 | 8445.16 ms | -100.0% bf16 MFU | 62052 tok/s +step 16754/19560 | loss 3.327376 (+0.81z)| norm 0.2393 (-1.40z)| lr 3.22e-05 | 8444.32 ms | -100.0% bf16 MFU | 62054 tok/s +step 16755/19560 | loss 3.350494 (+1.51z)| norm 0.2545 (+0.29z)| lr 3.22e-05 | 8447.20 ms | -100.0% bf16 MFU | 62054 tok/s +step 16756/19560 | loss 3.265444 (-1.03z)| norm 0.2465 (-0.59z)| lr 3.22e-05 | 8444.69 ms | -100.0% bf16 MFU | 62056 tok/s +step 16757/19560 | loss 3.360415 (+1.81z)| norm 0.2505 (-0.13z)| lr 3.21e-05 | 8441.24 ms | -100.0% bf16 MFU | 62058 tok/s +step 16758/19560 | loss 3.269188 (-0.91z)| norm 0.2631 (+1.33z)| lr 3.21e-05 | 8447.60 ms | -100.0% bf16 MFU | 62059 tok/s +step 16759/19560 | loss 3.288536 (-0.33z)| norm 0.2447 (-0.78z)| lr 3.21e-05 | 8448.98 ms | -100.0% bf16 MFU | 62058 tok/s +step 16760/19560 | loss 3.289101 (-0.32z)| norm 0.2470 (-0.51z)| lr 3.21e-05 | 8440.90 ms | -100.0% bf16 MFU | 62061 tok/s +step 16761/19560 | loss 3.301954 (+0.06z)| norm 0.2380 (-1.55z)| lr 3.20e-05 | 8441.50 ms | -100.0% bf16 MFU | 62063 tok/s +step 16762/19560 | loss 3.286139 (-0.41z)| norm 0.2414 (-1.16z)| lr 3.20e-05 | 8442.99 ms | -100.0% bf16 MFU | 62065 tok/s +step 16763/19560 | loss 3.252025 (-1.44z)| norm 0.2526 (+0.19z)| lr 3.20e-05 | 8441.71 ms | -100.0% bf16 MFU | 62067 tok/s +step 16764/19560 | loss 3.283532 (-0.48z)| norm 0.2414 (-1.15z)| lr 3.20e-05 | 8443.59 ms | -100.0% bf16 MFU | 62069 tok/s +step 16765/19560 | loss 3.285869 (-0.41z)| norm 0.2566 (+0.68z)| lr 3.20e-05 | 8451.34 ms | -100.0% bf16 MFU | 62067 tok/s +step 16766/19560 | loss 3.338516 (+1.16z)| norm 0.2589 (+0.96z)| lr 3.19e-05 | 8445.34 ms | -100.0% bf16 MFU | 62068 tok/s +step 16767/19560 | loss 3.284517 (-0.47z)| norm 0.2457 (-0.63z)| lr 3.19e-05 | 8449.31 ms | -100.0% bf16 MFU | 62067 tok/s +step 16768/19560 | loss 3.241522 (-1.73z)| norm 0.2483 (-0.31z)| lr 3.19e-05 | 8440.92 ms | -100.0% bf16 MFU | 62069 tok/s +step 16769/19560 | loss 3.387485 (+2.53z)| norm 0.2567 (+0.70z)| lr 3.19e-05 | 8443.03 ms | -100.0% bf16 MFU | 62070 tok/s +step 16770/19560 | loss 3.455324 (+4.16z)| norm 0.2907 (+4.40z)| lr 3.18e-05 | 8444.65 ms | -100.0% bf16 MFU | 62071 tok/s +step 16771/19560 | loss 3.446813 (+3.69z)| norm 0.2737 (+2.44z)| lr 3.18e-05 | 8446.21 ms | -100.0% bf16 MFU | 62071 tok/s +step 16772/19560 | loss 3.314760 (+0.31z)| norm 0.2590 (+0.86z)| lr 3.18e-05 | 8441.07 ms | -100.0% bf16 MFU | 62073 tok/s +step 16773/19560 | loss 3.285278 (-0.45z)| norm 0.2606 (+1.03z)| lr 3.18e-05 | 8439.52 ms | -100.0% bf16 MFU | 62076 tok/s +step 16774/19560 | loss 3.287939 (-0.38z)| norm 0.2453 (-0.66z)| lr 3.18e-05 | 8437.68 ms | -100.0% bf16 MFU | 62079 tok/s +step 16775/19560 | loss 3.252513 (-1.27z)| norm 0.2418 (-1.04z)| lr 3.17e-05 | 8440.45 ms | -100.0% bf16 MFU | 62081 tok/s +step 16776/19560 | loss 3.303407 (+0.03z)| norm 0.2457 (-0.60z)| lr 3.17e-05 | 8445.01 ms | -100.0% bf16 MFU | 62081 tok/s +step 16777/19560 | loss 3.257333 (-1.14z)| norm 0.2444 (-0.73z)| lr 3.17e-05 | 8441.44 ms | -100.0% bf16 MFU | 62082 tok/s +step 16778/19560 | loss 3.268885 (-0.83z)| norm 0.2575 (+0.70z)| lr 3.17e-05 | 8439.92 ms | -100.0% bf16 MFU | 62084 tok/s +step 16779/19560 | loss 3.322923 (+0.52z)| norm 0.2402 (-1.18z)| lr 3.16e-05 | 8442.31 ms | -100.0% bf16 MFU | 62085 tok/s +step 16780/19560 | loss 3.349182 (+1.17z)| norm 0.2554 (+0.46z)| lr 3.16e-05 | 8442.35 ms | -100.0% bf16 MFU | 62086 tok/s +step 16781/19560 | loss 3.239596 (-1.56z)| norm 0.2527 (+0.17z)| lr 3.16e-05 | 8443.09 ms | -100.0% bf16 MFU | 62086 tok/s +step 16782/19560 | loss 3.322353 (+0.50z)| norm 0.2642 (+1.42z)| lr 3.16e-05 | 8440.16 ms | -100.0% bf16 MFU | 62088 tok/s +step 16783/19560 | loss 3.293291 (-0.23z)| norm 0.2533 (+0.22z)| lr 3.16e-05 | 8438.88 ms | -100.0% bf16 MFU | 62090 tok/s +step 16784/19560 | loss 3.270361 (-0.80z)| norm 0.2430 (-0.92z)| lr 3.15e-05 | 8441.93 ms | -100.0% bf16 MFU | 62091 tok/s +step 16785/19560 | loss 3.292999 (-0.24z)| norm 0.2466 (-0.51z)| lr 3.15e-05 | 8445.88 ms | -100.0% bf16 MFU | 62090 tok/s +step 16786/19560 | loss 3.328215 (+0.64z)| norm 0.2575 (+0.73z)| lr 3.15e-05 | 8446.12 ms | -100.0% bf16 MFU | 62089 tok/s +step 16787/19560 | loss 3.313891 (+0.28z)| norm 0.2535 (+0.26z)| lr 3.15e-05 | 8442.30 ms | -100.0% bf16 MFU | 62090 tok/s +step 16788/19560 | loss 3.341103 (+0.95z)| norm 0.2364 (-1.64z)| lr 3.14e-05 | 8442.58 ms | -100.0% bf16 MFU | 62090 tok/s +step 16789/19560 | loss 3.213184 (-2.23z)| norm 0.2460 (-0.55z)| lr 3.14e-05 | 8438.50 ms | -100.0% bf16 MFU | 62092 tok/s +step 16790/19560 | loss 3.289959 (-0.31z)| norm 0.2379 (-1.45z)| lr 3.14e-05 | 8440.65 ms | -100.0% bf16 MFU | 62094 tok/s +step 16791/19560 | loss 3.346855 (+1.10z)| norm 0.2308 (-2.21z)| lr 3.14e-05 | 8440.47 ms | -100.0% bf16 MFU | 62095 tok/s +step 16792/19560 | loss 3.317779 (+0.37z)| norm 0.2700 (+2.06z)| lr 3.14e-05 | 8439.96 ms | -100.0% bf16 MFU | 62096 tok/s +step 16793/19560 | loss 3.395175 (+2.24z)| norm 0.2431 (-0.86z)| lr 3.13e-05 | 8439.96 ms | -100.0% bf16 MFU | 62097 tok/s +step 16794/19560 | loss 3.322597 (+0.48z)| norm 0.2422 (-0.95z)| lr 3.13e-05 | 8442.02 ms | -100.0% bf16 MFU | 62097 tok/s +step 16795/19560 | loss 3.293827 (-0.23z)| norm 0.2472 (-0.42z)| lr 3.13e-05 | 8436.20 ms | -100.0% bf16 MFU | 62100 tok/s +step 16796/19560 | loss 3.283208 (-0.49z)| norm 0.2569 (+0.63z)| lr 3.13e-05 | 8440.83 ms | -100.0% bf16 MFU | 62101 tok/s +step 16797/19560 | loss 3.268529 (-0.84z)| norm 0.2347 (-1.77z)| lr 3.12e-05 | 8442.26 ms | -100.0% bf16 MFU | 62101 tok/s +step 16798/19560 | loss 3.347325 (+1.08z)| norm 0.2398 (-1.20z)| lr 3.12e-05 | 8439.31 ms | -100.0% bf16 MFU | 62102 tok/s +step 16799/19560 | loss 3.249935 (-1.29z)| norm 0.2455 (-0.58z)| lr 3.12e-05 | 8439.15 ms | -100.0% bf16 MFU | 62103 tok/s +step 16800/19560 | loss 3.319595 (+0.41z)| norm 0.2390 (-1.28z)| lr 3.12e-05 | 8441.27 ms | -100.0% bf16 MFU | 62103 tok/s +step 16801/19560 | loss 3.279040 (-0.58z)| norm 0.2458 (-0.53z)| lr 3.12e-05 | 8444.13 ms | -100.0% bf16 MFU | 62103 tok/s +step 16802/19560 | loss 3.275832 (-0.65z)| norm 0.2373 (-1.43z)| lr 3.11e-05 | 8438.49 ms | -100.0% bf16 MFU | 62104 tok/s +step 16803/19560 | loss 3.304345 (+0.06z)| norm 0.2446 (-0.64z)| lr 3.11e-05 | 8435.97 ms | -100.0% bf16 MFU | 62106 tok/s +step 16804/19560 | loss 3.344835 (+1.05z)| norm 0.2593 (+0.93z)| lr 3.11e-05 | 8439.94 ms | -100.0% bf16 MFU | 62107 tok/s +step 16805/19560 | loss 3.285317 (-0.40z)| norm 0.2384 (-1.29z)| lr 3.11e-05 | 8437.98 ms | -100.0% bf16 MFU | 62108 tok/s +step 16806/19560 | loss 3.295658 (-0.14z)| norm 0.2325 (-1.88z)| lr 3.10e-05 | 8439.23 ms | -100.0% bf16 MFU | 62109 tok/s +step 16807/19560 | loss 3.307667 (+0.16z)| norm 0.2467 (-0.38z)| lr 3.10e-05 | 8437.01 ms | -100.0% bf16 MFU | 62111 tok/s +step 16808/19560 | loss 3.285893 (-0.37z)| norm 0.2407 (-1.00z)| lr 3.10e-05 | 8438.27 ms | -100.0% bf16 MFU | 62112 tok/s +step 16809/19560 | loss 3.254681 (-1.13z)| norm 0.2523 (+0.22z)| lr 3.10e-05 | 8442.03 ms | -100.0% bf16 MFU | 62112 tok/s +step 16810/19560 | loss 3.270706 (-0.73z)| norm 0.2374 (-1.34z)| lr 3.10e-05 | 8440.84 ms | -100.0% bf16 MFU | 62112 tok/s +step 16811/19560 | loss 3.283147 (-0.41z)| norm 0.2656 (+1.59z)| lr 3.09e-05 | 8439.86 ms | -100.0% bf16 MFU | 62112 tok/s +step 16812/19560 | loss 3.285324 (-0.35z)| norm 0.2623 (+1.22z)| lr 3.09e-05 | 8436.40 ms | -100.0% bf16 MFU | 62114 tok/s +step 16813/19560 | loss 3.299738 (-0.00z)| norm 0.2535 (+0.30z)| lr 3.09e-05 | 8435.37 ms | -100.0% bf16 MFU | 62116 tok/s +step 16814/19560 | loss 3.325118 (+0.62z)| norm 0.2693 (+1.90z)| lr 3.09e-05 | 8438.50 ms | -100.0% bf16 MFU | 62116 tok/s +step 16815/19560 | loss 3.303738 (+0.10z)| norm 0.2556 (+0.48z)| lr 3.08e-05 | 8440.27 ms | -100.0% bf16 MFU | 62117 tok/s +step 16816/19560 | loss 3.299740 (+0.00z)| norm 0.2458 (-0.51z)| lr 3.08e-05 | 8437.53 ms | -100.0% bf16 MFU | 62118 tok/s +step 16817/19560 | loss 3.282357 (-0.43z)| norm 0.2519 (+0.11z)| lr 3.08e-05 | 8443.85 ms | -100.0% bf16 MFU | 62116 tok/s +step 16818/19560 | loss 3.315313 (+0.40z)| norm 0.2444 (-0.68z)| lr 3.08e-05 | 8441.16 ms | -100.0% bf16 MFU | 62116 tok/s +step 16819/19560 | loss 3.305356 (+0.16z)| norm 0.2459 (-0.53z)| lr 3.08e-05 | 8435.82 ms | -100.0% bf16 MFU | 62118 tok/s +step 16820/19560 | loss 3.302926 (+0.09z)| norm 0.2359 (-1.54z)| lr 3.07e-05 | 8436.16 ms | -100.0% bf16 MFU | 62119 tok/s +step 16821/19560 | loss 3.356248 (+1.40z)| norm 0.2527 (+0.18z)| lr 3.07e-05 | 8436.40 ms | -100.0% bf16 MFU | 62121 tok/s +step 16822/19560 | loss 3.256988 (-1.06z)| norm 0.2511 (+0.02z)| lr 3.07e-05 | 8435.72 ms | -100.0% bf16 MFU | 62122 tok/s +step 16823/19560 | loss 3.247974 (-1.27z)| norm 0.2504 (-0.05z)| lr 3.07e-05 | 8436.28 ms | -100.0% bf16 MFU | 62123 tok/s +step 16824/19560 | loss 3.301008 (+0.04z)| norm 0.2620 (+1.14z)| lr 3.06e-05 | 8437.46 ms | -100.0% bf16 MFU | 62124 tok/s +step 16825/19560 | loss 3.272640 (-0.65z)| norm 0.2436 (-0.76z)| lr 3.06e-05 | 8438.48 ms | -100.0% bf16 MFU | 62124 tok/s +step 16826/19560 | loss 3.257682 (-1.02z)| norm 0.2490 (-0.21z)| lr 3.06e-05 | 8442.28 ms | -100.0% bf16 MFU | 62123 tok/s +step 16827/19560 | loss 3.285136 (-0.33z)| norm 0.2546 (+0.37z)| lr 3.06e-05 | 8440.63 ms | -100.0% bf16 MFU | 62123 tok/s +step 16828/19560 | loss 3.321196 (+0.56z)| norm 0.2502 (-0.09z)| lr 3.06e-05 | 8438.76 ms | -100.0% bf16 MFU | 62123 tok/s +step 16829/19560 | loss 3.282329 (-0.41z)| norm 0.2598 (+0.90z)| lr 3.05e-05 | 8441.01 ms | -100.0% bf16 MFU | 62123 tok/s +step 16830/19560 | loss 3.290221 (-0.20z)| norm 0.2542 (+0.32z)| lr 3.05e-05 | 8439.74 ms | -100.0% bf16 MFU | 62123 tok/s +step 16831/19560 | loss 3.285569 (-0.31z)| norm 0.2414 (-1.00z)| lr 3.05e-05 | 8438.25 ms | -100.0% bf16 MFU | 62123 tok/s +step 16832/19560 | loss 3.294449 (-0.11z)| norm 0.2553 (+0.43z)| lr 3.05e-05 | 8438.92 ms | -100.0% bf16 MFU | 62123 tok/s +step 16833/19560 | loss 3.277072 (-0.56z)| norm 0.2531 (+0.20z)| lr 3.04e-05 | 8440.47 ms | -100.0% bf16 MFU | 62123 tok/s +step 16834/19560 | loss 3.280918 (-0.46z)| norm 0.2588 (+0.80z)| lr 3.04e-05 | 8438.92 ms | -100.0% bf16 MFU | 62123 tok/s +step 16835/19560 | loss 3.342600 (+1.11z)| norm 0.2460 (-0.52z)| lr 3.04e-05 | 8438.99 ms | -100.0% bf16 MFU | 62123 tok/s +step 16836/19560 | loss 3.292963 (-0.15z)| norm 0.2460 (-0.52z)| lr 3.04e-05 | 8437.29 ms | -100.0% bf16 MFU | 62124 tok/s +step 16837/19560 | loss 3.286994 (-0.30z)| norm 0.2752 (+2.45z)| lr 3.04e-05 | 8440.44 ms | -100.0% bf16 MFU | 62124 tok/s +step 16838/19560 | loss 3.304247 (+0.15z)| norm 0.2568 (+0.60z)| lr 3.03e-05 | 8439.32 ms | -100.0% bf16 MFU | 62124 tok/s +step 16839/19560 | loss 3.262481 (-0.92z)| norm 0.2310 (-2.02z)| lr 3.03e-05 | 8438.03 ms | -100.0% bf16 MFU | 62124 tok/s +step 16840/19560 | loss 3.289040 (-0.24z)| norm 0.2599 (+0.92z)| lr 3.03e-05 | 8437.12 ms | -100.0% bf16 MFU | 62125 tok/s +step 16841/19560 | loss 3.269216 (-0.75z)| norm 0.2808 (+2.92z)| lr 3.03e-05 | 8436.59 ms | -100.0% bf16 MFU | 62126 tok/s +step 16842/19560 | loss 3.294522 (-0.10z)| norm 0.2510 (-0.01z)| lr 3.02e-05 | 8439.09 ms | -100.0% bf16 MFU | 62126 tok/s +step 16843/19560 | loss 3.323656 (+0.65z)| norm 0.2523 (+0.13z)| lr 3.02e-05 | 8441.09 ms | -100.0% bf16 MFU | 62125 tok/s +step 16844/19560 | loss 3.268272 (-0.77z)| norm 0.2669 (+1.55z)| lr 3.02e-05 | 8437.18 ms | -100.0% bf16 MFU | 62126 tok/s +step 16845/19560 | loss 3.325813 (+0.70z)| norm 0.2385 (-1.23z)| lr 3.02e-05 | 8442.27 ms | -100.0% bf16 MFU | 62125 tok/s +step 16846/19560 | loss 3.283860 (-0.38z)| norm 0.2495 (-0.15z)| lr 3.02e-05 | 8436.40 ms | -100.0% bf16 MFU | 62126 tok/s +step 16847/19560 | loss 3.293064 (-0.12z)| norm 0.2438 (-0.70z)| lr 3.01e-05 | 8435.09 ms | -100.0% bf16 MFU | 62127 tok/s +step 16848/19560 | loss 3.302748 (+0.13z)| norm 0.2331 (-1.71z)| lr 3.01e-05 | 8436.04 ms | -100.0% bf16 MFU | 62128 tok/s +step 16849/19560 | loss 3.331770 (+0.88z)| norm 0.2463 (-0.43z)| lr 3.01e-05 | 8436.29 ms | -100.0% bf16 MFU | 62129 tok/s +step 16850/19560 | loss 3.289872 (-0.22z)| norm 0.2412 (-0.91z)| lr 3.01e-05 | 8433.71 ms | -100.0% bf16 MFU | 62131 tok/s +step 16851/19560 | loss 3.276776 (-0.56z)| norm 0.2392 (-1.09z)| lr 3.01e-05 | 8436.76 ms | -100.0% bf16 MFU | 62132 tok/s +step 16852/19560 | loss 3.251015 (-1.22z)| norm 0.2447 (-0.57z)| lr 3.00e-05 | 8439.15 ms | -100.0% bf16 MFU | 62131 tok/s +step 16853/19560 | loss 3.292194 (-0.13z)| norm 0.2449 (-0.55z)| lr 3.00e-05 | 8435.13 ms | -100.0% bf16 MFU | 62133 tok/s +step 16854/19560 | loss 3.290649 (-0.16z)| norm 0.2347 (-1.51z)| lr 3.00e-05 | 8438.45 ms | -100.0% bf16 MFU | 62133 tok/s +step 16855/19560 | loss 3.235058 (-1.62z)| norm 0.2431 (-0.68z)| lr 3.00e-05 | 8440.23 ms | -100.0% bf16 MFU | 62132 tok/s +step 16856/19560 | loss 3.323434 (+0.71z)| norm 0.2437 (-0.62z)| lr 2.99e-05 | 8436.87 ms | -100.0% bf16 MFU | 62132 tok/s +step 16857/19560 | loss 3.269515 (-0.74z)| norm 0.2374 (-1.21z)| lr 2.99e-05 | 8439.09 ms | -100.0% bf16 MFU | 62132 tok/s +step 16858/19560 | loss 3.264038 (-0.87z)| norm 0.2370 (-1.23z)| lr 2.99e-05 | 8437.09 ms | -100.0% bf16 MFU | 62133 tok/s +step 16859/19560 | loss 3.291233 (-0.13z)| norm 0.2470 (-0.27z)| lr 2.99e-05 | 8436.32 ms | -100.0% bf16 MFU | 62133 tok/s +step 16860/19560 | loss 3.226239 (-1.85z)| norm 0.2338 (-1.50z)| lr 2.99e-05 | 8441.74 ms | -100.0% bf16 MFU | 62132 tok/s +step 16861/19560 | loss 3.335769 (+1.08z)| norm 0.2463 (-0.31z)| lr 2.98e-05 | 8437.16 ms | -100.0% bf16 MFU | 62132 tok/s +step 16862/19560 | loss 3.359407 (+1.68z)| norm 0.2799 (+2.78z)| lr 2.98e-05 | 8436.95 ms | -100.0% bf16 MFU | 62133 tok/s +step 16863/19560 | loss 3.334839 (+1.02z)| norm 0.2735 (+2.13z)| lr 2.98e-05 | 8435.50 ms | -100.0% bf16 MFU | 62134 tok/s +step 16864/19560 | loss 3.304251 (+0.21z)| norm 0.2543 (+0.40z)| lr 2.98e-05 | 8437.68 ms | -100.0% bf16 MFU | 62134 tok/s +step 16865/19560 | loss 3.317057 (+0.54z)| norm 0.2410 (-0.83z)| lr 2.97e-05 | 8439.91 ms | -100.0% bf16 MFU | 62133 tok/s +step 16866/19560 | loss 3.252587 (-1.15z)| norm 0.2540 (+0.36z)| lr 2.97e-05 | 8435.88 ms | -100.0% bf16 MFU | 62134 tok/s +step 16867/19560 | loss 3.331187 (+0.90z)| norm 0.2536 (+0.33z)| lr 2.97e-05 | 8436.68 ms | -100.0% bf16 MFU | 62135 tok/s +step 16868/19560 | loss 3.260441 (-0.94z)| norm 0.2535 (+0.31z)| lr 2.97e-05 | 8434.65 ms | -100.0% bf16 MFU | 62136 tok/s +step 16869/19560 | loss 3.360420 (+1.66z)| norm 0.2430 (-0.66z)| lr 2.97e-05 | 8435.79 ms | -100.0% bf16 MFU | 62136 tok/s +step 16870/19560 | loss 3.309579 (+0.32z)| norm 0.2483 (-0.16z)| lr 2.96e-05 | 8437.03 ms | -100.0% bf16 MFU | 62137 tok/s +step 16871/19560 | loss 3.320368 (+0.60z)| norm 0.2576 (+0.70z)| lr 2.96e-05 | 8440.76 ms | -100.0% bf16 MFU | 62136 tok/s +step 16872/19560 | loss 3.306102 (+0.23z)| norm 0.2592 (+0.86z)| lr 2.96e-05 | 8440.36 ms | -100.0% bf16 MFU | 62135 tok/s +step 16873/19560 | loss 3.248085 (-1.32z)| norm 0.2441 (-0.55z)| lr 2.96e-05 | 8439.82 ms | -100.0% bf16 MFU | 62134 tok/s +step 16874/19560 | loss 3.326003 (+0.75z)| norm 0.2470 (-0.27z)| lr 2.96e-05 | 8436.73 ms | -100.0% bf16 MFU | 62134 tok/s +step 16875/19560 | loss 3.256551 (-1.08z)| norm 0.2445 (-0.50z)| lr 2.95e-05 | 8439.13 ms | -100.0% bf16 MFU | 62134 tok/s +step 16876/19560 | loss 3.302457 (+0.12z)| norm 0.2428 (-0.65z)| lr 2.95e-05 | 8439.37 ms | -100.0% bf16 MFU | 62133 tok/s +step 16877/19560 | loss 3.336638 (+1.02z)| norm 0.2509 (+0.11z)| lr 2.95e-05 | 8439.00 ms | -100.0% bf16 MFU | 62133 tok/s +step 16878/19560 | loss 3.311370 (+0.34z)| norm 0.2446 (-0.47z)| lr 2.95e-05 | 8438.30 ms | -100.0% bf16 MFU | 62133 tok/s +step 16879/19560 | loss 3.288963 (-0.25z)| norm 0.2400 (-0.89z)| lr 2.94e-05 | 8434.82 ms | -100.0% bf16 MFU | 62134 tok/s +step 16880/19560 | loss 3.293144 (-0.15z)| norm 0.2534 (+0.37z)| lr 2.94e-05 | 8439.55 ms | -100.0% bf16 MFU | 62134 tok/s +step 16881/19560 | loss 3.317538 (+0.49z)| norm 0.2614 (+1.10z)| lr 2.94e-05 | 8435.53 ms | -100.0% bf16 MFU | 62135 tok/s +step 16882/19560 | loss 3.320583 (+0.58z)| norm 0.2446 (-0.48z)| lr 2.94e-05 | 8436.19 ms | -100.0% bf16 MFU | 62135 tok/s +step 16883/19560 | loss 3.275334 (-0.62z)| norm 0.2422 (-0.70z)| lr 2.94e-05 | 8435.99 ms | -100.0% bf16 MFU | 62136 tok/s +step 16884/19560 | loss 3.213861 (-2.22z)| norm 0.2392 (-0.98z)| lr 2.93e-05 | 8437.05 ms | -100.0% bf16 MFU | 62136 tok/s +step 16885/19560 | loss 3.381572 (+2.18z)| norm 0.2434 (-0.57z)| lr 2.93e-05 | 8438.04 ms | -100.0% bf16 MFU | 62136 tok/s +step 16886/19560 | loss 3.302657 (+0.11z)| norm 0.2545 (+0.48z)| lr 2.93e-05 | 8438.08 ms | -100.0% bf16 MFU | 62136 tok/s +step 16887/19560 | loss 3.308825 (+0.27z)| norm 0.2475 (-0.19z)| lr 2.93e-05 | 8462.82 ms | -100.0% bf16 MFU | 62127 tok/s +step 16888/19560 | loss 3.306993 (+0.22z)| norm 0.2526 (+0.30z)| lr 2.92e-05 | 8465.13 ms | -100.0% bf16 MFU | 62117 tok/s +step 16889/19560 | loss 3.268966 (-0.77z)| norm 0.2554 (+0.54z)| lr 2.92e-05 | 8463.17 ms | -100.0% bf16 MFU | 62109 tok/s +step 16890/19560 | loss 3.289899 (-0.22z)| norm 0.2419 (-0.74z)| lr 2.92e-05 | 8458.99 ms | -100.0% bf16 MFU | 62102 tok/s +step 16891/19560 | loss 3.281414 (-0.46z)| norm 0.2385 (-1.04z)| lr 2.92e-05 | 8466.94 ms | -100.0% bf16 MFU | 62093 tok/s +step 16892/19560 | loss 3.255432 (-1.13z)| norm 0.2661 (+1.53z)| lr 2.92e-05 | 8461.91 ms | -100.0% bf16 MFU | 62087 tok/s +step 16893/19560 | loss 3.282511 (-0.42z)| norm 0.2569 (+0.68z)| lr 2.91e-05 | 8461.15 ms | -100.0% bf16 MFU | 62080 tok/s +step 16894/19560 | loss 3.282032 (-0.42z)| norm 0.2517 (+0.19z)| lr 2.91e-05 | 8458.14 ms | -100.0% bf16 MFU | 62076 tok/s +step 16895/19560 | loss 3.228308 (-1.80z)| norm 0.2537 (+0.37z)| lr 2.91e-05 | 8460.86 ms | -100.0% bf16 MFU | 62070 tok/s +step 16896/19560 | loss 3.276989 (-0.55z)| norm 0.2627 (+1.20z)| lr 2.91e-05 | 8460.21 ms | -100.0% bf16 MFU | 62065 tok/s +step 16897/19560 | loss 3.305663 (+0.23z)| norm 0.2344 (-1.42z)| lr 2.91e-05 | 8459.68 ms | -100.0% bf16 MFU | 62061 tok/s +step 16898/19560 | loss 3.311135 (+0.43z)| norm 0.2349 (-1.40z)| lr 2.90e-05 | 8454.01 ms | -100.0% bf16 MFU | 62059 tok/s +step 16899/19560 | loss 3.226680 (-2.08z)| norm 0.2565 (+0.74z)| lr 2.90e-05 | 8454.38 ms | -100.0% bf16 MFU | 62056 tok/s +step 16900/19560 | loss 3.362746 (+2.05z)| norm 0.2402 (-0.87z)| lr 2.90e-05 | 8457.70 ms | -100.0% bf16 MFU | 62053 tok/s +step 16901/19560 | loss 3.335713 (+1.22z)| norm 0.2469 (-0.20z)| lr 2.90e-05 | 8463.30 ms | -100.0% bf16 MFU | 62048 tok/s +step 16902/19560 | loss 3.287995 (-0.22z)| norm 0.2392 (-0.96z)| lr 2.89e-05 | 8458.11 ms | -100.0% bf16 MFU | 62045 tok/s +step 16903/19560 | loss 3.301223 (+0.17z)| norm 0.2407 (-0.81z)| lr 2.89e-05 | 8457.14 ms | -100.0% bf16 MFU | 62042 tok/s +step 16904/19560 | loss 3.316666 (+0.64z)| norm 0.2378 (-1.09z)| lr 2.89e-05 | 8462.15 ms | -100.0% bf16 MFU | 62038 tok/s +step 16905/19560 | loss 3.242723 (-1.59z)| norm 0.2571 (+0.82z)| lr 2.89e-05 | 8460.15 ms | -100.0% bf16 MFU | 62035 tok/s +step 16906/19560 | loss 3.292280 (-0.10z)| norm 0.3271 (+6.38z)| lr 2.89e-05 | 8452.13 ms | -100.0% bf16 MFU | 62034 tok/s +step 16907/19560 | loss 3.266769 (-0.86z)| norm 0.2442 (-0.43z)| lr 2.88e-05 | 8448.57 ms | -100.0% bf16 MFU | 62035 tok/s +step 16908/19560 | loss 3.258651 (-1.09z)| norm 0.2398 (-0.78z)| lr 2.88e-05 | 8459.10 ms | -100.0% bf16 MFU | 62033 tok/s +step 16909/19560 | loss 3.221129 (-2.20z)| norm 0.2370 (-0.99z)| lr 2.88e-05 | 8460.46 ms | -100.0% bf16 MFU | 62029 tok/s +step 16910/19560 | loss 3.317979 (+0.71z)| norm 0.2439 (-0.42z)| lr 2.88e-05 | 8450.86 ms | -100.0% bf16 MFU | 62030 tok/s +step 16911/19560 | loss 3.277753 (-0.50z)| norm 0.2403 (-0.71z)| lr 2.88e-05 | 8449.21 ms | -100.0% bf16 MFU | 62031 tok/s +step 16912/19560 | loss 3.258741 (-1.06z)| norm 0.2676 (+1.51z)| lr 2.87e-05 | 8454.02 ms | -100.0% bf16 MFU | 62030 tok/s +step 16913/19560 | loss 3.287192 (-0.21z)| norm 0.2466 (-0.20z)| lr 2.87e-05 | 8451.77 ms | -100.0% bf16 MFU | 62030 tok/s +step 16914/19560 | loss 3.270981 (-0.68z)| norm 0.2455 (-0.29z)| lr 2.87e-05 | 8457.49 ms | -100.0% bf16 MFU | 62028 tok/s +step 16915/19560 | loss 3.228413 (-1.91z)| norm 0.2438 (-0.42z)| lr 2.87e-05 | 8452.28 ms | -100.0% bf16 MFU | 62028 tok/s +step 16916/19560 | loss 3.301021 (+0.25z)| norm 0.2751 (+2.08z)| lr 2.86e-05 | 8451.19 ms | -100.0% bf16 MFU | 62029 tok/s +step 16917/19560 | loss 3.265702 (-0.84z)| norm 0.2413 (-0.64z)| lr 2.86e-05 | 8449.83 ms | -100.0% bf16 MFU | 62030 tok/s +step 16918/19560 | loss 3.280365 (-0.39z)| norm 0.2454 (-0.31z)| lr 2.86e-05 | 8455.47 ms | -100.0% bf16 MFU | 62029 tok/s +step 16919/19560 | loss 3.301975 (+0.29z)| norm 0.2494 (+0.00z)| lr 2.86e-05 | 8449.51 ms | -100.0% bf16 MFU | 62030 tok/s +step 16920/19560 | loss 3.223384 (-2.08z)| norm 0.2521 (+0.23z)| lr 2.86e-05 | 8453.42 ms | -100.0% bf16 MFU | 62029 tok/s +step 16921/19560 | loss 3.282153 (-0.28z)| norm 0.2416 (-0.63z)| lr 2.85e-05 | 8451.70 ms | -100.0% bf16 MFU | 62029 tok/s +step 16922/19560 | loss 3.288844 (-0.06z)| norm 0.2434 (-0.48z)| lr 2.85e-05 | 8448.12 ms | -100.0% bf16 MFU | 62031 tok/s +step 16923/19560 | loss 3.288950 (-0.06z)| norm 0.2445 (-0.39z)| lr 2.85e-05 | 8451.68 ms | -100.0% bf16 MFU | 62031 tok/s +step 16924/19560 | loss 3.270373 (-0.64z)| norm 0.2459 (-0.26z)| lr 2.85e-05 | 8448.25 ms | -100.0% bf16 MFU | 62032 tok/s +step 16925/19560 | loss 3.287947 (-0.09z)| norm 0.2425 (-0.55z)| lr 2.85e-05 | 8450.04 ms | -100.0% bf16 MFU | 62033 tok/s +step 16926/19560 | loss 3.283307 (-0.22z)| norm 0.2658 (+1.35z)| lr 2.84e-05 | 8446.79 ms | -100.0% bf16 MFU | 62035 tok/s +step 16927/19560 | loss 3.358979 (+2.15z)| norm 0.2393 (-0.82z)| lr 2.84e-05 | 8445.76 ms | -100.0% bf16 MFU | 62037 tok/s +step 16928/19560 | loss 3.339436 (+1.52z)| norm 0.2529 (+0.28z)| lr 2.84e-05 | 8446.52 ms | -100.0% bf16 MFU | 62039 tok/s +step 16929/19560 | loss 3.298501 (+0.22z)| norm 0.2403 (-0.75z)| lr 2.84e-05 | 8449.81 ms | -100.0% bf16 MFU | 62039 tok/s +step 16930/19560 | loss 3.308567 (+0.53z)| norm 0.2542 (+0.38z)| lr 2.84e-05 | 8448.84 ms | -100.0% bf16 MFU | 62040 tok/s +step 16931/19560 | loss 3.297303 (+0.18z)| norm 0.2431 (-0.53z)| lr 2.83e-05 | 8452.90 ms | -100.0% bf16 MFU | 62039 tok/s +step 16932/19560 | loss 3.321247 (+0.95z)| norm 0.2299 (-1.59z)| lr 2.83e-05 | 8448.25 ms | -100.0% bf16 MFU | 62040 tok/s +step 16933/19560 | loss 3.267617 (-0.75z)| norm 0.2424 (-0.57z)| lr 2.83e-05 | 8446.85 ms | -100.0% bf16 MFU | 62042 tok/s +step 16934/19560 | loss 3.260605 (-0.96z)| norm 0.2422 (-0.60z)| lr 2.83e-05 | 8438.99 ms | -100.0% bf16 MFU | 62046 tok/s +step 16935/19560 | loss 3.196579 (-2.87z)| norm 0.2551 (+0.46z)| lr 2.82e-05 | 8448.28 ms | -100.0% bf16 MFU | 62047 tok/s +step 16936/19560 | loss 3.268441 (-0.66z)| norm 0.2367 (-1.05z)| lr 2.82e-05 | 8445.13 ms | -100.0% bf16 MFU | 62048 tok/s +step 16937/19560 | loss 3.332184 (+1.27z)| norm 0.2563 (+0.56z)| lr 2.82e-05 | 8444.06 ms | -100.0% bf16 MFU | 62050 tok/s +step 16938/19560 | loss 3.399949 (+3.17z)| norm 0.2546 (+0.41z)| lr 2.82e-05 | 8456.02 ms | -100.0% bf16 MFU | 62048 tok/s +step 16939/19560 | loss 3.280453 (-0.33z)| norm 0.2400 (-0.78z)| lr 2.82e-05 | 8448.07 ms | -100.0% bf16 MFU | 62049 tok/s +step 16940/19560 | loss 3.241326 (-1.45z)| norm 0.2508 (+0.12z)| lr 2.81e-05 | 8448.98 ms | -100.0% bf16 MFU | 62049 tok/s +step 16941/19560 | loss 3.307804 (+0.48z)| norm 0.2379 (-0.94z)| lr 2.81e-05 | 8443.06 ms | -100.0% bf16 MFU | 62051 tok/s +step 16942/19560 | loss 3.322216 (+0.90z)| norm 0.2518 (+0.23z)| lr 2.81e-05 | 8448.90 ms | -100.0% bf16 MFU | 62051 tok/s +step 16943/19560 | loss 3.407480 (+3.21z)| norm 0.2695 (+1.68z)| lr 2.81e-05 | 8445.68 ms | -100.0% bf16 MFU | 62053 tok/s +step 16944/19560 | loss 3.296089 (+0.11z)| norm 0.2393 (-0.82z)| lr 2.81e-05 | 8449.09 ms | -100.0% bf16 MFU | 62053 tok/s +step 16945/19560 | loss 3.316581 (+0.67z)| norm 0.2517 (+0.21z)| lr 2.80e-05 | 8451.37 ms | -100.0% bf16 MFU | 62052 tok/s +step 16946/19560 | loss 3.308074 (+0.44z)| norm 0.2505 (+0.11z)| lr 2.80e-05 | 8442.73 ms | -100.0% bf16 MFU | 62054 tok/s +step 16947/19560 | loss 3.270382 (-0.60z)| norm 0.2480 (-0.10z)| lr 2.80e-05 | 8447.15 ms | -100.0% bf16 MFU | 62055 tok/s +step 16948/19560 | loss 3.281418 (-0.29z)| norm 0.2427 (-0.54z)| lr 2.80e-05 | 8449.97 ms | -100.0% bf16 MFU | 62054 tok/s +step 16949/19560 | loss 3.215195 (-2.09z)| norm 0.3281 (+5.64z)| lr 2.80e-05 | 8447.54 ms | -100.0% bf16 MFU | 62055 tok/s +step 16950/19560 | loss 3.277419 (-0.38z)| norm 0.2343 (-1.10z)| lr 2.79e-05 | 8444.97 ms | -100.0% bf16 MFU | 62056 tok/s +step 16951/19560 | loss 3.257650 (-0.93z)| norm 0.2455 (-0.30z)| lr 2.79e-05 | 8445.75 ms | -100.0% bf16 MFU | 62057 tok/s +step 16952/19560 | loss 3.277503 (-0.37z)| norm 0.2426 (-0.50z)| lr 2.79e-05 | 8442.52 ms | -100.0% bf16 MFU | 62059 tok/s +step 16953/19560 | loss 3.302073 (+0.31z)| norm 0.2432 (-0.45z)| lr 2.79e-05 | 8445.53 ms | -100.0% bf16 MFU | 62060 tok/s +step 16954/19560 | loss 3.327135 (+0.99z)| norm 0.2417 (-0.55z)| lr 2.78e-05 | 8450.64 ms | -100.0% bf16 MFU | 62059 tok/s +step 16955/19560 | loss 3.271518 (-0.56z)| norm 0.2355 (-0.99z)| lr 2.78e-05 | 8446.77 ms | -100.0% bf16 MFU | 62060 tok/s +step 16956/19560 | loss 3.274296 (-0.47z)| norm 0.2426 (-0.47z)| lr 2.78e-05 | 8444.72 ms | -100.0% bf16 MFU | 62061 tok/s +step 16957/19560 | loss 3.276347 (-0.41z)| norm 0.2376 (-0.82z)| lr 2.78e-05 | 8439.52 ms | -100.0% bf16 MFU | 62064 tok/s +step 16958/19560 | loss 3.392156 (+2.71z)| norm 0.2388 (-0.72z)| lr 2.78e-05 | 8445.46 ms | -100.0% bf16 MFU | 62065 tok/s +step 16959/19560 | loss 3.421053 (+3.31z)| norm 0.2481 (-0.07z)| lr 2.77e-05 | 8443.40 ms | -100.0% bf16 MFU | 62067 tok/s +step 16960/19560 | loss 3.311686 (+0.48z)| norm 0.2411 (-0.56z)| lr 2.77e-05 | 8450.97 ms | -100.0% bf16 MFU | 62065 tok/s +step 16961/19560 | loss 3.261514 (-0.81z)| norm 0.2512 (+0.16z)| lr 2.77e-05 | 8442.26 ms | -100.0% bf16 MFU | 62067 tok/s +step 16962/19560 | loss 3.343111 (+1.27z)| norm 0.2518 (+0.21z)| lr 2.77e-05 | 8441.99 ms | -100.0% bf16 MFU | 62069 tok/s +step 16963/19560 | loss 3.346569 (+1.36z)| norm 0.2491 (+0.02z)| lr 2.77e-05 | 8439.33 ms | -100.0% bf16 MFU | 62072 tok/s +step 16964/19560 | loss 3.304173 (+0.27z)| norm 0.2488 (-0.01z)| lr 2.76e-05 | 8442.79 ms | -100.0% bf16 MFU | 62073 tok/s +step 16965/19560 | loss 3.301021 (+0.19z)| norm 0.2581 (+0.67z)| lr 2.76e-05 | 8439.89 ms | -100.0% bf16 MFU | 62075 tok/s +step 16966/19560 | loss 3.306500 (+0.33z)| norm 0.2465 (-0.16z)| lr 2.76e-05 | 8443.19 ms | -100.0% bf16 MFU | 62076 tok/s +step 16967/19560 | loss 3.301607 (+0.19z)| norm 0.2444 (-0.32z)| lr 2.76e-05 | 8443.50 ms | -100.0% bf16 MFU | 62077 tok/s +step 16968/19560 | loss 3.353786 (+1.51z)| norm 0.2522 (+0.26z)| lr 2.76e-05 | 8445.10 ms | -100.0% bf16 MFU | 62078 tok/s +step 16969/19560 | loss 3.318125 (+0.59z)| norm 0.2386 (-0.73z)| lr 2.75e-05 | 8437.60 ms | -100.0% bf16 MFU | 62080 tok/s +step 16970/19560 | loss 3.355621 (+1.52z)| norm 0.2631 (+1.09z)| lr 2.75e-05 | 8441.31 ms | -100.0% bf16 MFU | 62082 tok/s +step 16971/19560 | loss 3.290625 (-0.11z)| norm 0.2481 (-0.02z)| lr 2.75e-05 | 8445.59 ms | -100.0% bf16 MFU | 62082 tok/s +step 16972/19560 | loss 3.300349 (+0.13z)| norm 0.2384 (-0.73z)| lr 2.75e-05 | 8439.01 ms | -100.0% bf16 MFU | 62084 tok/s +step 16973/19560 | loss 3.394013 (+2.43z)| norm 0.2764 (+2.05z)| lr 2.74e-05 | 8442.87 ms | -100.0% bf16 MFU | 62085 tok/s +step 16974/19560 | loss 3.281653 (-0.35z)| norm 0.2495 (+0.07z)| lr 2.74e-05 | 8450.92 ms | -100.0% bf16 MFU | 62082 tok/s +step 16975/19560 | loss 3.368041 (+1.75z)| norm 0.2865 (+2.69z)| lr 2.74e-05 | 8439.40 ms | -100.0% bf16 MFU | 62085 tok/s +step 16976/19560 | loss 3.258702 (-0.91z)| norm 0.2393 (-0.68z)| lr 2.74e-05 | 8441.45 ms | -100.0% bf16 MFU | 62086 tok/s +step 16977/19560 | loss 3.277557 (-0.44z)| norm 0.2521 (+0.22z)| lr 2.74e-05 | 8440.57 ms | -100.0% bf16 MFU | 62087 tok/s +step 16978/19560 | loss 3.286964 (-0.21z)| norm 0.2412 (-0.55z)| lr 2.73e-05 | 8444.79 ms | -100.0% bf16 MFU | 62087 tok/s +step 16979/19560 | loss 3.309468 (+0.33z)| norm 0.2692 (+1.43z)| lr 2.73e-05 | 8438.50 ms | -100.0% bf16 MFU | 62089 tok/s +step 16980/19560 | loss 3.243098 (-1.29z)| norm 0.2502 (+0.07z)| lr 2.73e-05 | 8443.50 ms | -100.0% bf16 MFU | 62089 tok/s +step 16981/19560 | loss 3.292869 (-0.07z)| norm 0.2442 (-0.36z)| lr 2.73e-05 | 8435.60 ms | -100.0% bf16 MFU | 62093 tok/s +step 16982/19560 | loss 3.294799 (-0.03z)| norm 0.2521 (+0.20z)| lr 2.73e-05 | 8439.84 ms | -100.0% bf16 MFU | 62094 tok/s +step 16983/19560 | loss 3.308110 (+0.29z)| norm 0.2421 (-0.52z)| lr 2.72e-05 | 8438.62 ms | -100.0% bf16 MFU | 62096 tok/s +step 16984/19560 | loss 3.218770 (-1.87z)| norm 0.2365 (-0.91z)| lr 2.72e-05 | 8441.62 ms | -100.0% bf16 MFU | 62096 tok/s +step 16985/19560 | loss 3.289990 (-0.14z)| norm 0.2524 (+0.22z)| lr 2.72e-05 | 8440.83 ms | -100.0% bf16 MFU | 62097 tok/s +step 16986/19560 | loss 3.322300 (+0.63z)| norm 0.2499 (+0.03z)| lr 2.72e-05 | 8440.85 ms | -100.0% bf16 MFU | 62098 tok/s +step 16987/19560 | loss 3.369685 (+1.75z)| norm 0.2460 (-0.25z)| lr 2.72e-05 | 8435.41 ms | -100.0% bf16 MFU | 62101 tok/s +step 16988/19560 | loss 3.325083 (+0.67z)| norm 0.2513 (+0.12z)| lr 2.71e-05 | 8444.00 ms | -100.0% bf16 MFU | 62100 tok/s +step 16989/19560 | loss 3.219310 (-1.86z)| norm 0.2348 (-1.06z)| lr 2.71e-05 | 8441.15 ms | -100.0% bf16 MFU | 62101 tok/s +step 16990/19560 | loss 3.303144 (+0.17z)| norm 0.2466 (-0.19z)| lr 2.71e-05 | 8439.82 ms | -100.0% bf16 MFU | 62102 tok/s +step 16991/19560 | loss 3.325161 (+0.70z)| norm 0.2518 (+0.20z)| lr 2.71e-05 | 8441.33 ms | -100.0% bf16 MFU | 62102 tok/s +step 16992/19560 | loss 3.289999 (-0.15z)| norm 0.2437 (-0.39z)| lr 2.71e-05 | 8446.46 ms | -100.0% bf16 MFU | 62101 tok/s +step 16993/19560 | loss 3.274768 (-0.51z)| norm 0.2528 (+0.28z)| lr 2.70e-05 | 8441.00 ms | -100.0% bf16 MFU | 62101 tok/s +step 16994/19560 | loss 3.289458 (-0.16z)| norm 0.2527 (+0.26z)| lr 2.70e-05 | 8442.97 ms | -100.0% bf16 MFU | 62101 tok/s +step 16995/19560 | loss 3.261129 (-0.84z)| norm 0.2391 (-0.73z)| lr 2.70e-05 | 8442.33 ms | -100.0% bf16 MFU | 62101 tok/s +step 16996/19560 | loss 3.264543 (-0.76z)| norm 0.2522 (+0.24z)| lr 2.70e-05 | 8437.52 ms | -100.0% bf16 MFU | 62103 tok/s +step 16997/19560 | loss 3.327012 (+0.78z)| norm 0.2376 (-0.84z)| lr 2.69e-05 | 8441.07 ms | -100.0% bf16 MFU | 62103 tok/s +step 16998/19560 | loss 3.278854 (-0.40z)| norm 0.2388 (-0.74z)| lr 2.69e-05 | 8442.12 ms | -100.0% bf16 MFU | 62103 tok/s +step 16999/19560 | loss 3.273659 (-0.52z)| norm 0.2383 (-0.76z)| lr 2.69e-05 | 8438.69 ms | -100.0% bf16 MFU | 62105 tok/s +step 17000/19560 | loss 3.233454 (-1.48z)| norm 0.2482 (-0.03z)| lr 2.69e-05 | 8442.71 ms | -100.0% bf16 MFU | 62104 tok/s +val loss 3.272290 +evaluating HellaSwag: 0/1256 evaluating HellaSwag: 10/1256 evaluating HellaSwag: 20/1256 evaluating HellaSwag: 30/1256 evaluating HellaSwag: 40/1256 evaluating HellaSwag: 50/1256 evaluating HellaSwag: 60/1256 evaluating HellaSwag: 70/1256 evaluating HellaSwag: 80/1256 evaluating HellaSwag: 90/1256 evaluating HellaSwag: 100/1256 evaluating HellaSwag: 110/1256 evaluating HellaSwag: 120/1256 evaluating HellaSwag: 130/1256 evaluating HellaSwag: 140/1256 evaluating HellaSwag: 150/1256 evaluating HellaSwag: 160/1256 evaluating HellaSwag: 170/1256 evaluating HellaSwag: 180/1256 evaluating HellaSwag: 190/1256 evaluating HellaSwag: 200/1256 evaluating HellaSwag: 210/1256 evaluating HellaSwag: 220/1256 evaluating HellaSwag: 230/1256 evaluating HellaSwag: 240/1256 evaluating HellaSwag: 250/1256 evaluating HellaSwag: 260/1256 evaluating HellaSwag: 270/1256 evaluating HellaSwag: 280/1256 evaluating HellaSwag: 290/1256 evaluating HellaSwag: 300/1256 evaluating HellaSwag: 310/1256 evaluating HellaSwag: 320/1256 evaluating HellaSwag: 330/1256 evaluating HellaSwag: 340/1256 evaluating HellaSwag: 350/1256 evaluating HellaSwag: 360/1256 evaluating HellaSwag: 370/1256 evaluating HellaSwag: 380/1256 evaluating HellaSwag: 390/1256 evaluating HellaSwag: 400/1256 evaluating HellaSwag: 410/1256 evaluating HellaSwag: 420/1256 evaluating HellaSwag: 430/1256 evaluating HellaSwag: 440/1256 evaluating HellaSwag: 450/1256 evaluating HellaSwag: 460/1256 evaluating HellaSwag: 470/1256 evaluating HellaSwag: 480/1256 evaluating HellaSwag: 490/1256 evaluating HellaSwag: 500/1256 evaluating HellaSwag: 510/1256 evaluating HellaSwag: 520/1256 evaluating HellaSwag: 530/1256 evaluating HellaSwag: 540/1256 evaluating HellaSwag: 550/1256 evaluating HellaSwag: 560/1256 evaluating HellaSwag: 570/1256 evaluating HellaSwag: 580/1256 evaluating HellaSwag: 590/1256 evaluating HellaSwag: 600/1256 evaluating HellaSwag: 610/1256 evaluating HellaSwag: 620/1256 evaluating HellaSwag: 630/1256 evaluating HellaSwag: 640/1256 evaluating HellaSwag: 650/1256 evaluating HellaSwag: 660/1256 evaluating HellaSwag: 670/1256 evaluating HellaSwag: 680/1256 evaluating HellaSwag: 690/1256 evaluating HellaSwag: 700/1256 evaluating HellaSwag: 710/1256 evaluating HellaSwag: 720/1256 evaluating HellaSwag: 730/1256 evaluating HellaSwag: 740/1256 evaluating HellaSwag: 750/1256 evaluating HellaSwag: 760/1256 evaluating HellaSwag: 770/1256 evaluating HellaSwag: 780/1256 evaluating HellaSwag: 790/1256 evaluating HellaSwag: 800/1256 evaluating HellaSwag: 810/1256 evaluating HellaSwag: 820/1256 evaluating HellaSwag: 830/1256 evaluating HellaSwag: 840/1256 evaluating HellaSwag: 850/1256 evaluating HellaSwag: 860/1256 evaluating HellaSwag: 870/1256 evaluating HellaSwag: 880/1256 evaluating HellaSwag: 890/1256 evaluating HellaSwag: 900/1256 evaluating HellaSwag: 910/1256 evaluating HellaSwag: 920/1256 evaluating HellaSwag: 930/1256 evaluating HellaSwag: 940/1256 evaluating HellaSwag: 950/1256 evaluating HellaSwag: 960/1256 evaluating HellaSwag: 970/1256 evaluating HellaSwag: 980/1256 evaluating HellaSwag: 990/1256 evaluating HellaSwag: 1000/1256 evaluating HellaSwag: 1010/1256 evaluating HellaSwag: 1020/1256 evaluating HellaSwag: 1030/1256 evaluating HellaSwag: 1040/1256 evaluating HellaSwag: 1050/1256 evaluating HellaSwag: 1060/1256 evaluating HellaSwag: 1070/1256 evaluating HellaSwag: 1080/1256 evaluating HellaSwag: 1090/1256 evaluating HellaSwag: 1100/1256 evaluating HellaSwag: 1110/1256 evaluating HellaSwag: 1120/1256 evaluating HellaSwag: 1130/1256 evaluating HellaSwag: 1140/1256 evaluating HellaSwag: 1150/1256 evaluating HellaSwag: 1160/1256 evaluating HellaSwag: 1170/1256 evaluating HellaSwag: 1180/1256 evaluating HellaSwag: 1190/1256 evaluating HellaSwag: 1200/1256 evaluating HellaSwag: 1210/1256 evaluating HellaSwag: 1220/1256 evaluating HellaSwag: 1230/1256 evaluating HellaSwag: 1240/1256 evaluating HellaSwag: 1250/1256 HellaSwag: 3013/10042 = 0.300040 +step 17001/19560 | loss 3.265504 (-0.70z)| norm 0.2578 (+0.67z)| lr 2.69e-05 | 8441.50 ms | -100.0% bf16 MFU | 62105 tok/s +step 17002/19560 | loss 3.282325 (-0.28z)| norm 0.2473 (-0.11z)| lr 2.68e-05 | 8439.19 ms | -100.0% bf16 MFU | 62106 tok/s +step 17003/19560 | loss 3.298087 (+0.09z)| norm 0.2373 (-0.83z)| lr 2.68e-05 | 8438.34 ms | -100.0% bf16 MFU | 62107 tok/s +step 17004/19560 | loss 3.263793 (-0.74z)| norm 0.2402 (-0.62z)| lr 2.68e-05 | 8434.78 ms | -100.0% bf16 MFU | 62109 tok/s +step 17005/19560 | loss 3.325915 (+0.79z)| norm 0.2485 (-0.01z)| lr 2.68e-05 | 8440.28 ms | -100.0% bf16 MFU | 62110 tok/s +step 17006/19560 | loss 3.258300 (-0.86z)| norm 0.2538 (+0.37z)| lr 2.68e-05 | 8438.93 ms | -100.0% bf16 MFU | 62111 tok/s +step 17007/19560 | loss 3.249519 (-1.07z)| norm 0.2503 (+0.11z)| lr 2.67e-05 | 8438.41 ms | -100.0% bf16 MFU | 62112 tok/s +step 17008/19560 | loss 3.391948 (+2.34z)| norm 0.2555 (+0.49z)| lr 2.67e-05 | 8441.11 ms | -100.0% bf16 MFU | 62112 tok/s +step 17009/19560 | loss 3.278525 (-0.36z)| norm 0.2547 (+0.44z)| lr 2.67e-05 | 8443.01 ms | -100.0% bf16 MFU | 62111 tok/s +step 17010/19560 | loss 3.292197 (-0.03z)| norm 0.2591 (+0.76z)| lr 2.67e-05 | 8436.00 ms | -100.0% bf16 MFU | 62113 tok/s +step 17011/19560 | loss 3.309858 (+0.39z)| norm 0.2671 (+1.32z)| lr 2.67e-05 | 8436.53 ms | -100.0% bf16 MFU | 62115 tok/s +step 17012/19560 | loss 3.439336 (+3.34z)| norm 0.2547 (+0.40z)| lr 2.66e-05 | 8441.99 ms | -100.0% bf16 MFU | 62114 tok/s +step 17013/19560 | loss 3.231903 (-1.46z)| norm 0.2609 (+0.85z)| lr 2.66e-05 | 8436.88 ms | -100.0% bf16 MFU | 62115 tok/s +step 17014/19560 | loss 3.316419 (+0.52z)| norm 0.2530 (+0.27z)| lr 2.66e-05 | 8438.00 ms | -100.0% bf16 MFU | 62116 tok/s +step 17015/19560 | loss 3.294931 (+0.02z)| norm 0.2465 (-0.20z)| lr 2.66e-05 | 8438.69 ms | -100.0% bf16 MFU | 62117 tok/s +step 17016/19560 | loss 3.313928 (+0.46z)| norm 0.2648 (+1.12z)| lr 2.66e-05 | 8438.02 ms | -100.0% bf16 MFU | 62118 tok/s +step 17017/19560 | loss 3.288995 (-0.13z)| norm 0.2511 (+0.13z)| lr 2.65e-05 | 8439.84 ms | -100.0% bf16 MFU | 62118 tok/s +step 17018/19560 | loss 3.241696 (-1.22z)| norm 0.2465 (-0.21z)| lr 2.65e-05 | 8441.08 ms | -100.0% bf16 MFU | 62118 tok/s +step 17019/19560 | loss 3.273240 (-0.48z)| norm 0.2497 (+0.02z)| lr 2.65e-05 | 8436.78 ms | -100.0% bf16 MFU | 62119 tok/s +step 17020/19560 | loss 3.289110 (-0.12z)| norm 0.2410 (-0.61z)| lr 2.65e-05 | 8439.32 ms | -100.0% bf16 MFU | 62119 tok/s +step 17021/19560 | loss 3.274644 (-0.46z)| norm 0.2508 (+0.12z)| lr 2.65e-05 | 8436.28 ms | -100.0% bf16 MFU | 62121 tok/s +step 17022/19560 | loss 3.290633 (-0.09z)| norm 0.2486 (-0.05z)| lr 2.64e-05 | 8437.43 ms | -100.0% bf16 MFU | 62121 tok/s +step 17023/19560 | loss 3.320553 (+0.60z)| norm 0.2440 (-0.38z)| lr 2.64e-05 | 8436.63 ms | -100.0% bf16 MFU | 62123 tok/s +step 17024/19560 | loss 3.262204 (-0.77z)| norm 0.2471 (-0.15z)| lr 2.64e-05 | 8439.32 ms | -100.0% bf16 MFU | 62123 tok/s +step 17025/19560 | loss 3.277328 (-0.41z)| norm 0.2586 (+0.70z)| lr 2.64e-05 | 8439.98 ms | -100.0% bf16 MFU | 62123 tok/s +step 17026/19560 | loss 3.290507 (-0.09z)| norm 0.2414 (-0.58z)| lr 2.64e-05 | 8435.73 ms | -100.0% bf16 MFU | 62124 tok/s +step 17027/19560 | loss 3.305588 (+0.25z)| norm 0.2437 (-0.41z)| lr 2.63e-05 | 8437.84 ms | -100.0% bf16 MFU | 62125 tok/s +step 17028/19560 | loss 3.371480 (+1.80z)| norm 0.2672 (+1.32z)| lr 2.63e-05 | 8438.19 ms | -100.0% bf16 MFU | 62125 tok/s +step 17029/19560 | loss 3.261963 (-0.77z)| norm 0.2402 (-0.67z)| lr 2.63e-05 | 8438.21 ms | -100.0% bf16 MFU | 62125 tok/s +step 17030/19560 | loss 3.284338 (-0.24z)| norm 0.2402 (-0.68z)| lr 2.63e-05 | 8438.88 ms | -100.0% bf16 MFU | 62125 tok/s +step 17031/19560 | loss 3.284071 (-0.25z)| norm 0.2396 (-0.71z)| lr 2.62e-05 | 8441.25 ms | -100.0% bf16 MFU | 62125 tok/s +step 17032/19560 | loss 3.261789 (-0.76z)| norm 0.2368 (-0.92z)| lr 2.62e-05 | 8440.34 ms | -100.0% bf16 MFU | 62124 tok/s +step 17033/19560 | loss 3.298441 (+0.09z)| norm 0.2439 (-0.39z)| lr 2.62e-05 | 8437.79 ms | -100.0% bf16 MFU | 62125 tok/s +step 17034/19560 | loss 3.283475 (-0.26z)| norm 0.2438 (-0.41z)| lr 2.62e-05 | 8437.98 ms | -100.0% bf16 MFU | 62125 tok/s +step 17035/19560 | loss 3.302611 (+0.19z)| norm 0.2413 (-0.62z)| lr 2.62e-05 | 8439.27 ms | -100.0% bf16 MFU | 62125 tok/s +step 17036/19560 | loss 3.305860 (+0.26z)| norm 0.2517 (+0.26z)| lr 2.61e-05 | 8439.28 ms | -100.0% bf16 MFU | 62125 tok/s +step 17037/19560 | loss 3.264788 (-0.74z)| norm 0.2422 (-0.56z)| lr 2.61e-05 | 8435.32 ms | -100.0% bf16 MFU | 62127 tok/s +step 17038/19560 | loss 3.253197 (-1.00z)| norm 0.2544 (+0.48z)| lr 2.61e-05 | 8435.71 ms | -100.0% bf16 MFU | 62128 tok/s +step 17039/19560 | loss 3.318748 (+0.57z)| norm 0.2450 (-0.33z)| lr 2.61e-05 | 8439.62 ms | -100.0% bf16 MFU | 62128 tok/s +step 17040/19560 | loss 3.418900 (+2.86z)| norm 0.2652 (+1.41z)| lr 2.61e-05 | 8440.59 ms | -100.0% bf16 MFU | 62127 tok/s +step 17041/19560 | loss 3.264470 (-0.74z)| norm 0.2445 (-0.36z)| lr 2.60e-05 | 8440.36 ms | -100.0% bf16 MFU | 62126 tok/s +step 17042/19560 | loss 3.275419 (-0.49z)| norm 0.2486 (-0.02z)| lr 2.60e-05 | 8436.25 ms | -100.0% bf16 MFU | 62128 tok/s +step 17043/19560 | loss 3.286389 (-0.24z)| norm 0.2471 (-0.15z)| lr 2.60e-05 | 8436.52 ms | -100.0% bf16 MFU | 62128 tok/s +step 17044/19560 | loss 3.227696 (-1.59z)| norm 0.2454 (-0.28z)| lr 2.60e-05 | 8437.09 ms | -100.0% bf16 MFU | 62129 tok/s +step 17045/19560 | loss 3.289047 (-0.17z)| norm 0.2403 (-0.72z)| lr 2.60e-05 | 8436.83 ms | -100.0% bf16 MFU | 62130 tok/s +step 17046/19560 | loss 3.308623 (+0.28z)| norm 0.2540 (+0.48z)| lr 2.59e-05 | 8437.93 ms | -100.0% bf16 MFU | 62130 tok/s +step 17047/19560 | loss 3.271253 (-0.58z)| norm 0.2573 (+0.75z)| lr 2.59e-05 | 8434.53 ms | -100.0% bf16 MFU | 62131 tok/s +step 17048/19560 | loss 3.290723 (-0.15z)| norm 0.2622 (+1.17z)| lr 2.59e-05 | 8436.26 ms | -100.0% bf16 MFU | 62132 tok/s +step 17049/19560 | loss 3.303641 (+0.16z)| norm 0.2553 (+0.56z)| lr 2.59e-05 | 8440.26 ms | -100.0% bf16 MFU | 62131 tok/s +step 17050/19560 | loss 3.284680 (-0.29z)| norm 0.2547 (+0.50z)| lr 2.59e-05 | 8435.95 ms | -100.0% bf16 MFU | 62132 tok/s +step 17051/19560 | loss 3.329023 (+0.75z)| norm 0.2684 (+1.67z)| lr 2.58e-05 | 8435.77 ms | -100.0% bf16 MFU | 62133 tok/s +step 17052/19560 | loss 3.249535 (-1.12z)| norm 0.2530 (+0.33z)| lr 2.58e-05 | 8436.92 ms | -100.0% bf16 MFU | 62134 tok/s +step 17053/19560 | loss 3.279541 (-0.41z)| norm 0.2461 (-0.28z)| lr 2.58e-05 | 8435.41 ms | -100.0% bf16 MFU | 62135 tok/s +step 17054/19560 | loss 3.333754 (+0.85z)| norm 0.2427 (-0.56z)| lr 2.58e-05 | 8439.14 ms | -100.0% bf16 MFU | 62134 tok/s +step 17055/19560 | loss 3.277170 (-0.46z)| norm 0.2388 (-0.90z)| lr 2.58e-05 | 8435.51 ms | -100.0% bf16 MFU | 62135 tok/s +step 17056/19560 | loss 3.294337 (-0.05z)| norm 0.2608 (+1.01z)| lr 2.57e-05 | 8437.30 ms | -100.0% bf16 MFU | 62135 tok/s +step 17057/19560 | loss 3.340758 (+1.03z)| norm 0.2644 (+1.30z)| lr 2.57e-05 | 8436.63 ms | -100.0% bf16 MFU | 62136 tok/s +step 17058/19560 | loss 3.258189 (-0.90z)| norm 0.2581 (+0.76z)| lr 2.57e-05 | 8436.39 ms | -100.0% bf16 MFU | 62136 tok/s +step 17059/19560 | loss 3.302982 (+0.15z)| norm 0.2449 (-0.38z)| lr 2.57e-05 | 8437.86 ms | -100.0% bf16 MFU | 62136 tok/s +step 17060/19560 | loss 3.289887 (-0.15z)| norm 0.2527 (+0.28z)| lr 2.57e-05 | 8435.01 ms | -100.0% bf16 MFU | 62137 tok/s +step 17061/19560 | loss 3.309747 (+0.31z)| norm 0.2563 (+0.59z)| lr 2.56e-05 | 8435.33 ms | -100.0% bf16 MFU | 62138 tok/s +step 17062/19560 | loss 3.259931 (-0.86z)| norm 0.2472 (-0.22z)| lr 2.56e-05 | 8436.15 ms | -100.0% bf16 MFU | 62139 tok/s +step 17063/19560 | loss 3.303822 (+0.15z)| norm 0.2416 (-0.69z)| lr 2.56e-05 | 8435.56 ms | -100.0% bf16 MFU | 62139 tok/s +step 17064/19560 | loss 3.307008 (+0.22z)| norm 0.2503 (+0.06z)| lr 2.56e-05 | 8437.83 ms | -100.0% bf16 MFU | 62139 tok/s +step 17065/19560 | loss 3.233049 (-1.53z)| norm 0.2436 (-0.52z)| lr 2.56e-05 | 8436.78 ms | -100.0% bf16 MFU | 62139 tok/s +step 17066/19560 | loss 3.299941 (+0.09z)| norm 0.2450 (-0.40z)| lr 2.55e-05 | 8433.56 ms | -100.0% bf16 MFU | 62141 tok/s +step 17067/19560 | loss 3.236182 (-1.45z)| norm 0.2441 (-0.48z)| lr 2.55e-05 | 8430.39 ms | -100.0% bf16 MFU | 62143 tok/s +step 17068/19560 | loss 3.335721 (+0.96z)| norm 0.2400 (-0.83z)| lr 2.55e-05 | 8428.73 ms | -100.0% bf16 MFU | 62146 tok/s +step 17069/19560 | loss 3.320294 (+0.58z)| norm 0.2487 (-0.07z)| lr 2.55e-05 | 8428.53 ms | -100.0% bf16 MFU | 62149 tok/s +step 17070/19560 | loss 3.316158 (+0.48z)| norm 0.2502 (+0.06z)| lr 2.55e-05 | 8430.33 ms | -100.0% bf16 MFU | 62151 tok/s +step 17071/19560 | loss 3.414083 (+2.86z)| norm 0.2445 (-0.43z)| lr 2.54e-05 | 8430.78 ms | -100.0% bf16 MFU | 62153 tok/s +step 17072/19560 | loss 3.282043 (-0.35z)| norm 0.2670 (+1.55z)| lr 2.54e-05 | 8428.11 ms | -100.0% bf16 MFU | 62156 tok/s +step 17073/19560 | loss 3.257356 (-0.94z)| norm 0.2551 (+0.49z)| lr 2.54e-05 | 8427.77 ms | -100.0% bf16 MFU | 62158 tok/s +step 17074/19560 | loss 3.256613 (-0.94z)| norm 0.2585 (+0.79z)| lr 2.54e-05 | 8427.70 ms | -100.0% bf16 MFU | 62161 tok/s +step 17075/19560 | loss 3.302338 (+0.16z)| norm 0.2391 (-0.92z)| lr 2.54e-05 | 8430.55 ms | -100.0% bf16 MFU | 62162 tok/s +step 17076/19560 | loss 3.294607 (-0.03z)| norm 0.2539 (+0.38z)| lr 2.53e-05 | 8429.21 ms | -100.0% bf16 MFU | 62164 tok/s +step 17077/19560 | loss 3.336706 (+0.97z)| norm 0.2392 (-1.09z)| lr 2.53e-05 | 8435.30 ms | -100.0% bf16 MFU | 62164 tok/s +step 17078/19560 | loss 3.310997 (+0.34z)| norm 0.2386 (-1.16z)| lr 2.53e-05 | 8456.71 ms | -100.0% bf16 MFU | 62155 tok/s +step 17079/19560 | loss 3.265611 (-0.78z)| norm 0.2550 (+0.66z)| lr 2.53e-05 | 8456.61 ms | -100.0% bf16 MFU | 62147 tok/s +step 17080/19560 | loss 3.354287 (+1.38z)| norm 0.2600 (+1.20z)| lr 2.53e-05 | 8455.11 ms | -100.0% bf16 MFU | 62140 tok/s +step 17081/19560 | loss 3.291437 (-0.16z)| norm 0.2391 (-1.11z)| lr 2.52e-05 | 8452.80 ms | -100.0% bf16 MFU | 62135 tok/s +step 17082/19560 | loss 3.245956 (-1.24z)| norm 0.2533 (+0.45z)| lr 2.52e-05 | 8455.38 ms | -100.0% bf16 MFU | 62128 tok/s +step 17083/19560 | loss 3.272195 (-0.61z)| norm 0.2358 (-1.49z)| lr 2.52e-05 | 8451.80 ms | -100.0% bf16 MFU | 62123 tok/s +step 17084/19560 | loss 3.274628 (-0.55z)| norm 0.2361 (-1.45z)| lr 2.52e-05 | 8453.82 ms | -100.0% bf16 MFU | 62118 tok/s +step 17085/19560 | loss 3.261647 (-0.86z)| norm 0.2587 (+1.03z)| lr 2.52e-05 | 8451.94 ms | -100.0% bf16 MFU | 62114 tok/s +step 17086/19560 | loss 3.315019 (+0.46z)| norm 0.2662 (+1.83z)| lr 2.51e-05 | 8454.67 ms | -100.0% bf16 MFU | 62109 tok/s +step 17087/19560 | loss 3.269751 (-0.66z)| norm 0.2478 (-0.20z)| lr 2.51e-05 | 8447.90 ms | -100.0% bf16 MFU | 62106 tok/s +step 17088/19560 | loss 3.318235 (+0.59z)| norm 0.2805 (+3.24z)| lr 2.51e-05 | 8452.55 ms | -100.0% bf16 MFU | 62102 tok/s +step 17089/19560 | loss 3.295959 (+0.01z)| norm 0.2523 (+0.25z)| lr 2.51e-05 | 8453.13 ms | -100.0% bf16 MFU | 62098 tok/s +step 17090/19560 | loss 3.302633 (+0.19z)| norm 0.2523 (+0.25z)| lr 2.51e-05 | 8450.49 ms | -100.0% bf16 MFU | 62096 tok/s +step 17091/19560 | loss 3.280873 (-0.36z)| norm 0.2456 (-0.45z)| lr 2.50e-05 | 8448.11 ms | -100.0% bf16 MFU | 62094 tok/s +step 17092/19560 | loss 3.265210 (-0.76z)| norm 0.2501 (+0.02z)| lr 2.50e-05 | 8447.44 ms | -100.0% bf16 MFU | 62092 tok/s +step 17093/19560 | loss 3.309557 (+0.39z)| norm 0.2507 (+0.09z)| lr 2.50e-05 | 8446.33 ms | -100.0% bf16 MFU | 62091 tok/s +step 17094/19560 | loss 3.273819 (-0.53z)| norm 0.2480 (-0.19z)| lr 2.50e-05 | 8445.64 ms | -100.0% bf16 MFU | 62091 tok/s +step 17095/19560 | loss 3.217192 (-1.96z)| norm 0.2399 (-1.05z)| lr 2.50e-05 | 8446.27 ms | -100.0% bf16 MFU | 62090 tok/s +step 17096/19560 | loss 3.297106 (+0.10z)| norm 0.2542 (+0.46z)| lr 2.49e-05 | 8446.21 ms | -100.0% bf16 MFU | 62089 tok/s +step 17097/19560 | loss 3.241028 (-1.32z)| norm 0.2372 (-1.33z)| lr 2.49e-05 | 8448.53 ms | -100.0% bf16 MFU | 62087 tok/s +step 17098/19560 | loss 3.320076 (+0.72z)| norm 0.2425 (-0.75z)| lr 2.49e-05 | 8443.62 ms | -100.0% bf16 MFU | 62088 tok/s +step 17099/19560 | loss 3.284246 (-0.21z)| norm 0.2350 (-1.53z)| lr 2.49e-05 | 8450.91 ms | -100.0% bf16 MFU | 62085 tok/s +step 17100/19560 | loss 3.350301 (+1.48z)| norm 0.2547 (+0.53z)| lr 2.49e-05 | 8444.49 ms | -100.0% bf16 MFU | 62085 tok/s +step 17101/19560 | loss 3.229403 (-1.62z)| norm 0.2395 (-1.07z)| lr 2.48e-05 | 8447.28 ms | -100.0% bf16 MFU | 62084 tok/s +step 17102/19560 | loss 3.309378 (+0.46z)| norm 0.2441 (-0.57z)| lr 2.48e-05 | 8450.21 ms | -100.0% bf16 MFU | 62082 tok/s +step 17103/19560 | loss 3.281381 (-0.25z)| norm 0.2446 (-0.51z)| lr 2.48e-05 | 8446.83 ms | -100.0% bf16 MFU | 62082 tok/s +step 17104/19560 | loss 3.306844 (+0.41z)| norm 0.2426 (-0.75z)| lr 2.48e-05 | 8443.43 ms | -100.0% bf16 MFU | 62082 tok/s +step 17105/19560 | loss 3.301986 (+0.28z)| norm 0.2310 (-2.04z)| lr 2.48e-05 | 8444.88 ms | -100.0% bf16 MFU | 62082 tok/s +step 17106/19560 | loss 3.208008 (-2.16z)| norm 0.2366 (-1.39z)| lr 2.47e-05 | 8446.06 ms | -100.0% bf16 MFU | 62082 tok/s +step 17107/19560 | loss 3.250297 (-1.04z)| norm 0.2492 (+0.06z)| lr 2.47e-05 | 8448.58 ms | -100.0% bf16 MFU | 62081 tok/s +step 17108/19560 | loss 3.251822 (-1.01z)| norm 0.2510 (+0.26z)| lr 2.47e-05 | 8442.32 ms | -100.0% bf16 MFU | 62082 tok/s +step 17109/19560 | loss 3.332406 (+1.08z)| norm 0.2543 (+0.64z)| lr 2.47e-05 | 8449.03 ms | -100.0% bf16 MFU | 62080 tok/s +step 17110/19560 | loss 3.281039 (-0.25z)| norm 0.2434 (-0.61z)| lr 2.47e-05 | 8451.59 ms | -100.0% bf16 MFU | 62078 tok/s +step 17111/19560 | loss 3.251990 (-0.99z)| norm 0.2403 (-0.98z)| lr 2.46e-05 | 8450.73 ms | -100.0% bf16 MFU | 62076 tok/s +step 17112/19560 | loss 3.235616 (-1.42z)| norm 0.2517 (+0.34z)| lr 2.46e-05 | 8445.04 ms | -100.0% bf16 MFU | 62076 tok/s +step 17113/19560 | loss 3.220613 (-1.78z)| norm 0.2416 (-0.83z)| lr 2.46e-05 | 8449.67 ms | -100.0% bf16 MFU | 62075 tok/s +step 17114/19560 | loss 3.240260 (-1.25z)| norm 0.2852 (+3.95z)| lr 2.46e-05 | 8447.12 ms | -100.0% bf16 MFU | 62075 tok/s +step 17115/19560 | loss 3.202423 (-2.18z)| norm 0.2464 (-0.29z)| lr 2.46e-05 | 8444.56 ms | -100.0% bf16 MFU | 62075 tok/s +step 17116/19560 | loss 3.228957 (-1.48z)| norm 0.2427 (-0.68z)| lr 2.45e-05 | 8447.43 ms | -100.0% bf16 MFU | 62075 tok/s +step 17117/19560 | loss 3.262329 (-0.64z)| norm 0.2528 (+0.41z)| lr 2.45e-05 | 8448.86 ms | -100.0% bf16 MFU | 62074 tok/s +step 17118/19560 | loss 3.315928 (+0.73z)| norm 0.2346 (-1.57z)| lr 2.45e-05 | 8453.50 ms | -100.0% bf16 MFU | 62071 tok/s +step 17119/19560 | loss 3.223512 (-1.61z)| norm 0.2678 (+2.01z)| lr 2.45e-05 | 8450.51 ms | -100.0% bf16 MFU | 62070 tok/s +step 17120/19560 | loss 3.259131 (-0.70z)| norm 0.2505 (+0.14z)| lr 2.45e-05 | 8451.49 ms | -100.0% bf16 MFU | 62068 tok/s +step 17121/19560 | loss 3.220553 (-1.65z)| norm 0.2427 (-0.68z)| lr 2.44e-05 | 8453.54 ms | -100.0% bf16 MFU | 62065 tok/s +step 17122/19560 | loss 3.275378 (-0.27z)| norm 0.2425 (-0.70z)| lr 2.44e-05 | 8447.42 ms | -100.0% bf16 MFU | 62065 tok/s +step 17123/19560 | loss 3.252771 (-0.83z)| norm 0.2431 (-0.63z)| lr 2.44e-05 | 8442.18 ms | -100.0% bf16 MFU | 62067 tok/s +step 17124/19560 | loss 3.294520 (+0.21z)| norm 0.2453 (-0.40z)| lr 2.44e-05 | 8446.11 ms | -100.0% bf16 MFU | 62068 tok/s +step 17125/19560 | loss 3.291825 (+0.15z)| norm 0.2462 (-0.31z)| lr 2.44e-05 | 8450.06 ms | -100.0% bf16 MFU | 62067 tok/s +step 17126/19560 | loss 3.286566 (+0.02z)| norm 0.2522 (+0.33z)| lr 2.43e-05 | 8442.22 ms | -100.0% bf16 MFU | 62068 tok/s +step 17127/19560 | loss 3.273463 (-0.31z)| norm 0.2323 (-1.81z)| lr 2.43e-05 | 8441.92 ms | -100.0% bf16 MFU | 62070 tok/s +step 17128/19560 | loss 3.296782 (+0.26z)| norm 0.2375 (-1.23z)| lr 2.43e-05 | 8449.17 ms | -100.0% bf16 MFU | 62069 tok/s +step 17129/19560 | loss 3.238782 (-1.20z)| norm 0.2385 (-1.10z)| lr 2.43e-05 | 8445.27 ms | -100.0% bf16 MFU | 62070 tok/s +step 17130/19560 | loss 3.252811 (-0.84z)| norm 0.2519 (+0.32z)| lr 2.43e-05 | 8447.70 ms | -100.0% bf16 MFU | 62070 tok/s +step 17131/19560 | loss 3.269906 (-0.40z)| norm 0.2649 (+1.67z)| lr 2.42e-05 | 8442.84 ms | -100.0% bf16 MFU | 62071 tok/s +step 17132/19560 | loss 3.301817 (+0.40z)| norm 0.2552 (+0.64z)| lr 2.42e-05 | 8443.80 ms | -100.0% bf16 MFU | 62072 tok/s +step 17133/19560 | loss 3.292642 (+0.17z)| norm 0.2396 (-1.02z)| lr 2.42e-05 | 8449.40 ms | -100.0% bf16 MFU | 62071 tok/s +step 17134/19560 | loss 3.292760 (+0.17z)| norm 0.2517 (+0.27z)| lr 2.42e-05 | 8445.29 ms | -100.0% bf16 MFU | 62071 tok/s +step 17135/19560 | loss 3.305738 (+0.49z)| norm 0.2529 (+0.40z)| lr 2.42e-05 | 8442.86 ms | -100.0% bf16 MFU | 62073 tok/s +step 17136/19560 | loss 3.264639 (-0.55z)| norm 0.2498 (+0.07z)| lr 2.41e-05 | 8445.09 ms | -100.0% bf16 MFU | 62073 tok/s +step 17137/19560 | loss 3.295719 (+0.26z)| norm 0.2540 (+0.52z)| lr 2.41e-05 | 8444.20 ms | -100.0% bf16 MFU | 62074 tok/s +step 17138/19560 | loss 3.216059 (-1.78z)| norm 0.2573 (+0.87z)| lr 2.41e-05 | 8449.07 ms | -100.0% bf16 MFU | 62073 tok/s +step 17139/19560 | loss 3.261754 (-0.59z)| norm 0.2400 (-0.95z)| lr 2.41e-05 | 8449.23 ms | -100.0% bf16 MFU | 62072 tok/s +step 17140/19560 | loss 3.260929 (-0.62z)| norm 0.2522 (+0.36z)| lr 2.41e-05 | 8442.16 ms | -100.0% bf16 MFU | 62073 tok/s +step 17141/19560 | loss 3.282130 (-0.04z)| norm 0.2695 (+2.18z)| lr 2.40e-05 | 8450.49 ms | -100.0% bf16 MFU | 62072 tok/s +step 17142/19560 | loss 3.262390 (-0.58z)| norm 0.2480 (-0.10z)| lr 2.40e-05 | 8447.32 ms | -100.0% bf16 MFU | 62072 tok/s +step 17143/19560 | loss 3.268682 (-0.40z)| norm 0.2395 (-0.99z)| lr 2.40e-05 | 8444.14 ms | -100.0% bf16 MFU | 62072 tok/s +step 17144/19560 | loss 3.275183 (-0.21z)| norm 0.2418 (-0.73z)| lr 2.40e-05 | 8447.58 ms | -100.0% bf16 MFU | 62072 tok/s +step 17145/19560 | loss 3.317281 (+0.95z)| norm 0.2408 (-0.83z)| lr 2.40e-05 | 8444.71 ms | -100.0% bf16 MFU | 62073 tok/s +step 17146/19560 | loss 3.269975 (-0.37z)| norm 0.2455 (-0.33z)| lr 2.39e-05 | 8441.49 ms | -100.0% bf16 MFU | 62074 tok/s +step 17147/19560 | loss 3.308646 (+0.70z)| norm 0.2454 (-0.33z)| lr 2.39e-05 | 8445.85 ms | -100.0% bf16 MFU | 62075 tok/s +step 17148/19560 | loss 3.251133 (-0.89z)| norm 0.2486 (-0.00z)| lr 2.39e-05 | 8441.02 ms | -100.0% bf16 MFU | 62076 tok/s +step 17149/19560 | loss 3.297907 (+0.40z)| norm 0.2503 (+0.18z)| lr 2.39e-05 | 8443.53 ms | -100.0% bf16 MFU | 62077 tok/s +step 17150/19560 | loss 3.292909 (+0.27z)| norm 0.2420 (-0.70z)| lr 2.39e-05 | 8439.88 ms | -100.0% bf16 MFU | 62079 tok/s +step 17151/19560 | loss 3.254222 (-0.80z)| norm 0.2410 (-0.80z)| lr 2.39e-05 | 8443.08 ms | -100.0% bf16 MFU | 62080 tok/s +step 17152/19560 | loss 3.269426 (-0.38z)| norm 0.2595 (+1.15z)| lr 2.38e-05 | 8444.23 ms | -100.0% bf16 MFU | 62081 tok/s +step 17153/19560 | loss 3.242242 (-1.12z)| norm 0.2486 (+0.00z)| lr 2.38e-05 | 8446.71 ms | -100.0% bf16 MFU | 62080 tok/s +step 17154/19560 | loss 3.238074 (-1.22z)| norm 0.2402 (-0.89z)| lr 2.38e-05 | 8447.16 ms | -100.0% bf16 MFU | 62079 tok/s +step 17155/19560 | loss 3.286346 (+0.12z)| norm 0.2358 (-1.34z)| lr 2.38e-05 | 8441.53 ms | -100.0% bf16 MFU | 62081 tok/s +step 17156/19560 | loss 3.284565 (+0.09z)| norm 0.2573 (+0.96z)| lr 2.38e-05 | 8442.91 ms | -100.0% bf16 MFU | 62082 tok/s +step 17157/19560 | loss 3.298137 (+0.47z)| norm 0.2431 (-0.57z)| lr 2.37e-05 | 8440.37 ms | -100.0% bf16 MFU | 62084 tok/s +step 17158/19560 | loss 3.221056 (-1.68z)| norm 0.2319 (-1.75z)| lr 2.37e-05 | 8440.42 ms | -100.0% bf16 MFU | 62085 tok/s +step 17159/19560 | loss 3.262949 (-0.51z)| norm 0.2463 (-0.23z)| lr 2.37e-05 | 8448.89 ms | -100.0% bf16 MFU | 62084 tok/s +step 17160/19560 | loss 3.233364 (-1.32z)| norm 0.2406 (-0.84z)| lr 2.37e-05 | 8438.57 ms | -100.0% bf16 MFU | 62086 tok/s +step 17161/19560 | loss 3.288524 (+0.22z)| norm 0.2438 (-0.50z)| lr 2.37e-05 | 8445.70 ms | -100.0% bf16 MFU | 62085 tok/s +step 17162/19560 | loss 3.250573 (-0.83z)| norm 0.2308 (-1.85z)| lr 2.36e-05 | 8441.00 ms | -100.0% bf16 MFU | 62087 tok/s +step 17163/19560 | loss 3.261763 (-0.51z)| norm 0.2494 (+0.10z)| lr 2.36e-05 | 8438.69 ms | -100.0% bf16 MFU | 62089 tok/s +step 17164/19560 | loss 3.275456 (-0.12z)| norm 0.2545 (+0.64z)| lr 2.36e-05 | 8444.64 ms | -100.0% bf16 MFU | 62089 tok/s +step 17165/19560 | loss 3.281183 (+0.03z)| norm 0.2489 (+0.05z)| lr 2.36e-05 | 8441.50 ms | -100.0% bf16 MFU | 62090 tok/s +step 17166/19560 | loss 3.288542 (+0.23z)| norm 0.2506 (+0.23z)| lr 2.36e-05 | 8439.81 ms | -100.0% bf16 MFU | 62091 tok/s +step 17167/19560 | loss 3.352590 (+1.99z)| norm 0.2362 (-1.28z)| lr 2.35e-05 | 8442.56 ms | -100.0% bf16 MFU | 62092 tok/s +step 17168/19560 | loss 3.271713 (-0.23z)| norm 0.2572 (+0.95z)| lr 2.35e-05 | 8439.34 ms | -100.0% bf16 MFU | 62093 tok/s +step 17169/19560 | loss 3.261546 (-0.52z)| norm 0.2498 (+0.15z)| lr 2.35e-05 | 8446.38 ms | -100.0% bf16 MFU | 62092 tok/s +step 17170/19560 | loss 3.285016 (+0.16z)| norm 0.2681 (+2.05z)| lr 2.35e-05 | 8445.84 ms | -100.0% bf16 MFU | 62092 tok/s +step 17171/19560 | loss 3.305576 (+0.76z)| norm 0.2412 (-0.75z)| lr 2.35e-05 | 8441.98 ms | -100.0% bf16 MFU | 62092 tok/s +step 17172/19560 | loss 3.285279 (+0.15z)| norm 0.2510 (+0.26z)| lr 2.34e-05 | 8439.15 ms | -100.0% bf16 MFU | 62094 tok/s +step 17173/19560 | loss 3.304881 (+0.73z)| norm 0.2508 (+0.23z)| lr 2.34e-05 | 8439.23 ms | -100.0% bf16 MFU | 62095 tok/s +step 17174/19560 | loss 3.291674 (+0.34z)| norm 0.2532 (+0.48z)| lr 2.34e-05 | 8439.64 ms | -100.0% bf16 MFU | 62097 tok/s +step 17175/19560 | loss 3.342911 (+1.82z)| norm 0.2456 (-0.31z)| lr 2.34e-05 | 8435.72 ms | -100.0% bf16 MFU | 62099 tok/s +step 17176/19560 | loss 3.311091 (+0.88z)| norm 0.2423 (-0.63z)| lr 2.34e-05 | 8445.95 ms | -100.0% bf16 MFU | 62098 tok/s +step 17177/19560 | loss 3.300900 (+0.59z)| norm 0.2502 (+0.20z)| lr 2.33e-05 | 8437.45 ms | -100.0% bf16 MFU | 62100 tok/s +step 17178/19560 | loss 3.385161 (+2.91z)| norm 0.2397 (-0.89z)| lr 2.33e-05 | 8440.57 ms | -100.0% bf16 MFU | 62101 tok/s +step 17179/19560 | loss 3.314968 (+0.95z)| norm 0.2484 (+0.04z)| lr 2.33e-05 | 8439.80 ms | -100.0% bf16 MFU | 62102 tok/s +step 17180/19560 | loss 3.270662 (-0.31z)| norm 0.2382 (-1.04z)| lr 2.33e-05 | 8439.76 ms | -100.0% bf16 MFU | 62103 tok/s +step 17181/19560 | loss 3.282802 (+0.03z)| norm 0.2410 (-0.73z)| lr 2.33e-05 | 8441.33 ms | -100.0% bf16 MFU | 62103 tok/s +step 17182/19560 | loss 3.267305 (-0.39z)| norm 0.2429 (-0.54z)| lr 2.32e-05 | 8439.96 ms | -100.0% bf16 MFU | 62104 tok/s +step 17183/19560 | loss 3.286221 (+0.14z)| norm 0.2607 (+1.35z)| lr 2.32e-05 | 8435.41 ms | -100.0% bf16 MFU | 62107 tok/s +step 17184/19560 | loss 3.289055 (+0.23z)| norm 0.2484 (+0.05z)| lr 2.32e-05 | 8440.33 ms | -100.0% bf16 MFU | 62107 tok/s +step 17185/19560 | loss 3.338209 (+1.63z)| norm 0.2381 (-1.05z)| lr 2.32e-05 | 8438.25 ms | -100.0% bf16 MFU | 62108 tok/s +step 17186/19560 | loss 3.245408 (-1.01z)| norm 0.2392 (-0.91z)| lr 2.32e-05 | 8435.86 ms | -100.0% bf16 MFU | 62110 tok/s +step 17187/19560 | loss 3.304407 (+0.67z)| norm 0.2310 (-1.77z)| lr 2.32e-05 | 8443.92 ms | -100.0% bf16 MFU | 62109 tok/s +step 17188/19560 | loss 3.289416 (+0.24z)| norm 0.2432 (-0.45z)| lr 2.31e-05 | 8440.17 ms | -100.0% bf16 MFU | 62110 tok/s +step 17189/19560 | loss 3.257819 (-0.65z)| norm 0.2361 (-1.19z)| lr 2.31e-05 | 8441.55 ms | -100.0% bf16 MFU | 62110 tok/s +step 17190/19560 | loss 3.251310 (-0.83z)| norm 0.2330 (-1.50z)| lr 2.31e-05 | 8436.96 ms | -100.0% bf16 MFU | 62111 tok/s +step 17191/19560 | loss 3.302112 (+0.61z)| norm 0.2339 (-1.39z)| lr 2.31e-05 | 8436.78 ms | -100.0% bf16 MFU | 62113 tok/s +step 17192/19560 | loss 3.261497 (-0.53z)| norm 0.2417 (-0.56z)| lr 2.31e-05 | 8437.23 ms | -100.0% bf16 MFU | 62114 tok/s +step 17193/19560 | loss 3.303950 (+0.67z)| norm 0.2505 (+0.37z)| lr 2.30e-05 | 8430.38 ms | -100.0% bf16 MFU | 62118 tok/s +step 17194/19560 | loss 3.274244 (-0.18z)| norm 0.2470 (-0.01z)| lr 2.30e-05 | 8430.60 ms | -100.0% bf16 MFU | 62122 tok/s +step 17195/19560 | loss 3.220310 (-1.72z)| norm 0.2336 (-1.40z)| lr 2.30e-05 | 8432.09 ms | -100.0% bf16 MFU | 62124 tok/s +step 17196/19560 | loss 3.258976 (-0.60z)| norm 0.2339 (-1.36z)| lr 2.30e-05 | 8429.68 ms | -100.0% bf16 MFU | 62128 tok/s +step 17197/19560 | loss 3.278745 (-0.02z)| norm 0.2407 (-0.64z)| lr 2.30e-05 | 8426.61 ms | -100.0% bf16 MFU | 62133 tok/s +step 17198/19560 | loss 3.267758 (-0.33z)| norm 0.2301 (-1.71z)| lr 2.29e-05 | 8432.24 ms | -100.0% bf16 MFU | 62135 tok/s +step 17199/19560 | loss 3.349115 (+2.15z)| norm 0.2447 (-0.21z)| lr 2.29e-05 | 8430.12 ms | -100.0% bf16 MFU | 62138 tok/s +step 17200/19560 | loss 3.333231 (+1.63z)| norm 0.2506 (+0.42z)| lr 2.29e-05 | 8429.82 ms | -100.0% bf16 MFU | 62140 tok/s +step 17201/19560 | loss 3.268721 (-0.31z)| norm 0.2489 (+0.25z)| lr 2.29e-05 | 8430.87 ms | -100.0% bf16 MFU | 62143 tok/s +step 17202/19560 | loss 3.280500 (+0.04z)| norm 0.2512 (+0.50z)| lr 2.29e-05 | 8429.85 ms | -100.0% bf16 MFU | 62145 tok/s +step 17203/19560 | loss 3.288200 (+0.27z)| norm 0.2452 (-0.14z)| lr 2.28e-05 | 8431.35 ms | -100.0% bf16 MFU | 62147 tok/s +step 17204/19560 | loss 3.358019 (+2.32z)| norm 0.2658 (+2.00z)| lr 2.28e-05 | 8429.67 ms | -100.0% bf16 MFU | 62150 tok/s +step 17205/19560 | loss 3.238580 (-1.20z)| norm 0.2286 (-1.85z)| lr 2.28e-05 | 8429.69 ms | -100.0% bf16 MFU | 62152 tok/s +step 17206/19560 | loss 3.242699 (-1.06z)| norm 0.2383 (-0.85z)| lr 2.28e-05 | 8428.71 ms | -100.0% bf16 MFU | 62154 tok/s +step 17207/19560 | loss 3.212387 (-1.92z)| norm 0.2324 (-1.43z)| lr 2.28e-05 | 8430.51 ms | -100.0% bf16 MFU | 62156 tok/s +step 17208/19560 | loss 3.266839 (-0.31z)| norm 0.2335 (-1.30z)| lr 2.28e-05 | 8428.71 ms | -100.0% bf16 MFU | 62159 tok/s +step 17209/19560 | loss 3.323031 (+1.35z)| norm 0.2494 (+0.32z)| lr 2.27e-05 | 8429.12 ms | -100.0% bf16 MFU | 62161 tok/s +step 17210/19560 | loss 3.285131 (+0.22z)| norm 0.2406 (-0.57z)| lr 2.27e-05 | 8429.62 ms | -100.0% bf16 MFU | 62162 tok/s +step 17211/19560 | loss 3.264579 (-0.39z)| norm 0.2530 (+0.69z)| lr 2.27e-05 | 8434.08 ms | -100.0% bf16 MFU | 62162 tok/s +step 17212/19560 | loss 3.307481 (+0.88z)| norm 0.2309 (-1.58z)| lr 2.27e-05 | 8433.83 ms | -100.0% bf16 MFU | 62162 tok/s +step 17213/19560 | loss 3.277679 (-0.01z)| norm 0.2521 (+0.61z)| lr 2.27e-05 | 8435.11 ms | -100.0% bf16 MFU | 62162 tok/s +step 17214/19560 | loss 3.257825 (-0.59z)| norm 0.2399 (-0.64z)| lr 2.26e-05 | 8436.16 ms | -100.0% bf16 MFU | 62161 tok/s +step 17215/19560 | loss 3.293629 (+0.47z)| norm 0.2408 (-0.54z)| lr 2.26e-05 | 8434.87 ms | -100.0% bf16 MFU | 62161 tok/s +step 17216/19560 | loss 3.223861 (-1.58z)| norm 0.2381 (-0.82z)| lr 2.26e-05 | 8435.88 ms | -100.0% bf16 MFU | 62161 tok/s +step 17217/19560 | loss 3.353981 (+2.23z)| norm 0.2448 (-0.08z)| lr 2.26e-05 | 8436.59 ms | -100.0% bf16 MFU | 62160 tok/s +step 17218/19560 | loss 3.241390 (-1.04z)| norm 0.2532 (+0.85z)| lr 2.26e-05 | 8434.59 ms | -100.0% bf16 MFU | 62160 tok/s +step 17219/19560 | loss 3.268210 (-0.26z)| norm 0.2400 (-0.61z)| lr 2.25e-05 | 8434.10 ms | -100.0% bf16 MFU | 62160 tok/s +step 17220/19560 | loss 3.268450 (-0.25z)| norm 0.2499 (+0.49z)| lr 2.25e-05 | 8438.46 ms | -100.0% bf16 MFU | 62158 tok/s +step 17221/19560 | loss 3.263636 (-0.38z)| norm 0.2445 (-0.10z)| lr 2.25e-05 | 8438.47 ms | -100.0% bf16 MFU | 62157 tok/s +step 17222/19560 | loss 3.267755 (-0.26z)| norm 0.2489 (+0.38z)| lr 2.25e-05 | 8437.60 ms | -100.0% bf16 MFU | 62156 tok/s +step 17223/19560 | loss 3.215507 (-1.78z)| norm 0.2507 (+0.57z)| lr 2.25e-05 | 8438.52 ms | -100.0% bf16 MFU | 62155 tok/s +step 17224/19560 | loss 3.280325 (+0.11z)| norm 0.2285 (-1.83z)| lr 2.24e-05 | 8434.99 ms | -100.0% bf16 MFU | 62155 tok/s +step 17225/19560 | loss 3.286057 (+0.27z)| norm 0.2456 (+0.03z)| lr 2.24e-05 | 8433.18 ms | -100.0% bf16 MFU | 62156 tok/s +step 17226/19560 | loss 3.217663 (-1.70z)| norm 0.2455 (+0.01z)| lr 2.24e-05 | 8436.99 ms | -100.0% bf16 MFU | 62155 tok/s +step 17227/19560 | loss 3.247463 (-0.82z)| norm 0.2437 (-0.19z)| lr 2.24e-05 | 8438.31 ms | -100.0% bf16 MFU | 62154 tok/s +step 17228/19560 | loss 3.319914 (+1.31z)| norm 0.2417 (-0.41z)| lr 2.24e-05 | 8435.22 ms | -100.0% bf16 MFU | 62154 tok/s +step 17229/19560 | loss 3.206295 (-2.02z)| norm 0.2524 (+0.76z)| lr 2.24e-05 | 8441.34 ms | -100.0% bf16 MFU | 62152 tok/s +step 17230/19560 | loss 3.244028 (-0.90z)| norm 0.2514 (+0.64z)| lr 2.23e-05 | 8436.83 ms | -100.0% bf16 MFU | 62151 tok/s +step 17231/19560 | loss 3.274652 (-0.00z)| norm 0.2421 (-0.38z)| lr 2.23e-05 | 8436.29 ms | -100.0% bf16 MFU | 62151 tok/s +step 17232/19560 | loss 3.237839 (-1.06z)| norm 0.2520 (+0.70z)| lr 2.23e-05 | 8439.60 ms | -100.0% bf16 MFU | 62150 tok/s +step 17233/19560 | loss 3.221161 (-1.52z)| norm 0.2446 (-0.13z)| lr 2.23e-05 | 8438.38 ms | -100.0% bf16 MFU | 62149 tok/s +step 17234/19560 | loss 3.298721 (+0.72z)| norm 0.2504 (+0.51z)| lr 2.23e-05 | 8437.75 ms | -100.0% bf16 MFU | 62148 tok/s +step 17235/19560 | loss 3.281766 (+0.21z)| norm 0.2525 (+0.74z)| lr 2.22e-05 | 8436.50 ms | -100.0% bf16 MFU | 62148 tok/s +step 17236/19560 | loss 3.348907 (+2.13z)| norm 0.2470 (+0.13z)| lr 2.22e-05 | 8437.37 ms | -100.0% bf16 MFU | 62147 tok/s +step 17237/19560 | loss 3.243546 (-0.91z)| norm 0.2559 (+1.12z)| lr 2.22e-05 | 8434.57 ms | -100.0% bf16 MFU | 62148 tok/s +step 17238/19560 | loss 3.266142 (-0.24z)| norm 0.2442 (-0.18z)| lr 2.22e-05 | 8438.06 ms | -100.0% bf16 MFU | 62147 tok/s +step 17239/19560 | loss 3.262377 (-0.36z)| norm 0.2587 (+1.40z)| lr 2.22e-05 | 8436.68 ms | -100.0% bf16 MFU | 62147 tok/s +step 17240/19560 | loss 3.255306 (-0.57z)| norm 0.2393 (-0.72z)| lr 2.21e-05 | 8438.74 ms | -100.0% bf16 MFU | 62146 tok/s +step 17241/19560 | loss 3.306030 (+0.90z)| norm 0.2658 (+2.14z)| lr 2.21e-05 | 8437.11 ms | -100.0% bf16 MFU | 62146 tok/s +step 17242/19560 | loss 3.287413 (+0.34z)| norm 0.2623 (+1.89z)| lr 2.21e-05 | 8437.33 ms | -100.0% bf16 MFU | 62146 tok/s +step 17243/19560 | loss 3.251364 (-0.75z)| norm 0.2322 (-1.56z)| lr 2.21e-05 | 8437.13 ms | -100.0% bf16 MFU | 62145 tok/s +step 17244/19560 | loss 3.234429 (-1.26z)| norm 0.2326 (-1.49z)| lr 2.21e-05 | 8438.32 ms | -100.0% bf16 MFU | 62145 tok/s +step 17245/19560 | loss 3.250178 (-0.78z)| norm 0.2411 (-0.51z)| lr 2.20e-05 | 8436.16 ms | -100.0% bf16 MFU | 62145 tok/s +step 17246/19560 | loss 3.298712 (+0.69z)| norm 0.2485 (+0.32z)| lr 2.20e-05 | 8437.68 ms | -100.0% bf16 MFU | 62144 tok/s +step 17247/19560 | loss 3.299267 (+0.69z)| norm 0.2531 (+0.87z)| lr 2.20e-05 | 8432.62 ms | -100.0% bf16 MFU | 62146 tok/s +step 17248/19560 | loss 3.284943 (+0.25z)| norm 0.2448 (-0.08z)| lr 2.20e-05 | 8436.65 ms | -100.0% bf16 MFU | 62146 tok/s +step 17249/19560 | loss 3.243799 (-1.02z)| norm 0.2412 (-0.51z)| lr 2.20e-05 | 8438.26 ms | -100.0% bf16 MFU | 62145 tok/s +step 17250/19560 | loss 3.308333 (+0.95z)| norm 0.2306 (-1.73z)| lr 2.20e-05 | 8435.82 ms | -100.0% bf16 MFU | 62145 tok/s +val loss 3.270900 +evaluating HellaSwag: 0/1256 evaluating HellaSwag: 10/1256 evaluating HellaSwag: 20/1256 evaluating HellaSwag: 30/1256 evaluating HellaSwag: 40/1256 evaluating HellaSwag: 50/1256 evaluating HellaSwag: 60/1256 evaluating HellaSwag: 70/1256 evaluating HellaSwag: 80/1256 evaluating HellaSwag: 90/1256 evaluating HellaSwag: 100/1256 evaluating HellaSwag: 110/1256 evaluating HellaSwag: 120/1256 evaluating HellaSwag: 130/1256 evaluating HellaSwag: 140/1256 evaluating HellaSwag: 150/1256 evaluating HellaSwag: 160/1256 evaluating HellaSwag: 170/1256 evaluating HellaSwag: 180/1256 evaluating HellaSwag: 190/1256 evaluating HellaSwag: 200/1256 evaluating HellaSwag: 210/1256 evaluating HellaSwag: 220/1256 evaluating HellaSwag: 230/1256 evaluating HellaSwag: 240/1256 evaluating HellaSwag: 250/1256 evaluating HellaSwag: 260/1256 evaluating HellaSwag: 270/1256 evaluating HellaSwag: 280/1256 evaluating HellaSwag: 290/1256 evaluating HellaSwag: 300/1256 evaluating HellaSwag: 310/1256 evaluating HellaSwag: 320/1256 evaluating HellaSwag: 330/1256 evaluating HellaSwag: 340/1256 evaluating HellaSwag: 350/1256 evaluating HellaSwag: 360/1256 evaluating HellaSwag: 370/1256 evaluating HellaSwag: 380/1256 evaluating HellaSwag: 390/1256 evaluating HellaSwag: 400/1256 evaluating HellaSwag: 410/1256 evaluating HellaSwag: 420/1256 evaluating HellaSwag: 430/1256 evaluating HellaSwag: 440/1256 evaluating HellaSwag: 450/1256 evaluating HellaSwag: 460/1256 evaluating HellaSwag: 470/1256 evaluating HellaSwag: 480/1256 evaluating HellaSwag: 490/1256 evaluating HellaSwag: 500/1256 evaluating HellaSwag: 510/1256 evaluating HellaSwag: 520/1256 evaluating HellaSwag: 530/1256 evaluating HellaSwag: 540/1256 evaluating HellaSwag: 550/1256 evaluating HellaSwag: 560/1256 evaluating HellaSwag: 570/1256 evaluating HellaSwag: 580/1256 evaluating HellaSwag: 590/1256 evaluating HellaSwag: 600/1256 evaluating HellaSwag: 610/1256 evaluating HellaSwag: 620/1256 evaluating HellaSwag: 630/1256 evaluating HellaSwag: 640/1256 evaluating HellaSwag: 650/1256 evaluating HellaSwag: 660/1256 evaluating HellaSwag: 670/1256 evaluating HellaSwag: 680/1256 evaluating HellaSwag: 690/1256 evaluating HellaSwag: 700/1256 evaluating HellaSwag: 710/1256 evaluating HellaSwag: 720/1256 evaluating HellaSwag: 730/1256 evaluating HellaSwag: 740/1256 evaluating HellaSwag: 750/1256 evaluating HellaSwag: 760/1256 evaluating HellaSwag: 770/1256 evaluating HellaSwag: 780/1256 evaluating HellaSwag: 790/1256 evaluating HellaSwag: 800/1256 evaluating HellaSwag: 810/1256 evaluating HellaSwag: 820/1256 evaluating HellaSwag: 830/1256 evaluating HellaSwag: 840/1256 evaluating HellaSwag: 850/1256 evaluating HellaSwag: 860/1256 evaluating HellaSwag: 870/1256 evaluating HellaSwag: 880/1256 evaluating HellaSwag: 890/1256 evaluating HellaSwag: 900/1256 evaluating HellaSwag: 910/1256 evaluating HellaSwag: 920/1256 evaluating HellaSwag: 930/1256 evaluating HellaSwag: 940/1256 evaluating HellaSwag: 950/1256 evaluating HellaSwag: 960/1256 evaluating HellaSwag: 970/1256 evaluating HellaSwag: 980/1256 evaluating HellaSwag: 990/1256 evaluating HellaSwag: 1000/1256 evaluating HellaSwag: 1010/1256 evaluating HellaSwag: 1020/1256 evaluating HellaSwag: 1030/1256 evaluating HellaSwag: 1040/1256 evaluating HellaSwag: 1050/1256 evaluating HellaSwag: 1060/1256 evaluating HellaSwag: 1070/1256 evaluating HellaSwag: 1080/1256 evaluating HellaSwag: 1090/1256 evaluating HellaSwag: 1100/1256 evaluating HellaSwag: 1110/1256 evaluating HellaSwag: 1120/1256 evaluating HellaSwag: 1130/1256 evaluating HellaSwag: 1140/1256 evaluating HellaSwag: 1150/1256 evaluating HellaSwag: 1160/1256 evaluating HellaSwag: 1170/1256 evaluating HellaSwag: 1180/1256 evaluating HellaSwag: 1190/1256 evaluating HellaSwag: 1200/1256 evaluating HellaSwag: 1210/1256 evaluating HellaSwag: 1220/1256 evaluating HellaSwag: 1230/1256 evaluating HellaSwag: 1240/1256 evaluating HellaSwag: 1250/1256 HellaSwag: 3008/10042 = 0.299542 +step 17251/19560 | loss 3.254337 (-0.70z)| norm 0.2355 (-1.14z)| lr 2.19e-05 | 8435.60 ms | -100.0% bf16 MFU | 62146 tok/s +step 17252/19560 | loss 3.246851 (-0.92z)| norm 0.2489 (+0.40z)| lr 2.19e-05 | 8435.70 ms | -100.0% bf16 MFU | 62146 tok/s +step 17253/19560 | loss 3.345518 (+2.05z)| norm 0.2550 (+1.09z)| lr 2.19e-05 | 8434.72 ms | -100.0% bf16 MFU | 62147 tok/s +step 17254/19560 | loss 3.251489 (-0.77z)| norm 0.2317 (-1.55z)| lr 2.19e-05 | 8437.40 ms | -100.0% bf16 MFU | 62146 tok/s +step 17255/19560 | loss 3.237530 (-1.17z)| norm 0.2462 (+0.08z)| lr 2.19e-05 | 8436.86 ms | -100.0% bf16 MFU | 62146 tok/s +step 17256/19560 | loss 3.296654 (+0.60z)| norm 0.2426 (-0.34z)| lr 2.18e-05 | 8434.68 ms | -100.0% bf16 MFU | 62147 tok/s +step 17257/19560 | loss 3.202877 (-2.17z)| norm 0.2436 (-0.22z)| lr 2.18e-05 | 8437.34 ms | -100.0% bf16 MFU | 62146 tok/s +step 17258/19560 | loss 3.282526 (+0.17z)| norm 0.2456 (+0.02z)| lr 2.18e-05 | 8436.92 ms | -100.0% bf16 MFU | 62146 tok/s +step 17259/19560 | loss 3.295790 (+0.56z)| norm 0.2441 (-0.14z)| lr 2.18e-05 | 8439.90 ms | -100.0% bf16 MFU | 62145 tok/s +step 17260/19560 | loss 3.301182 (+0.72z)| norm 0.2436 (-0.19z)| lr 2.18e-05 | 8435.37 ms | -100.0% bf16 MFU | 62145 tok/s +step 17261/19560 | loss 3.296707 (+0.58z)| norm 0.2368 (-0.99z)| lr 2.17e-05 | 8437.51 ms | -100.0% bf16 MFU | 62145 tok/s +step 17262/19560 | loss 3.302087 (+0.74z)| norm 0.2838 (+4.22z)| lr 2.17e-05 | 8435.02 ms | -100.0% bf16 MFU | 62145 tok/s +step 17263/19560 | loss 3.211160 (-1.89z)| norm 0.2351 (-1.12z)| lr 2.17e-05 | 8434.89 ms | -100.0% bf16 MFU | 62146 tok/s +step 17264/19560 | loss 3.210598 (-1.87z)| norm 0.2388 (-0.70z)| lr 2.17e-05 | 8432.32 ms | -100.0% bf16 MFU | 62147 tok/s +step 17265/19560 | loss 3.318591 (+1.22z)| norm 0.2353 (-1.07z)| lr 2.17e-05 | 8440.03 ms | -100.0% bf16 MFU | 62146 tok/s +step 17266/19560 | loss 3.270062 (-0.18z)| norm 0.2335 (-1.25z)| lr 2.17e-05 | 8435.02 ms | -100.0% bf16 MFU | 62147 tok/s +step 17267/19560 | loss 3.321290 (+1.28z)| norm 0.2377 (-0.79z)| lr 2.16e-05 | 8436.33 ms | -100.0% bf16 MFU | 62147 tok/s +step 17268/19560 | loss 3.242292 (-0.99z)| norm 0.2539 (+0.98z)| lr 2.16e-05 | 8450.22 ms | -100.0% bf16 MFU | 62141 tok/s +step 17269/19560 | loss 3.309808 (+0.94z)| norm 0.2443 (-0.04z)| lr 2.16e-05 | 8467.93 ms | -100.0% bf16 MFU | 62130 tok/s +step 17270/19560 | loss 3.283034 (+0.17z)| norm 0.2457 (+0.11z)| lr 2.16e-05 | 8465.25 ms | -100.0% bf16 MFU | 62120 tok/s +step 17271/19560 | loss 3.301101 (+0.68z)| norm 0.2509 (+0.70z)| lr 2.16e-05 | 8462.11 ms | -100.0% bf16 MFU | 62112 tok/s +step 17272/19560 | loss 3.258915 (-0.52z)| norm 0.2457 (+0.10z)| lr 2.15e-05 | 8460.40 ms | -100.0% bf16 MFU | 62105 tok/s +step 17273/19560 | loss 3.287364 (+0.30z)| norm 0.2477 (+0.33z)| lr 2.15e-05 | 8464.21 ms | -100.0% bf16 MFU | 62097 tok/s +step 17274/19560 | loss 3.223824 (-1.50z)| norm 0.2435 (-0.15z)| lr 2.15e-05 | 8461.89 ms | -100.0% bf16 MFU | 62090 tok/s +step 17275/19560 | loss 3.293214 (+0.47z)| norm 0.2346 (-1.14z)| lr 2.15e-05 | 8456.50 ms | -100.0% bf16 MFU | 62085 tok/s +step 17276/19560 | loss 3.258403 (-0.52z)| norm 0.2328 (-1.32z)| lr 2.15e-05 | 8460.61 ms | -100.0% bf16 MFU | 62079 tok/s +step 17277/19560 | loss 3.216125 (-1.69z)| norm 0.2351 (-1.05z)| lr 2.15e-05 | 8453.68 ms | -100.0% bf16 MFU | 62076 tok/s +step 17278/19560 | loss 3.279211 (+0.10z)| norm 0.2292 (-1.68z)| lr 2.14e-05 | 8461.25 ms | -100.0% bf16 MFU | 62071 tok/s +step 17279/19560 | loss 3.233633 (-1.18z)| norm 0.2372 (-0.79z)| lr 2.14e-05 | 8454.42 ms | -100.0% bf16 MFU | 62068 tok/s +step 17280/19560 | loss 3.233874 (-1.16z)| norm 0.2381 (-0.67z)| lr 2.14e-05 | 8461.35 ms | -100.0% bf16 MFU | 62063 tok/s +step 17281/19560 | loss 3.321669 (+1.27z)| norm 0.2529 (+0.96z)| lr 2.14e-05 | 8459.85 ms | -100.0% bf16 MFU | 62058 tok/s +step 17282/19560 | loss 3.277874 (+0.04z)| norm 0.2427 (-0.18z)| lr 2.14e-05 | 8455.55 ms | -100.0% bf16 MFU | 62056 tok/s +step 17283/19560 | loss 3.306380 (+0.83z)| norm 0.2418 (-0.28z)| lr 2.13e-05 | 8454.87 ms | -100.0% bf16 MFU | 62053 tok/s +step 17284/19560 | loss 3.289415 (+0.36z)| norm 0.2347 (-1.06z)| lr 2.13e-05 | 8455.12 ms | -100.0% bf16 MFU | 62051 tok/s +step 17285/19560 | loss 3.356446 (+2.18z)| norm 0.2416 (-0.28z)| lr 2.13e-05 | 8445.28 ms | -100.0% bf16 MFU | 62053 tok/s +step 17286/19560 | loss 3.285278 (+0.22z)| norm 0.2368 (-0.83z)| lr 2.13e-05 | 8451.53 ms | -100.0% bf16 MFU | 62052 tok/s +step 17287/19560 | loss 3.237704 (-1.09z)| norm 0.2359 (-0.91z)| lr 2.13e-05 | 8448.83 ms | -100.0% bf16 MFU | 62052 tok/s +step 17288/19560 | loss 3.342338 (+1.76z)| norm 0.2307 (-1.47z)| lr 2.12e-05 | 8448.50 ms | -100.0% bf16 MFU | 62052 tok/s +step 17289/19560 | loss 3.294142 (+0.44z)| norm 0.2769 (+3.44z)| lr 2.12e-05 | 8435.76 ms | -100.0% bf16 MFU | 62057 tok/s +step 17290/19560 | loss 3.254242 (-0.66z)| norm 0.2415 (-0.30z)| lr 2.12e-05 | 8446.18 ms | -100.0% bf16 MFU | 62058 tok/s +step 17291/19560 | loss 3.283286 (+0.13z)| norm 0.2424 (-0.20z)| lr 2.12e-05 | 8444.97 ms | -100.0% bf16 MFU | 62059 tok/s +step 17292/19560 | loss 3.287140 (+0.24z)| norm 0.2457 (+0.15z)| lr 2.12e-05 | 8445.81 ms | -100.0% bf16 MFU | 62060 tok/s +step 17293/19560 | loss 3.335809 (+1.55z)| norm 0.2485 (+0.46z)| lr 2.12e-05 | 8441.43 ms | -100.0% bf16 MFU | 62062 tok/s +step 17294/19560 | loss 3.328519 (+1.33z)| norm 0.2477 (+0.37z)| lr 2.11e-05 | 8447.94 ms | -100.0% bf16 MFU | 62062 tok/s +step 17295/19560 | loss 3.288541 (+0.27z)| norm 0.2405 (-0.40z)| lr 2.11e-05 | 8441.82 ms | -100.0% bf16 MFU | 62065 tok/s +step 17296/19560 | loss 3.313412 (+0.94z)| norm 0.2411 (-0.33z)| lr 2.11e-05 | 8445.37 ms | -100.0% bf16 MFU | 62065 tok/s +step 17297/19560 | loss 3.269078 (-0.27z)| norm 0.2399 (-0.44z)| lr 2.11e-05 | 8440.20 ms | -100.0% bf16 MFU | 62068 tok/s +step 17298/19560 | loss 3.267123 (-0.32z)| norm 0.2355 (-0.91z)| lr 2.11e-05 | 8443.70 ms | -100.0% bf16 MFU | 62069 tok/s +step 17299/19560 | loss 3.224414 (-1.46z)| norm 0.2378 (-0.66z)| lr 2.10e-05 | 8447.50 ms | -100.0% bf16 MFU | 62069 tok/s +step 17300/19560 | loss 3.189845 (-2.33z)| norm 0.2369 (-0.74z)| lr 2.10e-05 | 8444.50 ms | -100.0% bf16 MFU | 62070 tok/s +step 17301/19560 | loss 3.195293 (-2.13z)| norm 0.2485 (+0.54z)| lr 2.10e-05 | 8441.40 ms | -100.0% bf16 MFU | 62072 tok/s +step 17302/19560 | loss 3.390939 (+2.87z)| norm 0.2444 (+0.09z)| lr 2.10e-05 | 8443.45 ms | -100.0% bf16 MFU | 62073 tok/s +step 17303/19560 | loss 3.236390 (-1.02z)| norm 0.2320 (-1.27z)| lr 2.10e-05 | 8444.56 ms | -100.0% bf16 MFU | 62074 tok/s +step 17304/19560 | loss 3.297520 (+0.53z)| norm 0.2545 (+1.20z)| lr 2.10e-05 | 8438.44 ms | -100.0% bf16 MFU | 62076 tok/s +step 17305/19560 | loss 3.270791 (-0.14z)| norm 0.2540 (+1.15z)| lr 2.09e-05 | 8446.95 ms | -100.0% bf16 MFU | 62076 tok/s +step 17306/19560 | loss 3.225701 (-1.29z)| norm 0.2482 (+0.50z)| lr 2.09e-05 | 8439.10 ms | -100.0% bf16 MFU | 62078 tok/s +step 17307/19560 | loss 3.234429 (-1.05z)| norm 0.2365 (-0.77z)| lr 2.09e-05 | 8448.38 ms | -100.0% bf16 MFU | 62077 tok/s +step 17308/19560 | loss 3.299451 (+0.65z)| norm 0.2330 (-1.14z)| lr 2.09e-05 | 8443.68 ms | -100.0% bf16 MFU | 62078 tok/s +step 17309/19560 | loss 3.270853 (-0.10z)| norm 0.2361 (-0.80z)| lr 2.09e-05 | 8447.37 ms | -100.0% bf16 MFU | 62078 tok/s +step 17310/19560 | loss 3.229707 (-1.16z)| norm 0.2298 (-1.46z)| lr 2.08e-05 | 8450.36 ms | -100.0% bf16 MFU | 62076 tok/s +step 17311/19560 | loss 3.266834 (-0.19z)| norm 0.2357 (-0.81z)| lr 2.08e-05 | 8452.81 ms | -100.0% bf16 MFU | 62073 tok/s +step 17312/19560 | loss 3.282219 (+0.21z)| norm 0.2295 (-1.46z)| lr 2.08e-05 | 8448.25 ms | -100.0% bf16 MFU | 62073 tok/s +step 17313/19560 | loss 3.221038 (-1.36z)| norm 0.2355 (-0.81z)| lr 2.08e-05 | 8446.13 ms | -100.0% bf16 MFU | 62073 tok/s +step 17314/19560 | loss 3.281200 (+0.20z)| norm 0.2392 (-0.41z)| lr 2.08e-05 | 8441.76 ms | -100.0% bf16 MFU | 62074 tok/s +step 17315/19560 | loss 3.249740 (-0.61z)| norm 0.2320 (-1.19z)| lr 2.08e-05 | 8447.94 ms | -100.0% bf16 MFU | 62074 tok/s +step 17316/19560 | loss 3.263246 (-0.25z)| norm 0.2447 (+0.18z)| lr 2.07e-05 | 8438.99 ms | -100.0% bf16 MFU | 62076 tok/s +step 17317/19560 | loss 3.233724 (-1.02z)| norm 0.2427 (-0.04z)| lr 2.07e-05 | 8439.54 ms | -100.0% bf16 MFU | 62079 tok/s +step 17318/19560 | loss 3.235180 (-0.97z)| norm 0.2411 (-0.22z)| lr 2.07e-05 | 8436.37 ms | -100.0% bf16 MFU | 62082 tok/s +step 17319/19560 | loss 3.239219 (-0.85z)| norm 0.2309 (-1.33z)| lr 2.07e-05 | 8436.21 ms | -100.0% bf16 MFU | 62085 tok/s +step 17320/19560 | loss 3.238738 (-0.86z)| norm 0.2364 (-0.73z)| lr 2.07e-05 | 8440.22 ms | -100.0% bf16 MFU | 62087 tok/s +step 17321/19560 | loss 3.246337 (-0.65z)| norm 0.2358 (-0.77z)| lr 2.06e-05 | 8437.62 ms | -100.0% bf16 MFU | 62089 tok/s +step 17322/19560 | loss 3.294833 (+0.61z)| norm 0.2358 (-0.76z)| lr 2.06e-05 | 8430.34 ms | -100.0% bf16 MFU | 62094 tok/s +step 17323/19560 | loss 3.222200 (-1.28z)| norm 0.2243 (-1.98z)| lr 2.06e-05 | 8440.62 ms | -100.0% bf16 MFU | 62096 tok/s +step 17324/19560 | loss 3.281188 (+0.25z)| norm 0.2433 (+0.05z)| lr 2.06e-05 | 8437.26 ms | -100.0% bf16 MFU | 62098 tok/s +step 17325/19560 | loss 3.263804 (-0.20z)| norm 0.2497 (+0.73z)| lr 2.06e-05 | 8434.34 ms | -100.0% bf16 MFU | 62101 tok/s +step 17326/19560 | loss 3.299355 (+0.71z)| norm 0.2348 (-0.88z)| lr 2.06e-05 | 8438.00 ms | -100.0% bf16 MFU | 62103 tok/s +step 17327/19560 | loss 3.247166 (-0.63z)| norm 0.2363 (-0.71z)| lr 2.05e-05 | 8433.28 ms | -100.0% bf16 MFU | 62106 tok/s +step 17328/19560 | loss 3.212791 (-1.51z)| norm 0.2366 (-0.67z)| lr 2.05e-05 | 8435.10 ms | -100.0% bf16 MFU | 62108 tok/s +step 17329/19560 | loss 3.361584 (+2.34z)| norm 0.2437 (+0.10z)| lr 2.05e-05 | 8436.83 ms | -100.0% bf16 MFU | 62110 tok/s +step 17330/19560 | loss 3.213604 (-1.45z)| norm 0.2473 (+0.50z)| lr 2.05e-05 | 8439.59 ms | -100.0% bf16 MFU | 62111 tok/s +step 17331/19560 | loss 3.218574 (-1.30z)| norm 0.2168 (-2.69z)| lr 2.05e-05 | 8436.69 ms | -100.0% bf16 MFU | 62112 tok/s +step 17332/19560 | loss 3.222282 (-1.20z)| norm 0.2376 (-0.50z)| lr 2.04e-05 | 8434.98 ms | -100.0% bf16 MFU | 62115 tok/s +step 17333/19560 | loss 3.329936 (+1.55z)| norm 0.2573 (+1.59z)| lr 2.04e-05 | 8430.30 ms | -100.0% bf16 MFU | 62118 tok/s +step 17334/19560 | loss 3.281647 (+0.31z)| norm 0.2409 (-0.17z)| lr 2.04e-05 | 8444.34 ms | -100.0% bf16 MFU | 62117 tok/s +step 17335/19560 | loss 3.256022 (-0.36z)| norm 0.2352 (-0.79z)| lr 2.04e-05 | 8437.94 ms | -100.0% bf16 MFU | 62118 tok/s +step 17336/19560 | loss 3.257394 (-0.33z)| norm 0.2326 (-1.07z)| lr 2.04e-05 | 8440.82 ms | -100.0% bf16 MFU | 62117 tok/s +step 17337/19560 | loss 3.282233 (+0.33z)| norm 0.2493 (+0.73z)| lr 2.04e-05 | 8436.00 ms | -100.0% bf16 MFU | 62119 tok/s +step 17338/19560 | loss 3.259586 (-0.26z)| norm 0.2515 (+0.95z)| lr 2.03e-05 | 8439.06 ms | -100.0% bf16 MFU | 62119 tok/s +step 17339/19560 | loss 3.302941 (+0.86z)| norm 0.2288 (-1.45z)| lr 2.03e-05 | 8438.16 ms | -100.0% bf16 MFU | 62120 tok/s +step 17340/19560 | loss 3.260423 (-0.23z)| norm 0.2521 (+1.02z)| lr 2.03e-05 | 8442.72 ms | -100.0% bf16 MFU | 62119 tok/s +step 17341/19560 | loss 3.314039 (+1.15z)| norm 0.2605 (+1.88z)| lr 2.03e-05 | 8440.04 ms | -100.0% bf16 MFU | 62119 tok/s +step 17342/19560 | loss 3.325170 (+1.41z)| norm 0.2469 (+0.45z)| lr 2.03e-05 | 8439.94 ms | -100.0% bf16 MFU | 62119 tok/s +step 17343/19560 | loss 3.284552 (+0.37z)| norm 0.2581 (+1.59z)| lr 2.02e-05 | 8443.47 ms | -100.0% bf16 MFU | 62118 tok/s +step 17344/19560 | loss 3.239670 (-0.79z)| norm 0.2575 (+1.50z)| lr 2.02e-05 | 8440.45 ms | -100.0% bf16 MFU | 62118 tok/s +step 17345/19560 | loss 3.311902 (+1.10z)| norm 0.2680 (+2.51z)| lr 2.02e-05 | 8439.70 ms | -100.0% bf16 MFU | 62118 tok/s +step 17346/19560 | loss 3.246103 (-0.63z)| norm 0.2550 (+1.19z)| lr 2.02e-05 | 8440.88 ms | -100.0% bf16 MFU | 62118 tok/s +step 17347/19560 | loss 3.247435 (-0.59z)| norm 0.2547 (+1.14z)| lr 2.02e-05 | 8441.37 ms | -100.0% bf16 MFU | 62117 tok/s +step 17348/19560 | loss 3.297384 (+0.71z)| norm 0.2615 (+1.80z)| lr 2.02e-05 | 8440.59 ms | -100.0% bf16 MFU | 62117 tok/s +step 17349/19560 | loss 3.284998 (+0.39z)| norm 0.2301 (-1.30z)| lr 2.01e-05 | 8442.08 ms | -100.0% bf16 MFU | 62117 tok/s +step 17350/19560 | loss 3.189363 (-2.06z)| norm 0.2490 (+0.56z)| lr 2.01e-05 | 8445.21 ms | -100.0% bf16 MFU | 62115 tok/s +step 17351/19560 | loss 3.261350 (-0.22z)| norm 0.2572 (+1.36z)| lr 2.01e-05 | 8441.33 ms | -100.0% bf16 MFU | 62114 tok/s +step 17352/19560 | loss 3.222929 (-1.20z)| norm 0.2700 (+2.54z)| lr 2.01e-05 | 8445.67 ms | -100.0% bf16 MFU | 62113 tok/s +step 17353/19560 | loss 3.308044 (+0.98z)| norm 0.2476 (+0.38z)| lr 2.01e-05 | 8440.85 ms | -100.0% bf16 MFU | 62113 tok/s +step 17354/19560 | loss 3.280654 (+0.27z)| norm 0.2414 (-0.21z)| lr 2.00e-05 | 8448.61 ms | -100.0% bf16 MFU | 62110 tok/s +step 17355/19560 | loss 3.329279 (+1.50z)| norm 0.2855 (+3.78z)| lr 2.00e-05 | 8441.72 ms | -100.0% bf16 MFU | 62110 tok/s +step 17356/19560 | loss 3.217912 (-1.33z)| norm 0.2430 (-0.09z)| lr 2.00e-05 | 8441.02 ms | -100.0% bf16 MFU | 62110 tok/s +step 17357/19560 | loss 3.319544 (+1.25z)| norm 0.2660 (+1.97z)| lr 2.00e-05 | 8448.72 ms | -100.0% bf16 MFU | 62107 tok/s +step 17358/19560 | loss 3.233980 (-0.95z)| norm 0.2333 (-0.95z)| lr 2.00e-05 | 8444.74 ms | -100.0% bf16 MFU | 62106 tok/s +step 17359/19560 | loss 3.222083 (-1.23z)| norm 0.2437 (-0.02z)| lr 2.00e-05 | 8446.41 ms | -100.0% bf16 MFU | 62104 tok/s +step 17360/19560 | loss 3.254480 (-0.41z)| norm 0.2422 (-0.15z)| lr 1.99e-05 | 8438.51 ms | -100.0% bf16 MFU | 62106 tok/s +step 17361/19560 | loss 3.299281 (+0.72z)| norm 0.2392 (-0.41z)| lr 1.99e-05 | 8441.94 ms | -100.0% bf16 MFU | 62106 tok/s +step 17362/19560 | loss 3.280003 (+0.23z)| norm 0.2426 (-0.10z)| lr 1.99e-05 | 8447.79 ms | -100.0% bf16 MFU | 62103 tok/s +step 17363/19560 | loss 3.236619 (-0.87z)| norm 0.2337 (-0.89z)| lr 1.99e-05 | 8439.96 ms | -100.0% bf16 MFU | 62104 tok/s +step 17364/19560 | loss 3.271453 (+0.04z)| norm 0.2416 (-0.18z)| lr 1.99e-05 | 8441.03 ms | -100.0% bf16 MFU | 62105 tok/s +step 17365/19560 | loss 3.293973 (+0.61z)| norm 0.2499 (+0.57z)| lr 1.98e-05 | 8445.11 ms | -100.0% bf16 MFU | 62103 tok/s +step 17366/19560 | loss 3.227443 (-1.11z)| norm 0.2425 (-0.10z)| lr 1.98e-05 | 8441.27 ms | -100.0% bf16 MFU | 62104 tok/s +step 17367/19560 | loss 3.227541 (-1.09z)| norm 0.2401 (-0.29z)| lr 1.98e-05 | 8438.07 ms | -100.0% bf16 MFU | 62105 tok/s +step 17368/19560 | loss 3.294464 (+0.63z)| norm 0.2313 (-1.09z)| lr 1.98e-05 | 8441.38 ms | -100.0% bf16 MFU | 62105 tok/s +step 17369/19560 | loss 3.328027 (+1.48z)| norm 0.2388 (-0.39z)| lr 1.98e-05 | 8436.82 ms | -100.0% bf16 MFU | 62107 tok/s +step 17370/19560 | loss 3.241147 (-0.74z)| norm 0.2509 (+0.73z)| lr 1.98e-05 | 8441.02 ms | -100.0% bf16 MFU | 62108 tok/s +step 17371/19560 | loss 3.227311 (-1.08z)| norm 0.2409 (-0.21z)| lr 1.97e-05 | 8434.36 ms | -100.0% bf16 MFU | 62110 tok/s +step 17372/19560 | loss 3.260838 (-0.23z)| norm 0.2393 (-0.36z)| lr 1.97e-05 | 8438.92 ms | -100.0% bf16 MFU | 62111 tok/s +step 17373/19560 | loss 3.274527 (+0.11z)| norm 0.2404 (-0.26z)| lr 1.97e-05 | 8443.20 ms | -100.0% bf16 MFU | 62110 tok/s +step 17374/19560 | loss 3.253799 (-0.41z)| norm 0.2417 (-0.13z)| lr 1.97e-05 | 8436.92 ms | -100.0% bf16 MFU | 62112 tok/s +step 17375/19560 | loss 3.247468 (-0.56z)| norm 0.2505 (+0.70z)| lr 1.97e-05 | 8438.16 ms | -100.0% bf16 MFU | 62113 tok/s +step 17376/19560 | loss 3.259324 (-0.25z)| norm 0.2372 (-0.55z)| lr 1.97e-05 | 8443.19 ms | -100.0% bf16 MFU | 62112 tok/s +step 17377/19560 | loss 3.294753 (+0.65z)| norm 0.2377 (-0.50z)| lr 1.96e-05 | 8439.99 ms | -100.0% bf16 MFU | 62112 tok/s +step 17378/19560 | loss 3.266298 (-0.08z)| norm 0.2433 (+0.02z)| lr 1.96e-05 | 8436.60 ms | -100.0% bf16 MFU | 62114 tok/s +step 17379/19560 | loss 3.233542 (-0.92z)| norm 0.2614 (+1.69z)| lr 1.96e-05 | 8439.15 ms | -100.0% bf16 MFU | 62115 tok/s +step 17380/19560 | loss 3.220523 (-1.24z)| norm 0.2414 (-0.17z)| lr 1.96e-05 | 8437.68 ms | -100.0% bf16 MFU | 62116 tok/s +step 17381/19560 | loss 3.252667 (-0.40z)| norm 0.2489 (+0.54z)| lr 1.96e-05 | 8440.70 ms | -100.0% bf16 MFU | 62116 tok/s +step 17382/19560 | loss 3.249927 (-0.47z)| norm 0.2265 (-1.55z)| lr 1.95e-05 | 8436.64 ms | -100.0% bf16 MFU | 62117 tok/s +step 17383/19560 | loss 3.319815 (+1.32z)| norm 0.2448 (+0.16z)| lr 1.95e-05 | 8445.71 ms | -100.0% bf16 MFU | 62115 tok/s +step 17384/19560 | loss 3.310338 (+1.07z)| norm 0.2426 (-0.05z)| lr 1.95e-05 | 8440.56 ms | -100.0% bf16 MFU | 62115 tok/s +step 17385/19560 | loss 3.257641 (-0.31z)| norm 0.2485 (+0.50z)| lr 1.95e-05 | 8438.22 ms | -100.0% bf16 MFU | 62116 tok/s +step 17386/19560 | loss 3.311302 (+1.08z)| norm 0.2519 (+0.81z)| lr 1.95e-05 | 8440.81 ms | -100.0% bf16 MFU | 62116 tok/s +step 17387/19560 | loss 3.261668 (-0.20z)| norm 0.2207 (-2.03z)| lr 1.95e-05 | 8435.30 ms | -100.0% bf16 MFU | 62118 tok/s +step 17388/19560 | loss 3.297975 (+0.75z)| norm 0.2402 (-0.26z)| lr 1.94e-05 | 8437.01 ms | -100.0% bf16 MFU | 62119 tok/s +step 17389/19560 | loss 3.297937 (+0.75z)| norm 0.2243 (-1.68z)| lr 1.94e-05 | 8439.53 ms | -100.0% bf16 MFU | 62119 tok/s +step 17390/19560 | loss 3.204508 (-1.66z)| norm 0.2431 (+0.05z)| lr 1.94e-05 | 8436.83 ms | -100.0% bf16 MFU | 62120 tok/s +step 17391/19560 | loss 3.268380 (-0.02z)| norm 0.2289 (-1.30z)| lr 1.94e-05 | 8439.62 ms | -100.0% bf16 MFU | 62120 tok/s +step 17392/19560 | loss 3.258082 (-0.30z)| norm 0.2361 (-0.61z)| lr 1.94e-05 | 8438.67 ms | -100.0% bf16 MFU | 62121 tok/s +step 17393/19560 | loss 3.293790 (+0.65z)| norm 0.2382 (-0.41z)| lr 1.94e-05 | 8439.72 ms | -100.0% bf16 MFU | 62121 tok/s +step 17394/19560 | loss 3.222874 (-1.21z)| norm 0.2318 (-1.02z)| lr 1.93e-05 | 8439.09 ms | -100.0% bf16 MFU | 62121 tok/s +step 17395/19560 | loss 3.337646 (+1.80z)| norm 0.2410 (-0.15z)| lr 1.93e-05 | 8439.56 ms | -100.0% bf16 MFU | 62121 tok/s +step 17396/19560 | loss 3.265798 (-0.09z)| norm 0.2366 (-0.55z)| lr 1.93e-05 | 8437.60 ms | -100.0% bf16 MFU | 62122 tok/s +step 17397/19560 | loss 3.262333 (-0.17z)| norm 0.2410 (-0.13z)| lr 1.93e-05 | 8437.05 ms | -100.0% bf16 MFU | 62123 tok/s +step 17398/19560 | loss 3.261053 (-0.20z)| norm 0.2283 (-1.32z)| lr 1.93e-05 | 8437.28 ms | -100.0% bf16 MFU | 62124 tok/s +step 17399/19560 | loss 3.193716 (-1.93z)| norm 0.2328 (-0.88z)| lr 1.92e-05 | 8434.11 ms | -100.0% bf16 MFU | 62126 tok/s +step 17400/19560 | loss 3.296566 (+0.74z)| norm 0.2315 (-0.99z)| lr 1.92e-05 | 8436.48 ms | -100.0% bf16 MFU | 62127 tok/s +step 17401/19560 | loss 3.282100 (+0.37z)| norm 0.2483 (+0.59z)| lr 1.92e-05 | 8436.58 ms | -100.0% bf16 MFU | 62128 tok/s +step 17402/19560 | loss 3.264536 (-0.10z)| norm 0.2365 (-0.52z)| lr 1.92e-05 | 8434.88 ms | -100.0% bf16 MFU | 62129 tok/s +step 17403/19560 | loss 3.314660 (+1.20z)| norm 0.2353 (-0.63z)| lr 1.92e-05 | 8434.29 ms | -100.0% bf16 MFU | 62131 tok/s +step 17404/19560 | loss 3.277055 (+0.22z)| norm 0.2269 (-1.41z)| lr 1.92e-05 | 8440.13 ms | -100.0% bf16 MFU | 62130 tok/s +step 17405/19560 | loss 3.299449 (+0.79z)| norm 0.2457 (+0.35z)| lr 1.91e-05 | 8437.29 ms | -100.0% bf16 MFU | 62131 tok/s +step 17406/19560 | loss 3.279094 (+0.26z)| norm 0.2524 (+0.96z)| lr 1.91e-05 | 8438.28 ms | -100.0% bf16 MFU | 62131 tok/s +step 17407/19560 | loss 3.248981 (-0.53z)| norm 0.2356 (-0.62z)| lr 1.91e-05 | 8434.14 ms | -100.0% bf16 MFU | 62132 tok/s +step 17408/19560 | loss 3.303222 (+0.87z)| norm 0.2368 (-0.50z)| lr 1.91e-05 | 8437.47 ms | -100.0% bf16 MFU | 62133 tok/s +step 17409/19560 | loss 3.276034 (+0.17z)| norm 0.2371 (-0.46z)| lr 1.91e-05 | 8439.53 ms | -100.0% bf16 MFU | 62132 tok/s +step 17410/19560 | loss 3.291425 (+0.57z)| norm 0.2354 (-0.62z)| lr 1.91e-05 | 8439.21 ms | -100.0% bf16 MFU | 62132 tok/s +step 17411/19560 | loss 3.286056 (+0.44z)| norm 0.2452 (+0.30z)| lr 1.90e-05 | 8436.58 ms | -100.0% bf16 MFU | 62132 tok/s +step 17412/19560 | loss 3.229819 (-1.03z)| norm 0.2340 (-0.75z)| lr 1.90e-05 | 8437.14 ms | -100.0% bf16 MFU | 62133 tok/s +step 17413/19560 | loss 3.242786 (-0.68z)| norm 0.2384 (-0.33z)| lr 1.90e-05 | 8438.46 ms | -100.0% bf16 MFU | 62133 tok/s +step 17414/19560 | loss 3.251753 (-0.43z)| norm 0.2448 (+0.26z)| lr 1.90e-05 | 8436.99 ms | -100.0% bf16 MFU | 62133 tok/s +step 17415/19560 | loss 3.264344 (-0.10z)| norm 0.2428 (+0.07z)| lr 1.90e-05 | 8436.09 ms | -100.0% bf16 MFU | 62134 tok/s +step 17416/19560 | loss 3.322242 (+1.47z)| norm 0.2620 (+1.84z)| lr 1.89e-05 | 8439.33 ms | -100.0% bf16 MFU | 62133 tok/s +step 17417/19560 | loss 3.301123 (+0.90z)| norm 0.2472 (+0.50z)| lr 1.89e-05 | 8436.17 ms | -100.0% bf16 MFU | 62134 tok/s +step 17418/19560 | loss 3.245510 (-0.61z)| norm 0.2388 (-0.32z)| lr 1.89e-05 | 8436.13 ms | -100.0% bf16 MFU | 62135 tok/s +step 17419/19560 | loss 3.316292 (+1.29z)| norm 0.2345 (-0.73z)| lr 1.89e-05 | 8432.44 ms | -100.0% bf16 MFU | 62137 tok/s +step 17420/19560 | loss 3.285654 (+0.47z)| norm 0.2369 (-0.50z)| lr 1.89e-05 | 8436.60 ms | -100.0% bf16 MFU | 62137 tok/s +step 17421/19560 | loss 3.233185 (-0.93z)| norm 0.2336 (-0.80z)| lr 1.89e-05 | 8434.70 ms | -100.0% bf16 MFU | 62138 tok/s +step 17422/19560 | loss 3.293068 (+0.71z)| norm 0.2627 (+1.99z)| lr 1.88e-05 | 8437.61 ms | -100.0% bf16 MFU | 62138 tok/s +step 17423/19560 | loss 3.274436 (+0.20z)| norm 0.2296 (-1.17z)| lr 1.88e-05 | 8432.94 ms | -100.0% bf16 MFU | 62140 tok/s +step 17424/19560 | loss 3.273791 (+0.20z)| norm 0.2384 (-0.33z)| lr 1.88e-05 | 8434.32 ms | -100.0% bf16 MFU | 62141 tok/s +step 17425/19560 | loss 3.238030 (-0.78z)| norm 0.2326 (-0.87z)| lr 1.88e-05 | 8433.74 ms | -100.0% bf16 MFU | 62142 tok/s +step 17426/19560 | loss 3.317632 (+1.39z)| norm 0.2462 (+0.41z)| lr 1.88e-05 | 8434.96 ms | -100.0% bf16 MFU | 62143 tok/s +step 17427/19560 | loss 3.322370 (+1.49z)| norm 0.2385 (-0.33z)| lr 1.88e-05 | 8437.84 ms | -100.0% bf16 MFU | 62142 tok/s +step 17428/19560 | loss 3.249388 (-0.52z)| norm 0.2365 (-0.51z)| lr 1.87e-05 | 8431.32 ms | -100.0% bf16 MFU | 62145 tok/s +step 17429/19560 | loss 3.421836 (+4.01z)| norm 0.2443 (+0.23z)| lr 1.87e-05 | 8436.94 ms | -100.0% bf16 MFU | 62144 tok/s +step 17430/19560 | loss 3.227411 (-1.13z)| norm 0.2367 (-0.48z)| lr 1.87e-05 | 8435.74 ms | -100.0% bf16 MFU | 62145 tok/s +step 17431/19560 | loss 3.218104 (-1.37z)| norm 0.2320 (-0.93z)| lr 1.87e-05 | 8437.13 ms | -100.0% bf16 MFU | 62145 tok/s +step 17432/19560 | loss 3.271968 (+0.10z)| norm 0.2404 (-0.12z)| lr 1.87e-05 | 8439.80 ms | -100.0% bf16 MFU | 62143 tok/s +step 17433/19560 | loss 3.295897 (+0.75z)| norm 0.2354 (-0.59z)| lr 1.87e-05 | 8433.81 ms | -100.0% bf16 MFU | 62144 tok/s +step 17434/19560 | loss 3.261817 (-0.19z)| norm 0.2404 (-0.10z)| lr 1.86e-05 | 8433.40 ms | -100.0% bf16 MFU | 62146 tok/s +step 17435/19560 | loss 3.303214 (+0.93z)| norm 0.2283 (-1.26z)| lr 1.86e-05 | 8435.62 ms | -100.0% bf16 MFU | 62146 tok/s +step 17436/19560 | loss 3.270589 (+0.04z)| norm 0.2550 (+1.27z)| lr 1.86e-05 | 8433.98 ms | -100.0% bf16 MFU | 62147 tok/s +step 17437/19560 | loss 3.281546 (+0.34z)| norm 0.2282 (-1.27z)| lr 1.86e-05 | 8437.05 ms | -100.0% bf16 MFU | 62146 tok/s +step 17438/19560 | loss 3.261272 (-0.22z)| norm 0.2350 (-0.63z)| lr 1.86e-05 | 8435.61 ms | -100.0% bf16 MFU | 62147 tok/s +step 17439/19560 | loss 3.256160 (-0.36z)| norm 0.2296 (-1.13z)| lr 1.85e-05 | 8437.74 ms | -100.0% bf16 MFU | 62146 tok/s +step 17440/19560 | loss 3.260190 (-0.25z)| norm 0.2312 (-0.98z)| lr 1.85e-05 | 8436.79 ms | -100.0% bf16 MFU | 62146 tok/s +step 17441/19560 | loss 3.270163 (+0.02z)| norm 0.2505 (+0.84z)| lr 1.85e-05 | 8435.90 ms | -100.0% bf16 MFU | 62146 tok/s +step 17442/19560 | loss 3.252733 (-0.46z)| norm 0.2361 (-0.52z)| lr 1.85e-05 | 8437.39 ms | -100.0% bf16 MFU | 62146 tok/s +step 17443/19560 | loss 3.247378 (-0.61z)| norm 0.2359 (-0.54z)| lr 1.85e-05 | 8432.57 ms | -100.0% bf16 MFU | 62147 tok/s +step 17444/19560 | loss 3.252002 (-0.47z)| norm 0.2379 (-0.35z)| lr 1.85e-05 | 8435.36 ms | -100.0% bf16 MFU | 62148 tok/s +step 17445/19560 | loss 3.308640 (+1.08z)| norm 0.2370 (-0.43z)| lr 1.84e-05 | 8439.02 ms | -100.0% bf16 MFU | 62147 tok/s +step 17446/19560 | loss 3.259999 (-0.28z)| norm 0.2378 (-0.35z)| lr 1.84e-05 | 8437.83 ms | -100.0% bf16 MFU | 62146 tok/s +step 17447/19560 | loss 3.194938 (-2.04z)| norm 0.2235 (-1.69z)| lr 1.84e-05 | 8439.24 ms | -100.0% bf16 MFU | 62145 tok/s +step 17448/19560 | loss 3.338665 (+1.85z)| norm 0.2447 (+0.30z)| lr 1.84e-05 | 8435.95 ms | -100.0% bf16 MFU | 62145 tok/s +step 17449/19560 | loss 3.240182 (-0.82z)| norm 0.2534 (+1.10z)| lr 1.84e-05 | 8437.13 ms | -100.0% bf16 MFU | 62145 tok/s +step 17450/19560 | loss 3.346387 (+2.02z)| norm 0.2408 (-0.09z)| lr 1.84e-05 | 8437.99 ms | -100.0% bf16 MFU | 62144 tok/s +step 17451/19560 | loss 3.254005 (-0.45z)| norm 0.2301 (-1.10z)| lr 1.83e-05 | 8436.28 ms | -100.0% bf16 MFU | 62145 tok/s +step 17452/19560 | loss 3.263994 (-0.18z)| norm 0.2459 (+0.39z)| lr 1.83e-05 | 8434.67 ms | -100.0% bf16 MFU | 62145 tok/s +step 17453/19560 | loss 3.292881 (+0.59z)| norm 0.2430 (+0.12z)| lr 1.83e-05 | 8435.44 ms | -100.0% bf16 MFU | 62146 tok/s +step 17454/19560 | loss 3.303532 (+0.87z)| norm 0.2415 (-0.02z)| lr 1.83e-05 | 8438.11 ms | -100.0% bf16 MFU | 62145 tok/s +step 17455/19560 | loss 3.351179 (+2.09z)| norm 0.2379 (-0.37z)| lr 1.83e-05 | 8436.78 ms | -100.0% bf16 MFU | 62145 tok/s +step 17456/19560 | loss 3.304920 (+0.86z)| norm 0.2473 (+0.51z)| lr 1.83e-05 | 8434.14 ms | -100.0% bf16 MFU | 62146 tok/s +step 17457/19560 | loss 3.244678 (-0.73z)| norm 0.2499 (+0.75z)| lr 1.82e-05 | 8437.38 ms | -100.0% bf16 MFU | 62145 tok/s +step 17458/19560 | loss 3.261488 (-0.29z)| norm 0.2436 (+0.16z)| lr 1.82e-05 | 8438.35 ms | -100.0% bf16 MFU | 62145 tok/s +step 17459/19560 | loss 3.291588 (+0.52z)| norm 0.2406 (-0.15z)| lr 1.82e-05 | 8461.63 ms | -100.0% bf16 MFU | 62136 tok/s +step 17460/19560 | loss 3.340604 (+1.84z)| norm 0.2494 (+0.70z)| lr 1.82e-05 | 8464.54 ms | -100.0% bf16 MFU | 62126 tok/s +step 17461/19560 | loss 3.326104 (+1.44z)| norm 0.2500 (+0.76z)| lr 1.82e-05 | 8465.78 ms | -100.0% bf16 MFU | 62116 tok/s +step 17462/19560 | loss 3.192167 (-2.17z)| norm 0.2429 (+0.08z)| lr 1.82e-05 | 8462.96 ms | -100.0% bf16 MFU | 62108 tok/s +step 17463/19560 | loss 3.266335 (-0.18z)| norm 0.2407 (-0.14z)| lr 1.81e-05 | 8466.06 ms | -100.0% bf16 MFU | 62099 tok/s +step 17464/19560 | loss 3.269372 (-0.10z)| norm 0.2364 (-0.56z)| lr 1.81e-05 | 8465.79 ms | -100.0% bf16 MFU | 62090 tok/s +step 17465/19560 | loss 3.343808 (+1.87z)| norm 0.2401 (-0.20z)| lr 1.81e-05 | 8462.79 ms | -100.0% bf16 MFU | 62083 tok/s +step 17466/19560 | loss 3.270035 (-0.09z)| norm 0.2492 (+0.70z)| lr 1.81e-05 | 8456.78 ms | -100.0% bf16 MFU | 62079 tok/s +step 17467/19560 | loss 3.365808 (+2.39z)| norm 0.2582 (+1.55z)| lr 1.81e-05 | 8461.02 ms | -100.0% bf16 MFU | 62073 tok/s +step 17468/19560 | loss 3.291318 (+0.44z)| norm 0.2416 (-0.06z)| lr 1.80e-05 | 8465.60 ms | -100.0% bf16 MFU | 62066 tok/s +step 17469/19560 | loss 3.317562 (+1.12z)| norm 0.2422 (+0.01z)| lr 1.80e-05 | 8457.58 ms | -100.0% bf16 MFU | 62062 tok/s +step 17470/19560 | loss 3.247498 (-0.69z)| norm 0.2534 (+1.11z)| lr 1.80e-05 | 8461.85 ms | -100.0% bf16 MFU | 62057 tok/s +step 17471/19560 | loss 3.280441 (+0.18z)| norm 0.2753 (+3.16z)| lr 1.80e-05 | 8459.17 ms | -100.0% bf16 MFU | 62053 tok/s +step 17472/19560 | loss 3.261257 (-0.33z)| norm 0.2492 (+0.67z)| lr 1.80e-05 | 8456.60 ms | -100.0% bf16 MFU | 62051 tok/s +step 17473/19560 | loss 3.318237 (+1.16z)| norm 0.3032 (+5.30z)| lr 1.80e-05 | 8457.50 ms | -100.0% bf16 MFU | 62048 tok/s +step 17474/19560 | loss 3.253429 (-0.54z)| norm 0.2457 (+0.29z)| lr 1.79e-05 | 8458.17 ms | -100.0% bf16 MFU | 62044 tok/s +step 17475/19560 | loss 3.272810 (-0.04z)| norm 0.2310 (-0.99z)| lr 1.79e-05 | 8454.60 ms | -100.0% bf16 MFU | 62043 tok/s +step 17476/19560 | loss 3.268475 (-0.14z)| norm 0.2433 (+0.10z)| lr 1.79e-05 | 8454.09 ms | -100.0% bf16 MFU | 62042 tok/s +step 17477/19560 | loss 3.343681 (+1.80z)| norm 0.2656 (+2.04z)| lr 1.79e-05 | 8454.25 ms | -100.0% bf16 MFU | 62040 tok/s +step 17478/19560 | loss 3.292952 (+0.47z)| norm 0.2494 (+0.62z)| lr 1.79e-05 | 8457.79 ms | -100.0% bf16 MFU | 62038 tok/s +step 17479/19560 | loss 3.254342 (-0.55z)| norm 0.2494 (+0.62z)| lr 1.79e-05 | 8457.08 ms | -100.0% bf16 MFU | 62035 tok/s +step 17480/19560 | loss 3.296134 (+0.54z)| norm 0.2334 (-0.77z)| lr 1.78e-05 | 8453.39 ms | -100.0% bf16 MFU | 62035 tok/s +step 17481/19560 | loss 3.268243 (-0.19z)| norm 0.2488 (+0.60z)| lr 1.78e-05 | 8453.45 ms | -100.0% bf16 MFU | 62034 tok/s +step 17482/19560 | loss 3.396695 (+3.09z)| norm 0.2502 (+0.72z)| lr 1.78e-05 | 8455.92 ms | -100.0% bf16 MFU | 62032 tok/s +step 17483/19560 | loss 3.253524 (-0.57z)| norm 0.2368 (-0.48z)| lr 1.78e-05 | 8453.14 ms | -100.0% bf16 MFU | 62032 tok/s +step 17484/19560 | loss 3.291941 (+0.41z)| norm 0.2479 (+0.58z)| lr 1.78e-05 | 8456.75 ms | -100.0% bf16 MFU | 62030 tok/s +step 17485/19560 | loss 3.331439 (+1.43z)| norm 0.2350 (-0.63z)| lr 1.78e-05 | 8450.21 ms | -100.0% bf16 MFU | 62031 tok/s +step 17486/19560 | loss 3.284905 (+0.21z)| norm 0.2305 (-1.07z)| lr 1.77e-05 | 8452.26 ms | -100.0% bf16 MFU | 62031 tok/s +step 17487/19560 | loss 3.261250 (-0.42z)| norm 0.2469 (+0.52z)| lr 1.77e-05 | 8451.65 ms | -100.0% bf16 MFU | 62031 tok/s +step 17488/19560 | loss 3.287183 (+0.26z)| norm 0.2357 (-0.56z)| lr 1.77e-05 | 8451.66 ms | -100.0% bf16 MFU | 62031 tok/s +step 17489/19560 | loss 3.282223 (+0.13z)| norm 0.2461 (+0.44z)| lr 1.77e-05 | 8460.21 ms | -100.0% bf16 MFU | 62028 tok/s +step 17490/19560 | loss 3.352938 (+1.95z)| norm 0.2579 (+1.56z)| lr 1.77e-05 | 8458.31 ms | -100.0% bf16 MFU | 62026 tok/s +step 17491/19560 | loss 3.308579 (+0.79z)| norm 0.2340 (-0.73z)| lr 1.77e-05 | 8453.10 ms | -100.0% bf16 MFU | 62026 tok/s +step 17492/19560 | loss 3.272459 (-0.15z)| norm 0.2263 (-1.45z)| lr 1.76e-05 | 8452.74 ms | -100.0% bf16 MFU | 62026 tok/s +step 17493/19560 | loss 3.309139 (+0.80z)| norm 0.2441 (+0.25z)| lr 1.76e-05 | 8450.88 ms | -100.0% bf16 MFU | 62026 tok/s +step 17494/19560 | loss 3.265383 (-0.35z)| norm 0.2482 (+0.63z)| lr 1.76e-05 | 8452.83 ms | -100.0% bf16 MFU | 62026 tok/s +step 17495/19560 | loss 3.306321 (+0.71z)| norm 0.2415 (-0.00z)| lr 1.76e-05 | 8452.29 ms | -100.0% bf16 MFU | 62027 tok/s +step 17496/19560 | loss 3.280457 (+0.03z)| norm 0.2415 (-0.01z)| lr 1.76e-05 | 8451.37 ms | -100.0% bf16 MFU | 62027 tok/s +step 17497/19560 | loss 3.302564 (+0.62z)| norm 0.2431 (+0.14z)| lr 1.76e-05 | 8450.70 ms | -100.0% bf16 MFU | 62028 tok/s +step 17498/19560 | loss 3.259508 (-0.52z)| norm 0.2321 (-0.90z)| lr 1.75e-05 | 8451.34 ms | -100.0% bf16 MFU | 62028 tok/s +step 17499/19560 | loss 3.277070 (-0.07z)| norm 0.2412 (-0.03z)| lr 1.75e-05 | 8449.00 ms | -100.0% bf16 MFU | 62029 tok/s +step 17500/19560 | loss 3.268049 (-0.31z)| norm 0.2531 (+1.09z)| lr 1.75e-05 | 8455.29 ms | -100.0% bf16 MFU | 62028 tok/s +val loss 3.269192 +evaluating HellaSwag: 0/1256 evaluating HellaSwag: 10/1256 evaluating HellaSwag: 20/1256 evaluating HellaSwag: 30/1256 evaluating HellaSwag: 40/1256 evaluating HellaSwag: 50/1256 evaluating HellaSwag: 60/1256 evaluating HellaSwag: 70/1256 evaluating HellaSwag: 80/1256 evaluating HellaSwag: 90/1256 evaluating HellaSwag: 100/1256 evaluating HellaSwag: 110/1256 evaluating HellaSwag: 120/1256 evaluating HellaSwag: 130/1256 evaluating HellaSwag: 140/1256 evaluating HellaSwag: 150/1256 evaluating HellaSwag: 160/1256 evaluating HellaSwag: 170/1256 evaluating HellaSwag: 180/1256 evaluating HellaSwag: 190/1256 evaluating HellaSwag: 200/1256 evaluating HellaSwag: 210/1256 evaluating HellaSwag: 220/1256 evaluating HellaSwag: 230/1256 evaluating HellaSwag: 240/1256 evaluating HellaSwag: 250/1256 evaluating HellaSwag: 260/1256 evaluating HellaSwag: 270/1256 evaluating HellaSwag: 280/1256 evaluating HellaSwag: 290/1256 evaluating HellaSwag: 300/1256 evaluating HellaSwag: 310/1256 evaluating HellaSwag: 320/1256 evaluating HellaSwag: 330/1256 evaluating HellaSwag: 340/1256 evaluating HellaSwag: 350/1256 evaluating HellaSwag: 360/1256 evaluating HellaSwag: 370/1256 evaluating HellaSwag: 380/1256 evaluating HellaSwag: 390/1256 evaluating HellaSwag: 400/1256 evaluating HellaSwag: 410/1256 evaluating HellaSwag: 420/1256 evaluating HellaSwag: 430/1256 evaluating HellaSwag: 440/1256 evaluating HellaSwag: 450/1256 evaluating HellaSwag: 460/1256 evaluating HellaSwag: 470/1256 evaluating HellaSwag: 480/1256 evaluating HellaSwag: 490/1256 evaluating HellaSwag: 500/1256 evaluating HellaSwag: 510/1256 evaluating HellaSwag: 520/1256 evaluating HellaSwag: 530/1256 evaluating HellaSwag: 540/1256 evaluating HellaSwag: 550/1256 evaluating HellaSwag: 560/1256 evaluating HellaSwag: 570/1256 evaluating HellaSwag: 580/1256 evaluating HellaSwag: 590/1256 evaluating HellaSwag: 600/1256 evaluating HellaSwag: 610/1256 evaluating HellaSwag: 620/1256 evaluating HellaSwag: 630/1256 evaluating HellaSwag: 640/1256 evaluating HellaSwag: 650/1256 evaluating HellaSwag: 660/1256 evaluating HellaSwag: 670/1256 evaluating HellaSwag: 680/1256 evaluating HellaSwag: 690/1256 evaluating HellaSwag: 700/1256 evaluating HellaSwag: 710/1256 evaluating HellaSwag: 720/1256 evaluating HellaSwag: 730/1256 evaluating HellaSwag: 740/1256 evaluating HellaSwag: 750/1256 evaluating HellaSwag: 760/1256 evaluating HellaSwag: 770/1256 evaluating HellaSwag: 780/1256 evaluating HellaSwag: 790/1256 evaluating HellaSwag: 800/1256 evaluating HellaSwag: 810/1256 evaluating HellaSwag: 820/1256 evaluating HellaSwag: 830/1256 evaluating HellaSwag: 840/1256 evaluating HellaSwag: 850/1256 evaluating HellaSwag: 860/1256 evaluating HellaSwag: 870/1256 evaluating HellaSwag: 880/1256 evaluating HellaSwag: 890/1256 evaluating HellaSwag: 900/1256 evaluating HellaSwag: 910/1256 evaluating HellaSwag: 920/1256 evaluating HellaSwag: 930/1256 evaluating HellaSwag: 940/1256 evaluating HellaSwag: 950/1256 evaluating HellaSwag: 960/1256 evaluating HellaSwag: 970/1256 evaluating HellaSwag: 980/1256 evaluating HellaSwag: 990/1256 evaluating HellaSwag: 1000/1256 evaluating HellaSwag: 1010/1256 evaluating HellaSwag: 1020/1256 evaluating HellaSwag: 1030/1256 evaluating HellaSwag: 1040/1256 evaluating HellaSwag: 1050/1256 evaluating HellaSwag: 1060/1256 evaluating HellaSwag: 1070/1256 evaluating HellaSwag: 1080/1256 evaluating HellaSwag: 1090/1256 evaluating HellaSwag: 1100/1256 evaluating HellaSwag: 1110/1256 evaluating HellaSwag: 1120/1256 evaluating HellaSwag: 1130/1256 evaluating HellaSwag: 1140/1256 evaluating HellaSwag: 1150/1256 evaluating HellaSwag: 1160/1256 evaluating HellaSwag: 1170/1256 evaluating HellaSwag: 1180/1256 evaluating HellaSwag: 1190/1256 evaluating HellaSwag: 1200/1256 evaluating HellaSwag: 1210/1256 evaluating HellaSwag: 1220/1256 evaluating HellaSwag: 1230/1256 evaluating HellaSwag: 1240/1256 evaluating HellaSwag: 1250/1256 HellaSwag: 2989/10042 = 0.297650 +step 17501/19560 | loss 3.283176 (+0.09z)| norm 0.2424 (+0.07z)| lr 1.75e-05 | 8448.98 ms | -100.0% bf16 MFU | 62030 tok/s +step 17502/19560 | loss 3.255291 (-0.65z)| norm 0.2387 (-0.28z)| lr 1.75e-05 | 8449.71 ms | -100.0% bf16 MFU | 62030 tok/s +step 17503/19560 | loss 3.295214 (+0.40z)| norm 0.2334 (-0.77z)| lr 1.75e-05 | 8454.92 ms | -100.0% bf16 MFU | 62029 tok/s +step 17504/19560 | loss 3.249716 (-0.81z)| norm 0.2642 (+2.11z)| lr 1.74e-05 | 8443.65 ms | -100.0% bf16 MFU | 62033 tok/s +step 17505/19560 | loss 3.253098 (-0.71z)| norm 0.2484 (+0.61z)| lr 1.74e-05 | 8447.99 ms | -100.0% bf16 MFU | 62034 tok/s +step 17506/19560 | loss 3.309652 (+0.79z)| norm 0.2393 (-0.23z)| lr 1.74e-05 | 8447.84 ms | -100.0% bf16 MFU | 62035 tok/s +step 17507/19560 | loss 3.297511 (+0.45z)| norm 0.2344 (-0.67z)| lr 1.74e-05 | 8439.03 ms | -100.0% bf16 MFU | 62040 tok/s +step 17508/19560 | loss 3.263416 (-0.47z)| norm 0.2450 (+0.32z)| lr 1.74e-05 | 8438.28 ms | -100.0% bf16 MFU | 62045 tok/s +step 17509/19560 | loss 3.303261 (+0.59z)| norm 0.2363 (-0.49z)| lr 1.74e-05 | 8434.11 ms | -100.0% bf16 MFU | 62050 tok/s +step 17510/19560 | loss 3.283499 (+0.05z)| norm 0.2355 (-0.57z)| lr 1.73e-05 | 8438.13 ms | -100.0% bf16 MFU | 62055 tok/s +step 17511/19560 | loss 3.261520 (-0.53z)| norm 0.2353 (-0.59z)| lr 1.73e-05 | 8432.57 ms | -100.0% bf16 MFU | 62061 tok/s +step 17512/19560 | loss 3.278826 (-0.06z)| norm 0.2430 (+0.14z)| lr 1.73e-05 | 8434.37 ms | -100.0% bf16 MFU | 62066 tok/s +step 17513/19560 | loss 3.304275 (+0.63z)| norm 0.2440 (+0.25z)| lr 1.73e-05 | 8435.79 ms | -100.0% bf16 MFU | 62070 tok/s +step 17514/19560 | loss 3.284475 (+0.09z)| norm 0.2417 (+0.03z)| lr 1.73e-05 | 8439.57 ms | -100.0% bf16 MFU | 62072 tok/s +step 17515/19560 | loss 3.330835 (+1.34z)| norm 0.2323 (-0.89z)| lr 1.73e-05 | 8438.28 ms | -100.0% bf16 MFU | 62075 tok/s +step 17516/19560 | loss 3.339609 (+1.56z)| norm 0.2458 (+0.42z)| lr 1.72e-05 | 8434.07 ms | -100.0% bf16 MFU | 62080 tok/s +step 17517/19560 | loss 3.305614 (+0.64z)| norm 0.2456 (+0.38z)| lr 1.72e-05 | 8436.70 ms | -100.0% bf16 MFU | 62083 tok/s +step 17518/19560 | loss 3.313054 (+0.83z)| norm 0.2373 (-0.42z)| lr 1.72e-05 | 8435.93 ms | -100.0% bf16 MFU | 62086 tok/s +step 17519/19560 | loss 3.236127 (-1.26z)| norm 0.2458 (+0.40z)| lr 1.72e-05 | 8437.50 ms | -100.0% bf16 MFU | 62089 tok/s +step 17520/19560 | loss 3.258134 (-0.66z)| norm 0.2441 (+0.22z)| lr 1.72e-05 | 8437.66 ms | -100.0% bf16 MFU | 62091 tok/s +step 17521/19560 | loss 3.231156 (-1.37z)| norm 0.2432 (+0.13z)| lr 1.72e-05 | 8434.81 ms | -100.0% bf16 MFU | 62095 tok/s +step 17522/19560 | loss 3.232655 (-1.34z)| norm 0.2495 (+0.74z)| lr 1.71e-05 | 8442.13 ms | -100.0% bf16 MFU | 62095 tok/s +step 17523/19560 | loss 3.314915 (+0.90z)| norm 0.2293 (-1.23z)| lr 1.71e-05 | 8443.15 ms | -100.0% bf16 MFU | 62095 tok/s +step 17524/19560 | loss 3.287071 (+0.13z)| norm 0.2356 (-0.62z)| lr 1.71e-05 | 8439.80 ms | -100.0% bf16 MFU | 62096 tok/s +step 17525/19560 | loss 3.284303 (+0.05z)| norm 0.2467 (+0.47z)| lr 1.71e-05 | 8440.92 ms | -100.0% bf16 MFU | 62097 tok/s +step 17526/19560 | loss 3.289900 (+0.20z)| norm 0.2441 (+0.20z)| lr 1.71e-05 | 8434.98 ms | -100.0% bf16 MFU | 62100 tok/s +step 17527/19560 | loss 3.252537 (-0.85z)| norm 0.2481 (+0.59z)| lr 1.71e-05 | 8439.91 ms | -100.0% bf16 MFU | 62101 tok/s +step 17528/19560 | loss 3.244234 (-1.06z)| norm 0.2398 (-0.24z)| lr 1.70e-05 | 8440.62 ms | -100.0% bf16 MFU | 62102 tok/s +step 17529/19560 | loss 3.331649 (+1.34z)| norm 0.2418 (-0.04z)| lr 1.70e-05 | 8439.27 ms | -100.0% bf16 MFU | 62103 tok/s +step 17530/19560 | loss 3.239114 (-1.20z)| norm 0.2662 (+2.31z)| lr 1.70e-05 | 8441.48 ms | -100.0% bf16 MFU | 62103 tok/s +step 17531/19560 | loss 3.295024 (+0.34z)| norm 0.2371 (-0.52z)| lr 1.70e-05 | 8438.70 ms | -100.0% bf16 MFU | 62105 tok/s +step 17532/19560 | loss 3.283393 (+0.02z)| norm 0.2366 (-0.58z)| lr 1.70e-05 | 8440.26 ms | -100.0% bf16 MFU | 62105 tok/s +step 17533/19560 | loss 3.251698 (-0.84z)| norm 0.2317 (-1.04z)| lr 1.70e-05 | 8446.79 ms | -100.0% bf16 MFU | 62103 tok/s +step 17534/19560 | loss 3.269194 (-0.36z)| norm 0.2346 (-0.75z)| lr 1.69e-05 | 8437.05 ms | -100.0% bf16 MFU | 62105 tok/s +step 17535/19560 | loss 3.281626 (-0.02z)| norm 0.2430 (+0.07z)| lr 1.69e-05 | 8444.55 ms | -100.0% bf16 MFU | 62104 tok/s +step 17536/19560 | loss 3.308998 (+0.73z)| norm 0.2411 (-0.12z)| lr 1.69e-05 | 8438.51 ms | -100.0% bf16 MFU | 62106 tok/s +step 17537/19560 | loss 3.290767 (+0.22z)| norm 0.2398 (-0.25z)| lr 1.69e-05 | 8443.07 ms | -100.0% bf16 MFU | 62105 tok/s +step 17538/19560 | loss 3.272848 (-0.26z)| norm 0.2264 (-1.55z)| lr 1.69e-05 | 8440.76 ms | -100.0% bf16 MFU | 62106 tok/s +step 17539/19560 | loss 3.227069 (-1.50z)| norm 0.2419 (-0.04z)| lr 1.69e-05 | 8440.86 ms | -100.0% bf16 MFU | 62106 tok/s +step 17540/19560 | loss 3.249520 (-0.89z)| norm 0.2297 (-1.22z)| lr 1.68e-05 | 8447.54 ms | -100.0% bf16 MFU | 62104 tok/s +step 17541/19560 | loss 3.233444 (-1.33z)| norm 0.2214 (-1.99z)| lr 1.68e-05 | 8442.83 ms | -100.0% bf16 MFU | 62104 tok/s +step 17542/19560 | loss 3.281566 (-0.02z)| norm 0.2380 (-0.39z)| lr 1.68e-05 | 8443.00 ms | -100.0% bf16 MFU | 62103 tok/s +step 17543/19560 | loss 3.287675 (+0.14z)| norm 0.2308 (-1.07z)| lr 1.68e-05 | 8439.01 ms | -100.0% bf16 MFU | 62104 tok/s +step 17544/19560 | loss 3.259047 (-0.63z)| norm 0.2482 (+0.61z)| lr 1.68e-05 | 8442.19 ms | -100.0% bf16 MFU | 62104 tok/s +step 17545/19560 | loss 3.239725 (-1.15z)| norm 0.2287 (-1.25z)| lr 1.68e-05 | 8437.86 ms | -100.0% bf16 MFU | 62106 tok/s +step 17546/19560 | loss 3.219781 (-1.68z)| norm 0.2358 (-0.56z)| lr 1.67e-05 | 8438.26 ms | -100.0% bf16 MFU | 62107 tok/s +step 17547/19560 | loss 3.286595 (+0.15z)| norm 0.2461 (+0.41z)| lr 1.67e-05 | 8440.04 ms | -100.0% bf16 MFU | 62108 tok/s +step 17548/19560 | loss 3.258186 (-0.62z)| norm 0.2424 (+0.05z)| lr 1.67e-05 | 8439.55 ms | -100.0% bf16 MFU | 62109 tok/s +step 17549/19560 | loss 3.261644 (-0.53z)| norm 0.2428 (+0.08z)| lr 1.67e-05 | 8436.65 ms | -100.0% bf16 MFU | 62110 tok/s +step 17550/19560 | loss 3.325847 (+1.21z)| norm 0.2443 (+0.25z)| lr 1.67e-05 | 8437.63 ms | -100.0% bf16 MFU | 62112 tok/s +step 17551/19560 | loss 3.317788 (+0.98z)| norm 0.2413 (-0.05z)| lr 1.67e-05 | 8444.04 ms | -100.0% bf16 MFU | 62111 tok/s +step 17552/19560 | loss 3.265443 (-0.44z)| norm 0.2329 (-0.87z)| lr 1.66e-05 | 8443.55 ms | -100.0% bf16 MFU | 62110 tok/s +step 17553/19560 | loss 3.306033 (+0.65z)| norm 0.2501 (+0.81z)| lr 1.66e-05 | 8438.70 ms | -100.0% bf16 MFU | 62111 tok/s +step 17554/19560 | loss 3.263501 (-0.50z)| norm 0.2322 (-0.94z)| lr 1.66e-05 | 8439.74 ms | -100.0% bf16 MFU | 62111 tok/s +step 17555/19560 | loss 3.285298 (+0.11z)| norm 0.2446 (+0.27z)| lr 1.66e-05 | 8440.13 ms | -100.0% bf16 MFU | 62112 tok/s +step 17556/19560 | loss 3.258834 (-0.62z)| norm 0.2486 (+0.65z)| lr 1.66e-05 | 8437.37 ms | -100.0% bf16 MFU | 62113 tok/s +step 17557/19560 | loss 3.197072 (-2.37z)| norm 0.2429 (+0.09z)| lr 1.66e-05 | 8439.45 ms | -100.0% bf16 MFU | 62113 tok/s +step 17558/19560 | loss 3.263154 (-0.49z)| norm 0.2387 (-0.32z)| lr 1.65e-05 | 8439.24 ms | -100.0% bf16 MFU | 62114 tok/s +step 17559/19560 | loss 3.250160 (-0.88z)| norm 0.2439 (+0.18z)| lr 1.65e-05 | 8438.77 ms | -100.0% bf16 MFU | 62115 tok/s +step 17560/19560 | loss 3.243613 (-1.06z)| norm 0.2416 (-0.04z)| lr 1.65e-05 | 8439.77 ms | -100.0% bf16 MFU | 62115 tok/s +step 17561/19560 | loss 3.312824 (+0.94z)| norm 0.2448 (+0.26z)| lr 1.65e-05 | 8439.70 ms | -100.0% bf16 MFU | 62115 tok/s +step 17562/19560 | loss 3.251783 (-0.82z)| norm 0.2271 (-1.46z)| lr 1.65e-05 | 8441.91 ms | -100.0% bf16 MFU | 62115 tok/s +step 17563/19560 | loss 3.252038 (-0.80z)| norm 0.2353 (-0.66z)| lr 1.65e-05 | 8441.67 ms | -100.0% bf16 MFU | 62115 tok/s +step 17564/19560 | loss 3.282499 (+0.08z)| norm 0.2729 (+2.94z)| lr 1.64e-05 | 8438.82 ms | -100.0% bf16 MFU | 62115 tok/s +step 17565/19560 | loss 3.291260 (+0.33z)| norm 0.2271 (-1.45z)| lr 1.64e-05 | 8439.70 ms | -100.0% bf16 MFU | 62116 tok/s +step 17566/19560 | loss 3.242655 (-1.07z)| norm 0.2409 (-0.13z)| lr 1.64e-05 | 8440.01 ms | -100.0% bf16 MFU | 62116 tok/s +step 17567/19560 | loss 3.244107 (-1.02z)| norm 0.2362 (-0.58z)| lr 1.64e-05 | 8437.71 ms | -100.0% bf16 MFU | 62117 tok/s +step 17568/19560 | loss 3.309863 (+0.85z)| norm 0.2626 (+1.90z)| lr 1.64e-05 | 8440.38 ms | -100.0% bf16 MFU | 62117 tok/s +step 17569/19560 | loss 3.354918 (+2.09z)| norm 0.2539 (+1.07z)| lr 1.64e-05 | 8437.89 ms | -100.0% bf16 MFU | 62118 tok/s +step 17570/19560 | loss 3.286467 (+0.15z)| norm 0.2526 (+0.94z)| lr 1.63e-05 | 8440.79 ms | -100.0% bf16 MFU | 62117 tok/s +step 17571/19560 | loss 3.361237 (+2.21z)| norm 0.2354 (-0.69z)| lr 1.63e-05 | 8439.57 ms | -100.0% bf16 MFU | 62118 tok/s +step 17572/19560 | loss 3.264805 (-0.48z)| norm 0.2394 (-0.31z)| lr 1.63e-05 | 8440.50 ms | -100.0% bf16 MFU | 62118 tok/s +step 17573/19560 | loss 3.318972 (+1.03z)| norm 0.2435 (+0.06z)| lr 1.63e-05 | 8438.60 ms | -100.0% bf16 MFU | 62118 tok/s +step 17574/19560 | loss 3.326276 (+1.21z)| norm 0.2344 (-0.79z)| lr 1.63e-05 | 8442.53 ms | -100.0% bf16 MFU | 62117 tok/s +step 17575/19560 | loss 3.315070 (+0.89z)| norm 0.2437 (+0.07z)| lr 1.63e-05 | 8436.22 ms | -100.0% bf16 MFU | 62119 tok/s +step 17576/19560 | loss 3.268012 (-0.43z)| norm 0.2361 (-0.65z)| lr 1.63e-05 | 8435.59 ms | -100.0% bf16 MFU | 62120 tok/s +step 17577/19560 | loss 3.302222 (+0.54z)| norm 0.2380 (-0.45z)| lr 1.62e-05 | 8435.81 ms | -100.0% bf16 MFU | 62122 tok/s +step 17578/19560 | loss 3.282139 (-0.02z)| norm 0.2303 (-1.18z)| lr 1.62e-05 | 8439.18 ms | -100.0% bf16 MFU | 62122 tok/s +step 17579/19560 | loss 3.207891 (-2.13z)| norm 0.2388 (-0.37z)| lr 1.62e-05 | 8438.19 ms | -100.0% bf16 MFU | 62123 tok/s +step 17580/19560 | loss 3.283928 (+0.03z)| norm 0.2438 (+0.10z)| lr 1.62e-05 | 8436.42 ms | -100.0% bf16 MFU | 62124 tok/s +step 17581/19560 | loss 3.255091 (-0.78z)| norm 0.2305 (-1.16z)| lr 1.62e-05 | 8437.55 ms | -100.0% bf16 MFU | 62125 tok/s +step 17582/19560 | loss 3.267254 (-0.43z)| norm 0.2358 (-0.65z)| lr 1.62e-05 | 8444.58 ms | -100.0% bf16 MFU | 62123 tok/s +step 17583/19560 | loss 3.287468 (+0.17z)| norm 0.2458 (+0.30z)| lr 1.61e-05 | 8435.34 ms | -100.0% bf16 MFU | 62124 tok/s +step 17584/19560 | loss 3.295120 (+0.39z)| norm 0.2322 (-0.98z)| lr 1.61e-05 | 8439.19 ms | -100.0% bf16 MFU | 62124 tok/s +step 17585/19560 | loss 3.308732 (+0.78z)| norm 0.2323 (-0.96z)| lr 1.61e-05 | 8439.82 ms | -100.0% bf16 MFU | 62124 tok/s +step 17586/19560 | loss 3.308633 (+0.76z)| norm 0.2408 (-0.15z)| lr 1.61e-05 | 8436.45 ms | -100.0% bf16 MFU | 62125 tok/s +step 17587/19560 | loss 3.294390 (+0.35z)| norm 0.3185 (+6.06z)| lr 1.61e-05 | 8436.47 ms | -100.0% bf16 MFU | 62126 tok/s +step 17588/19560 | loss 3.248338 (-0.98z)| norm 0.2409 (-0.16z)| lr 1.61e-05 | 8436.73 ms | -100.0% bf16 MFU | 62127 tok/s +step 17589/19560 | loss 3.308821 (+0.80z)| norm 0.2504 (+0.61z)| lr 1.60e-05 | 8437.80 ms | -100.0% bf16 MFU | 62127 tok/s +step 17590/19560 | loss 3.279663 (-0.08z)| norm 0.2522 (+0.74z)| lr 1.60e-05 | 8437.12 ms | -100.0% bf16 MFU | 62128 tok/s +step 17591/19560 | loss 3.224343 (-1.72z)| norm 0.2340 (-0.71z)| lr 1.60e-05 | 8436.52 ms | -100.0% bf16 MFU | 62129 tok/s +step 17592/19560 | loss 3.317469 (+1.05z)| norm 0.2462 (+0.26z)| lr 1.60e-05 | 8438.05 ms | -100.0% bf16 MFU | 62129 tok/s +step 17593/19560 | loss 3.232226 (-1.47z)| norm 0.2362 (-0.54z)| lr 1.60e-05 | 8437.47 ms | -100.0% bf16 MFU | 62130 tok/s +step 17594/19560 | loss 3.320287 (+1.14z)| norm 0.2389 (-0.32z)| lr 1.60e-05 | 8436.57 ms | -100.0% bf16 MFU | 62130 tok/s +step 17595/19560 | loss 3.326997 (+1.38z)| norm 0.2405 (-0.18z)| lr 1.59e-05 | 8436.97 ms | -100.0% bf16 MFU | 62131 tok/s +step 17596/19560 | loss 3.304365 (+0.69z)| norm 0.2327 (-0.80z)| lr 1.59e-05 | 8435.96 ms | -100.0% bf16 MFU | 62132 tok/s +step 17597/19560 | loss 3.313893 (+0.98z)| norm 0.2423 (-0.03z)| lr 1.59e-05 | 8438.37 ms | -100.0% bf16 MFU | 62132 tok/s +step 17598/19560 | loss 3.310068 (+0.85z)| norm 0.2282 (-1.14z)| lr 1.59e-05 | 8437.64 ms | -100.0% bf16 MFU | 62132 tok/s +step 17599/19560 | loss 3.282184 (+0.00z)| norm 0.2483 (+0.50z)| lr 1.59e-05 | 8434.32 ms | -100.0% bf16 MFU | 62134 tok/s +step 17600/19560 | loss 3.304657 (+0.67z)| norm 0.2291 (-1.06z)| lr 1.59e-05 | 8436.45 ms | -100.0% bf16 MFU | 62134 tok/s +step 17601/19560 | loss 3.282940 (+0.02z)| norm 0.2421 (+0.04z)| lr 1.58e-05 | 8437.58 ms | -100.0% bf16 MFU | 62134 tok/s +step 17602/19560 | loss 3.279834 (-0.08z)| norm 0.2277 (-1.26z)| lr 1.58e-05 | 8437.82 ms | -100.0% bf16 MFU | 62134 tok/s +step 17603/19560 | loss 3.264369 (-0.55z)| norm 0.2421 (+0.05z)| lr 1.58e-05 | 8439.79 ms | -100.0% bf16 MFU | 62134 tok/s +step 17604/19560 | loss 3.256023 (-0.80z)| norm 0.2276 (-1.26z)| lr 1.58e-05 | 8434.10 ms | -100.0% bf16 MFU | 62135 tok/s +step 17605/19560 | loss 3.266743 (-0.46z)| norm 0.2547 (+1.23z)| lr 1.58e-05 | 8434.56 ms | -100.0% bf16 MFU | 62136 tok/s +step 17606/19560 | loss 3.333616 (+1.58z)| norm 0.2600 (+1.69z)| lr 1.58e-05 | 8436.03 ms | -100.0% bf16 MFU | 62137 tok/s +step 17607/19560 | loss 3.308506 (+0.80z)| norm 0.2307 (-0.96z)| lr 1.58e-05 | 8437.48 ms | -100.0% bf16 MFU | 62137 tok/s +step 17608/19560 | loss 3.352235 (+2.09z)| norm 0.2500 (+0.78z)| lr 1.57e-05 | 8438.81 ms | -100.0% bf16 MFU | 62137 tok/s +step 17609/19560 | loss 3.324953 (+1.25z)| norm 0.2360 (-0.48z)| lr 1.57e-05 | 8434.31 ms | -100.0% bf16 MFU | 62138 tok/s +step 17610/19560 | loss 3.322611 (+1.25z)| norm 0.2395 (-0.16z)| lr 1.57e-05 | 8439.39 ms | -100.0% bf16 MFU | 62137 tok/s +step 17611/19560 | loss 3.245512 (-1.16z)| norm 0.2271 (-1.27z)| lr 1.57e-05 | 8435.58 ms | -100.0% bf16 MFU | 62138 tok/s +step 17612/19560 | loss 3.302063 (+0.60z)| norm 0.2351 (-0.54z)| lr 1.57e-05 | 8437.97 ms | -100.0% bf16 MFU | 62138 tok/s +step 17613/19560 | loss 3.238409 (-1.36z)| norm 0.2319 (-0.83z)| lr 1.57e-05 | 8436.15 ms | -100.0% bf16 MFU | 62138 tok/s +step 17614/19560 | loss 3.309513 (+0.85z)| norm 0.2556 (+1.30z)| lr 1.56e-05 | 8440.62 ms | -100.0% bf16 MFU | 62137 tok/s +step 17615/19560 | loss 3.300769 (+0.57z)| norm 0.2329 (-0.74z)| lr 1.56e-05 | 8435.24 ms | -100.0% bf16 MFU | 62138 tok/s +step 17616/19560 | loss 3.300749 (+0.56z)| norm 0.2435 (+0.21z)| lr 1.56e-05 | 8431.87 ms | -100.0% bf16 MFU | 62140 tok/s +step 17617/19560 | loss 3.267457 (-0.47z)| norm 0.2398 (-0.12z)| lr 1.56e-05 | 8437.66 ms | -100.0% bf16 MFU | 62140 tok/s +step 17618/19560 | loss 3.226196 (-1.73z)| norm 0.2388 (-0.20z)| lr 1.56e-05 | 8436.67 ms | -100.0% bf16 MFU | 62140 tok/s +step 17619/19560 | loss 3.260713 (-0.64z)| norm 0.2351 (-0.54z)| lr 1.56e-05 | 8437.64 ms | -100.0% bf16 MFU | 62140 tok/s +step 17620/19560 | loss 3.276945 (-0.13z)| norm 0.2430 (+0.17z)| lr 1.55e-05 | 8436.52 ms | -100.0% bf16 MFU | 62140 tok/s +step 17621/19560 | loss 3.252759 (-0.87z)| norm 0.2361 (-0.46z)| lr 1.55e-05 | 8438.37 ms | -100.0% bf16 MFU | 62140 tok/s +step 17622/19560 | loss 3.260429 (-0.63z)| norm 0.2313 (-0.89z)| lr 1.55e-05 | 8435.82 ms | -100.0% bf16 MFU | 62140 tok/s +step 17623/19560 | loss 3.287992 (+0.23z)| norm 0.2290 (-1.08z)| lr 1.55e-05 | 8432.98 ms | -100.0% bf16 MFU | 62142 tok/s +step 17624/19560 | loss 3.425911 (+4.20z)| norm 0.2412 (+0.04z)| lr 1.55e-05 | 8435.72 ms | -100.0% bf16 MFU | 62142 tok/s +step 17625/19560 | loss 3.306450 (+0.72z)| norm 0.2337 (-0.64z)| lr 1.55e-05 | 8437.80 ms | -100.0% bf16 MFU | 62142 tok/s +step 17626/19560 | loss 3.318586 (+1.06z)| norm 0.2445 (+0.33z)| lr 1.54e-05 | 8437.84 ms | -100.0% bf16 MFU | 62142 tok/s +step 17627/19560 | loss 3.315050 (+0.94z)| norm 0.2497 (+0.80z)| lr 1.54e-05 | 8435.92 ms | -100.0% bf16 MFU | 62142 tok/s +step 17628/19560 | loss 3.266478 (-0.46z)| norm 0.2395 (-0.12z)| lr 1.54e-05 | 8436.98 ms | -100.0% bf16 MFU | 62142 tok/s +step 17629/19560 | loss 3.298441 (+0.46z)| norm 0.2292 (-1.05z)| lr 1.54e-05 | 8436.14 ms | -100.0% bf16 MFU | 62142 tok/s +step 17630/19560 | loss 3.303923 (+0.61z)| norm 0.2353 (-0.49z)| lr 1.54e-05 | 8434.63 ms | -100.0% bf16 MFU | 62143 tok/s +step 17631/19560 | loss 3.306736 (+0.68z)| norm 0.2548 (+1.27z)| lr 1.54e-05 | 8434.18 ms | -100.0% bf16 MFU | 62144 tok/s +step 17632/19560 | loss 3.173660 (-3.04z)| norm 0.2405 (-0.02z)| lr 1.54e-05 | 8435.17 ms | -100.0% bf16 MFU | 62145 tok/s +step 17633/19560 | loss 3.311880 (+0.81z)| norm 0.2378 (-0.26z)| lr 1.53e-05 | 8436.03 ms | -100.0% bf16 MFU | 62145 tok/s +step 17634/19560 | loss 3.280589 (-0.06z)| norm 0.2386 (-0.18z)| lr 1.53e-05 | 8434.27 ms | -100.0% bf16 MFU | 62146 tok/s +step 17635/19560 | loss 3.271625 (-0.30z)| norm 0.2353 (-0.49z)| lr 1.53e-05 | 8437.48 ms | -100.0% bf16 MFU | 62145 tok/s +step 17636/19560 | loss 3.292171 (+0.27z)| norm 0.2369 (-0.34z)| lr 1.53e-05 | 8436.94 ms | -100.0% bf16 MFU | 62145 tok/s +step 17637/19560 | loss 3.266924 (-0.43z)| norm 0.2378 (-0.26z)| lr 1.53e-05 | 8435.54 ms | -100.0% bf16 MFU | 62145 tok/s +step 17638/19560 | loss 3.303133 (+0.58z)| norm 0.2397 (-0.08z)| lr 1.53e-05 | 8434.79 ms | -100.0% bf16 MFU | 62146 tok/s +step 17639/19560 | loss 3.286772 (+0.11z)| norm 0.2436 (+0.28z)| lr 1.52e-05 | 8436.72 ms | -100.0% bf16 MFU | 62146 tok/s +step 17640/19560 | loss 3.289915 (+0.20z)| norm 0.2368 (-0.35z)| lr 1.52e-05 | 8436.44 ms | -100.0% bf16 MFU | 62146 tok/s +step 17641/19560 | loss 3.248711 (-0.94z)| norm 0.2316 (-0.83z)| lr 1.52e-05 | 8437.57 ms | -100.0% bf16 MFU | 62145 tok/s +step 17642/19560 | loss 3.263078 (-0.53z)| norm 0.2356 (-0.45z)| lr 1.52e-05 | 8435.97 ms | -100.0% bf16 MFU | 62146 tok/s +step 17643/19560 | loss 3.238658 (-1.20z)| norm 0.2358 (-0.43z)| lr 1.52e-05 | 8436.30 ms | -100.0% bf16 MFU | 62146 tok/s +step 17644/19560 | loss 3.264486 (-0.46z)| norm 0.2323 (-0.74z)| lr 1.52e-05 | 8437.35 ms | -100.0% bf16 MFU | 62145 tok/s +step 17645/19560 | loss 3.280485 (-0.01z)| norm 0.2312 (-0.83z)| lr 1.51e-05 | 8436.73 ms | -100.0% bf16 MFU | 62145 tok/s +step 17646/19560 | loss 3.282007 (+0.04z)| norm 0.2410 (+0.06z)| lr 1.51e-05 | 8436.54 ms | -100.0% bf16 MFU | 62145 tok/s +step 17647/19560 | loss 3.224858 (-1.57z)| norm 0.2258 (-1.31z)| lr 1.51e-05 | 8436.53 ms | -100.0% bf16 MFU | 62145 tok/s +step 17648/19560 | loss 3.315012 (+0.96z)| norm 0.2401 (-0.00z)| lr 1.51e-05 | 8433.58 ms | -100.0% bf16 MFU | 62146 tok/s +step 17649/19560 | loss 3.257941 (-0.66z)| norm 0.2327 (-0.67z)| lr 1.51e-05 | 8443.58 ms | -100.0% bf16 MFU | 62144 tok/s +step 17650/19560 | loss 3.311170 (+0.84z)| norm 0.2415 (+0.15z)| lr 1.51e-05 | 8465.00 ms | -100.0% bf16 MFU | 62133 tok/s +step 17651/19560 | loss 3.324066 (+1.21z)| norm 0.2328 (-0.66z)| lr 1.51e-05 | 8468.94 ms | -100.0% bf16 MFU | 62122 tok/s +step 17652/19560 | loss 3.301485 (+0.56z)| norm 0.2391 (-0.08z)| lr 1.50e-05 | 8463.42 ms | -100.0% bf16 MFU | 62113 tok/s +step 17653/19560 | loss 3.257141 (-0.70z)| norm 0.2372 (-0.25z)| lr 1.50e-05 | 8463.30 ms | -100.0% bf16 MFU | 62105 tok/s +step 17654/19560 | loss 3.290137 (+0.24z)| norm 0.2366 (-0.31z)| lr 1.50e-05 | 8459.43 ms | -100.0% bf16 MFU | 62099 tok/s +step 17655/19560 | loss 3.303178 (+0.60z)| norm 0.2272 (-1.15z)| lr 1.50e-05 | 8458.11 ms | -100.0% bf16 MFU | 62093 tok/s +step 17656/19560 | loss 3.253641 (-0.81z)| norm 0.2304 (-0.84z)| lr 1.50e-05 | 8456.86 ms | -100.0% bf16 MFU | 62088 tok/s +step 17657/19560 | loss 3.333292 (+1.46z)| norm 0.2372 (-0.22z)| lr 1.50e-05 | 8456.98 ms | -100.0% bf16 MFU | 62083 tok/s +step 17658/19560 | loss 3.264292 (-0.52z)| norm 0.2432 (+0.36z)| lr 1.49e-05 | 8464.20 ms | -100.0% bf16 MFU | 62076 tok/s +step 17659/19560 | loss 3.309758 (+0.78z)| norm 0.2478 (+0.77z)| lr 1.49e-05 | 8448.82 ms | -100.0% bf16 MFU | 62075 tok/s +step 17660/19560 | loss 3.357589 (+2.09z)| norm 0.2484 (+0.82z)| lr 1.49e-05 | 8452.09 ms | -100.0% bf16 MFU | 62073 tok/s +step 17661/19560 | loss 3.344854 (+1.70z)| norm 0.2376 (-0.19z)| lr 1.49e-05 | 8452.73 ms | -100.0% bf16 MFU | 62071 tok/s +step 17662/19560 | loss 3.264694 (-0.53z)| norm 0.2436 (+0.37z)| lr 1.49e-05 | 8462.48 ms | -100.0% bf16 MFU | 62065 tok/s +step 17663/19560 | loss 3.303497 (+0.54z)| norm 0.2318 (-0.73z)| lr 1.49e-05 | 8458.09 ms | -100.0% bf16 MFU | 62061 tok/s +step 17664/19560 | loss 3.279736 (-0.11z)| norm 0.2295 (-0.94z)| lr 1.49e-05 | 8454.21 ms | -100.0% bf16 MFU | 62059 tok/s +step 17665/19560 | loss 3.287371 (+0.10z)| norm 0.2233 (-1.49z)| lr 1.48e-05 | 8460.94 ms | -100.0% bf16 MFU | 62054 tok/s +step 17666/19560 | loss 3.358292 (+2.03z)| norm 0.2454 (+0.54z)| lr 1.48e-05 | 8453.53 ms | -100.0% bf16 MFU | 62052 tok/s +step 17667/19560 | loss 3.265387 (-0.53z)| norm 0.2336 (-0.55z)| lr 1.48e-05 | 8453.23 ms | -100.0% bf16 MFU | 62051 tok/s +step 17668/19560 | loss 3.321488 (+1.00z)| norm 0.2537 (+1.29z)| lr 1.48e-05 | 8457.01 ms | -100.0% bf16 MFU | 62048 tok/s +step 17669/19560 | loss 3.383203 (+2.62z)| norm 0.2445 (+0.44z)| lr 1.48e-05 | 8458.07 ms | -100.0% bf16 MFU | 62045 tok/s +step 17670/19560 | loss 3.297446 (+0.30z)| norm 0.2267 (-1.21z)| lr 1.48e-05 | 8455.32 ms | -100.0% bf16 MFU | 62043 tok/s +step 17671/19560 | loss 3.296604 (+0.27z)| norm 0.2389 (-0.09z)| lr 1.47e-05 | 8450.41 ms | -100.0% bf16 MFU | 62043 tok/s +step 17672/19560 | loss 3.295875 (+0.25z)| norm 0.2466 (+0.63z)| lr 1.47e-05 | 8450.51 ms | -100.0% bf16 MFU | 62043 tok/s +step 17673/19560 | loss 3.397435 (+2.89z)| norm 0.2394 (-0.05z)| lr 1.47e-05 | 8451.79 ms | -100.0% bf16 MFU | 62042 tok/s +step 17674/19560 | loss 3.274218 (-0.38z)| norm 0.2341 (-0.54z)| lr 1.47e-05 | 8457.42 ms | -100.0% bf16 MFU | 62040 tok/s +step 17675/19560 | loss 3.254282 (-0.90z)| norm 0.2390 (-0.07z)| lr 1.47e-05 | 8454.42 ms | -100.0% bf16 MFU | 62039 tok/s +step 17676/19560 | loss 3.251528 (-0.97z)| norm 0.2381 (-0.16z)| lr 1.47e-05 | 8453.92 ms | -100.0% bf16 MFU | 62038 tok/s +step 17677/19560 | loss 3.292576 (+0.11z)| norm 0.2353 (-0.41z)| lr 1.47e-05 | 8453.35 ms | -100.0% bf16 MFU | 62037 tok/s +step 17678/19560 | loss 3.270140 (-0.48z)| norm 0.2346 (-0.48z)| lr 1.46e-05 | 8451.79 ms | -100.0% bf16 MFU | 62037 tok/s +step 17679/19560 | loss 3.288486 (+0.02z)| norm 0.2425 (+0.27z)| lr 1.46e-05 | 8444.49 ms | -100.0% bf16 MFU | 62039 tok/s +step 17680/19560 | loss 3.324608 (+0.97z)| norm 0.2448 (+0.47z)| lr 1.46e-05 | 8455.66 ms | -100.0% bf16 MFU | 62037 tok/s +step 17681/19560 | loss 3.269934 (-0.48z)| norm 0.2345 (-0.49z)| lr 1.46e-05 | 8451.10 ms | -100.0% bf16 MFU | 62037 tok/s +step 17682/19560 | loss 3.216095 (-1.89z)| norm 0.2308 (-0.83z)| lr 1.46e-05 | 8450.45 ms | -100.0% bf16 MFU | 62038 tok/s +step 17683/19560 | loss 3.321691 (+0.89z)| norm 0.2388 (-0.07z)| lr 1.46e-05 | 8453.83 ms | -100.0% bf16 MFU | 62037 tok/s +step 17684/19560 | loss 3.361830 (+1.90z)| norm 0.2483 (+0.82z)| lr 1.45e-05 | 8450.28 ms | -100.0% bf16 MFU | 62037 tok/s +step 17685/19560 | loss 3.289016 (-0.01z)| norm 0.2388 (-0.07z)| lr 1.45e-05 | 8454.61 ms | -100.0% bf16 MFU | 62036 tok/s +step 17686/19560 | loss 3.340775 (+1.34z)| norm 0.2489 (+0.87z)| lr 1.45e-05 | 8446.33 ms | -100.0% bf16 MFU | 62038 tok/s +step 17687/19560 | loss 3.249092 (-1.08z)| norm 0.2348 (-0.45z)| lr 1.45e-05 | 8448.36 ms | -100.0% bf16 MFU | 62039 tok/s +step 17688/19560 | loss 3.299114 (+0.23z)| norm 0.2385 (-0.09z)| lr 1.45e-05 | 8453.21 ms | -100.0% bf16 MFU | 62038 tok/s +step 17689/19560 | loss 3.248094 (-1.11z)| norm 0.2207 (-1.73z)| lr 1.45e-05 | 8453.48 ms | -100.0% bf16 MFU | 62037 tok/s +step 17690/19560 | loss 3.266212 (-0.63z)| norm 0.2225 (-1.55z)| lr 1.45e-05 | 8454.21 ms | -100.0% bf16 MFU | 62036 tok/s +step 17691/19560 | loss 3.394906 (+2.69z)| norm 0.2435 (+0.38z)| lr 1.44e-05 | 8442.80 ms | -100.0% bf16 MFU | 62039 tok/s +step 17692/19560 | loss 3.298707 (+0.19z)| norm 0.2411 (+0.19z)| lr 1.44e-05 | 8450.43 ms | -100.0% bf16 MFU | 62039 tok/s +step 17693/19560 | loss 3.298812 (+0.19z)| norm 0.2429 (+0.35z)| lr 1.44e-05 | 8452.50 ms | -100.0% bf16 MFU | 62039 tok/s +step 17694/19560 | loss 3.310247 (+0.48z)| norm 0.2429 (+0.35z)| lr 1.44e-05 | 8447.86 ms | -100.0% bf16 MFU | 62040 tok/s +step 17695/19560 | loss 3.245132 (-1.22z)| norm 0.2396 (+0.03z)| lr 1.44e-05 | 8452.29 ms | -100.0% bf16 MFU | 62039 tok/s +step 17696/19560 | loss 3.325387 (+0.87z)| norm 0.2372 (-0.19z)| lr 1.44e-05 | 8446.80 ms | -100.0% bf16 MFU | 62041 tok/s +step 17697/19560 | loss 3.334041 (+1.10z)| norm 0.2253 (-1.34z)| lr 1.43e-05 | 8443.95 ms | -100.0% bf16 MFU | 62043 tok/s +step 17698/19560 | loss 3.272120 (-0.51z)| norm 0.2359 (-0.29z)| lr 1.43e-05 | 8438.19 ms | -100.0% bf16 MFU | 62048 tok/s +step 17699/19560 | loss 3.297597 (+0.17z)| norm 0.2342 (-0.45z)| lr 1.43e-05 | 8441.38 ms | -100.0% bf16 MFU | 62051 tok/s +step 17700/19560 | loss 3.288412 (-0.08z)| norm 0.2275 (-1.10z)| lr 1.43e-05 | 8443.42 ms | -100.0% bf16 MFU | 62053 tok/s +step 17701/19560 | loss 3.272501 (-0.49z)| norm 0.2384 (-0.02z)| lr 1.43e-05 | 8435.60 ms | -100.0% bf16 MFU | 62058 tok/s +step 17702/19560 | loss 3.336217 (+1.20z)| norm 0.2550 (+1.59z)| lr 1.43e-05 | 8446.39 ms | -100.0% bf16 MFU | 62059 tok/s +step 17703/19560 | loss 3.304991 (+0.37z)| norm 0.2297 (-0.88z)| lr 1.43e-05 | 8438.50 ms | -100.0% bf16 MFU | 62062 tok/s +step 17704/19560 | loss 3.256293 (-0.92z)| norm 0.2294 (-0.89z)| lr 1.42e-05 | 8430.94 ms | -100.0% bf16 MFU | 62068 tok/s +step 17705/19560 | loss 3.290460 (-0.01z)| norm 0.2305 (-0.78z)| lr 1.42e-05 | 8433.04 ms | -100.0% bf16 MFU | 62074 tok/s +step 17706/19560 | loss 3.274604 (-0.43z)| norm 0.2270 (-1.11z)| lr 1.42e-05 | 8433.99 ms | -100.0% bf16 MFU | 62078 tok/s +step 17707/19560 | loss 3.263362 (-0.75z)| norm 0.2309 (-0.73z)| lr 1.42e-05 | 8440.07 ms | -100.0% bf16 MFU | 62080 tok/s +step 17708/19560 | loss 3.267900 (-0.62z)| norm 0.2176 (-1.97z)| lr 1.42e-05 | 8439.41 ms | -100.0% bf16 MFU | 62082 tok/s +step 17709/19560 | loss 3.316922 (+0.68z)| norm 0.2367 (-0.16z)| lr 1.42e-05 | 8438.41 ms | -100.0% bf16 MFU | 62085 tok/s +step 17710/19560 | loss 3.345589 (+1.43z)| norm 0.2322 (-0.58z)| lr 1.41e-05 | 8437.57 ms | -100.0% bf16 MFU | 62087 tok/s +step 17711/19560 | loss 3.303691 (+0.31z)| norm 0.2296 (-0.82z)| lr 1.41e-05 | 8438.96 ms | -100.0% bf16 MFU | 62089 tok/s +step 17712/19560 | loss 3.221740 (-1.85z)| norm 0.2323 (-0.56z)| lr 1.41e-05 | 8439.49 ms | -100.0% bf16 MFU | 62091 tok/s +step 17713/19560 | loss 3.276030 (-0.41z)| norm 0.2363 (-0.18z)| lr 1.41e-05 | 8430.58 ms | -100.0% bf16 MFU | 62096 tok/s +step 17714/19560 | loss 3.276341 (-0.39z)| norm 0.2348 (-0.32z)| lr 1.41e-05 | 8434.96 ms | -100.0% bf16 MFU | 62099 tok/s +step 17715/19560 | loss 3.350235 (+1.53z)| norm 0.2365 (-0.13z)| lr 1.41e-05 | 8437.00 ms | -100.0% bf16 MFU | 62101 tok/s +step 17716/19560 | loss 3.292539 (+0.01z)| norm 0.2396 (+0.28z)| lr 1.41e-05 | 8435.39 ms | -100.0% bf16 MFU | 62104 tok/s +step 17717/19560 | loss 3.233679 (-1.50z)| norm 0.2333 (-0.53z)| lr 1.40e-05 | 8434.98 ms | -100.0% bf16 MFU | 62106 tok/s +step 17718/19560 | loss 3.268339 (-0.60z)| norm 0.2355 (-0.23z)| lr 1.40e-05 | 8437.09 ms | -100.0% bf16 MFU | 62108 tok/s +step 17719/19560 | loss 3.293329 (+0.04z)| norm 0.2368 (-0.06z)| lr 1.40e-05 | 8440.29 ms | -100.0% bf16 MFU | 62108 tok/s +step 17720/19560 | loss 3.358284 (+1.73z)| norm 0.2667 (+3.72z)| lr 1.40e-05 | 8445.37 ms | -100.0% bf16 MFU | 62107 tok/s +step 17721/19560 | loss 3.256866 (-0.93z)| norm 0.2298 (-0.95z)| lr 1.40e-05 | 8438.44 ms | -100.0% bf16 MFU | 62108 tok/s +step 17722/19560 | loss 3.327524 (+0.92z)| norm 0.2326 (-0.60z)| lr 1.40e-05 | 8440.09 ms | -100.0% bf16 MFU | 62109 tok/s +step 17723/19560 | loss 3.351775 (+1.54z)| norm 0.2539 (+2.05z)| lr 1.40e-05 | 8442.66 ms | -100.0% bf16 MFU | 62108 tok/s +step 17724/19560 | loss 3.297126 (+0.12z)| norm 0.2330 (-0.55z)| lr 1.39e-05 | 8438.14 ms | -100.0% bf16 MFU | 62110 tok/s +step 17725/19560 | loss 3.396508 (+2.62z)| norm 0.2493 (+1.46z)| lr 1.39e-05 | 8446.29 ms | -100.0% bf16 MFU | 62108 tok/s +step 17726/19560 | loss 3.318996 (+0.65z)| norm 0.2331 (-0.54z)| lr 1.39e-05 | 8439.01 ms | -100.0% bf16 MFU | 62109 tok/s +step 17727/19560 | loss 3.284729 (-0.22z)| norm 0.2395 (+0.25z)| lr 1.39e-05 | 8442.51 ms | -100.0% bf16 MFU | 62108 tok/s +step 17728/19560 | loss 3.284824 (-0.21z)| norm 0.2470 (+1.18z)| lr 1.39e-05 | 8443.38 ms | -100.0% bf16 MFU | 62108 tok/s +step 17729/19560 | loss 3.275195 (-0.45z)| norm 0.2330 (-0.56z)| lr 1.39e-05 | 8444.67 ms | -100.0% bf16 MFU | 62106 tok/s +step 17730/19560 | loss 3.293469 (+0.01z)| norm 0.2340 (-0.45z)| lr 1.38e-05 | 8435.74 ms | -100.0% bf16 MFU | 62109 tok/s +step 17731/19560 | loss 3.363488 (+1.75z)| norm 0.2557 (+2.22z)| lr 1.38e-05 | 8439.55 ms | -100.0% bf16 MFU | 62109 tok/s +step 17732/19560 | loss 3.252434 (-1.04z)| norm 0.2314 (-0.78z)| lr 1.38e-05 | 8441.71 ms | -100.0% bf16 MFU | 62109 tok/s +step 17733/19560 | loss 3.290689 (-0.09z)| norm 0.2300 (-0.94z)| lr 1.38e-05 | 8436.51 ms | -100.0% bf16 MFU | 62111 tok/s +step 17734/19560 | loss 3.276221 (-0.44z)| norm 0.2378 (+0.06z)| lr 1.38e-05 | 8441.30 ms | -100.0% bf16 MFU | 62111 tok/s +step 17735/19560 | loss 3.291538 (-0.05z)| norm 0.2523 (+1.90z)| lr 1.38e-05 | 8443.88 ms | -100.0% bf16 MFU | 62110 tok/s +step 17736/19560 | loss 3.284532 (-0.22z)| norm 0.2605 (+2.86z)| lr 1.38e-05 | 8441.61 ms | -100.0% bf16 MFU | 62110 tok/s +step 17737/19560 | loss 3.284867 (-0.20z)| norm 0.2529 (+1.87z)| lr 1.37e-05 | 8437.47 ms | -100.0% bf16 MFU | 62111 tok/s +step 17738/19560 | loss 3.266859 (-0.65z)| norm 0.2344 (-0.40z)| lr 1.37e-05 | 8441.74 ms | -100.0% bf16 MFU | 62111 tok/s +step 17739/19560 | loss 3.279184 (-0.34z)| norm 0.2391 (+0.16z)| lr 1.37e-05 | 8442.81 ms | -100.0% bf16 MFU | 62110 tok/s +step 17740/19560 | loss 3.334517 (+1.07z)| norm 0.2519 (+1.71z)| lr 1.37e-05 | 8438.72 ms | -100.0% bf16 MFU | 62111 tok/s +step 17741/19560 | loss 3.215590 (-1.95z)| norm 0.2532 (+1.83z)| lr 1.37e-05 | 8439.73 ms | -100.0% bf16 MFU | 62112 tok/s +step 17742/19560 | loss 3.244735 (-1.20z)| norm 0.2349 (-0.37z)| lr 1.37e-05 | 8438.77 ms | -100.0% bf16 MFU | 62113 tok/s +step 17743/19560 | loss 3.266949 (-0.63z)| norm 0.2340 (-0.48z)| lr 1.37e-05 | 8442.47 ms | -100.0% bf16 MFU | 62112 tok/s +step 17744/19560 | loss 3.298027 (+0.16z)| norm 0.2456 (+0.95z)| lr 1.36e-05 | 8437.60 ms | -100.0% bf16 MFU | 62113 tok/s +step 17745/19560 | loss 3.282645 (-0.23z)| norm 0.2283 (-1.16z)| lr 1.36e-05 | 8438.10 ms | -100.0% bf16 MFU | 62114 tok/s +step 17746/19560 | loss 3.311817 (+0.49z)| norm 0.2524 (+1.75z)| lr 1.36e-05 | 8438.14 ms | -100.0% bf16 MFU | 62115 tok/s +step 17747/19560 | loss 3.303998 (+0.28z)| norm 0.2305 (-0.89z)| lr 1.36e-05 | 8440.14 ms | -100.0% bf16 MFU | 62115 tok/s +step 17748/19560 | loss 3.261490 (-0.80z)| norm 0.2321 (-0.69z)| lr 1.36e-05 | 8440.69 ms | -100.0% bf16 MFU | 62115 tok/s +step 17749/19560 | loss 3.373423 (+2.01z)| norm 0.2530 (+1.79z)| lr 1.36e-05 | 8443.69 ms | -100.0% bf16 MFU | 62114 tok/s +step 17750/19560 | loss 3.269965 (-0.60z)| norm 0.2253 (-1.50z)| lr 1.35e-05 | 8440.38 ms | -100.0% bf16 MFU | 62114 tok/s +val loss 3.267943 +evaluating HellaSwag: 0/1256 evaluating HellaSwag: 10/1256 evaluating HellaSwag: 20/1256 evaluating HellaSwag: 30/1256 evaluating HellaSwag: 40/1256 evaluating HellaSwag: 50/1256 evaluating HellaSwag: 60/1256 evaluating HellaSwag: 70/1256 evaluating HellaSwag: 80/1256 evaluating HellaSwag: 90/1256 evaluating HellaSwag: 100/1256 evaluating HellaSwag: 110/1256 evaluating HellaSwag: 120/1256 evaluating HellaSwag: 130/1256 evaluating HellaSwag: 140/1256 evaluating HellaSwag: 150/1256 evaluating HellaSwag: 160/1256 evaluating HellaSwag: 170/1256 evaluating HellaSwag: 180/1256 evaluating HellaSwag: 190/1256 evaluating HellaSwag: 200/1256 evaluating HellaSwag: 210/1256 evaluating HellaSwag: 220/1256 evaluating HellaSwag: 230/1256 evaluating HellaSwag: 240/1256 evaluating HellaSwag: 250/1256 evaluating HellaSwag: 260/1256 evaluating HellaSwag: 270/1256 evaluating HellaSwag: 280/1256 evaluating HellaSwag: 290/1256 evaluating HellaSwag: 300/1256 evaluating HellaSwag: 310/1256 evaluating HellaSwag: 320/1256 evaluating HellaSwag: 330/1256 evaluating HellaSwag: 340/1256 evaluating HellaSwag: 350/1256 evaluating HellaSwag: 360/1256 evaluating HellaSwag: 370/1256 evaluating HellaSwag: 380/1256 evaluating HellaSwag: 390/1256 evaluating HellaSwag: 400/1256 evaluating HellaSwag: 410/1256 evaluating HellaSwag: 420/1256 evaluating HellaSwag: 430/1256 evaluating HellaSwag: 440/1256 evaluating HellaSwag: 450/1256 evaluating HellaSwag: 460/1256 evaluating HellaSwag: 470/1256 evaluating HellaSwag: 480/1256 evaluating HellaSwag: 490/1256 evaluating HellaSwag: 500/1256 evaluating HellaSwag: 510/1256 evaluating HellaSwag: 520/1256 evaluating HellaSwag: 530/1256 evaluating HellaSwag: 540/1256 evaluating HellaSwag: 550/1256 evaluating HellaSwag: 560/1256 evaluating HellaSwag: 570/1256 evaluating HellaSwag: 580/1256 evaluating HellaSwag: 590/1256 evaluating HellaSwag: 600/1256 evaluating HellaSwag: 610/1256 evaluating HellaSwag: 620/1256 evaluating HellaSwag: 630/1256 evaluating HellaSwag: 640/1256 evaluating HellaSwag: 650/1256 evaluating HellaSwag: 660/1256 evaluating HellaSwag: 670/1256 evaluating HellaSwag: 680/1256 evaluating HellaSwag: 690/1256 evaluating HellaSwag: 700/1256 evaluating HellaSwag: 710/1256 evaluating HellaSwag: 720/1256 evaluating HellaSwag: 730/1256 evaluating HellaSwag: 740/1256 evaluating HellaSwag: 750/1256 evaluating HellaSwag: 760/1256 evaluating HellaSwag: 770/1256 evaluating HellaSwag: 780/1256 evaluating HellaSwag: 790/1256 evaluating HellaSwag: 800/1256 evaluating HellaSwag: 810/1256 evaluating HellaSwag: 820/1256 evaluating HellaSwag: 830/1256 evaluating HellaSwag: 840/1256 evaluating HellaSwag: 850/1256 evaluating HellaSwag: 860/1256 evaluating HellaSwag: 870/1256 evaluating HellaSwag: 880/1256 evaluating HellaSwag: 890/1256 evaluating HellaSwag: 900/1256 evaluating HellaSwag: 910/1256 evaluating HellaSwag: 920/1256 evaluating HellaSwag: 930/1256 evaluating HellaSwag: 940/1256 evaluating HellaSwag: 950/1256 evaluating HellaSwag: 960/1256 evaluating HellaSwag: 970/1256 evaluating HellaSwag: 980/1256 evaluating HellaSwag: 990/1256 evaluating HellaSwag: 1000/1256 evaluating HellaSwag: 1010/1256 evaluating HellaSwag: 1020/1256 evaluating HellaSwag: 1030/1256 evaluating HellaSwag: 1040/1256 evaluating HellaSwag: 1050/1256 evaluating HellaSwag: 1060/1256 evaluating HellaSwag: 1070/1256 evaluating HellaSwag: 1080/1256 evaluating HellaSwag: 1090/1256 evaluating HellaSwag: 1100/1256 evaluating HellaSwag: 1110/1256 evaluating HellaSwag: 1120/1256 evaluating HellaSwag: 1130/1256 evaluating HellaSwag: 1140/1256 evaluating HellaSwag: 1150/1256 evaluating HellaSwag: 1160/1256 evaluating HellaSwag: 1170/1256 evaluating HellaSwag: 1180/1256 evaluating HellaSwag: 1190/1256 evaluating HellaSwag: 1200/1256 evaluating HellaSwag: 1210/1256 evaluating HellaSwag: 1220/1256 evaluating HellaSwag: 1230/1256 evaluating HellaSwag: 1240/1256 evaluating HellaSwag: 1250/1256 HellaSwag: 2994/10042 = 0.298148 +step 17751/19560 | loss 3.231362 (-1.55z)| norm 0.2333 (-0.55z)| lr 1.35e-05 | 8435.94 ms | -100.0% bf16 MFU | 62116 tok/s +step 17752/19560 | loss 3.281731 (-0.28z)| norm 0.2404 (+0.29z)| lr 1.35e-05 | 8436.83 ms | -100.0% bf16 MFU | 62117 tok/s +step 17753/19560 | loss 3.280843 (-0.29z)| norm 0.2266 (-1.34z)| lr 1.35e-05 | 8441.62 ms | -100.0% bf16 MFU | 62117 tok/s +step 17754/19560 | loss 3.244570 (-1.22z)| norm 0.2409 (+0.37z)| lr 1.35e-05 | 8435.91 ms | -100.0% bf16 MFU | 62119 tok/s +step 17755/19560 | loss 3.306564 (+0.40z)| norm 0.2344 (-0.40z)| lr 1.35e-05 | 8438.29 ms | -100.0% bf16 MFU | 62119 tok/s +step 17756/19560 | loss 3.235547 (-1.44z)| norm 0.2382 (+0.06z)| lr 1.35e-05 | 8436.46 ms | -100.0% bf16 MFU | 62121 tok/s +step 17757/19560 | loss 3.238052 (-1.35z)| norm 0.2428 (+0.60z)| lr 1.34e-05 | 8436.43 ms | -100.0% bf16 MFU | 62122 tok/s +step 17758/19560 | loss 3.264113 (-0.67z)| norm 0.2317 (-0.73z)| lr 1.34e-05 | 8437.64 ms | -100.0% bf16 MFU | 62123 tok/s +step 17759/19560 | loss 3.286034 (-0.11z)| norm 0.2346 (-0.37z)| lr 1.34e-05 | 8434.82 ms | -100.0% bf16 MFU | 62124 tok/s +step 17760/19560 | loss 3.290038 (-0.03z)| norm 0.2399 (+0.28z)| lr 1.34e-05 | 8442.39 ms | -100.0% bf16 MFU | 62123 tok/s +step 17761/19560 | loss 3.293396 (+0.06z)| norm 0.2359 (-0.21z)| lr 1.34e-05 | 8439.67 ms | -100.0% bf16 MFU | 62123 tok/s +step 17762/19560 | loss 3.256437 (-0.91z)| norm 0.2330 (-0.55z)| lr 1.34e-05 | 8442.84 ms | -100.0% bf16 MFU | 62122 tok/s +step 17763/19560 | loss 3.288390 (-0.07z)| norm 0.2417 (+0.50z)| lr 1.34e-05 | 8439.93 ms | -100.0% bf16 MFU | 62122 tok/s +step 17764/19560 | loss 3.280148 (-0.28z)| norm 0.2336 (-0.48z)| lr 1.33e-05 | 8443.04 ms | -100.0% bf16 MFU | 62121 tok/s +step 17765/19560 | loss 3.240936 (-1.32z)| norm 0.2380 (+0.05z)| lr 1.33e-05 | 8439.84 ms | -100.0% bf16 MFU | 62121 tok/s +step 17766/19560 | loss 3.225696 (-1.69z)| norm 0.2271 (-1.25z)| lr 1.33e-05 | 8440.04 ms | -100.0% bf16 MFU | 62120 tok/s +step 17767/19560 | loss 3.282600 (-0.19z)| norm 0.2283 (-1.08z)| lr 1.33e-05 | 8441.34 ms | -100.0% bf16 MFU | 62120 tok/s +step 17768/19560 | loss 3.252102 (-0.98z)| norm 0.2249 (-1.47z)| lr 1.33e-05 | 8438.90 ms | -100.0% bf16 MFU | 62120 tok/s +step 17769/19560 | loss 3.232532 (-1.48z)| norm 0.2268 (-1.24z)| lr 1.33e-05 | 8439.67 ms | -100.0% bf16 MFU | 62120 tok/s +step 17770/19560 | loss 3.184755 (-2.64z)| norm 0.2320 (-0.61z)| lr 1.33e-05 | 8436.21 ms | -100.0% bf16 MFU | 62122 tok/s +step 17771/19560 | loss 3.370718 (+2.02z)| norm 0.2448 (+0.89z)| lr 1.32e-05 | 8439.20 ms | -100.0% bf16 MFU | 62122 tok/s +step 17772/19560 | loss 3.292389 (+0.05z)| norm 0.2352 (-0.25z)| lr 1.32e-05 | 8438.22 ms | -100.0% bf16 MFU | 62122 tok/s +step 17773/19560 | loss 3.273551 (-0.42z)| norm 0.2342 (-0.37z)| lr 1.32e-05 | 8440.63 ms | -100.0% bf16 MFU | 62122 tok/s +step 17774/19560 | loss 3.280839 (-0.23z)| norm 0.2470 (+1.13z)| lr 1.32e-05 | 8441.71 ms | -100.0% bf16 MFU | 62121 tok/s +step 17775/19560 | loss 3.212710 (-1.93z)| norm 0.2350 (-0.29z)| lr 1.32e-05 | 8440.22 ms | -100.0% bf16 MFU | 62121 tok/s +step 17776/19560 | loss 3.271605 (-0.45z)| norm 0.2500 (+1.46z)| lr 1.32e-05 | 8442.12 ms | -100.0% bf16 MFU | 62120 tok/s +step 17777/19560 | loss 3.207588 (-2.02z)| norm 0.2559 (+2.10z)| lr 1.31e-05 | 8442.48 ms | -100.0% bf16 MFU | 62119 tok/s +step 17778/19560 | loss 3.269659 (-0.48z)| norm 0.2426 (+0.57z)| lr 1.31e-05 | 8440.82 ms | -100.0% bf16 MFU | 62119 tok/s +step 17779/19560 | loss 3.295223 (+0.16z)| norm 0.2300 (-0.89z)| lr 1.31e-05 | 8435.92 ms | -100.0% bf16 MFU | 62121 tok/s +step 17780/19560 | loss 3.372017 (+2.01z)| norm 0.2499 (+1.39z)| lr 1.31e-05 | 8439.66 ms | -100.0% bf16 MFU | 62121 tok/s +step 17781/19560 | loss 3.283368 (-0.15z)| norm 0.2498 (+1.36z)| lr 1.31e-05 | 8439.03 ms | -100.0% bf16 MFU | 62121 tok/s +step 17782/19560 | loss 3.302177 (+0.31z)| norm 0.2503 (+1.38z)| lr 1.31e-05 | 8437.93 ms | -100.0% bf16 MFU | 62122 tok/s +step 17783/19560 | loss 3.284994 (-0.11z)| norm 0.2312 (-0.77z)| lr 1.31e-05 | 8442.57 ms | -100.0% bf16 MFU | 62121 tok/s +step 17784/19560 | loss 3.291488 (+0.04z)| norm 0.2341 (-0.45z)| lr 1.30e-05 | 8440.04 ms | -100.0% bf16 MFU | 62120 tok/s +step 17785/19560 | loss 3.330404 (+1.00z)| norm 0.2305 (-0.85z)| lr 1.30e-05 | 8440.33 ms | -100.0% bf16 MFU | 62120 tok/s +step 17786/19560 | loss 3.243492 (-1.13z)| norm 0.2313 (-0.74z)| lr 1.30e-05 | 8437.59 ms | -100.0% bf16 MFU | 62121 tok/s +step 17787/19560 | loss 3.307335 (+0.43z)| norm 0.2375 (-0.04z)| lr 1.30e-05 | 8436.48 ms | -100.0% bf16 MFU | 62122 tok/s +step 17788/19560 | loss 3.288150 (-0.02z)| norm 0.2371 (-0.07z)| lr 1.30e-05 | 8439.49 ms | -100.0% bf16 MFU | 62122 tok/s +step 17789/19560 | loss 3.298144 (+0.24z)| norm 0.2351 (-0.30z)| lr 1.30e-05 | 8434.13 ms | -100.0% bf16 MFU | 62124 tok/s +step 17790/19560 | loss 3.286399 (-0.06z)| norm 0.2387 (+0.12z)| lr 1.30e-05 | 8439.38 ms | -100.0% bf16 MFU | 62124 tok/s +step 17791/19560 | loss 3.393170 (+2.52z)| norm 0.2395 (+0.20z)| lr 1.29e-05 | 8436.72 ms | -100.0% bf16 MFU | 62125 tok/s +step 17792/19560 | loss 3.272179 (-0.42z)| norm 0.2352 (-0.30z)| lr 1.29e-05 | 8438.34 ms | -100.0% bf16 MFU | 62126 tok/s +step 17793/19560 | loss 3.321927 (+0.78z)| norm 0.2388 (+0.11z)| lr 1.29e-05 | 8437.90 ms | -100.0% bf16 MFU | 62126 tok/s +step 17794/19560 | loss 3.328610 (+0.95z)| norm 0.2391 (+0.15z)| lr 1.29e-05 | 8437.79 ms | -100.0% bf16 MFU | 62127 tok/s +step 17795/19560 | loss 3.275316 (-0.35z)| norm 0.2341 (-0.44z)| lr 1.29e-05 | 8436.35 ms | -100.0% bf16 MFU | 62128 tok/s +step 17796/19560 | loss 3.304987 (+0.38z)| norm 0.2377 (-0.00z)| lr 1.29e-05 | 8435.85 ms | -100.0% bf16 MFU | 62129 tok/s +step 17797/19560 | loss 3.285686 (-0.08z)| norm 0.2333 (-0.52z)| lr 1.29e-05 | 8438.86 ms | -100.0% bf16 MFU | 62129 tok/s +step 17798/19560 | loss 3.312565 (+0.59z)| norm 0.2431 (+0.64z)| lr 1.28e-05 | 8441.24 ms | -100.0% bf16 MFU | 62128 tok/s +step 17799/19560 | loss 3.241588 (-1.16z)| norm 0.2398 (+0.24z)| lr 1.28e-05 | 8438.81 ms | -100.0% bf16 MFU | 62128 tok/s +step 17800/19560 | loss 3.280625 (-0.19z)| norm 0.2284 (-1.10z)| lr 1.28e-05 | 8436.70 ms | -100.0% bf16 MFU | 62129 tok/s +step 17801/19560 | loss 3.274938 (-0.32z)| norm 0.2361 (-0.18z)| lr 1.28e-05 | 8436.33 ms | -100.0% bf16 MFU | 62129 tok/s +step 17802/19560 | loss 3.282140 (-0.13z)| norm 0.2341 (-0.42z)| lr 1.28e-05 | 8437.48 ms | -100.0% bf16 MFU | 62130 tok/s +step 17803/19560 | loss 3.283673 (-0.10z)| norm 0.2319 (-0.68z)| lr 1.28e-05 | 8437.24 ms | -100.0% bf16 MFU | 62130 tok/s +step 17804/19560 | loss 3.244957 (-1.09z)| norm 0.2311 (-0.76z)| lr 1.28e-05 | 8436.64 ms | -100.0% bf16 MFU | 62131 tok/s +step 17805/19560 | loss 3.327228 (+1.01z)| norm 0.2320 (-0.64z)| lr 1.27e-05 | 8439.52 ms | -100.0% bf16 MFU | 62131 tok/s +step 17806/19560 | loss 3.281922 (-0.15z)| norm 0.2357 (-0.21z)| lr 1.27e-05 | 8437.33 ms | -100.0% bf16 MFU | 62131 tok/s +step 17807/19560 | loss 3.334334 (+1.17z)| norm 0.2518 (+1.66z)| lr 1.27e-05 | 8436.95 ms | -100.0% bf16 MFU | 62132 tok/s +step 17808/19560 | loss 3.317125 (+0.73z)| norm 0.2398 (+0.27z)| lr 1.27e-05 | 8438.88 ms | -100.0% bf16 MFU | 62131 tok/s +step 17809/19560 | loss 3.296140 (+0.20z)| norm 0.2446 (+0.82z)| lr 1.27e-05 | 8435.76 ms | -100.0% bf16 MFU | 62132 tok/s +step 17810/19560 | loss 3.303947 (+0.38z)| norm 0.2364 (-0.14z)| lr 1.27e-05 | 8437.09 ms | -100.0% bf16 MFU | 62133 tok/s +step 17811/19560 | loss 3.240468 (-1.23z)| norm 0.2351 (-0.29z)| lr 1.27e-05 | 8437.98 ms | -100.0% bf16 MFU | 62133 tok/s +step 17812/19560 | loss 3.302539 (+0.38z)| norm 0.2347 (-0.33z)| lr 1.26e-05 | 8437.14 ms | -100.0% bf16 MFU | 62133 tok/s +step 17813/19560 | loss 3.336764 (+1.25z)| norm 0.2450 (+0.88z)| lr 1.26e-05 | 8435.83 ms | -100.0% bf16 MFU | 62134 tok/s +step 17814/19560 | loss 3.329125 (+1.06z)| norm 0.2415 (+0.47z)| lr 1.26e-05 | 8439.93 ms | -100.0% bf16 MFU | 62133 tok/s +step 17815/19560 | loss 3.280302 (-0.21z)| norm 0.2256 (-1.39z)| lr 1.26e-05 | 8438.02 ms | -100.0% bf16 MFU | 62133 tok/s +step 17816/19560 | loss 3.354818 (+1.70z)| norm 0.2387 (+0.15z)| lr 1.26e-05 | 8435.06 ms | -100.0% bf16 MFU | 62135 tok/s +step 17817/19560 | loss 3.223336 (-1.67z)| norm 0.2387 (+0.14z)| lr 1.26e-05 | 8435.69 ms | -100.0% bf16 MFU | 62135 tok/s +step 17818/19560 | loss 3.312978 (+0.61z)| norm 0.2393 (+0.20z)| lr 1.26e-05 | 8438.12 ms | -100.0% bf16 MFU | 62135 tok/s +step 17819/19560 | loss 3.255389 (-0.86z)| norm 0.2363 (-0.16z)| lr 1.25e-05 | 8435.08 ms | -100.0% bf16 MFU | 62136 tok/s +step 17820/19560 | loss 3.290484 (+0.07z)| norm 0.2331 (-0.54z)| lr 1.25e-05 | 8435.53 ms | -100.0% bf16 MFU | 62137 tok/s +step 17821/19560 | loss 3.319244 (+0.82z)| norm 0.2249 (-1.51z)| lr 1.25e-05 | 8438.14 ms | -100.0% bf16 MFU | 62137 tok/s +step 17822/19560 | loss 3.249302 (-1.00z)| norm 0.2356 (-0.21z)| lr 1.25e-05 | 8436.38 ms | -100.0% bf16 MFU | 62137 tok/s +step 17823/19560 | loss 3.302057 (+0.37z)| norm 0.2348 (-0.30z)| lr 1.25e-05 | 8436.70 ms | -100.0% bf16 MFU | 62138 tok/s +step 17824/19560 | loss 3.250374 (-0.98z)| norm 0.2259 (-1.36z)| lr 1.25e-05 | 8437.68 ms | -100.0% bf16 MFU | 62138 tok/s +step 17825/19560 | loss 3.316742 (+0.77z)| norm 0.2481 (+1.28z)| lr 1.25e-05 | 8439.62 ms | -100.0% bf16 MFU | 62137 tok/s +step 17826/19560 | loss 3.377580 (+2.31z)| norm 0.2338 (-0.43z)| lr 1.24e-05 | 8437.33 ms | -100.0% bf16 MFU | 62137 tok/s +step 17827/19560 | loss 3.286556 (-0.04z)| norm 0.2300 (-0.88z)| lr 1.24e-05 | 8436.15 ms | -100.0% bf16 MFU | 62137 tok/s +step 17828/19560 | loss 3.286349 (-0.05z)| norm 0.2354 (-0.25z)| lr 1.24e-05 | 8438.12 ms | -100.0% bf16 MFU | 62137 tok/s +step 17829/19560 | loss 3.261644 (-0.68z)| norm 0.2368 (-0.08z)| lr 1.24e-05 | 8437.07 ms | -100.0% bf16 MFU | 62137 tok/s +step 17830/19560 | loss 3.359738 (+1.83z)| norm 0.2363 (-0.12z)| lr 1.24e-05 | 8435.22 ms | -100.0% bf16 MFU | 62138 tok/s +step 17831/19560 | loss 3.329894 (+1.06z)| norm 0.2324 (-0.60z)| lr 1.24e-05 | 8439.14 ms | -100.0% bf16 MFU | 62138 tok/s +step 17832/19560 | loss 3.349832 (+1.54z)| norm 0.2441 (+0.81z)| lr 1.24e-05 | 8435.56 ms | -100.0% bf16 MFU | 62138 tok/s +step 17833/19560 | loss 3.325224 (+0.90z)| norm 0.2384 (+0.11z)| lr 1.23e-05 | 8436.56 ms | -100.0% bf16 MFU | 62139 tok/s +step 17834/19560 | loss 3.310698 (+0.53z)| norm 0.2234 (-1.72z)| lr 1.23e-05 | 8439.42 ms | -100.0% bf16 MFU | 62138 tok/s +step 17835/19560 | loss 3.283222 (-0.17z)| norm 0.2290 (-1.04z)| lr 1.23e-05 | 8435.91 ms | -100.0% bf16 MFU | 62139 tok/s +step 17836/19560 | loss 3.322581 (+0.82z)| norm 0.2439 (+0.78z)| lr 1.23e-05 | 8438.31 ms | -100.0% bf16 MFU | 62138 tok/s +step 17837/19560 | loss 3.251314 (-0.97z)| norm 0.2281 (-1.18z)| lr 1.23e-05 | 8437.77 ms | -100.0% bf16 MFU | 62138 tok/s +step 17838/19560 | loss 3.347006 (+1.44z)| norm 0.2430 (+0.67z)| lr 1.23e-05 | 8437.17 ms | -100.0% bf16 MFU | 62138 tok/s +step 17839/19560 | loss 3.298930 (+0.23z)| norm 0.2376 (-0.01z)| lr 1.23e-05 | 8437.97 ms | -100.0% bf16 MFU | 62138 tok/s +step 17840/19560 | loss 3.333233 (+1.08z)| norm 0.2468 (+1.11z)| lr 1.22e-05 | 8453.50 ms | -100.0% bf16 MFU | 62132 tok/s +step 17841/19560 | loss 3.288022 (-0.07z)| norm 0.2386 (+0.09z)| lr 1.22e-05 | 8468.48 ms | -100.0% bf16 MFU | 62121 tok/s +step 17842/19560 | loss 3.328681 (+0.95z)| norm 0.2333 (-0.56z)| lr 1.22e-05 | 8465.65 ms | -100.0% bf16 MFU | 62112 tok/s +step 17843/19560 | loss 3.307694 (+0.43z)| norm 0.2353 (-0.31z)| lr 1.22e-05 | 8465.35 ms | -100.0% bf16 MFU | 62103 tok/s +step 17844/19560 | loss 3.283576 (-0.18z)| norm 0.2370 (-0.11z)| lr 1.22e-05 | 8463.85 ms | -100.0% bf16 MFU | 62095 tok/s +step 17845/19560 | loss 3.353806 (+1.58z)| norm 0.2311 (-0.82z)| lr 1.22e-05 | 8463.10 ms | -100.0% bf16 MFU | 62087 tok/s +step 17846/19560 | loss 3.358163 (+1.66z)| norm 0.2387 (+0.11z)| lr 1.22e-05 | 8461.62 ms | -100.0% bf16 MFU | 62081 tok/s +step 17847/19560 | loss 3.348957 (+1.41z)| norm 0.2550 (+2.08z)| lr 1.21e-05 | 8466.68 ms | -100.0% bf16 MFU | 62073 tok/s +step 17848/19560 | loss 3.301743 (+0.24z)| norm 0.2369 (-0.11z)| lr 1.21e-05 | 8457.35 ms | -100.0% bf16 MFU | 62069 tok/s +step 17849/19560 | loss 3.292376 (-0.01z)| norm 0.2360 (-0.23z)| lr 1.21e-05 | 8460.05 ms | -100.0% bf16 MFU | 62064 tok/s +step 17850/19560 | loss 3.430443 (+3.33z)| norm 0.2477 (+1.25z)| lr 1.21e-05 | 8458.17 ms | -100.0% bf16 MFU | 62060 tok/s +step 17851/19560 | loss 3.367697 (+1.79z)| norm 0.2351 (-0.34z)| lr 1.21e-05 | 8464.08 ms | -100.0% bf16 MFU | 62055 tok/s +step 17852/19560 | loss 3.302366 (+0.21z)| norm 0.2236 (-1.81z)| lr 1.21e-05 | 8465.41 ms | -100.0% bf16 MFU | 62048 tok/s +step 17853/19560 | loss 3.298988 (+0.15z)| norm 0.2665 (+3.54z)| lr 1.21e-05 | 8461.73 ms | -100.0% bf16 MFU | 62044 tok/s +step 17854/19560 | loss 3.281008 (-0.29z)| norm 0.2475 (+1.17z)| lr 1.20e-05 | 8459.66 ms | -100.0% bf16 MFU | 62041 tok/s +step 17855/19560 | loss 3.319304 (+0.66z)| norm 0.2241 (-1.66z)| lr 1.20e-05 | 8463.67 ms | -100.0% bf16 MFU | 62036 tok/s +step 17856/19560 | loss 3.390592 (+2.35z)| norm 0.2478 (+1.21z)| lr 1.20e-05 | 8460.14 ms | -100.0% bf16 MFU | 62033 tok/s +step 17857/19560 | loss 3.325136 (+0.75z)| norm 0.2397 (+0.23z)| lr 1.20e-05 | 8457.79 ms | -100.0% bf16 MFU | 62030 tok/s +step 17858/19560 | loss 3.291293 (-0.07z)| norm 0.2237 (-1.69z)| lr 1.20e-05 | 8454.94 ms | -100.0% bf16 MFU | 62029 tok/s +step 17859/19560 | loss 3.395659 (+2.43z)| norm 0.2422 (+0.55z)| lr 1.20e-05 | 8457.07 ms | -100.0% bf16 MFU | 62028 tok/s +step 17860/19560 | loss 3.286744 (-0.19z)| norm 0.2360 (-0.21z)| lr 1.20e-05 | 8458.13 ms | -100.0% bf16 MFU | 62026 tok/s +step 17861/19560 | loss 3.336535 (+1.00z)| norm 0.2296 (-0.99z)| lr 1.19e-05 | 8459.25 ms | -100.0% bf16 MFU | 62023 tok/s +step 17862/19560 | loss 3.327439 (+0.77z)| norm 0.2499 (+1.48z)| lr 1.19e-05 | 8457.97 ms | -100.0% bf16 MFU | 62021 tok/s +step 17863/19560 | loss 3.415195 (+2.76z)| norm 0.2494 (+1.42z)| lr 1.19e-05 | 8449.96 ms | -100.0% bf16 MFU | 62023 tok/s +step 17864/19560 | loss 3.309531 (+0.30z)| norm 0.2300 (-0.95z)| lr 1.19e-05 | 8456.55 ms | -100.0% bf16 MFU | 62021 tok/s +step 17865/19560 | loss 3.285102 (-0.26z)| norm 0.2330 (-0.55z)| lr 1.19e-05 | 8454.20 ms | -100.0% bf16 MFU | 62021 tok/s +step 17866/19560 | loss 3.347924 (+1.17z)| norm 0.2396 (+0.27z)| lr 1.19e-05 | 8461.62 ms | -100.0% bf16 MFU | 62018 tok/s +step 17867/19560 | loss 3.266367 (-0.71z)| norm 0.2350 (-0.30z)| lr 1.19e-05 | 8454.34 ms | -100.0% bf16 MFU | 62018 tok/s +step 17868/19560 | loss 3.321887 (+0.58z)| norm 0.2341 (-0.40z)| lr 1.19e-05 | 8452.85 ms | -100.0% bf16 MFU | 62018 tok/s +step 17869/19560 | loss 3.301246 (+0.09z)| norm 0.2314 (-0.74z)| lr 1.18e-05 | 8449.62 ms | -100.0% bf16 MFU | 62020 tok/s +step 17870/19560 | loss 3.294805 (-0.07z)| norm 0.2237 (-1.72z)| lr 1.18e-05 | 8452.01 ms | -100.0% bf16 MFU | 62020 tok/s +step 17871/19560 | loss 3.286388 (-0.28z)| norm 0.2238 (-1.68z)| lr 1.18e-05 | 8449.29 ms | -100.0% bf16 MFU | 62022 tok/s +step 17872/19560 | loss 3.241708 (-1.31z)| norm 0.2292 (-0.97z)| lr 1.18e-05 | 8454.64 ms | -100.0% bf16 MFU | 62021 tok/s +step 17873/19560 | loss 3.327221 (+0.68z)| norm 0.2320 (-0.62z)| lr 1.18e-05 | 8448.92 ms | -100.0% bf16 MFU | 62023 tok/s +step 17874/19560 | loss 3.291989 (-0.14z)| norm 0.2261 (-1.37z)| lr 1.18e-05 | 8456.07 ms | -100.0% bf16 MFU | 62022 tok/s +step 17875/19560 | loss 3.294758 (-0.07z)| norm 0.2379 (+0.16z)| lr 1.18e-05 | 8448.59 ms | -100.0% bf16 MFU | 62024 tok/s +step 17876/19560 | loss 3.318362 (+0.47z)| norm 0.2407 (+0.51z)| lr 1.17e-05 | 8452.65 ms | -100.0% bf16 MFU | 62024 tok/s +step 17877/19560 | loss 3.331772 (+0.80z)| norm 0.2225 (-1.83z)| lr 1.17e-05 | 8448.27 ms | -100.0% bf16 MFU | 62026 tok/s +step 17878/19560 | loss 3.315060 (+0.40z)| norm 0.2400 (+0.45z)| lr 1.17e-05 | 8454.62 ms | -100.0% bf16 MFU | 62025 tok/s +step 17879/19560 | loss 3.301922 (+0.08z)| norm 0.2401 (+0.45z)| lr 1.17e-05 | 8449.40 ms | -100.0% bf16 MFU | 62026 tok/s +step 17880/19560 | loss 3.283224 (-0.37z)| norm 0.2307 (-0.77z)| lr 1.17e-05 | 8444.13 ms | -100.0% bf16 MFU | 62029 tok/s +step 17881/19560 | loss 3.336516 (+0.89z)| norm 0.2452 (+1.12z)| lr 1.17e-05 | 8452.26 ms | -100.0% bf16 MFU | 62029 tok/s +step 17882/19560 | loss 3.286417 (-0.32z)| norm 0.2450 (+1.08z)| lr 1.17e-05 | 8443.45 ms | -100.0% bf16 MFU | 62032 tok/s +step 17883/19560 | loss 3.356249 (+1.34z)| norm 0.2387 (+0.25z)| lr 1.16e-05 | 8448.37 ms | -100.0% bf16 MFU | 62034 tok/s +step 17884/19560 | loss 3.292361 (-0.19z)| norm 0.2307 (-0.79z)| lr 1.16e-05 | 8448.14 ms | -100.0% bf16 MFU | 62035 tok/s +step 17885/19560 | loss 3.323919 (+0.56z)| norm 0.2307 (-0.78z)| lr 1.16e-05 | 8458.12 ms | -100.0% bf16 MFU | 62033 tok/s +step 17886/19560 | loss 3.328720 (+0.66z)| norm 0.2264 (-1.33z)| lr 1.16e-05 | 8449.88 ms | -100.0% bf16 MFU | 62033 tok/s +step 17887/19560 | loss 3.359200 (+1.38z)| norm 0.2282 (-1.08z)| lr 1.16e-05 | 8448.54 ms | -100.0% bf16 MFU | 62034 tok/s +step 17888/19560 | loss 3.286502 (-0.38z)| norm 0.2426 (+0.77z)| lr 1.16e-05 | 8453.48 ms | -100.0% bf16 MFU | 62034 tok/s +step 17889/19560 | loss 3.313709 (+0.28z)| norm 0.2363 (-0.03z)| lr 1.16e-05 | 8452.19 ms | -100.0% bf16 MFU | 62034 tok/s +step 17890/19560 | loss 3.266991 (-0.86z)| norm 0.2261 (-1.34z)| lr 1.15e-05 | 8449.81 ms | -100.0% bf16 MFU | 62034 tok/s +step 17891/19560 | loss 3.315959 (+0.32z)| norm 0.2299 (-0.84z)| lr 1.15e-05 | 8443.64 ms | -100.0% bf16 MFU | 62037 tok/s +step 17892/19560 | loss 3.307972 (+0.13z)| norm 0.2353 (-0.15z)| lr 1.15e-05 | 8439.98 ms | -100.0% bf16 MFU | 62041 tok/s +step 17893/19560 | loss 3.309557 (+0.15z)| norm 0.2310 (-0.70z)| lr 1.15e-05 | 8447.92 ms | -100.0% bf16 MFU | 62042 tok/s +step 17894/19560 | loss 3.285523 (-0.45z)| norm 0.2278 (-1.11z)| lr 1.15e-05 | 8449.16 ms | -100.0% bf16 MFU | 62043 tok/s +step 17895/19560 | loss 3.329367 (+0.62z)| norm 0.2324 (-0.52z)| lr 1.15e-05 | 8451.03 ms | -100.0% bf16 MFU | 62043 tok/s +step 17896/19560 | loss 3.377267 (+1.78z)| norm 0.2379 (+0.18z)| lr 1.15e-05 | 8448.37 ms | -100.0% bf16 MFU | 62043 tok/s +step 17897/19560 | loss 3.305725 (+0.00z)| norm 0.2333 (-0.43z)| lr 1.15e-05 | 8445.13 ms | -100.0% bf16 MFU | 62045 tok/s +step 17898/19560 | loss 3.254821 (-1.32z)| norm 0.2307 (-0.76z)| lr 1.14e-05 | 8449.09 ms | -100.0% bf16 MFU | 62046 tok/s +step 17899/19560 | loss 3.324068 (+0.47z)| norm 0.2397 (+0.41z)| lr 1.14e-05 | 8449.59 ms | -100.0% bf16 MFU | 62046 tok/s +step 17900/19560 | loss 3.266875 (-1.00z)| norm 0.2328 (-0.48z)| lr 1.14e-05 | 8443.17 ms | -100.0% bf16 MFU | 62048 tok/s +step 17901/19560 | loss 3.315659 (+0.25z)| norm 0.2316 (-0.64z)| lr 1.14e-05 | 8443.30 ms | -100.0% bf16 MFU | 62051 tok/s +step 17902/19560 | loss 3.352216 (+1.18z)| norm 0.2647 (+3.51z)| lr 1.14e-05 | 8443.84 ms | -100.0% bf16 MFU | 62053 tok/s +step 17903/19560 | loss 3.312555 (+0.14z)| norm 0.2276 (-1.12z)| lr 1.14e-05 | 8443.04 ms | -100.0% bf16 MFU | 62055 tok/s +step 17904/19560 | loss 3.264674 (-1.13z)| norm 0.2294 (-0.89z)| lr 1.14e-05 | 8448.89 ms | -100.0% bf16 MFU | 62055 tok/s +step 17905/19560 | loss 3.314563 (+0.17z)| norm 0.2263 (-1.26z)| lr 1.13e-05 | 8446.43 ms | -100.0% bf16 MFU | 62056 tok/s +step 17906/19560 | loss 3.289518 (-0.51z)| norm 0.2384 (+0.29z)| lr 1.13e-05 | 8442.31 ms | -100.0% bf16 MFU | 62058 tok/s +step 17907/19560 | loss 3.324345 (+0.43z)| norm 0.2213 (-1.87z)| lr 1.13e-05 | 8447.87 ms | -100.0% bf16 MFU | 62058 tok/s +step 17908/19560 | loss 3.282173 (-0.71z)| norm 0.2199 (-2.02z)| lr 1.13e-05 | 8453.99 ms | -100.0% bf16 MFU | 62056 tok/s +step 17909/19560 | loss 3.290596 (-0.48z)| norm 0.2363 (+0.07z)| lr 1.13e-05 | 8451.66 ms | -100.0% bf16 MFU | 62055 tok/s +step 17910/19560 | loss 3.318641 (+0.29z)| norm 0.2318 (-0.49z)| lr 1.13e-05 | 8439.30 ms | -100.0% bf16 MFU | 62058 tok/s +step 17911/19560 | loss 3.316329 (+0.22z)| norm 0.2606 (+3.09z)| lr 1.13e-05 | 8444.23 ms | -100.0% bf16 MFU | 62060 tok/s +step 17912/19560 | loss 3.380033 (+1.93z)| norm 0.2478 (+1.46z)| lr 1.12e-05 | 8443.50 ms | -100.0% bf16 MFU | 62062 tok/s +step 17913/19560 | loss 3.287946 (-0.56z)| norm 0.2317 (-0.52z)| lr 1.12e-05 | 8445.50 ms | -100.0% bf16 MFU | 62063 tok/s +step 17914/19560 | loss 3.301962 (-0.20z)| norm 0.2357 (-0.03z)| lr 1.12e-05 | 8445.68 ms | -100.0% bf16 MFU | 62063 tok/s +step 17915/19560 | loss 3.322304 (+0.36z)| norm 0.2451 (+1.12z)| lr 1.12e-05 | 8442.98 ms | -100.0% bf16 MFU | 62065 tok/s +step 17916/19560 | loss 3.330338 (+0.57z)| norm 0.2357 (-0.04z)| lr 1.12e-05 | 8443.43 ms | -100.0% bf16 MFU | 62066 tok/s +step 17917/19560 | loss 3.316316 (+0.18z)| norm 0.2351 (-0.11z)| lr 1.12e-05 | 8437.98 ms | -100.0% bf16 MFU | 62070 tok/s +step 17918/19560 | loss 3.269206 (-1.11z)| norm 0.2216 (-1.74z)| lr 1.12e-05 | 8442.70 ms | -100.0% bf16 MFU | 62071 tok/s +step 17919/19560 | loss 3.290474 (-0.51z)| norm 0.2251 (-1.30z)| lr 1.12e-05 | 8443.79 ms | -100.0% bf16 MFU | 62072 tok/s +step 17920/19560 | loss 3.300112 (-0.25z)| norm 0.2446 (+1.06z)| lr 1.11e-05 | 8441.32 ms | -100.0% bf16 MFU | 62074 tok/s +step 17921/19560 | loss 3.290230 (-0.52z)| norm 0.2250 (-1.28z)| lr 1.11e-05 | 8443.29 ms | -100.0% bf16 MFU | 62075 tok/s +step 17922/19560 | loss 3.289835 (-0.52z)| norm 0.2310 (-0.56z)| lr 1.11e-05 | 8446.57 ms | -100.0% bf16 MFU | 62075 tok/s +step 17923/19560 | loss 3.292922 (-0.44z)| norm 0.2236 (-1.43z)| lr 1.11e-05 | 8447.07 ms | -100.0% bf16 MFU | 62075 tok/s +step 17924/19560 | loss 3.303084 (-0.15z)| norm 0.2288 (-0.80z)| lr 1.11e-05 | 8447.37 ms | -100.0% bf16 MFU | 62074 tok/s +step 17925/19560 | loss 3.339280 (+0.85z)| norm 0.2311 (-0.52z)| lr 1.11e-05 | 8440.59 ms | -100.0% bf16 MFU | 62076 tok/s +step 17926/19560 | loss 3.304527 (-0.12z)| norm 0.2457 (+1.21z)| lr 1.11e-05 | 8443.41 ms | -100.0% bf16 MFU | 62077 tok/s +step 17927/19560 | loss 3.347198 (+1.07z)| norm 0.2300 (-0.64z)| lr 1.10e-05 | 8442.95 ms | -100.0% bf16 MFU | 62078 tok/s +step 17928/19560 | loss 3.282203 (-0.78z)| norm 0.2297 (-0.68z)| lr 1.10e-05 | 8449.86 ms | -100.0% bf16 MFU | 62077 tok/s +step 17929/19560 | loss 3.353310 (+1.22z)| norm 0.2359 (+0.05z)| lr 1.10e-05 | 8444.25 ms | -100.0% bf16 MFU | 62077 tok/s +step 17930/19560 | loss 3.321477 (+0.31z)| norm 0.2270 (-0.99z)| lr 1.10e-05 | 8442.46 ms | -100.0% bf16 MFU | 62078 tok/s +step 17931/19560 | loss 3.265862 (-1.26z)| norm 0.2321 (-0.39z)| lr 1.10e-05 | 8445.99 ms | -100.0% bf16 MFU | 62078 tok/s +step 17932/19560 | loss 3.314697 (+0.11z)| norm 0.2255 (-1.16z)| lr 1.10e-05 | 8443.58 ms | -100.0% bf16 MFU | 62079 tok/s +step 17933/19560 | loss 3.300879 (-0.29z)| norm 0.2321 (-0.38z)| lr 1.10e-05 | 8445.22 ms | -100.0% bf16 MFU | 62079 tok/s +step 17934/19560 | loss 3.315484 (+0.13z)| norm 0.2409 (+0.64z)| lr 1.10e-05 | 8442.14 ms | -100.0% bf16 MFU | 62080 tok/s +step 17935/19560 | loss 3.277935 (-0.94z)| norm 0.2325 (-0.33z)| lr 1.09e-05 | 8446.06 ms | -100.0% bf16 MFU | 62080 tok/s +step 17936/19560 | loss 3.296916 (-0.39z)| norm 0.2408 (+0.66z)| lr 1.09e-05 | 8437.96 ms | -100.0% bf16 MFU | 62083 tok/s +step 17937/19560 | loss 3.333154 (+0.64z)| norm 0.2466 (+1.34z)| lr 1.09e-05 | 8441.30 ms | -100.0% bf16 MFU | 62084 tok/s +step 17938/19560 | loss 3.323967 (+0.37z)| norm 0.2413 (+0.71z)| lr 1.09e-05 | 8443.58 ms | -100.0% bf16 MFU | 62085 tok/s +step 17939/19560 | loss 3.329991 (+0.53z)| norm 0.2345 (-0.10z)| lr 1.09e-05 | 8437.68 ms | -100.0% bf16 MFU | 62087 tok/s +step 17940/19560 | loss 3.262877 (-1.40z)| norm 0.2326 (-0.32z)| lr 1.09e-05 | 8442.25 ms | -100.0% bf16 MFU | 62088 tok/s +step 17941/19560 | loss 3.256627 (-1.55z)| norm 0.2492 (+1.63z)| lr 1.09e-05 | 8445.18 ms | -100.0% bf16 MFU | 62088 tok/s +step 17942/19560 | loss 3.286528 (-0.68z)| norm 0.2341 (-0.14z)| lr 1.08e-05 | 8439.25 ms | -100.0% bf16 MFU | 62089 tok/s +step 17943/19560 | loss 3.302261 (-0.24z)| norm 0.2304 (-0.58z)| lr 1.08e-05 | 8438.63 ms | -100.0% bf16 MFU | 62091 tok/s +step 17944/19560 | loss 3.299759 (-0.30z)| norm 0.2270 (-0.96z)| lr 1.08e-05 | 8439.72 ms | -100.0% bf16 MFU | 62093 tok/s +step 17945/19560 | loss 3.293891 (-0.50z)| norm 0.2318 (-0.40z)| lr 1.08e-05 | 8439.61 ms | -100.0% bf16 MFU | 62094 tok/s +step 17946/19560 | loss 3.324954 (+0.42z)| norm 0.2388 (+0.43z)| lr 1.08e-05 | 8436.57 ms | -100.0% bf16 MFU | 62097 tok/s +step 17947/19560 | loss 3.244123 (-1.96z)| norm 0.2423 (+0.84z)| lr 1.08e-05 | 8439.97 ms | -100.0% bf16 MFU | 62098 tok/s +step 17948/19560 | loss 3.298362 (-0.36z)| norm 0.2293 (-0.70z)| lr 1.08e-05 | 8440.19 ms | -100.0% bf16 MFU | 62099 tok/s +step 17949/19560 | loss 3.314168 (+0.10z)| norm 0.2272 (-0.94z)| lr 1.08e-05 | 8439.45 ms | -100.0% bf16 MFU | 62100 tok/s +step 17950/19560 | loss 3.284165 (-0.80z)| norm 0.2431 (+0.92z)| lr 1.07e-05 | 8443.70 ms | -100.0% bf16 MFU | 62100 tok/s +step 17951/19560 | loss 3.234810 (-2.21z)| norm 0.2255 (-1.13z)| lr 1.07e-05 | 8438.88 ms | -100.0% bf16 MFU | 62101 tok/s +step 17952/19560 | loss 3.322134 (+0.33z)| norm 0.2421 (+0.79z)| lr 1.07e-05 | 8439.64 ms | -100.0% bf16 MFU | 62102 tok/s +step 17953/19560 | loss 3.354949 (+1.28z)| norm 0.2398 (+0.54z)| lr 1.07e-05 | 8442.12 ms | -100.0% bf16 MFU | 62102 tok/s +step 17954/19560 | loss 3.241669 (-2.01z)| norm 0.2483 (+1.51z)| lr 1.07e-05 | 8442.78 ms | -100.0% bf16 MFU | 62102 tok/s +step 17955/19560 | loss 3.279608 (-0.90z)| norm 0.2277 (-0.89z)| lr 1.07e-05 | 8437.98 ms | -100.0% bf16 MFU | 62104 tok/s +step 17956/19560 | loss 3.351763 (+1.20z)| norm 0.2354 (+0.01z)| lr 1.07e-05 | 8443.32 ms | -100.0% bf16 MFU | 62103 tok/s +step 17957/19560 | loss 3.312690 (+0.05z)| norm 0.2250 (-1.19z)| lr 1.06e-05 | 8438.42 ms | -100.0% bf16 MFU | 62105 tok/s +step 17958/19560 | loss 3.281337 (-0.86z)| norm 0.2308 (-0.51z)| lr 1.06e-05 | 8441.53 ms | -100.0% bf16 MFU | 62105 tok/s +step 17959/19560 | loss 3.218337 (-2.63z)| norm 0.2288 (-0.73z)| lr 1.06e-05 | 8441.17 ms | -100.0% bf16 MFU | 62105 tok/s +step 17960/19560 | loss 3.333141 (+0.69z)| norm 0.2334 (-0.19z)| lr 1.06e-05 | 8435.99 ms | -100.0% bf16 MFU | 62107 tok/s +step 17961/19560 | loss 3.264924 (-1.27z)| norm 0.2261 (-1.03z)| lr 1.06e-05 | 8439.03 ms | -100.0% bf16 MFU | 62108 tok/s +step 17962/19560 | loss 3.282403 (-0.76z)| norm 0.2355 (+0.04z)| lr 1.06e-05 | 8441.14 ms | -100.0% bf16 MFU | 62109 tok/s +step 17963/19560 | loss 3.332395 (+0.67z)| norm 0.2412 (+0.70z)| lr 1.06e-05 | 8438.86 ms | -100.0% bf16 MFU | 62109 tok/s +step 17964/19560 | loss 3.310271 (+0.03z)| norm 0.2324 (-0.32z)| lr 1.06e-05 | 8438.63 ms | -100.0% bf16 MFU | 62110 tok/s +step 17965/19560 | loss 3.317791 (+0.24z)| norm 0.2178 (-1.98z)| lr 1.05e-05 | 8441.57 ms | -100.0% bf16 MFU | 62110 tok/s +step 17966/19560 | loss 3.232224 (-2.19z)| norm 0.2783 (+4.55z)| lr 1.05e-05 | 8444.61 ms | -100.0% bf16 MFU | 62109 tok/s +step 17967/19560 | loss 3.303350 (-0.15z)| norm 0.2311 (-0.43z)| lr 1.05e-05 | 8442.15 ms | -100.0% bf16 MFU | 62109 tok/s +step 17968/19560 | loss 3.279941 (-0.81z)| norm 0.2311 (-0.43z)| lr 1.05e-05 | 8441.53 ms | -100.0% bf16 MFU | 62109 tok/s +step 17969/19560 | loss 3.258209 (-1.42z)| norm 0.2385 (+0.36z)| lr 1.05e-05 | 8435.75 ms | -100.0% bf16 MFU | 62111 tok/s +step 17970/19560 | loss 3.242566 (-1.82z)| norm 0.2344 (-0.08z)| lr 1.05e-05 | 8441.01 ms | -100.0% bf16 MFU | 62111 tok/s +step 17971/19560 | loss 3.269343 (-1.06z)| norm 0.2334 (-0.19z)| lr 1.05e-05 | 8438.18 ms | -100.0% bf16 MFU | 62112 tok/s +step 17972/19560 | loss 3.295827 (-0.32z)| norm 0.2243 (-1.14z)| lr 1.04e-05 | 8443.21 ms | -100.0% bf16 MFU | 62111 tok/s +step 17973/19560 | loss 3.362733 (+1.55z)| norm 0.2292 (-0.61z)| lr 1.04e-05 | 8439.99 ms | -100.0% bf16 MFU | 62112 tok/s +step 17974/19560 | loss 3.274825 (-0.89z)| norm 0.2382 (+0.34z)| lr 1.04e-05 | 8442.88 ms | -100.0% bf16 MFU | 62111 tok/s +step 17975/19560 | loss 3.287643 (-0.52z)| norm 0.2399 (+0.54z)| lr 1.04e-05 | 8438.21 ms | -100.0% bf16 MFU | 62112 tok/s +step 17976/19560 | loss 3.298249 (-0.22z)| norm 0.2276 (-0.77z)| lr 1.04e-05 | 8435.74 ms | -100.0% bf16 MFU | 62114 tok/s +step 17977/19560 | loss 3.289986 (-0.45z)| norm 0.2347 (-0.01z)| lr 1.04e-05 | 8438.93 ms | -100.0% bf16 MFU | 62115 tok/s +step 17978/19560 | loss 3.260381 (-1.31z)| norm 0.2221 (-1.34z)| lr 1.04e-05 | 8437.57 ms | -100.0% bf16 MFU | 62116 tok/s +step 17979/19560 | loss 3.308944 (+0.14z)| norm 0.2463 (+1.24z)| lr 1.04e-05 | 8438.99 ms | -100.0% bf16 MFU | 62116 tok/s +step 17980/19560 | loss 3.267709 (-1.08z)| norm 0.2281 (-0.71z)| lr 1.03e-05 | 8438.60 ms | -100.0% bf16 MFU | 62117 tok/s +step 17981/19560 | loss 3.279399 (-0.73z)| norm 0.2407 (+0.69z)| lr 1.03e-05 | 8436.80 ms | -100.0% bf16 MFU | 62118 tok/s +step 17982/19560 | loss 3.261692 (-1.24z)| norm 0.2351 (+0.08z)| lr 1.03e-05 | 8437.00 ms | -100.0% bf16 MFU | 62119 tok/s +step 17983/19560 | loss 3.292833 (-0.32z)| norm 0.2447 (+1.14z)| lr 1.03e-05 | 8438.02 ms | -100.0% bf16 MFU | 62120 tok/s +step 17984/19560 | loss 3.300449 (-0.07z)| norm 0.2415 (+0.80z)| lr 1.03e-05 | 8439.90 ms | -100.0% bf16 MFU | 62120 tok/s +step 17985/19560 | loss 3.342466 (+1.20z)| norm 0.2422 (+0.86z)| lr 1.03e-05 | 8436.45 ms | -100.0% bf16 MFU | 62121 tok/s +step 17986/19560 | loss 3.224500 (-2.31z)| norm 0.2317 (-0.34z)| lr 1.03e-05 | 8440.92 ms | -100.0% bf16 MFU | 62121 tok/s +step 17987/19560 | loss 3.297267 (-0.13z)| norm 0.2267 (-0.89z)| lr 1.03e-05 | 8438.79 ms | -100.0% bf16 MFU | 62121 tok/s +step 17988/19560 | loss 3.304201 (+0.07z)| norm 0.2262 (-0.94z)| lr 1.02e-05 | 8435.77 ms | -100.0% bf16 MFU | 62123 tok/s +step 17989/19560 | loss 3.323142 (+0.66z)| norm 0.2334 (-0.12z)| lr 1.02e-05 | 8440.31 ms | -100.0% bf16 MFU | 62123 tok/s +step 17990/19560 | loss 3.259966 (-1.26z)| norm 0.2322 (-0.24z)| lr 1.02e-05 | 8433.53 ms | -100.0% bf16 MFU | 62125 tok/s +step 17991/19560 | loss 3.247098 (-1.68z)| norm 0.2398 (+0.65z)| lr 1.02e-05 | 8443.75 ms | -100.0% bf16 MFU | 62123 tok/s +step 17992/19560 | loss 3.347717 (+1.50z)| norm 0.2319 (-0.27z)| lr 1.02e-05 | 8440.64 ms | -100.0% bf16 MFU | 62123 tok/s +step 17993/19560 | loss 3.317722 (+0.55z)| norm 0.2327 (-0.18z)| lr 1.02e-05 | 8436.85 ms | -100.0% bf16 MFU | 62124 tok/s +step 17994/19560 | loss 3.260950 (-1.22z)| norm 0.2251 (-1.05z)| lr 1.02e-05 | 8435.94 ms | -100.0% bf16 MFU | 62125 tok/s +step 17995/19560 | loss 3.297307 (-0.08z)| norm 0.2338 (-0.04z)| lr 1.01e-05 | 8437.01 ms | -100.0% bf16 MFU | 62126 tok/s +step 17996/19560 | loss 3.313959 (+0.45z)| norm 0.2347 (+0.06z)| lr 1.01e-05 | 8436.34 ms | -100.0% bf16 MFU | 62127 tok/s +step 17997/19560 | loss 3.319318 (+0.61z)| norm 0.2334 (-0.08z)| lr 1.01e-05 | 8437.59 ms | -100.0% bf16 MFU | 62127 tok/s +step 17998/19560 | loss 3.322407 (+0.70z)| norm 0.2222 (-1.38z)| lr 1.01e-05 | 8435.73 ms | -100.0% bf16 MFU | 62129 tok/s +step 17999/19560 | loss 3.271913 (-0.89z)| norm 0.2334 (-0.10z)| lr 1.01e-05 | 8440.34 ms | -100.0% bf16 MFU | 62128 tok/s +step 18000/19560 | loss 3.299482 (-0.04z)| norm 0.2260 (-0.95z)| lr 1.01e-05 | 8437.77 ms | -100.0% bf16 MFU | 62128 tok/s +val loss 3.266748 +evaluating HellaSwag: 0/1256 evaluating HellaSwag: 10/1256 evaluating HellaSwag: 20/1256 evaluating HellaSwag: 30/1256 evaluating HellaSwag: 40/1256 evaluating HellaSwag: 50/1256 evaluating HellaSwag: 60/1256 evaluating HellaSwag: 70/1256 evaluating HellaSwag: 80/1256 evaluating HellaSwag: 90/1256 evaluating HellaSwag: 100/1256 evaluating HellaSwag: 110/1256 evaluating HellaSwag: 120/1256 evaluating HellaSwag: 130/1256 evaluating HellaSwag: 140/1256 evaluating HellaSwag: 150/1256 evaluating HellaSwag: 160/1256 evaluating HellaSwag: 170/1256 evaluating HellaSwag: 180/1256 evaluating HellaSwag: 190/1256 evaluating HellaSwag: 200/1256 evaluating HellaSwag: 210/1256 evaluating HellaSwag: 220/1256 evaluating HellaSwag: 230/1256 evaluating HellaSwag: 240/1256 evaluating HellaSwag: 250/1256 evaluating HellaSwag: 260/1256 evaluating HellaSwag: 270/1256 evaluating HellaSwag: 280/1256 evaluating HellaSwag: 290/1256 evaluating HellaSwag: 300/1256 evaluating HellaSwag: 310/1256 evaluating HellaSwag: 320/1256 evaluating HellaSwag: 330/1256 evaluating HellaSwag: 340/1256 evaluating HellaSwag: 350/1256 evaluating HellaSwag: 360/1256 evaluating HellaSwag: 370/1256 evaluating HellaSwag: 380/1256 evaluating HellaSwag: 390/1256 evaluating HellaSwag: 400/1256 evaluating HellaSwag: 410/1256 evaluating HellaSwag: 420/1256 evaluating HellaSwag: 430/1256 evaluating HellaSwag: 440/1256 evaluating HellaSwag: 450/1256 evaluating HellaSwag: 460/1256 evaluating HellaSwag: 470/1256 evaluating HellaSwag: 480/1256 evaluating HellaSwag: 490/1256 evaluating HellaSwag: 500/1256 evaluating HellaSwag: 510/1256 evaluating HellaSwag: 520/1256 evaluating HellaSwag: 530/1256 evaluating HellaSwag: 540/1256 evaluating HellaSwag: 550/1256 evaluating HellaSwag: 560/1256 evaluating HellaSwag: 570/1256 evaluating HellaSwag: 580/1256 evaluating HellaSwag: 590/1256 evaluating HellaSwag: 600/1256 evaluating HellaSwag: 610/1256 evaluating HellaSwag: 620/1256 evaluating HellaSwag: 630/1256 evaluating HellaSwag: 640/1256 evaluating HellaSwag: 650/1256 evaluating HellaSwag: 660/1256 evaluating HellaSwag: 670/1256 evaluating HellaSwag: 680/1256 evaluating HellaSwag: 690/1256 evaluating HellaSwag: 700/1256 evaluating HellaSwag: 710/1256 evaluating HellaSwag: 720/1256 evaluating HellaSwag: 730/1256 evaluating HellaSwag: 740/1256 evaluating HellaSwag: 750/1256 evaluating HellaSwag: 760/1256 evaluating HellaSwag: 770/1256 evaluating HellaSwag: 780/1256 evaluating HellaSwag: 790/1256 evaluating HellaSwag: 800/1256 evaluating HellaSwag: 810/1256 evaluating HellaSwag: 820/1256 evaluating HellaSwag: 830/1256 evaluating HellaSwag: 840/1256 evaluating HellaSwag: 850/1256 evaluating HellaSwag: 860/1256 evaluating HellaSwag: 870/1256 evaluating HellaSwag: 880/1256 evaluating HellaSwag: 890/1256 evaluating HellaSwag: 900/1256 evaluating HellaSwag: 910/1256 evaluating HellaSwag: 920/1256 evaluating HellaSwag: 930/1256 evaluating HellaSwag: 940/1256 evaluating HellaSwag: 950/1256 evaluating HellaSwag: 960/1256 evaluating HellaSwag: 970/1256 evaluating HellaSwag: 980/1256 evaluating HellaSwag: 990/1256 evaluating HellaSwag: 1000/1256 evaluating HellaSwag: 1010/1256 evaluating HellaSwag: 1020/1256 evaluating HellaSwag: 1030/1256 evaluating HellaSwag: 1040/1256 evaluating HellaSwag: 1050/1256 evaluating HellaSwag: 1060/1256 evaluating HellaSwag: 1070/1256 evaluating HellaSwag: 1080/1256 evaluating HellaSwag: 1090/1256 evaluating HellaSwag: 1100/1256 evaluating HellaSwag: 1110/1256 evaluating HellaSwag: 1120/1256 evaluating HellaSwag: 1130/1256 evaluating HellaSwag: 1140/1256 evaluating HellaSwag: 1150/1256 evaluating HellaSwag: 1160/1256 evaluating HellaSwag: 1170/1256 evaluating HellaSwag: 1180/1256 evaluating HellaSwag: 1190/1256 evaluating HellaSwag: 1200/1256 evaluating HellaSwag: 1210/1256 evaluating HellaSwag: 1220/1256 evaluating HellaSwag: 1230/1256 evaluating HellaSwag: 1240/1256 evaluating HellaSwag: 1250/1256 HellaSwag: 2997/10042 = 0.298447 +step 18001/19560 | loss 3.301975 (+0.05z)| norm 0.2308 (-0.40z)| lr 1.01e-05 | 8434.69 ms | -100.0% bf16 MFU | 62130 tok/s +step 18002/19560 | loss 3.355226 (+1.72z)| norm 0.2423 (+0.93z)| lr 1.01e-05 | 8431.39 ms | -100.0% bf16 MFU | 62133 tok/s +step 18003/19560 | loss 3.392998 (+2.81z)| norm 0.2592 (+2.80z)| lr 1.00e-05 | 8431.63 ms | -100.0% bf16 MFU | 62135 tok/s +step 18004/19560 | loss 3.368314 (+2.01z)| norm 0.2520 (+1.95z)| lr 1.00e-05 | 8431.71 ms | -100.0% bf16 MFU | 62137 tok/s +step 18005/19560 | loss 3.266407 (-1.06z)| norm 0.2363 (+0.18z)| lr 1.00e-05 | 8428.38 ms | -100.0% bf16 MFU | 62141 tok/s +step 18006/19560 | loss 3.305681 (+0.13z)| norm 0.2328 (-0.20z)| lr 1.00e-05 | 8430.61 ms | -100.0% bf16 MFU | 62143 tok/s +step 18007/19560 | loss 3.279617 (-0.65z)| norm 0.2367 (+0.24z)| lr 1.00e-05 | 8429.30 ms | -100.0% bf16 MFU | 62146 tok/s +step 18008/19560 | loss 3.298392 (-0.09z)| norm 0.2388 (+0.46z)| lr 9.98e-06 | 8427.15 ms | -100.0% bf16 MFU | 62149 tok/s +step 18009/19560 | loss 3.370151 (+2.04z)| norm 0.2159 (-2.07z)| lr 9.97e-06 | 8429.36 ms | -100.0% bf16 MFU | 62152 tok/s +step 18010/19560 | loss 3.242895 (-1.73z)| norm 0.2292 (-0.57z)| lr 9.96e-06 | 8431.59 ms | -100.0% bf16 MFU | 62153 tok/s +step 18011/19560 | loss 3.265299 (-1.05z)| norm 0.2546 (+2.21z)| lr 9.94e-06 | 8432.42 ms | -100.0% bf16 MFU | 62154 tok/s +step 18012/19560 | loss 3.307178 (+0.19z)| norm 0.2337 (-0.08z)| lr 9.93e-06 | 8433.12 ms | -100.0% bf16 MFU | 62155 tok/s +step 18013/19560 | loss 3.320188 (+0.58z)| norm 0.2355 (+0.12z)| lr 9.92e-06 | 8433.71 ms | -100.0% bf16 MFU | 62156 tok/s +step 18014/19560 | loss 3.280644 (-0.59z)| norm 0.2229 (-1.26z)| lr 9.91e-06 | 8432.29 ms | -100.0% bf16 MFU | 62157 tok/s +step 18015/19560 | loss 3.314396 (+0.43z)| norm 0.2234 (-1.20z)| lr 9.89e-06 | 8430.11 ms | -100.0% bf16 MFU | 62158 tok/s +step 18016/19560 | loss 3.269344 (-0.92z)| norm 0.2378 (+0.37z)| lr 9.88e-06 | 8430.03 ms | -100.0% bf16 MFU | 62160 tok/s +step 18017/19560 | loss 3.235246 (-1.90z)| norm 0.2602 (+2.72z)| lr 9.87e-06 | 8430.30 ms | -100.0% bf16 MFU | 62162 tok/s +step 18018/19560 | loss 3.331278 (+0.93z)| norm 0.2333 (-0.14z)| lr 9.85e-06 | 8430.66 ms | -100.0% bf16 MFU | 62163 tok/s +step 18019/19560 | loss 3.365967 (+1.92z)| norm 0.2285 (-0.65z)| lr 9.84e-06 | 8433.50 ms | -100.0% bf16 MFU | 62163 tok/s +step 18020/19560 | loss 3.289049 (-0.32z)| norm 0.2552 (+2.13z)| lr 9.83e-06 | 8432.02 ms | -100.0% bf16 MFU | 62164 tok/s +step 18021/19560 | loss 3.284319 (-0.45z)| norm 0.2275 (-0.76z)| lr 9.82e-06 | 8431.24 ms | -100.0% bf16 MFU | 62165 tok/s +step 18022/19560 | loss 3.324851 (+0.72z)| norm 0.2334 (-0.14z)| lr 9.80e-06 | 8436.76 ms | -100.0% bf16 MFU | 62164 tok/s +step 18023/19560 | loss 3.302245 (+0.07z)| norm 0.2477 (+1.33z)| lr 9.79e-06 | 8434.04 ms | -100.0% bf16 MFU | 62164 tok/s +step 18024/19560 | loss 3.284593 (-0.43z)| norm 0.2265 (-0.86z)| lr 9.78e-06 | 8436.38 ms | -100.0% bf16 MFU | 62163 tok/s +step 18025/19560 | loss 3.372298 (+2.13z)| norm 0.2197 (-1.54z)| lr 9.77e-06 | 8434.77 ms | -100.0% bf16 MFU | 62163 tok/s +step 18026/19560 | loss 3.282032 (-0.53z)| norm 0.2216 (-1.33z)| lr 9.75e-06 | 8436.22 ms | -100.0% bf16 MFU | 62162 tok/s +step 18027/19560 | loss 3.278288 (-0.63z)| norm 0.2302 (-0.45z)| lr 9.74e-06 | 8437.44 ms | -100.0% bf16 MFU | 62161 tok/s +step 18028/19560 | loss 3.380069 (+2.31z)| norm 0.2475 (+1.30z)| lr 9.73e-06 | 8435.61 ms | -100.0% bf16 MFU | 62160 tok/s +step 18029/19560 | loss 3.404344 (+2.89z)| norm 0.2403 (+0.56z)| lr 9.72e-06 | 8434.66 ms | -100.0% bf16 MFU | 62160 tok/s +step 18030/19560 | loss 3.283255 (-0.49z)| norm 0.2354 (+0.09z)| lr 9.70e-06 | 8433.39 ms | -100.0% bf16 MFU | 62161 tok/s +step 18031/19560 | loss 3.260805 (-1.11z)| norm 0.2301 (-0.46z)| lr 9.69e-06 | 8460.71 ms | -100.0% bf16 MFU | 62151 tok/s +step 18032/19560 | loss 3.305230 (+0.13z)| norm 0.2551 (+2.11z)| lr 9.68e-06 | 8470.23 ms | -100.0% bf16 MFU | 62138 tok/s +step 18033/19560 | loss 3.292178 (-0.23z)| norm 0.2265 (-0.85z)| lr 9.67e-06 | 8459.53 ms | -100.0% bf16 MFU | 62130 tok/s +step 18034/19560 | loss 3.326939 (+0.74z)| norm 0.2413 (+0.68z)| lr 9.65e-06 | 8462.99 ms | -100.0% bf16 MFU | 62121 tok/s +step 18035/19560 | loss 3.272420 (-0.78z)| norm 0.2349 (+0.01z)| lr 9.64e-06 | 8459.40 ms | -100.0% bf16 MFU | 62114 tok/s +step 18036/19560 | loss 3.318413 (+0.50z)| norm 0.2424 (+0.77z)| lr 9.63e-06 | 8457.64 ms | -100.0% bf16 MFU | 62108 tok/s +step 18037/19560 | loss 3.301187 (+0.02z)| norm 0.2316 (-0.36z)| lr 9.61e-06 | 8458.41 ms | -100.0% bf16 MFU | 62102 tok/s +step 18038/19560 | loss 3.276573 (-0.67z)| norm 0.2487 (+1.41z)| lr 9.60e-06 | 8457.51 ms | -100.0% bf16 MFU | 62096 tok/s +step 18039/19560 | loss 3.281525 (-0.52z)| norm 0.2463 (+1.20z)| lr 9.59e-06 | 8458.67 ms | -100.0% bf16 MFU | 62090 tok/s +step 18040/19560 | loss 3.347937 (+1.37z)| norm 0.2260 (-0.95z)| lr 9.58e-06 | 8459.61 ms | -100.0% bf16 MFU | 62085 tok/s +step 18041/19560 | loss 3.312366 (+0.36z)| norm 0.2312 (-0.39z)| lr 9.56e-06 | 8464.89 ms | -100.0% bf16 MFU | 62077 tok/s +step 18042/19560 | loss 3.304976 (+0.14z)| norm 0.2286 (-0.66z)| lr 9.55e-06 | 8466.08 ms | -100.0% bf16 MFU | 62070 tok/s +step 18043/19560 | loss 3.299494 (-0.01z)| norm 0.2445 (+1.03z)| lr 9.54e-06 | 8453.08 ms | -100.0% bf16 MFU | 62067 tok/s +step 18044/19560 | loss 3.306346 (+0.19z)| norm 0.2294 (-0.57z)| lr 9.53e-06 | 8458.91 ms | -100.0% bf16 MFU | 62063 tok/s +step 18045/19560 | loss 3.247492 (-1.46z)| norm 0.2325 (-0.24z)| lr 9.51e-06 | 8460.91 ms | -100.0% bf16 MFU | 62058 tok/s +step 18046/19560 | loss 3.241835 (-1.61z)| norm 0.2402 (+0.57z)| lr 9.50e-06 | 8458.04 ms | -100.0% bf16 MFU | 62055 tok/s +step 18047/19560 | loss 3.273644 (-0.70z)| norm 0.2254 (-1.02z)| lr 9.49e-06 | 8452.25 ms | -100.0% bf16 MFU | 62053 tok/s +step 18048/19560 | loss 3.311591 (+0.36z)| norm 0.2661 (+3.21z)| lr 9.48e-06 | 8455.36 ms | -100.0% bf16 MFU | 62051 tok/s +step 18049/19560 | loss 3.397359 (+2.67z)| norm 0.2614 (+2.63z)| lr 9.46e-06 | 8454.43 ms | -100.0% bf16 MFU | 62049 tok/s +step 18050/19560 | loss 3.288449 (-0.30z)| norm 0.2249 (-1.04z)| lr 9.45e-06 | 8460.92 ms | -100.0% bf16 MFU | 62045 tok/s +step 18051/19560 | loss 3.333340 (+0.91z)| norm 0.3240 (+6.98z)| lr 9.44e-06 | 8452.75 ms | -100.0% bf16 MFU | 62044 tok/s +step 18052/19560 | loss 3.335343 (+0.96z)| norm 0.2486 (+0.98z)| lr 9.43e-06 | 8457.90 ms | -100.0% bf16 MFU | 62041 tok/s +step 18053/19560 | loss 3.300940 (+0.03z)| norm 0.2335 (-0.21z)| lr 9.42e-06 | 8448.89 ms | -100.0% bf16 MFU | 62042 tok/s +step 18054/19560 | loss 3.352958 (+1.43z)| norm 0.2288 (-0.58z)| lr 9.40e-06 | 8453.09 ms | -100.0% bf16 MFU | 62041 tok/s +step 18055/19560 | loss 3.293948 (-0.16z)| norm 0.2364 (+0.02z)| lr 9.39e-06 | 8455.33 ms | -100.0% bf16 MFU | 62039 tok/s +step 18056/19560 | loss 3.297366 (-0.07z)| norm 0.2422 (+0.48z)| lr 9.38e-06 | 8456.76 ms | -100.0% bf16 MFU | 62037 tok/s +step 18057/19560 | loss 3.280904 (-0.51z)| norm 0.2485 (+0.96z)| lr 9.37e-06 | 8452.43 ms | -100.0% bf16 MFU | 62037 tok/s +step 18058/19560 | loss 3.389489 (+2.41z)| norm 0.2384 (+0.16z)| lr 9.35e-06 | 8451.48 ms | -100.0% bf16 MFU | 62037 tok/s +step 18059/19560 | loss 3.308166 (+0.21z)| norm 0.2370 (+0.04z)| lr 9.34e-06 | 8453.14 ms | -100.0% bf16 MFU | 62036 tok/s +step 18060/19560 | loss 3.340973 (+1.09z)| norm 0.2352 (-0.11z)| lr 9.33e-06 | 8449.91 ms | -100.0% bf16 MFU | 62036 tok/s +step 18061/19560 | loss 3.362416 (+1.63z)| norm 0.2374 (+0.07z)| lr 9.32e-06 | 8455.64 ms | -100.0% bf16 MFU | 62035 tok/s +step 18062/19560 | loss 3.312048 (+0.30z)| norm 0.2472 (+0.84z)| lr 9.30e-06 | 8448.76 ms | -100.0% bf16 MFU | 62036 tok/s +step 18063/19560 | loss 3.295658 (-0.14z)| norm 0.2482 (+0.90z)| lr 9.29e-06 | 8449.06 ms | -100.0% bf16 MFU | 62037 tok/s +step 18064/19560 | loss 3.296911 (-0.11z)| norm 0.2209 (-1.23z)| lr 9.28e-06 | 8450.78 ms | -100.0% bf16 MFU | 62037 tok/s +step 18065/19560 | loss 3.278108 (-0.60z)| norm 0.2390 (+0.19z)| lr 9.27e-06 | 8452.04 ms | -100.0% bf16 MFU | 62037 tok/s +step 18066/19560 | loss 3.395223 (+2.45z)| norm 0.2704 (+2.59z)| lr 9.25e-06 | 8449.01 ms | -100.0% bf16 MFU | 62037 tok/s +step 18067/19560 | loss 3.240406 (-1.55z)| norm 0.2963 (+4.21z)| lr 9.24e-06 | 8450.93 ms | -100.0% bf16 MFU | 62037 tok/s +step 18068/19560 | loss 3.250017 (-1.30z)| norm 0.2521 (+1.04z)| lr 9.23e-06 | 8452.36 ms | -100.0% bf16 MFU | 62037 tok/s +step 18069/19560 | loss 3.296860 (-0.10z)| norm 0.2479 (+0.75z)| lr 9.22e-06 | 8449.74 ms | -100.0% bf16 MFU | 62038 tok/s +step 18070/19560 | loss 3.360085 (+1.51z)| norm 0.2577 (+1.42z)| lr 9.21e-06 | 8450.12 ms | -100.0% bf16 MFU | 62038 tok/s +step 18071/19560 | loss 3.340811 (+1.00z)| norm 0.2464 (+0.61z)| lr 9.19e-06 | 8449.90 ms | -100.0% bf16 MFU | 62038 tok/s +step 18072/19560 | loss 3.304125 (+0.06z)| norm 0.2380 (+0.01z)| lr 9.18e-06 | 8451.12 ms | -100.0% bf16 MFU | 62038 tok/s +step 18073/19560 | loss 3.316248 (+0.37z)| norm 0.2561 (+1.27z)| lr 9.17e-06 | 8446.57 ms | -100.0% bf16 MFU | 62040 tok/s +step 18074/19560 | loss 3.261313 (-1.02z)| norm 0.2374 (-0.04z)| lr 9.16e-06 | 8446.87 ms | -100.0% bf16 MFU | 62041 tok/s +step 18075/19560 | loss 3.310193 (+0.22z)| norm 0.2274 (-0.73z)| lr 9.14e-06 | 8451.58 ms | -100.0% bf16 MFU | 62041 tok/s +step 18076/19560 | loss 3.353244 (+1.30z)| norm 0.2270 (-0.76z)| lr 9.13e-06 | 8449.95 ms | -100.0% bf16 MFU | 62041 tok/s +step 18077/19560 | loss 3.293661 (-0.21z)| norm 0.2226 (-1.06z)| lr 9.12e-06 | 8442.14 ms | -100.0% bf16 MFU | 62044 tok/s +step 18078/19560 | loss 3.307794 (+0.14z)| norm 0.2374 (-0.03z)| lr 9.11e-06 | 8446.81 ms | -100.0% bf16 MFU | 62046 tok/s +step 18079/19560 | loss 3.303588 (+0.02z)| norm 0.2244 (-0.93z)| lr 9.09e-06 | 8449.23 ms | -100.0% bf16 MFU | 62046 tok/s +step 18080/19560 | loss 3.298850 (-0.10z)| norm 0.2250 (-0.88z)| lr 9.08e-06 | 8446.78 ms | -100.0% bf16 MFU | 62047 tok/s +step 18081/19560 | loss 3.299012 (-0.08z)| norm 0.2290 (-0.59z)| lr 9.07e-06 | 8447.45 ms | -100.0% bf16 MFU | 62048 tok/s +step 18082/19560 | loss 3.315422 (+0.33z)| norm 0.2327 (-0.33z)| lr 9.06e-06 | 8443.72 ms | -100.0% bf16 MFU | 62050 tok/s +step 18083/19560 | loss 3.277581 (-0.66z)| norm 0.2306 (-0.47z)| lr 9.05e-06 | 8448.38 ms | -100.0% bf16 MFU | 62051 tok/s +step 18084/19560 | loss 3.292661 (-0.25z)| norm 0.2251 (-0.85z)| lr 9.03e-06 | 8449.01 ms | -100.0% bf16 MFU | 62051 tok/s +step 18085/19560 | loss 3.364682 (+1.63z)| norm 0.2315 (-0.41z)| lr 9.02e-06 | 8444.86 ms | -100.0% bf16 MFU | 62052 tok/s +step 18086/19560 | loss 3.296793 (-0.16z)| norm 0.2345 (-0.21z)| lr 9.01e-06 | 8443.91 ms | -100.0% bf16 MFU | 62054 tok/s +step 18087/19560 | loss 3.289148 (-0.38z)| norm 0.2261 (-0.79z)| lr 9.00e-06 | 8445.52 ms | -100.0% bf16 MFU | 62056 tok/s +step 18088/19560 | loss 3.314847 (+0.31z)| norm 0.2277 (-0.67z)| lr 8.99e-06 | 8444.48 ms | -100.0% bf16 MFU | 62057 tok/s +step 18089/19560 | loss 3.295527 (-0.21z)| norm 0.2318 (-0.39z)| lr 8.97e-06 | 8447.25 ms | -100.0% bf16 MFU | 62058 tok/s +step 18090/19560 | loss 3.291797 (-0.32z)| norm 0.2395 (+0.15z)| lr 8.96e-06 | 8446.62 ms | -100.0% bf16 MFU | 62058 tok/s +step 18091/19560 | loss 3.357295 (+1.44z)| norm 0.2475 (+0.70z)| lr 8.95e-06 | 8447.13 ms | -100.0% bf16 MFU | 62059 tok/s +step 18092/19560 | loss 3.328055 (+0.65z)| norm 0.2296 (-0.55z)| lr 8.94e-06 | 8453.85 ms | -100.0% bf16 MFU | 62057 tok/s +step 18093/19560 | loss 3.336644 (+0.87z)| norm 0.2904 (+3.49z)| lr 8.92e-06 | 8444.83 ms | -100.0% bf16 MFU | 62058 tok/s +step 18094/19560 | loss 3.339715 (+0.94z)| norm 0.2300 (-0.53z)| lr 8.91e-06 | 8444.03 ms | -100.0% bf16 MFU | 62060 tok/s +step 18095/19560 | loss 3.278144 (-0.71z)| norm 0.2286 (-0.62z)| lr 8.90e-06 | 8444.98 ms | -100.0% bf16 MFU | 62061 tok/s +step 18096/19560 | loss 3.253621 (-1.36z)| norm 0.2272 (-0.71z)| lr 8.89e-06 | 8445.12 ms | -100.0% bf16 MFU | 62062 tok/s +step 18097/19560 | loss 3.280649 (-0.64z)| norm 0.2398 (+0.15z)| lr 8.88e-06 | 8444.51 ms | -100.0% bf16 MFU | 62063 tok/s +step 18098/19560 | loss 3.238151 (-1.78z)| norm 0.2312 (-0.44z)| lr 8.86e-06 | 8447.35 ms | -100.0% bf16 MFU | 62063 tok/s +step 18099/19560 | loss 3.343599 (+1.03z)| norm 0.2261 (-0.78z)| lr 8.85e-06 | 8448.16 ms | -100.0% bf16 MFU | 62063 tok/s +step 18100/19560 | loss 3.274078 (-0.83z)| norm 0.2245 (-0.89z)| lr 8.84e-06 | 8445.19 ms | -100.0% bf16 MFU | 62064 tok/s +step 18101/19560 | loss 3.245826 (-1.56z)| norm 0.2255 (-0.82z)| lr 8.83e-06 | 8441.74 ms | -100.0% bf16 MFU | 62066 tok/s +step 18102/19560 | loss 3.291685 (-0.34z)| norm 0.2276 (-0.67z)| lr 8.82e-06 | 8444.61 ms | -100.0% bf16 MFU | 62067 tok/s +step 18103/19560 | loss 3.296473 (-0.21z)| norm 0.2459 (+0.58z)| lr 8.80e-06 | 8442.79 ms | -100.0% bf16 MFU | 62069 tok/s +step 18104/19560 | loss 3.344947 (+1.08z)| norm 0.2245 (-0.88z)| lr 8.79e-06 | 8444.07 ms | -100.0% bf16 MFU | 62070 tok/s +step 18105/19560 | loss 3.360957 (+1.48z)| norm 0.2340 (-0.23z)| lr 8.78e-06 | 8447.56 ms | -100.0% bf16 MFU | 62069 tok/s +step 18106/19560 | loss 3.315778 (+0.27z)| norm 0.2230 (-0.98z)| lr 8.77e-06 | 8442.71 ms | -100.0% bf16 MFU | 62071 tok/s +step 18107/19560 | loss 3.488988 (+4.45z)| norm 0.2320 (-0.36z)| lr 8.76e-06 | 8443.08 ms | -100.0% bf16 MFU | 62072 tok/s +step 18108/19560 | loss 3.309237 (+0.05z)| norm 0.2310 (-0.43z)| lr 8.74e-06 | 8446.53 ms | -100.0% bf16 MFU | 62072 tok/s +step 18109/19560 | loss 3.449773 (+3.32z)| norm 0.2540 (+1.12z)| lr 8.73e-06 | 8438.93 ms | -100.0% bf16 MFU | 62075 tok/s +step 18110/19560 | loss 3.265266 (-1.02z)| norm 0.2413 (+0.26z)| lr 8.72e-06 | 8440.49 ms | -100.0% bf16 MFU | 62077 tok/s +step 18111/19560 | loss 3.254514 (-1.26z)| norm 0.2331 (-0.29z)| lr 8.71e-06 | 8445.90 ms | -100.0% bf16 MFU | 62077 tok/s +step 18112/19560 | loss 3.280416 (-0.65z)| norm 0.2321 (-0.35z)| lr 8.70e-06 | 8442.85 ms | -100.0% bf16 MFU | 62078 tok/s +step 18113/19560 | loss 3.340975 (+0.76z)| norm 0.2659 (+1.89z)| lr 8.68e-06 | 8444.96 ms | -100.0% bf16 MFU | 62078 tok/s +step 18114/19560 | loss 3.260701 (-1.13z)| norm 0.2259 (-0.77z)| lr 8.67e-06 | 8447.31 ms | -100.0% bf16 MFU | 62078 tok/s +step 18115/19560 | loss 3.265000 (-1.02z)| norm 0.2310 (-0.44z)| lr 8.66e-06 | 8438.40 ms | -100.0% bf16 MFU | 62080 tok/s +step 18116/19560 | loss 3.293732 (-0.34z)| norm 0.2421 (+0.30z)| lr 8.65e-06 | 8441.44 ms | -100.0% bf16 MFU | 62082 tok/s +step 18117/19560 | loss 3.253493 (-1.26z)| norm 0.2402 (+0.17z)| lr 8.64e-06 | 8444.04 ms | -100.0% bf16 MFU | 62082 tok/s +step 18118/19560 | loss 3.404504 (+2.20z)| norm 0.2299 (-0.52z)| lr 8.62e-06 | 8442.83 ms | -100.0% bf16 MFU | 62083 tok/s +step 18119/19560 | loss 3.355931 (+1.07z)| norm 0.2399 (+0.15z)| lr 8.61e-06 | 8439.98 ms | -100.0% bf16 MFU | 62085 tok/s +step 18120/19560 | loss 3.275019 (-0.79z)| norm 0.2316 (-0.41z)| lr 8.60e-06 | 8442.62 ms | -100.0% bf16 MFU | 62085 tok/s +step 18121/19560 | loss 3.341010 (+0.73z)| norm 0.2471 (+0.62z)| lr 8.59e-06 | 8439.95 ms | -100.0% bf16 MFU | 62087 tok/s +step 18122/19560 | loss 3.323735 (+0.32z)| norm 0.2422 (+0.29z)| lr 8.58e-06 | 8436.44 ms | -100.0% bf16 MFU | 62090 tok/s +step 18123/19560 | loss 3.275976 (-0.78z)| norm 0.2363 (-0.11z)| lr 8.57e-06 | 8433.30 ms | -100.0% bf16 MFU | 62094 tok/s +step 18124/19560 | loss 3.302471 (-0.16z)| norm 0.2410 (+0.20z)| lr 8.55e-06 | 8435.44 ms | -100.0% bf16 MFU | 62097 tok/s +step 18125/19560 | loss 3.320756 (+0.26z)| norm 0.2255 (-0.83z)| lr 8.54e-06 | 8431.08 ms | -100.0% bf16 MFU | 62101 tok/s +step 18126/19560 | loss 3.313629 (+0.10z)| norm 0.2311 (-0.46z)| lr 8.53e-06 | 8432.12 ms | -100.0% bf16 MFU | 62105 tok/s +step 18127/19560 | loss 3.288842 (-0.48z)| norm 0.2282 (-0.65z)| lr 8.52e-06 | 8434.97 ms | -100.0% bf16 MFU | 62108 tok/s +step 18128/19560 | loss 3.274554 (-0.80z)| norm 0.2437 (+0.38z)| lr 8.51e-06 | 8430.51 ms | -100.0% bf16 MFU | 62112 tok/s +step 18129/19560 | loss 3.341756 (+0.74z)| norm 0.2312 (-0.46z)| lr 8.49e-06 | 8431.78 ms | -100.0% bf16 MFU | 62115 tok/s +step 18130/19560 | loss 3.287667 (-0.50z)| norm 0.2405 (+0.16z)| lr 8.48e-06 | 8435.12 ms | -100.0% bf16 MFU | 62117 tok/s +step 18131/19560 | loss 3.253867 (-1.27z)| norm 0.2254 (-0.84z)| lr 8.47e-06 | 8432.65 ms | -100.0% bf16 MFU | 62120 tok/s +step 18132/19560 | loss 3.283755 (-0.56z)| norm 0.2223 (-1.03z)| lr 8.46e-06 | 8435.03 ms | -100.0% bf16 MFU | 62122 tok/s +step 18133/19560 | loss 3.279929 (-0.65z)| norm 0.2224 (-1.01z)| lr 8.45e-06 | 8433.54 ms | -100.0% bf16 MFU | 62124 tok/s +step 18134/19560 | loss 3.291856 (-0.37z)| norm 0.2324 (-0.34z)| lr 8.44e-06 | 8434.90 ms | -100.0% bf16 MFU | 62126 tok/s +step 18135/19560 | loss 3.337216 (+0.69z)| norm 0.2322 (-0.35z)| lr 8.42e-06 | 8438.28 ms | -100.0% bf16 MFU | 62126 tok/s +step 18136/19560 | loss 3.324358 (+0.38z)| norm 0.2245 (-0.86z)| lr 8.41e-06 | 8434.41 ms | -100.0% bf16 MFU | 62128 tok/s +step 18137/19560 | loss 3.366064 (+1.37z)| norm 0.2433 (+0.39z)| lr 8.40e-06 | 8436.22 ms | -100.0% bf16 MFU | 62129 tok/s +step 18138/19560 | loss 3.262554 (-1.08z)| norm 0.2799 (+2.74z)| lr 8.39e-06 | 8437.82 ms | -100.0% bf16 MFU | 62129 tok/s +step 18139/19560 | loss 3.306231 (-0.05z)| norm 0.2230 (-0.97z)| lr 8.38e-06 | 8437.61 ms | -100.0% bf16 MFU | 62130 tok/s +step 18140/19560 | loss 3.289311 (-0.45z)| norm 0.2745 (+2.34z)| lr 8.37e-06 | 8444.90 ms | -100.0% bf16 MFU | 62127 tok/s +step 18141/19560 | loss 3.261995 (-1.09z)| norm 0.2369 (-0.07z)| lr 8.35e-06 | 8434.84 ms | -100.0% bf16 MFU | 62129 tok/s +step 18142/19560 | loss 3.294114 (-0.33z)| norm 0.2411 (+0.19z)| lr 8.34e-06 | 8436.92 ms | -100.0% bf16 MFU | 62129 tok/s +step 18143/19560 | loss 3.331255 (+0.55z)| norm 0.2283 (-0.64z)| lr 8.33e-06 | 8433.83 ms | -100.0% bf16 MFU | 62131 tok/s +step 18144/19560 | loss 3.304995 (-0.08z)| norm 0.2311 (-0.45z)| lr 8.32e-06 | 8444.92 ms | -100.0% bf16 MFU | 62129 tok/s +step 18145/19560 | loss 3.270354 (-0.92z)| norm 0.2309 (-0.46z)| lr 8.31e-06 | 8438.27 ms | -100.0% bf16 MFU | 62129 tok/s +step 18146/19560 | loss 3.245202 (-1.49z)| norm 0.2266 (-0.73z)| lr 8.29e-06 | 8437.88 ms | -100.0% bf16 MFU | 62129 tok/s +step 18147/19560 | loss 3.285067 (-0.53z)| norm 0.2294 (-0.55z)| lr 8.28e-06 | 8441.87 ms | -100.0% bf16 MFU | 62128 tok/s +step 18148/19560 | loss 3.406648 (+2.30z)| norm 0.2338 (-0.25z)| lr 8.27e-06 | 8438.87 ms | -100.0% bf16 MFU | 62128 tok/s +step 18149/19560 | loss 3.289194 (-0.45z)| norm 0.2383 (+0.03z)| lr 8.26e-06 | 8438.82 ms | -100.0% bf16 MFU | 62128 tok/s +step 18150/19560 | loss 3.292660 (-0.36z)| norm 0.2265 (-0.73z)| lr 8.25e-06 | 8434.65 ms | -100.0% bf16 MFU | 62130 tok/s +step 18151/19560 | loss 3.297888 (-0.24z)| norm 0.2259 (-0.76z)| lr 8.24e-06 | 8438.23 ms | -100.0% bf16 MFU | 62130 tok/s +step 18152/19560 | loss 3.333618 (+0.59z)| norm 0.2255 (-0.79z)| lr 8.22e-06 | 8441.37 ms | -100.0% bf16 MFU | 62129 tok/s +step 18153/19560 | loss 3.325359 (+0.41z)| norm 0.2327 (-0.33z)| lr 8.21e-06 | 8438.89 ms | -100.0% bf16 MFU | 62129 tok/s +step 18154/19560 | loss 3.280132 (-0.66z)| norm 0.2436 (+0.38z)| lr 8.20e-06 | 8437.40 ms | -100.0% bf16 MFU | 62129 tok/s +step 18155/19560 | loss 3.273509 (-0.81z)| norm 0.2320 (-0.39z)| lr 8.19e-06 | 8442.55 ms | -100.0% bf16 MFU | 62128 tok/s +step 18156/19560 | loss 3.295502 (-0.28z)| norm 0.2323 (-0.36z)| lr 8.18e-06 | 8435.45 ms | -100.0% bf16 MFU | 62129 tok/s +step 18157/19560 | loss 3.298891 (-0.19z)| norm 0.2333 (-0.29z)| lr 8.17e-06 | 8439.61 ms | -100.0% bf16 MFU | 62129 tok/s +step 18158/19560 | loss 3.243374 (-1.52z)| norm 0.2402 (+0.17z)| lr 8.16e-06 | 8440.83 ms | -100.0% bf16 MFU | 62128 tok/s +step 18159/19560 | loss 3.329824 (+0.56z)| norm 0.2295 (-0.54z)| lr 8.14e-06 | 8438.29 ms | -100.0% bf16 MFU | 62128 tok/s +step 18160/19560 | loss 3.256872 (-1.19z)| norm 0.2256 (-0.78z)| lr 8.13e-06 | 8438.06 ms | -100.0% bf16 MFU | 62128 tok/s +step 18161/19560 | loss 3.356283 (+1.19z)| norm 0.2477 (+0.67z)| lr 8.12e-06 | 8440.49 ms | -100.0% bf16 MFU | 62128 tok/s +step 18162/19560 | loss 3.315092 (+0.20z)| norm 0.2266 (-0.72z)| lr 8.11e-06 | 8441.16 ms | -100.0% bf16 MFU | 62127 tok/s +step 18163/19560 | loss 3.286130 (-0.50z)| norm 0.2352 (-0.15z)| lr 8.10e-06 | 8439.93 ms | -100.0% bf16 MFU | 62127 tok/s +step 18164/19560 | loss 3.301715 (-0.12z)| norm 0.2320 (-0.36z)| lr 8.09e-06 | 8439.75 ms | -100.0% bf16 MFU | 62126 tok/s +step 18165/19560 | loss 3.306772 (-0.00z)| norm 0.2459 (+0.55z)| lr 8.07e-06 | 8439.75 ms | -100.0% bf16 MFU | 62126 tok/s +step 18166/19560 | loss 3.279278 (-0.66z)| norm 0.2335 (-0.26z)| lr 8.06e-06 | 8438.66 ms | -100.0% bf16 MFU | 62126 tok/s +step 18167/19560 | loss 3.367822 (+1.44z)| norm 0.2333 (-0.27z)| lr 8.05e-06 | 8440.04 ms | -100.0% bf16 MFU | 62126 tok/s +step 18168/19560 | loss 3.357021 (+1.18z)| norm 0.2314 (-0.39z)| lr 8.04e-06 | 8438.55 ms | -100.0% bf16 MFU | 62126 tok/s +step 18169/19560 | loss 3.224019 (-1.95z)| norm 0.2415 (+0.27z)| lr 8.03e-06 | 8437.53 ms | -100.0% bf16 MFU | 62127 tok/s +step 18170/19560 | loss 3.358533 (+1.20z)| norm 0.2450 (+0.49z)| lr 8.02e-06 | 8434.65 ms | -100.0% bf16 MFU | 62128 tok/s +step 18171/19560 | loss 3.346950 (+0.92z)| norm 0.2311 (-0.42z)| lr 8.01e-06 | 8431.83 ms | -100.0% bf16 MFU | 62131 tok/s +step 18172/19560 | loss 3.283114 (-0.57z)| norm 0.2296 (-0.52z)| lr 7.99e-06 | 8434.67 ms | -100.0% bf16 MFU | 62132 tok/s +step 18173/19560 | loss 3.336030 (+0.65z)| norm 0.2208 (-1.09z)| lr 7.98e-06 | 8430.17 ms | -100.0% bf16 MFU | 62135 tok/s +step 18174/19560 | loss 3.264676 (-1.03z)| norm 0.2287 (-0.57z)| lr 7.97e-06 | 8431.25 ms | -100.0% bf16 MFU | 62138 tok/s +step 18175/19560 | loss 3.284253 (-0.57z)| norm 0.2518 (+0.94z)| lr 7.96e-06 | 8430.95 ms | -100.0% bf16 MFU | 62140 tok/s +step 18176/19560 | loss 3.284186 (-0.57z)| norm 0.2275 (-0.65z)| lr 7.95e-06 | 8430.63 ms | -100.0% bf16 MFU | 62142 tok/s +step 18177/19560 | loss 3.295214 (-0.29z)| norm 0.2277 (-0.62z)| lr 7.94e-06 | 8424.71 ms | -100.0% bf16 MFU | 62147 tok/s +step 18178/19560 | loss 3.331326 (+0.56z)| norm 0.2326 (-0.30z)| lr 7.93e-06 | 8426.61 ms | -100.0% bf16 MFU | 62151 tok/s +step 18179/19560 | loss 3.259974 (-1.13z)| norm 0.2290 (-0.57z)| lr 7.91e-06 | 8426.78 ms | -100.0% bf16 MFU | 62154 tok/s +step 18180/19560 | loss 3.300899 (-0.14z)| norm 0.2281 (-0.63z)| lr 7.90e-06 | 8429.44 ms | -100.0% bf16 MFU | 62156 tok/s +step 18181/19560 | loss 3.340571 (+0.80z)| norm 0.2295 (-0.51z)| lr 7.89e-06 | 8431.67 ms | -100.0% bf16 MFU | 62157 tok/s +step 18182/19560 | loss 3.228121 (-1.85z)| norm 0.2321 (-0.31z)| lr 7.88e-06 | 8428.98 ms | -100.0% bf16 MFU | 62159 tok/s +step 18183/19560 | loss 3.295907 (-0.24z)| norm 0.2269 (-0.72z)| lr 7.87e-06 | 8430.14 ms | -100.0% bf16 MFU | 62161 tok/s +step 18184/19560 | loss 3.262490 (-1.02z)| norm 0.2312 (-0.37z)| lr 7.86e-06 | 8430.43 ms | -100.0% bf16 MFU | 62162 tok/s +step 18185/19560 | loss 3.343850 (+0.88z)| norm 0.2355 (-0.03z)| lr 7.85e-06 | 8432.69 ms | -100.0% bf16 MFU | 62163 tok/s +step 18186/19560 | loss 3.299681 (-0.15z)| norm 0.2251 (-0.83z)| lr 7.83e-06 | 8430.98 ms | -100.0% bf16 MFU | 62164 tok/s +step 18187/19560 | loss 3.309891 (+0.10z)| norm 0.2294 (-0.49z)| lr 7.82e-06 | 8432.12 ms | -100.0% bf16 MFU | 62165 tok/s +step 18188/19560 | loss 3.267423 (-0.90z)| norm 0.2264 (-0.72z)| lr 7.81e-06 | 8431.46 ms | -100.0% bf16 MFU | 62166 tok/s +step 18189/19560 | loss 3.342010 (+0.89z)| norm 0.2459 (+0.80z)| lr 7.80e-06 | 8433.72 ms | -100.0% bf16 MFU | 62166 tok/s +step 18190/19560 | loss 3.255111 (-1.18z)| norm 0.2286 (-0.54z)| lr 7.79e-06 | 8434.31 ms | -100.0% bf16 MFU | 62165 tok/s +step 18191/19560 | loss 3.307185 (+0.06z)| norm 0.2342 (-0.09z)| lr 7.78e-06 | 8434.92 ms | -100.0% bf16 MFU | 62165 tok/s +step 18192/19560 | loss 3.254842 (-1.17z)| norm 0.2325 (-0.24z)| lr 7.77e-06 | 8436.17 ms | -100.0% bf16 MFU | 62164 tok/s +step 18193/19560 | loss 3.250357 (-1.27z)| norm 0.2144 (-1.63z)| lr 7.76e-06 | 8435.07 ms | -100.0% bf16 MFU | 62164 tok/s +step 18194/19560 | loss 3.273454 (-0.71z)| norm 0.2401 (+0.40z)| lr 7.74e-06 | 8434.84 ms | -100.0% bf16 MFU | 62163 tok/s +step 18195/19560 | loss 3.301628 (-0.05z)| norm 0.2340 (-0.05z)| lr 7.73e-06 | 8436.09 ms | -100.0% bf16 MFU | 62163 tok/s +step 18196/19560 | loss 3.356973 (+1.27z)| norm 0.2286 (-0.53z)| lr 7.72e-06 | 8434.99 ms | -100.0% bf16 MFU | 62162 tok/s +step 18197/19560 | loss 3.330486 (+0.62z)| norm 0.2466 (+1.10z)| lr 7.71e-06 | 8436.00 ms | -100.0% bf16 MFU | 62162 tok/s +step 18198/19560 | loss 3.342801 (+0.93z)| norm 0.2270 (-0.66z)| lr 7.70e-06 | 8435.33 ms | -100.0% bf16 MFU | 62161 tok/s +step 18199/19560 | loss 3.296021 (-0.20z)| norm 0.2352 (+0.10z)| lr 7.69e-06 | 8437.07 ms | -100.0% bf16 MFU | 62160 tok/s +step 18200/19560 | loss 3.353976 (+1.19z)| norm 0.2358 (+0.16z)| lr 7.68e-06 | 8437.56 ms | -100.0% bf16 MFU | 62159 tok/s +step 18201/19560 | loss 3.322514 (+0.43z)| norm 0.2297 (-0.39z)| lr 7.67e-06 | 8439.23 ms | -100.0% bf16 MFU | 62157 tok/s +step 18202/19560 | loss 3.279418 (-0.62z)| norm 0.2257 (-0.76z)| lr 7.65e-06 | 8434.20 ms | -100.0% bf16 MFU | 62158 tok/s +step 18203/19560 | loss 3.340902 (+0.87z)| norm 0.2323 (-0.14z)| lr 7.64e-06 | 8437.82 ms | -100.0% bf16 MFU | 62157 tok/s +step 18204/19560 | loss 3.327577 (+0.55z)| norm 0.2348 (+0.09z)| lr 7.63e-06 | 8435.99 ms | -100.0% bf16 MFU | 62156 tok/s +step 18205/19560 | loss 3.334862 (+0.72z)| norm 0.2414 (+0.70z)| lr 7.62e-06 | 8436.47 ms | -100.0% bf16 MFU | 62156 tok/s +step 18206/19560 | loss 3.277863 (-0.66z)| norm 0.2308 (-0.29z)| lr 7.61e-06 | 8438.42 ms | -100.0% bf16 MFU | 62154 tok/s +step 18207/19560 | loss 3.244360 (-1.45z)| norm 0.2231 (-1.02z)| lr 7.60e-06 | 8434.34 ms | -100.0% bf16 MFU | 62155 tok/s +step 18208/19560 | loss 3.309742 (+0.12z)| norm 0.2497 (+1.46z)| lr 7.59e-06 | 8435.20 ms | -100.0% bf16 MFU | 62155 tok/s +step 18209/19560 | loss 3.271092 (-0.80z)| norm 0.2446 (+0.97z)| lr 7.58e-06 | 8438.80 ms | -100.0% bf16 MFU | 62153 tok/s +step 18210/19560 | loss 3.394649 (+2.11z)| norm 0.2440 (+0.90z)| lr 7.56e-06 | 8436.98 ms | -100.0% bf16 MFU | 62153 tok/s +step 18211/19560 | loss 3.280366 (-0.58z)| norm 0.2364 (+0.19z)| lr 7.55e-06 | 8437.05 ms | -100.0% bf16 MFU | 62152 tok/s +step 18212/19560 | loss 3.269764 (-0.82z)| norm 0.2330 (-0.14z)| lr 7.54e-06 | 8437.69 ms | -100.0% bf16 MFU | 62151 tok/s +step 18213/19560 | loss 3.255193 (-1.15z)| norm 0.2375 (+0.28z)| lr 7.53e-06 | 8439.27 ms | -100.0% bf16 MFU | 62150 tok/s +step 18214/19560 | loss 3.277116 (-0.63z)| norm 0.2278 (-0.62z)| lr 7.52e-06 | 8439.46 ms | -100.0% bf16 MFU | 62149 tok/s +step 18215/19560 | loss 3.372158 (+1.58z)| norm 0.2282 (-0.59z)| lr 7.51e-06 | 8439.14 ms | -100.0% bf16 MFU | 62148 tok/s +step 18216/19560 | loss 3.271193 (-0.77z)| norm 0.2295 (-0.47z)| lr 7.50e-06 | 8438.09 ms | -100.0% bf16 MFU | 62147 tok/s +step 18217/19560 | loss 3.309447 (+0.12z)| norm 0.2301 (-0.41z)| lr 7.49e-06 | 8436.87 ms | -100.0% bf16 MFU | 62147 tok/s +step 18218/19560 | loss 3.322783 (+0.42z)| norm 0.2230 (-1.06z)| lr 7.48e-06 | 8438.35 ms | -100.0% bf16 MFU | 62146 tok/s +step 18219/19560 | loss 3.306149 (+0.05z)| norm 0.2393 (+0.47z)| lr 7.46e-06 | 8436.88 ms | -100.0% bf16 MFU | 62146 tok/s +step 18220/19560 | loss 3.317266 (+0.31z)| norm 0.2529 (+1.71z)| lr 7.45e-06 | 8433.37 ms | -100.0% bf16 MFU | 62147 tok/s +step 18221/19560 | loss 3.266819 (-0.86z)| norm 0.2349 (+0.10z)| lr 7.44e-06 | 8439.69 ms | -100.0% bf16 MFU | 62146 tok/s +step 18222/19560 | loss 3.207934 (-2.18z)| norm 0.2179 (-1.66z)| lr 7.43e-06 | 8460.59 ms | -100.0% bf16 MFU | 62137 tok/s +step 18223/19560 | loss 3.243751 (-1.34z)| norm 0.2252 (-0.90z)| lr 7.42e-06 | 8459.64 ms | -100.0% bf16 MFU | 62129 tok/s +step 18224/19560 | loss 3.224950 (-1.75z)| norm 0.2341 (+0.02z)| lr 7.41e-06 | 8456.81 ms | -100.0% bf16 MFU | 62122 tok/s +step 18225/19560 | loss 3.252166 (-1.12z)| norm 0.2486 (+1.50z)| lr 7.40e-06 | 8455.34 ms | -100.0% bf16 MFU | 62116 tok/s +step 18226/19560 | loss 3.256185 (-1.04z)| norm 0.2258 (-0.84z)| lr 7.39e-06 | 8454.88 ms | -100.0% bf16 MFU | 62111 tok/s +step 18227/19560 | loss 3.276667 (-0.56z)| norm 0.2264 (-0.77z)| lr 7.38e-06 | 8457.51 ms | -100.0% bf16 MFU | 62105 tok/s +step 18228/19560 | loss 3.249213 (-1.18z)| norm 0.2416 (+0.76z)| lr 7.37e-06 | 8454.46 ms | -100.0% bf16 MFU | 62100 tok/s +step 18229/19560 | loss 3.268125 (-0.76z)| norm 0.2367 (+0.26z)| lr 7.35e-06 | 8451.06 ms | -100.0% bf16 MFU | 62097 tok/s +step 18230/19560 | loss 3.284672 (-0.38z)| norm 0.2302 (-0.41z)| lr 7.34e-06 | 8451.83 ms | -100.0% bf16 MFU | 62094 tok/s +step 18231/19560 | loss 3.317281 (+0.36z)| norm 0.2365 (+0.24z)| lr 7.33e-06 | 8452.53 ms | -100.0% bf16 MFU | 62091 tok/s +step 18232/19560 | loss 3.282346 (-0.42z)| norm 0.2342 (-0.00z)| lr 7.32e-06 | 8454.15 ms | -100.0% bf16 MFU | 62087 tok/s +step 18233/19560 | loss 3.303745 (+0.07z)| norm 0.2356 (+0.14z)| lr 7.31e-06 | 8452.26 ms | -100.0% bf16 MFU | 62084 tok/s +step 18234/19560 | loss 3.263918 (-0.83z)| norm 0.2328 (-0.16z)| lr 7.30e-06 | 8453.06 ms | -100.0% bf16 MFU | 62081 tok/s +step 18235/19560 | loss 3.298939 (+0.01z)| norm 0.2217 (-1.30z)| lr 7.29e-06 | 8450.72 ms | -100.0% bf16 MFU | 62079 tok/s +step 18236/19560 | loss 3.307079 (+0.21z)| norm 0.2326 (-0.17z)| lr 7.28e-06 | 8450.60 ms | -100.0% bf16 MFU | 62077 tok/s +step 18237/19560 | loss 3.269485 (-0.73z)| norm 0.2347 (+0.07z)| lr 7.27e-06 | 8450.60 ms | -100.0% bf16 MFU | 62075 tok/s +step 18238/19560 | loss 3.283474 (-0.37z)| norm 0.2298 (-0.45z)| lr 7.26e-06 | 8448.85 ms | -100.0% bf16 MFU | 62074 tok/s +step 18239/19560 | loss 3.247791 (-1.30z)| norm 0.2301 (-0.41z)| lr 7.24e-06 | 8450.70 ms | -100.0% bf16 MFU | 62073 tok/s +step 18240/19560 | loss 3.260404 (-0.96z)| norm 0.2367 (+0.28z)| lr 7.23e-06 | 8455.45 ms | -100.0% bf16 MFU | 62069 tok/s +step 18241/19560 | loss 3.301055 (+0.11z)| norm 0.2295 (-0.47z)| lr 7.22e-06 | 8459.64 ms | -100.0% bf16 MFU | 62065 tok/s +step 18242/19560 | loss 3.349082 (+1.35z)| norm 0.2797 (+4.60z)| lr 7.21e-06 | 8447.83 ms | -100.0% bf16 MFU | 62064 tok/s +step 18243/19560 | loss 3.333591 (+0.93z)| norm 0.2338 (-0.04z)| lr 7.20e-06 | 8459.77 ms | -100.0% bf16 MFU | 62060 tok/s +step 18244/19560 | loss 3.245876 (-1.35z)| norm 0.2207 (-1.34z)| lr 7.19e-06 | 8457.51 ms | -100.0% bf16 MFU | 62056 tok/s +step 18245/19560 | loss 3.295290 (-0.07z)| norm 0.2203 (-1.36z)| lr 7.18e-06 | 8451.72 ms | -100.0% bf16 MFU | 62055 tok/s +step 18246/19560 | loss 3.306169 (+0.24z)| norm 0.2373 (+0.34z)| lr 7.17e-06 | 8453.50 ms | -100.0% bf16 MFU | 62054 tok/s +step 18247/19560 | loss 3.273299 (-0.64z)| norm 0.2315 (-0.23z)| lr 7.16e-06 | 8448.53 ms | -100.0% bf16 MFU | 62054 tok/s +step 18248/19560 | loss 3.246267 (-1.36z)| norm 0.2293 (-0.46z)| lr 7.15e-06 | 8457.25 ms | -100.0% bf16 MFU | 62051 tok/s +step 18249/19560 | loss 3.298955 (+0.08z)| norm 0.2501 (+1.63z)| lr 7.14e-06 | 8450.43 ms | -100.0% bf16 MFU | 62050 tok/s +step 18250/19560 | loss 3.328004 (+0.87z)| norm 0.2379 (+0.40z)| lr 7.13e-06 | 8447.99 ms | -100.0% bf16 MFU | 62051 tok/s +val loss 3.265727 +evaluating HellaSwag: 0/1256 evaluating HellaSwag: 10/1256 evaluating HellaSwag: 20/1256 evaluating HellaSwag: 30/1256 evaluating HellaSwag: 40/1256 evaluating HellaSwag: 50/1256 evaluating HellaSwag: 60/1256 evaluating HellaSwag: 70/1256 evaluating HellaSwag: 80/1256 evaluating HellaSwag: 90/1256 evaluating HellaSwag: 100/1256 evaluating HellaSwag: 110/1256 evaluating HellaSwag: 120/1256 evaluating HellaSwag: 130/1256 evaluating HellaSwag: 140/1256 evaluating HellaSwag: 150/1256 evaluating HellaSwag: 160/1256 evaluating HellaSwag: 170/1256 evaluating HellaSwag: 180/1256 evaluating HellaSwag: 190/1256 evaluating HellaSwag: 200/1256 evaluating HellaSwag: 210/1256 evaluating HellaSwag: 220/1256 evaluating HellaSwag: 230/1256 evaluating HellaSwag: 240/1256 evaluating HellaSwag: 250/1256 evaluating HellaSwag: 260/1256 evaluating HellaSwag: 270/1256 evaluating HellaSwag: 280/1256 evaluating HellaSwag: 290/1256 evaluating HellaSwag: 300/1256 evaluating HellaSwag: 310/1256 evaluating HellaSwag: 320/1256 evaluating HellaSwag: 330/1256 evaluating HellaSwag: 340/1256 evaluating HellaSwag: 350/1256 evaluating HellaSwag: 360/1256 evaluating HellaSwag: 370/1256 evaluating HellaSwag: 380/1256 evaluating HellaSwag: 390/1256 evaluating HellaSwag: 400/1256 evaluating HellaSwag: 410/1256 evaluating HellaSwag: 420/1256 evaluating HellaSwag: 430/1256 evaluating HellaSwag: 440/1256 evaluating HellaSwag: 450/1256 evaluating HellaSwag: 460/1256 evaluating HellaSwag: 470/1256 evaluating HellaSwag: 480/1256 evaluating HellaSwag: 490/1256 evaluating HellaSwag: 500/1256 evaluating HellaSwag: 510/1256 evaluating HellaSwag: 520/1256 evaluating HellaSwag: 530/1256 evaluating HellaSwag: 540/1256 evaluating HellaSwag: 550/1256 evaluating HellaSwag: 560/1256 evaluating HellaSwag: 570/1256 evaluating HellaSwag: 580/1256 evaluating HellaSwag: 590/1256 evaluating HellaSwag: 600/1256 evaluating HellaSwag: 610/1256 evaluating HellaSwag: 620/1256 evaluating HellaSwag: 630/1256 evaluating HellaSwag: 640/1256 evaluating HellaSwag: 650/1256 evaluating HellaSwag: 660/1256 evaluating HellaSwag: 670/1256 evaluating HellaSwag: 680/1256 evaluating HellaSwag: 690/1256 evaluating HellaSwag: 700/1256 evaluating HellaSwag: 710/1256 evaluating HellaSwag: 720/1256 evaluating HellaSwag: 730/1256 evaluating HellaSwag: 740/1256 evaluating HellaSwag: 750/1256 evaluating HellaSwag: 760/1256 evaluating HellaSwag: 770/1256 evaluating HellaSwag: 780/1256 evaluating HellaSwag: 790/1256 evaluating HellaSwag: 800/1256 evaluating HellaSwag: 810/1256 evaluating HellaSwag: 820/1256 evaluating HellaSwag: 830/1256 evaluating HellaSwag: 840/1256 evaluating HellaSwag: 850/1256 evaluating HellaSwag: 860/1256 evaluating HellaSwag: 870/1256 evaluating HellaSwag: 880/1256 evaluating HellaSwag: 890/1256 evaluating HellaSwag: 900/1256 evaluating HellaSwag: 910/1256 evaluating HellaSwag: 920/1256 evaluating HellaSwag: 930/1256 evaluating HellaSwag: 940/1256 evaluating HellaSwag: 950/1256 evaluating HellaSwag: 960/1256 evaluating HellaSwag: 970/1256 evaluating HellaSwag: 980/1256 evaluating HellaSwag: 990/1256 evaluating HellaSwag: 1000/1256 evaluating HellaSwag: 1010/1256 evaluating HellaSwag: 1020/1256 evaluating HellaSwag: 1030/1256 evaluating HellaSwag: 1040/1256 evaluating HellaSwag: 1050/1256 evaluating HellaSwag: 1060/1256 evaluating HellaSwag: 1070/1256 evaluating HellaSwag: 1080/1256 evaluating HellaSwag: 1090/1256 evaluating HellaSwag: 1100/1256 evaluating HellaSwag: 1110/1256 evaluating HellaSwag: 1120/1256 evaluating HellaSwag: 1130/1256 evaluating HellaSwag: 1140/1256 evaluating HellaSwag: 1150/1256 evaluating HellaSwag: 1160/1256 evaluating HellaSwag: 1170/1256 evaluating HellaSwag: 1180/1256 evaluating HellaSwag: 1190/1256 evaluating HellaSwag: 1200/1256 evaluating HellaSwag: 1210/1256 evaluating HellaSwag: 1220/1256 evaluating HellaSwag: 1230/1256 evaluating HellaSwag: 1240/1256 evaluating HellaSwag: 1250/1256 HellaSwag: 2998/10042 = 0.298546 +step 18251/19560 | loss 3.267116 (-0.79z)| norm 0.2410 (+0.72z)| lr 7.11e-06 | 8451.11 ms | -100.0% bf16 MFU | 62050 tok/s +step 18252/19560 | loss 3.292046 (-0.11z)| norm 0.2329 (-0.08z)| lr 7.10e-06 | 8454.14 ms | -100.0% bf16 MFU | 62048 tok/s +step 18253/19560 | loss 3.367064 (+1.90z)| norm 0.2309 (-0.30z)| lr 7.09e-06 | 8452.88 ms | -100.0% bf16 MFU | 62047 tok/s +step 18254/19560 | loss 3.220437 (-1.99z)| norm 0.2331 (-0.07z)| lr 7.08e-06 | 8453.46 ms | -100.0% bf16 MFU | 62046 tok/s +step 18255/19560 | loss 3.264569 (-0.81z)| norm 0.2407 (+0.68z)| lr 7.07e-06 | 8448.41 ms | -100.0% bf16 MFU | 62046 tok/s +step 18256/19560 | loss 3.251300 (-1.15z)| norm 0.2246 (-0.92z)| lr 7.06e-06 | 8448.61 ms | -100.0% bf16 MFU | 62047 tok/s +step 18257/19560 | loss 3.302444 (+0.20z)| norm 0.2266 (-0.72z)| lr 7.05e-06 | 8451.14 ms | -100.0% bf16 MFU | 62046 tok/s +step 18258/19560 | loss 3.295836 (+0.02z)| norm 0.2316 (-0.21z)| lr 7.04e-06 | 8456.96 ms | -100.0% bf16 MFU | 62044 tok/s +step 18259/19560 | loss 3.392211 (+2.49z)| norm 0.2404 (+0.66z)| lr 7.03e-06 | 8447.51 ms | -100.0% bf16 MFU | 62045 tok/s +step 18260/19560 | loss 3.307690 (+0.30z)| norm 0.2327 (-0.12z)| lr 7.02e-06 | 8446.03 ms | -100.0% bf16 MFU | 62046 tok/s +step 18261/19560 | loss 3.333405 (+0.95z)| norm 0.2368 (+0.29z)| lr 7.01e-06 | 8453.32 ms | -100.0% bf16 MFU | 62045 tok/s +step 18262/19560 | loss 3.293978 (-0.07z)| norm 0.2315 (-0.25z)| lr 7.00e-06 | 8448.22 ms | -100.0% bf16 MFU | 62046 tok/s +step 18263/19560 | loss 3.244709 (-1.32z)| norm 0.2245 (-0.95z)| lr 6.98e-06 | 8452.97 ms | -100.0% bf16 MFU | 62045 tok/s +step 18264/19560 | loss 3.262550 (-0.85z)| norm 0.2396 (+0.57z)| lr 6.97e-06 | 8448.54 ms | -100.0% bf16 MFU | 62045 tok/s +step 18265/19560 | loss 3.283129 (-0.30z)| norm 0.2299 (-0.41z)| lr 6.96e-06 | 8453.07 ms | -100.0% bf16 MFU | 62044 tok/s +step 18266/19560 | loss 3.276659 (-0.48z)| norm 0.2237 (-1.09z)| lr 6.95e-06 | 8447.61 ms | -100.0% bf16 MFU | 62045 tok/s +step 18267/19560 | loss 3.269740 (-0.65z)| norm 0.2356 (+0.22z)| lr 6.94e-06 | 8452.50 ms | -100.0% bf16 MFU | 62044 tok/s +step 18268/19560 | loss 3.283213 (-0.30z)| norm 0.2305 (-0.33z)| lr 6.93e-06 | 8447.15 ms | -100.0% bf16 MFU | 62045 tok/s +step 18269/19560 | loss 3.304616 (+0.25z)| norm 0.2436 (+1.25z)| lr 6.92e-06 | 8446.28 ms | -100.0% bf16 MFU | 62047 tok/s +step 18270/19560 | loss 3.331124 (+0.94z)| norm 0.2423 (+1.09z)| lr 6.91e-06 | 8446.82 ms | -100.0% bf16 MFU | 62048 tok/s +step 18271/19560 | loss 3.263286 (-0.82z)| norm 0.2306 (-0.34z)| lr 6.90e-06 | 8450.07 ms | -100.0% bf16 MFU | 62048 tok/s +step 18272/19560 | loss 3.321736 (+0.70z)| norm 0.2198 (-1.62z)| lr 6.89e-06 | 8450.39 ms | -100.0% bf16 MFU | 62048 tok/s +step 18273/19560 | loss 3.374946 (+2.04z)| norm 0.2646 (+3.55z)| lr 6.88e-06 | 8450.27 ms | -100.0% bf16 MFU | 62047 tok/s +step 18274/19560 | loss 3.308494 (+0.32z)| norm 0.2311 (-0.28z)| lr 6.87e-06 | 8441.73 ms | -100.0% bf16 MFU | 62050 tok/s +step 18275/19560 | loss 3.348348 (+1.33z)| norm 0.2424 (+1.00z)| lr 6.86e-06 | 8433.82 ms | -100.0% bf16 MFU | 62056 tok/s +step 18276/19560 | loss 3.254609 (-1.08z)| norm 0.2279 (-0.65z)| lr 6.85e-06 | 8437.83 ms | -100.0% bf16 MFU | 62060 tok/s +step 18277/19560 | loss 3.269273 (-0.68z)| norm 0.2249 (-0.97z)| lr 6.84e-06 | 8439.09 ms | -100.0% bf16 MFU | 62063 tok/s +step 18278/19560 | loss 3.307574 (+0.32z)| norm 0.2249 (-0.97z)| lr 6.83e-06 | 8439.36 ms | -100.0% bf16 MFU | 62066 tok/s +step 18279/19560 | loss 3.334186 (+1.01z)| norm 0.2256 (-0.90z)| lr 6.81e-06 | 8432.08 ms | -100.0% bf16 MFU | 62072 tok/s +step 18280/19560 | loss 3.262258 (-0.86z)| norm 0.2388 (+0.59z)| lr 6.80e-06 | 8439.54 ms | -100.0% bf16 MFU | 62075 tok/s +step 18281/19560 | loss 3.301749 (+0.18z)| norm 0.2273 (-0.71z)| lr 6.79e-06 | 8429.70 ms | -100.0% bf16 MFU | 62081 tok/s +step 18282/19560 | loss 3.256893 (-0.99z)| norm 0.2798 (+4.75z)| lr 6.78e-06 | 8440.14 ms | -100.0% bf16 MFU | 62082 tok/s +step 18283/19560 | loss 3.271736 (-0.60z)| norm 0.2338 (-0.01z)| lr 6.77e-06 | 8438.55 ms | -100.0% bf16 MFU | 62085 tok/s +step 18284/19560 | loss 3.290354 (-0.11z)| norm 0.2240 (-1.01z)| lr 6.76e-06 | 8433.51 ms | -100.0% bf16 MFU | 62089 tok/s +step 18285/19560 | loss 3.278152 (-0.43z)| norm 0.2244 (-0.96z)| lr 6.75e-06 | 8439.37 ms | -100.0% bf16 MFU | 62091 tok/s +step 18286/19560 | loss 3.317836 (+0.60z)| norm 0.2366 (+0.30z)| lr 6.74e-06 | 8439.50 ms | -100.0% bf16 MFU | 62092 tok/s +step 18287/19560 | loss 3.236664 (-1.51z)| norm 0.2257 (-0.82z)| lr 6.73e-06 | 8438.17 ms | -100.0% bf16 MFU | 62094 tok/s +step 18288/19560 | loss 3.324486 (+0.78z)| norm 0.2278 (-0.60z)| lr 6.72e-06 | 8443.87 ms | -100.0% bf16 MFU | 62094 tok/s +step 18289/19560 | loss 3.299315 (+0.13z)| norm 0.2314 (-0.22z)| lr 6.71e-06 | 8438.64 ms | -100.0% bf16 MFU | 62096 tok/s +step 18290/19560 | loss 3.270717 (-0.62z)| norm 0.2271 (-0.67z)| lr 6.70e-06 | 8438.08 ms | -100.0% bf16 MFU | 62098 tok/s +step 18291/19560 | loss 3.276420 (-0.47z)| norm 0.2224 (-1.14z)| lr 6.69e-06 | 8443.57 ms | -100.0% bf16 MFU | 62098 tok/s +step 18292/19560 | loss 3.259635 (-0.90z)| norm 0.2216 (-1.21z)| lr 6.68e-06 | 8443.91 ms | -100.0% bf16 MFU | 62097 tok/s +step 18293/19560 | loss 3.196734 (-2.48z)| norm 0.2296 (-0.38z)| lr 6.67e-06 | 8441.89 ms | -100.0% bf16 MFU | 62098 tok/s +step 18294/19560 | loss 3.351153 (+1.48z)| norm 0.2290 (-0.43z)| lr 6.66e-06 | 8437.71 ms | -100.0% bf16 MFU | 62100 tok/s +step 18295/19560 | loss 3.330404 (+0.96z)| norm 0.2248 (-0.85z)| lr 6.65e-06 | 8445.82 ms | -100.0% bf16 MFU | 62098 tok/s +step 18296/19560 | loss 3.294973 (+0.06z)| norm 0.2253 (-0.80z)| lr 6.64e-06 | 8443.56 ms | -100.0% bf16 MFU | 62098 tok/s +step 18297/19560 | loss 3.291240 (-0.05z)| norm 0.2205 (-1.27z)| lr 6.63e-06 | 8442.54 ms | -100.0% bf16 MFU | 62098 tok/s +step 18298/19560 | loss 3.359740 (+1.76z)| norm 0.2368 (+0.41z)| lr 6.61e-06 | 8443.78 ms | -100.0% bf16 MFU | 62098 tok/s +step 18299/19560 | loss 3.272737 (-0.53z)| norm 0.2311 (-0.18z)| lr 6.60e-06 | 8440.38 ms | -100.0% bf16 MFU | 62099 tok/s +step 18300/19560 | loss 3.283959 (-0.23z)| norm 0.2259 (-0.71z)| lr 6.59e-06 | 8442.09 ms | -100.0% bf16 MFU | 62099 tok/s +step 18301/19560 | loss 3.276724 (-0.41z)| norm 0.2244 (-0.87z)| lr 6.58e-06 | 8448.51 ms | -100.0% bf16 MFU | 62097 tok/s +step 18302/19560 | loss 3.259624 (-0.87z)| norm 0.2280 (-0.49z)| lr 6.57e-06 | 8439.47 ms | -100.0% bf16 MFU | 62098 tok/s +step 18303/19560 | loss 3.293543 (+0.04z)| norm 0.2304 (-0.23z)| lr 6.56e-06 | 8447.45 ms | -100.0% bf16 MFU | 62097 tok/s +step 18304/19560 | loss 3.289752 (-0.07z)| norm 0.2216 (-1.14z)| lr 6.55e-06 | 8441.91 ms | -100.0% bf16 MFU | 62097 tok/s +step 18305/19560 | loss 3.218597 (-1.92z)| norm 0.2281 (-0.47z)| lr 6.54e-06 | 8446.31 ms | -100.0% bf16 MFU | 62096 tok/s +step 18306/19560 | loss 3.333021 (+1.09z)| norm 0.2207 (-1.22z)| lr 6.53e-06 | 8440.37 ms | -100.0% bf16 MFU | 62097 tok/s +step 18307/19560 | loss 3.299214 (+0.19z)| norm 0.2190 (-1.38z)| lr 6.52e-06 | 8441.95 ms | -100.0% bf16 MFU | 62097 tok/s +step 18308/19560 | loss 3.289761 (-0.05z)| norm 0.2361 (+0.36z)| lr 6.51e-06 | 8444.37 ms | -100.0% bf16 MFU | 62097 tok/s +step 18309/19560 | loss 3.364586 (+1.90z)| norm 0.2878 (+5.03z)| lr 6.50e-06 | 8444.54 ms | -100.0% bf16 MFU | 62096 tok/s +step 18310/19560 | loss 3.272902 (-0.52z)| norm 0.2234 (-0.87z)| lr 6.49e-06 | 8441.10 ms | -100.0% bf16 MFU | 62097 tok/s +step 18311/19560 | loss 3.270427 (-0.57z)| norm 0.2375 (+0.41z)| lr 6.48e-06 | 8441.66 ms | -100.0% bf16 MFU | 62098 tok/s +step 18312/19560 | loss 3.307213 (+0.39z)| norm 0.2274 (-0.51z)| lr 6.47e-06 | 8444.60 ms | -100.0% bf16 MFU | 62097 tok/s +step 18313/19560 | loss 3.295623 (+0.09z)| norm 0.2352 (+0.20z)| lr 6.46e-06 | 8446.29 ms | -100.0% bf16 MFU | 62096 tok/s +step 18314/19560 | loss 3.243037 (-1.29z)| norm 0.2301 (-0.26z)| lr 6.45e-06 | 8442.10 ms | -100.0% bf16 MFU | 62096 tok/s +step 18315/19560 | loss 3.243458 (-1.26z)| norm 0.2398 (+0.61z)| lr 6.44e-06 | 8444.45 ms | -100.0% bf16 MFU | 62096 tok/s +step 18316/19560 | loss 3.353802 (+1.62z)| norm 0.2269 (-0.57z)| lr 6.43e-06 | 8441.90 ms | -100.0% bf16 MFU | 62096 tok/s +step 18317/19560 | loss 3.287865 (-0.09z)| norm 0.2500 (+1.54z)| lr 6.42e-06 | 8443.20 ms | -100.0% bf16 MFU | 62096 tok/s +step 18318/19560 | loss 3.295243 (+0.09z)| norm 0.2369 (+0.34z)| lr 6.41e-06 | 8437.68 ms | -100.0% bf16 MFU | 62098 tok/s +step 18319/19560 | loss 3.252310 (-1.03z)| norm 0.2400 (+0.61z)| lr 6.40e-06 | 8439.79 ms | -100.0% bf16 MFU | 62099 tok/s +step 18320/19560 | loss 3.247192 (-1.16z)| norm 0.2358 (+0.23z)| lr 6.39e-06 | 8442.62 ms | -100.0% bf16 MFU | 62099 tok/s +step 18321/19560 | loss 3.215939 (-1.95z)| norm 0.2412 (+0.71z)| lr 6.38e-06 | 8437.54 ms | -100.0% bf16 MFU | 62101 tok/s +step 18322/19560 | loss 3.272887 (-0.47z)| norm 0.2316 (-0.17z)| lr 6.37e-06 | 8442.41 ms | -100.0% bf16 MFU | 62101 tok/s +step 18323/19560 | loss 3.286092 (-0.12z)| norm 0.2317 (-0.16z)| lr 6.36e-06 | 8443.59 ms | -100.0% bf16 MFU | 62101 tok/s +step 18324/19560 | loss 3.249355 (-1.06z)| norm 0.2285 (-0.45z)| lr 6.35e-06 | 8445.47 ms | -100.0% bf16 MFU | 62100 tok/s +step 18325/19560 | loss 3.306010 (+0.43z)| norm 0.2311 (-0.21z)| lr 6.34e-06 | 8442.12 ms | -100.0% bf16 MFU | 62100 tok/s +step 18326/19560 | loss 3.265964 (-0.62z)| norm 0.2315 (-0.17z)| lr 6.33e-06 | 8441.67 ms | -100.0% bf16 MFU | 62100 tok/s +step 18327/19560 | loss 3.315642 (+0.69z)| norm 0.2414 (+0.74z)| lr 6.32e-06 | 8443.08 ms | -100.0% bf16 MFU | 62100 tok/s +step 18328/19560 | loss 3.262416 (-0.70z)| norm 0.2398 (+0.59z)| lr 6.31e-06 | 8445.95 ms | -100.0% bf16 MFU | 62099 tok/s +step 18329/19560 | loss 3.292556 (+0.11z)| norm 0.2350 (+0.14z)| lr 6.30e-06 | 8443.72 ms | -100.0% bf16 MFU | 62099 tok/s +step 18330/19560 | loss 3.280252 (-0.22z)| norm 0.6415 (+10.79z)| lr 6.28e-06 | 8441.14 ms | -100.0% bf16 MFU | 62099 tok/s +step 18331/19560 | loss 3.257223 (-0.82z)| norm 0.2340 (-0.07z)| lr 6.27e-06 | 8441.44 ms | -100.0% bf16 MFU | 62100 tok/s +step 18332/19560 | loss 3.235428 (-1.38z)| norm 0.2308 (-0.16z)| lr 6.2 \ No newline at end of file